{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Classification: finetune\n", "========================\n", "\n", "In this notebook we illustrate how to re-train the models on user's data. Specifically, we remap the last layer of the model to the desired classes, without modifying the model's internal weights; this operation is called finetuning and is not as computationally intensive as re-training the full model. \n", "Regardless, this module greatly benefits from GPU compute, as long as the GPU(s) support CUDA and `nvidia-smi` is configured correctly. \n", "\n", "This module uses two scripts: `classification/main_prepare_learning_sets.py` for preparing the data for training, and `classification/main_classification_finetune.py`, that need to be executed in that order. \n", "\n", "The first step is to import the necessary libraries for `main_prepare_learning_sets.py`: " ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import argparse\n", "import shutil\n", "import sys\n", "import os\n", "from pathlib import Path\n", "\n", "import numpy as np\n", "import pandas as pd\n", "import yaml\n", "\n", "from mzbsuite.utils import cfg_to_arguments" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We need to declare the running parameters for the script, " ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'glob_random_seed': 222,\n", " 'glob_root_folder': '/Users/mivolpi/Projects/BioDetect/mzb-workflow',\n", " 'glob_blobs_folder': '/Users/mivolpi/Projects/BioDetect/mzb-workflow/data/derived/blobs/',\n", " 'glob_local_format': 'pdf',\n", " 'model_logger': 'wandb',\n", " 'impa_image_format': 'jpg',\n", " 'impa_clip_areas': [2700, 4700, -1, -1],\n", " 'impa_area_threshold': 5000,\n", " 'impa_gaussian_blur': [21, 21],\n", " 'impa_gaussian_blur_passes': 3,\n", " 'impa_adaptive_threshold_block_size': 351,\n", " 'impa_mask_postprocess_kernel': [11, 11],\n", " 'impa_mask_postprocess_passes': 5,\n", " 'impa_bounding_box_buffer': 200,\n", " 'impa_save_clips_plus_features': True,\n", " 'lset_class_cut': 'order',\n", " 'lset_val_size': 0.1,\n", " 'trcl_learning_rate': 0.0001,\n", " 'trcl_batch_size': 8,\n", " 'trcl_weight_decay': 0,\n", " 'trcl_step_size_decay': 5,\n", " 'trcl_number_epochs': 75,\n", " 'trcl_save_topk': 1,\n", " 'trcl_num_classes': 8,\n", " 'trcl_model_pretrarch': 'convnext-small',\n", " 'trcl_num_workers': 16,\n", " 'trcl_wandb_project_name': 'mzb-classifiers',\n", " 'trcl_logger': 'wandb',\n", " 'trsk_learning_rate': 0.001,\n", " 'trsk_batch_size': 32,\n", " 'trsk_weight_decay': 0,\n", " 'trsk_step_size_decay': 25,\n", " 'trsk_number_epochs': 400,\n", " 'trsk_save_topk': 1,\n", " 'trsk_num_classes': 2,\n", " 'trsk_model_pretrarch': 'mit_b2',\n", " 'trsk_num_workers': 16,\n", " 'trsk_wandb_project_name': 'mzb-skeletons',\n", " 'trsk_logger': 'wandb',\n", " 'infe_model_ckpt': 'last',\n", " 'infe_num_classes': 8,\n", " 'infe_image_glob': '*_rgb.jpg',\n", " 'skel_class_exclude': 'errors',\n", " 'skel_conv_rate': 131.6625,\n", " 'skel_label_thickness': 3,\n", " 'skel_label_buffer_on_preds': 25,\n", " 'skel_label_clip_with_mask': False,\n", " 'trcl_gpu_ids': None}" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ROOT_DIR = Path(\"/Users/mivolpi/Projects/BioDetect/mzb-workflow\")\n", "MODEL=\"convnext-small-vtest-1\"\n", "LSET_FOLD=Path(f\"{ROOT_DIR}/data/mzb_example_data\")\n", "\n", "arguments = {\n", " \"input_dir\": LSET_FOLD / \"curated_learning_sets\", \n", " \"taxonomy_file\": ROOT_DIR.absolute() / \"data/mzb_example_data/MZB_taxonomy.csv\", \n", " \"output_dir\": ROOT_DIR.absolute() / \"data/mzb_example_data/agg_lsets\", \n", " \"save_model\": ROOT_DIR.absolute() / f\"models/mzb-classification-models/{MODEL}\", \n", " \"config_file\": ROOT_DIR.absolute() / \"configs/mzb_example_config.yaml\"\n", "}\n", "\n", "with open(str(arguments[\"config_file\"]), \"r\") as f:\n", " cfg = yaml.load(f, Loader=yaml.FullLoader)\n", "\n", "cfg[\"trcl_gpu_ids\"] = None # this sets the number of available GPUs to zero, since this part of the module doesn't benefit from GPU compute. \n", "cfg" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Convert these parameters to a dictionary: " ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'glob_random_seed': 222, 'glob_root_folder': '/Users/mivolpi/Projects/BioDetect/mzb-workflow', 'glob_blobs_folder': '/Users/mivolpi/Projects/BioDetect/mzb-workflow/data/derived/blobs/', 'glob_local_format': 'pdf', 'model_logger': 'wandb', 'impa_image_format': 'jpg', 'impa_clip_areas': [2700, 4700, -1, -1], 'impa_area_threshold': 5000, 'impa_gaussian_blur': [21, 21], 'impa_gaussian_blur_passes': 3, 'impa_adaptive_threshold_block_size': 351, 'impa_mask_postprocess_kernel': [11, 11], 'impa_mask_postprocess_passes': 5, 'impa_bounding_box_buffer': 200, 'impa_save_clips_plus_features': True, 'lset_class_cut': 'order', 'lset_val_size': 0.1, 'trcl_learning_rate': 0.0001, 'trcl_batch_size': 8, 'trcl_weight_decay': 0, 'trcl_step_size_decay': 5, 'trcl_number_epochs': 75, 'trcl_save_topk': 1, 'trcl_num_classes': 8, 'trcl_model_pretrarch': 'convnext-small', 'trcl_num_workers': 16, 'trcl_wandb_project_name': 'mzb-classifiers', 'trcl_logger': 'wandb', 'trsk_learning_rate': 0.001, 'trsk_batch_size': 32, 'trsk_weight_decay': 0, 'trsk_step_size_decay': 25, 'trsk_number_epochs': 400, 'trsk_save_topk': 1, 'trsk_num_classes': 2, 'trsk_model_pretrarch': 'mit_b2', 'trsk_num_workers': 16, 'trsk_wandb_project_name': 'mzb-skeletons', 'trsk_logger': 'wandb', 'infe_model_ckpt': 'last', 'infe_num_classes': 8, 'infe_image_glob': '*_rgb.jpg', 'skel_class_exclude': 'errors', 'skel_conv_rate': 131.6625, 'skel_label_thickness': 3, 'skel_label_buffer_on_preds': 25, 'skel_label_clip_with_mask': False, 'trcl_gpu_ids': None}\n" ] } ], "source": [ "# Transforms configurations dicts to argparse arguments\n", "args = cfg_to_arguments(arguments)\n", "cfg = cfg_to_arguments(cfg)\n", "print(str(cfg))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We next check whether the target directories already exist, and if not create them: " ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "np.random.seed(cfg.glob_random_seed)\n", "\n", "# root of raw clip data\n", "root_data = Path(args.input_dir)\n", "outdir = Path(args.output_dir)\n", "outdir.mkdir(parents=True, exist_ok=True)\n", "\n", "# target folders definition\n", "target_trn = outdir / \"trn_set/\"\n", "target_val = outdir / \"val_set/\"\n", "\n", "# check if trn_set and val_set subfolders exist. If so, then interrupt the script.\n", "# This is to make sure that no overwriting happens; prompt the user that they need to specify a different output directory.\n", "if target_trn.exists() or target_val.exists():\n", " raise ValueError(\n", " # print in red and back to normal\n", " f\"\\033[91m Output directory {outdir} already exists. Please specify a different output directory.\\033[0m\"\n", " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We now use the specified taxonomic rank in the `lset_class_cut` parameter in the configuration file to cut the provided phylogenetic tree, and reorganize the images in directories corresponding to the this rank. \n", "See the documentation for further details. " ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Cutting phylogenetic tree at: order\n" ] } ], "source": [ "\n", "# make dictionary to recode: key is current classification, value is target reclassification.\n", "# forward fill to get last valid entry and subset to desired column\n", "mzb_taxonomy = pd.read_csv(Path(args.taxonomy_file))\n", "if \"Unnamed: 0\" in mzb_taxonomy.columns:\n", " mzb_taxonomy = mzb_taxonomy.drop(columns=[\"Unnamed: 0\"])\n", "mzb_taxonomy = mzb_taxonomy.ffill(axis=1)\n", "recode_order = dict(\n", " zip(mzb_taxonomy[\"query\"], mzb_taxonomy[cfg.lset_class_cut].str.lower())\n", ")\n", "\n", "print(f\"Cutting phylogenetic tree at: {cfg.lset_class_cut}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we copy the images over into the new folder structure according to the taxonomy: " ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# Move files to target folders for all files in the curated learning set\n", "for s_fo in recode_order:\n", " target_folder = target_trn / recode_order[s_fo]\n", " target_folder.mkdir(exist_ok=True, parents=True)\n", "\n", " for file in list((root_data / s_fo).glob(\"*\")):\n", " shutil.copy(file, target_folder)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Next, we split the validation set based on the proportion of total images specified by the `lset_val_size` parameter in the configuration file. We recommend at least 10% of the total images for each class. " ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/mivolpi/Projects/BioDetect/mzb-workflow/data/mzb_example_data/agg_lsets/trn_set/errors/32_hf2_protonemura_01_clip_11_rgb.png into /Users/mivolpi/Projects/BioDetect/mzb-workflow/data/mzb_example_data/agg_lsets/val_set/errors\n", "/Users/mivolpi/Projects/BioDetect/mzb-workflow/data/mzb_example_data/agg_lsets/trn_set/errors/31_b1_isoperla_01_clip_11_rgb.png into /Users/mivolpi/Projects/BioDetect/mzb-workflow/data/mzb_example_data/agg_lsets/val_set/errors\n", "/Users/mivolpi/Projects/BioDetect/mzb-workflow/data/mzb_example_data/agg_lsets/trn_set/errors/32_hf2_plecoptera_01_clip_5_rgb.png into /Users/mivolpi/Projects/BioDetect/mzb-workflow/data/mzb_example_data/agg_lsets/val_set/errors\n" ] } ], "source": [ "# move out the validation set\n", "# make a small val set, 10% or 1 file, what is possible...\n", "size = cfg.lset_val_size\n", "trn_folds = [a.name for a in sorted(list(target_trn.glob(\"*\")))]\n", "\n", "for s_fo in trn_folds:\n", " target_folder = target_val / s_fo\n", " target_folder.mkdir(exist_ok=True, parents=True)\n", "\n", " list_class = list((target_trn / s_fo).glob(\"*\"))\n", " n_val_sam = np.max((1, np.ceil(0.1 * len(list_class))))\n", " \n", " val_files = np.random.choice(list_class, int(n_val_sam))\n", "\n", " for file in val_files:\n", " try:\n", " shutil.move(str(file), target_folder)\n", " except:\n", " print(f\"{str(file)} into {target_folder}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we have the training dataset ready for model training, with a training set and a validation set containing the same classes. \n", "\n", "We move on to the model finetuning, using the script `classification/main_classification_finetune.py`. First we import some additional libraries from PyTorch; " ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "import torch\n", "import pytorch_lightning as pl\n", "from pytorch_lightning.loggers import WandbLogger, TensorBoardLogger\n", "from pytorch_lightning.strategies.ddp import DDPStrategy\n", "\n", "from mzbsuite.classification.mzb_classification_pilmodel import MZBModel\n", "from mzbsuite.utils import cfg_to_arguments, SaveLogCallback\n", "\n", "# Set the thread layer used by MKL\n", "os.environ[\"MKL_THREADING_LAYER\"] = \"GNU\" # this time we set the GPU computing layer to active" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Before we can launch the training, we need to define a few special parameters, relating to finding the specified monitoring the model training progress over time: " ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# args.output_dir\n", "args.input_dir_tr = args.output_dir" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# Define checkpoints callbacks\n", "# best model on validation\n", "best_val_cb = pl.callbacks.ModelCheckpoint(\n", " dirpath=args.save_model,\n", " filename=\"best-val-{epoch}-{step}-{val_loss:.1f}\",\n", " monitor=\"val_loss\",\n", " mode=\"min\",\n", " save_top_k=cfg.trcl_save_topk,\n", ")\n", "\n", "# latest model in training\n", "last_mod_cb = pl.callbacks.ModelCheckpoint(\n", " dirpath=args.save_model,\n", " filename=\"last-{step}\",\n", " every_n_train_steps=50,\n", " save_top_k=cfg.trcl_save_topk,\n", ")\n", "\n", "# Define progress bar callback\n", "pbar_cb = pl.callbacks.progress.TQDMProgressBar(refresh_rate=5)\n", "\n", "# Define logger callback to log training date\n", "trdatelog = SaveLogCallback(model_folder=args.save_model)\n", "\n", "# Define model from config\n", "model = MZBModel(\n", " data_dir=args.input_dir_tr,\n", " pretrained_network=cfg.trcl_model_pretrarch,\n", " learning_rate=cfg.trcl_learning_rate,\n", " batch_size=cfg.trcl_batch_size,\n", " weight_decay=cfg.trcl_weight_decay,\n", " num_workers_loader=cfg.trcl_num_workers,\n", " step_size_decay=cfg.trcl_step_size_decay,\n", " num_classes=cfg.trcl_num_classes,\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We now check wether a pre-trained model is available, and if there is load the weights from that model. Note that logging model progress requires either a [Weights & Biases](https://wandb.ai/) or [Tensorflow](https://www.tensorflow.org/) account. See the documentation for more details. " ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mmivolpi\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n", "\u001b[34m\u001b[1mwandb\u001b[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.\n" ] }, { "data": { "text/html": [ "Tracking run with wandb version 0.19.4" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "Run data is saved locally in ./wandb/run-20250122_162849-ed2na7k7" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "Syncing run classifier-convnext-small to Weights & Biases (docs)
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ " View project at https://wandb.ai/mivolpi/mzb-classifiers" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ " View run at https://wandb.ai/mivolpi/mzb-classifiers/runs/ed2na7k7" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "\u001b[34m\u001b[1mwandb\u001b[0m: logging graph, to disable use `wandb.watch(log_graph=False)`\n" ] } ], "source": [ "# Check if there is a model to load, if there is, load it and train from there\n", "if args.save_model.is_dir():\n", " if args.verbose:\n", " print(f\"Loading model from {args.save_model}\")\n", " try:\n", " fmodel = list(args.save_model.glob(\"last-*.ckpt\"))[0]\n", " except:\n", " print(\"No last-* model in folder, loading best model\")\n", " fmodel = list(\n", " args.save_model.glob(\"best-val-epoch=*-step=*-val_loss=*.*.ckpt\")\n", " )[-1]\n", "\n", " model = model.load_from_checkpoint(fmodel)\n", "\n", "# Define logger and name of run\n", "name_run = f\"classifier-{cfg.trcl_model_pretrarch}\" # f\"{model.pretrained_network}\"\n", "cbacks = [pbar_cb, best_val_cb, last_mod_cb, trdatelog]\n", "\n", "# Define logger, and use either wandb or tensorboard\n", "if cfg.trcl_logger == \"wandb\":\n", " logger = WandbLogger(\n", " project=cfg.trcl_wandb_project_name, name=name_run if name_run else None\n", " )\n", " logger.watch(model, log=\"all\")\n", "\n", "elif cfg.trcl_logger == \"tensorboard\":\n", " logger = TensorBoardLogger(\n", " save_dir=args.save_model,\n", " name=name_run if name_run else None,\n", " log_graph=True,\n", " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We are now finally ready to train our model! " ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/mivolpi/micromamba/envs/str-mzb/lib/python3.10/site-packages/lightning_fabric/connector.py:572: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!\n", "/Users/mivolpi/micromamba/envs/str-mzb/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/accelerator_connector.py:513: You passed `Trainer(accelerator='cpu', precision='16-mixed')` but AMP with fp16 is not supported on CPU. Using `precision='bf16-mixed'` instead.\n", "Using bfloat16 Automatic Mixed Precision (AMP)\n", "GPU available: False, used: False\n", "TPU available: False, using: 0 TPU cores\n", "HPU available: False, using: 0 HPUs\n", "Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/1\n", "----------------------------------------------------------------------------------------------------\n", "distributed_backend=gloo\n", "All distributed processes registered. Starting with 1 processes\n", "----------------------------------------------------------------------------------------------------\n", "\n", "[W122 16:28:50.686888000 ProcessGroupGloo.cpp:745] Warning: Unable to resolve hostname to a (local) address. Using the loopback address as fallback. Manually set the network interface to bind to with GLOO_SOCKET_IFNAME. (function operator())\n", "\n", " | Name | Type | Params | Mode \n", "-------------------------------------------------------\n", "0 | model | ConvNeXt | 49.5 M | train\n", "1 | accuracy | MulticlassF1Score | 0 | train\n", "-------------------------------------------------------\n", "6.2 K Trainable params\n", "49.5 M Non-trainable params\n", "49.5 M Total params\n", "197.843 Total estimated model params size (MB)\n", "384 Modules in train mode\n", "0 Modules in eval mode\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "696c367f6a60492d8d4eaa73b5283101", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Sanity Checking: | | 0/? [00:00 46\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstrategy\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlauncher\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlaunch\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtrainer_fn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrainer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 47\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m trainer_fn(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", "File \u001b[0;32m~/micromamba/envs/str-mzb/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/multiprocessing.py:144\u001b[0m, in \u001b[0;36m_MultiProcessingLauncher.launch\u001b[0;34m(self, function, trainer, *args, **kwargs)\u001b[0m\n\u001b[1;32m 143\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprocs \u001b[38;5;241m=\u001b[39m process_context\u001b[38;5;241m.\u001b[39mprocesses\n\u001b[0;32m--> 144\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[43mprocess_context\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mjoin\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 145\u001b[0m \u001b[38;5;28;01mpass\u001b[39;00m\n", "File \u001b[0;32m~/micromamba/envs/str-mzb/lib/python3.10/site-packages/torch/multiprocessing/spawn.py:132\u001b[0m, in \u001b[0;36mProcessContext.join\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 131\u001b[0m \u001b[38;5;66;03m# Wait for any process to fail or all of them to succeed.\u001b[39;00m\n\u001b[0;32m--> 132\u001b[0m ready \u001b[38;5;241m=\u001b[39m \u001b[43mmultiprocessing\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconnection\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 133\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msentinels\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mkeys\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 134\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 135\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 137\u001b[0m error_index \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", "File \u001b[0;32m~/micromamba/envs/str-mzb/lib/python3.10/multiprocessing/connection.py:931\u001b[0m, in \u001b[0;36mwait\u001b[0;34m(object_list, timeout)\u001b[0m\n\u001b[1;32m 930\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[0;32m--> 931\u001b[0m ready \u001b[38;5;241m=\u001b[39m \u001b[43mselector\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mselect\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 932\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ready:\n", "File \u001b[0;32m~/micromamba/envs/str-mzb/lib/python3.10/selectors.py:416\u001b[0m, in \u001b[0;36m_PollLikeSelector.select\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 415\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 416\u001b[0m fd_event_list \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_selector\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpoll\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 417\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mInterruptedError\u001b[39;00m:\n", "\u001b[0;31mKeyboardInterrupt\u001b[0m: ", "\nDuring handling of the above exception, another exception occurred:\n", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[12], line 15\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m# instantiate trainer and train\u001b[39;00m\n\u001b[1;32m 4\u001b[0m trainer \u001b[38;5;241m=\u001b[39m pl\u001b[38;5;241m.\u001b[39mTrainer(\n\u001b[1;32m 5\u001b[0m accelerator\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mauto\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;66;03m# cfg.trcl_num_gpus outdated\u001b[39;00m\n\u001b[1;32m 6\u001b[0m max_epochs\u001b[38;5;241m=\u001b[39mcfg\u001b[38;5;241m.\u001b[39mtrcl_number_epochs,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;66;03m# profiler=\"simple\",\u001b[39;00m\n\u001b[1;32m 13\u001b[0m )\n\u001b[0;32m---> 15\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m~/micromamba/envs/str-mzb/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py:539\u001b[0m, in \u001b[0;36mTrainer.fit\u001b[0;34m(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)\u001b[0m\n\u001b[1;32m 537\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mstatus \u001b[38;5;241m=\u001b[39m TrainerStatus\u001b[38;5;241m.\u001b[39mRUNNING\n\u001b[1;32m 538\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtraining \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 539\u001b[0m \u001b[43mcall\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_and_handle_interrupt\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 540\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fit_impl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtrain_dataloaders\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mval_dataloaders\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdatamodule\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mckpt_path\u001b[49m\n\u001b[1;32m 541\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m~/micromamba/envs/str-mzb/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py:64\u001b[0m, in \u001b[0;36m_call_and_handle_interrupt\u001b[0;34m(trainer, trainer_fn, *args, **kwargs)\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(launcher, _SubprocessScriptLauncher):\n\u001b[1;32m 63\u001b[0m launcher\u001b[38;5;241m.\u001b[39mkill(_get_sigkill_signal())\n\u001b[0;32m---> 64\u001b[0m \u001b[43mexit\u001b[49m(\u001b[38;5;241m1\u001b[39m)\n\u001b[1;32m 66\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mBaseException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exception:\n\u001b[1;32m 67\u001b[0m _interrupt(trainer, exception)\n", "\u001b[0;31mNameError\u001b[0m: name 'exit' is not defined" ] } ], "source": [ "\n", "# instantiate trainer and train\n", "trainer = pl.Trainer(\n", " accelerator=\"auto\", # cfg.trcl_num_gpus outdated\n", " max_epochs=cfg.trcl_number_epochs,\n", " strategy=\"ddp_notebook\",\n", " precision=16,\n", " callbacks=cbacks,\n", " logger=logger,\n", " log_every_n_steps=1\n", " # profiler=\"simple\",\n", ")\n", "\n", "trainer.fit(model)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.16" } }, "nbformat": 4, "nbformat_minor": 2 }