{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Classification: finetune\n",
    "========================\n",
    "\n",
    "In this notebook we illustrate how to re-train the models on user's data. Specifically, we remap the last layer of the model to the desired classes, without modifying the model's internal weights; this operation is called finetuning and is not as computationally intensive as re-training the full model. \n",
    "Regardless, this module greatly benefits from GPU compute, as long as the GPU(s) support CUDA and `nvidia-smi` is configured correctly. \n",
    "\n",
    "This module uses two scripts: `classification/main_prepare_learning_sets.py` for preparing the data for training, and `classification/main_classification_finetune.py`, that need to be executed in that order. \n",
    "\n",
    "The first step is to import the necessary libraries for `main_prepare_learning_sets.py`: "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import argparse\n",
    "import shutil\n",
    "import sys\n",
    "import os\n",
    "from pathlib import Path\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import yaml\n",
    "\n",
    "from mzbsuite.utils import cfg_to_arguments"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We need to declare the running parameters for the script, "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'glob_random_seed': 222,\n",
       " 'glob_root_folder': '/Users/mivolpi/Projects/BioDetect/mzb-workflow',\n",
       " 'glob_blobs_folder': '/Users/mivolpi/Projects/BioDetect/mzb-workflow/data/derived/blobs/',\n",
       " 'glob_local_format': 'pdf',\n",
       " 'model_logger': 'wandb',\n",
       " 'impa_image_format': 'jpg',\n",
       " 'impa_clip_areas': [2700, 4700, -1, -1],\n",
       " 'impa_area_threshold': 5000,\n",
       " 'impa_gaussian_blur': [21, 21],\n",
       " 'impa_gaussian_blur_passes': 3,\n",
       " 'impa_adaptive_threshold_block_size': 351,\n",
       " 'impa_mask_postprocess_kernel': [11, 11],\n",
       " 'impa_mask_postprocess_passes': 5,\n",
       " 'impa_bounding_box_buffer': 200,\n",
       " 'impa_save_clips_plus_features': True,\n",
       " 'lset_class_cut': 'order',\n",
       " 'lset_val_size': 0.1,\n",
       " 'trcl_learning_rate': 0.0001,\n",
       " 'trcl_batch_size': 8,\n",
       " 'trcl_weight_decay': 0,\n",
       " 'trcl_step_size_decay': 5,\n",
       " 'trcl_number_epochs': 75,\n",
       " 'trcl_save_topk': 1,\n",
       " 'trcl_num_classes': 8,\n",
       " 'trcl_model_pretrarch': 'convnext-small',\n",
       " 'trcl_num_workers': 16,\n",
       " 'trcl_wandb_project_name': 'mzb-classifiers',\n",
       " 'trcl_logger': 'wandb',\n",
       " 'trsk_learning_rate': 0.001,\n",
       " 'trsk_batch_size': 32,\n",
       " 'trsk_weight_decay': 0,\n",
       " 'trsk_step_size_decay': 25,\n",
       " 'trsk_number_epochs': 400,\n",
       " 'trsk_save_topk': 1,\n",
       " 'trsk_num_classes': 2,\n",
       " 'trsk_model_pretrarch': 'mit_b2',\n",
       " 'trsk_num_workers': 16,\n",
       " 'trsk_wandb_project_name': 'mzb-skeletons',\n",
       " 'trsk_logger': 'wandb',\n",
       " 'infe_model_ckpt': 'last',\n",
       " 'infe_num_classes': 8,\n",
       " 'infe_image_glob': '*_rgb.jpg',\n",
       " 'skel_class_exclude': 'errors',\n",
       " 'skel_conv_rate': 131.6625,\n",
       " 'skel_label_thickness': 3,\n",
       " 'skel_label_buffer_on_preds': 25,\n",
       " 'skel_label_clip_with_mask': False,\n",
       " 'trcl_gpu_ids': None}"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ROOT_DIR = Path(\"/Users/mivolpi/Projects/BioDetect/mzb-workflow\")\n",
    "MODEL=\"convnext-small-vtest-1\"\n",
    "LSET_FOLD=Path(f\"{ROOT_DIR}/data/mzb_example_data\")\n",
    "\n",
    "arguments = {\n",
    "    \"input_dir\": LSET_FOLD / \"curated_learning_sets\", \n",
    "    \"taxonomy_file\": ROOT_DIR.absolute() / \"data/mzb_example_data/MZB_taxonomy.csv\", \n",
    "    \"output_dir\": ROOT_DIR.absolute() / \"data/mzb_example_data/agg_lsets\", \n",
    "    \"save_model\": ROOT_DIR.absolute() / f\"models/mzb-classification-models/{MODEL}\", \n",
    "    \"config_file\": ROOT_DIR.absolute() / \"configs/mzb_example_config.yaml\"\n",
    "}\n",
    "\n",
    "with open(str(arguments[\"config_file\"]), \"r\") as f:\n",
    "    cfg = yaml.load(f, Loader=yaml.FullLoader)\n",
    "\n",
    "cfg[\"trcl_gpu_ids\"] = None # this sets the number of available GPUs to zero, since this part of the module doesn't benefit from GPU compute. \n",
    "cfg"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Convert these parameters to a dictionary: "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'glob_random_seed': 222, 'glob_root_folder': '/Users/mivolpi/Projects/BioDetect/mzb-workflow', 'glob_blobs_folder': '/Users/mivolpi/Projects/BioDetect/mzb-workflow/data/derived/blobs/', 'glob_local_format': 'pdf', 'model_logger': 'wandb', 'impa_image_format': 'jpg', 'impa_clip_areas': [2700, 4700, -1, -1], 'impa_area_threshold': 5000, 'impa_gaussian_blur': [21, 21], 'impa_gaussian_blur_passes': 3, 'impa_adaptive_threshold_block_size': 351, 'impa_mask_postprocess_kernel': [11, 11], 'impa_mask_postprocess_passes': 5, 'impa_bounding_box_buffer': 200, 'impa_save_clips_plus_features': True, 'lset_class_cut': 'order', 'lset_val_size': 0.1, 'trcl_learning_rate': 0.0001, 'trcl_batch_size': 8, 'trcl_weight_decay': 0, 'trcl_step_size_decay': 5, 'trcl_number_epochs': 75, 'trcl_save_topk': 1, 'trcl_num_classes': 8, 'trcl_model_pretrarch': 'convnext-small', 'trcl_num_workers': 16, 'trcl_wandb_project_name': 'mzb-classifiers', 'trcl_logger': 'wandb', 'trsk_learning_rate': 0.001, 'trsk_batch_size': 32, 'trsk_weight_decay': 0, 'trsk_step_size_decay': 25, 'trsk_number_epochs': 400, 'trsk_save_topk': 1, 'trsk_num_classes': 2, 'trsk_model_pretrarch': 'mit_b2', 'trsk_num_workers': 16, 'trsk_wandb_project_name': 'mzb-skeletons', 'trsk_logger': 'wandb', 'infe_model_ckpt': 'last', 'infe_num_classes': 8, 'infe_image_glob': '*_rgb.jpg', 'skel_class_exclude': 'errors', 'skel_conv_rate': 131.6625, 'skel_label_thickness': 3, 'skel_label_buffer_on_preds': 25, 'skel_label_clip_with_mask': False, 'trcl_gpu_ids': None}\n"
     ]
    }
   ],
   "source": [
    "# Transforms configurations dicts to argparse arguments\n",
    "args = cfg_to_arguments(arguments)\n",
    "cfg = cfg_to_arguments(cfg)\n",
    "print(str(cfg))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We next check whether the target directories already exist, and if not create them: "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.random.seed(cfg.glob_random_seed)\n",
    "\n",
    "# root of raw clip data\n",
    "root_data = Path(args.input_dir)\n",
    "outdir = Path(args.output_dir)\n",
    "outdir.mkdir(parents=True, exist_ok=True)\n",
    "\n",
    "# target folders definition\n",
    "target_trn = outdir / \"trn_set/\"\n",
    "target_val = outdir / \"val_set/\"\n",
    "\n",
    "# check if trn_set and val_set subfolders exist. If so, then interrupt the script.\n",
    "# This is to make sure that no overwriting happens; prompt the user that they need to specify a different output directory.\n",
    "if target_trn.exists() or target_val.exists():\n",
    "    raise ValueError(\n",
    "        # print in red and back to normal\n",
    "        f\"\\033[91m Output directory {outdir} already exists. Please specify a different output directory.\\033[0m\"\n",
    "    )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We now use the specified taxonomic rank in the `lset_class_cut` parameter in the configuration file to cut the provided phylogenetic tree, and reorganize the images in directories corresponding to the this rank. \n",
    "See the documentation for further details. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Cutting phylogenetic tree at: order\n"
     ]
    }
   ],
   "source": [
    "\n",
    "# make dictionary to recode: key is current classification, value is target reclassification.\n",
    "# forward fill to get last valid entry and subset to desired column\n",
    "mzb_taxonomy = pd.read_csv(Path(args.taxonomy_file))\n",
    "if \"Unnamed: 0\" in mzb_taxonomy.columns:\n",
    "    mzb_taxonomy = mzb_taxonomy.drop(columns=[\"Unnamed: 0\"])\n",
    "mzb_taxonomy = mzb_taxonomy.ffill(axis=1)\n",
    "recode_order = dict(\n",
    "    zip(mzb_taxonomy[\"query\"], mzb_taxonomy[cfg.lset_class_cut].str.lower())\n",
    ")\n",
    "\n",
    "print(f\"Cutting phylogenetic tree at: {cfg.lset_class_cut}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now we copy the images over into the new folder structure according to the taxonomy: "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Move files to target folders for all files in the curated learning set\n",
    "for s_fo in recode_order:\n",
    "    target_folder = target_trn / recode_order[s_fo]\n",
    "    target_folder.mkdir(exist_ok=True, parents=True)\n",
    "\n",
    "    for file in list((root_data / s_fo).glob(\"*\")):\n",
    "        shutil.copy(file, target_folder)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Next, we split the validation set based on the proportion of total images specified by the `lset_val_size` parameter in the configuration file. We recommend at least 10% of the total images for each class. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/Users/mivolpi/Projects/BioDetect/mzb-workflow/data/mzb_example_data/agg_lsets/trn_set/errors/32_hf2_protonemura_01_clip_11_rgb.png into /Users/mivolpi/Projects/BioDetect/mzb-workflow/data/mzb_example_data/agg_lsets/val_set/errors\n",
      "/Users/mivolpi/Projects/BioDetect/mzb-workflow/data/mzb_example_data/agg_lsets/trn_set/errors/31_b1_isoperla_01_clip_11_rgb.png into /Users/mivolpi/Projects/BioDetect/mzb-workflow/data/mzb_example_data/agg_lsets/val_set/errors\n",
      "/Users/mivolpi/Projects/BioDetect/mzb-workflow/data/mzb_example_data/agg_lsets/trn_set/errors/32_hf2_plecoptera_01_clip_5_rgb.png into /Users/mivolpi/Projects/BioDetect/mzb-workflow/data/mzb_example_data/agg_lsets/val_set/errors\n"
     ]
    }
   ],
   "source": [
    "# move out the validation set\n",
    "# make a small val set, 10% or 1 file, what is possible...\n",
    "size = cfg.lset_val_size\n",
    "trn_folds = [a.name for a in sorted(list(target_trn.glob(\"*\")))]\n",
    "\n",
    "for s_fo in trn_folds:\n",
    "    target_folder = target_val / s_fo\n",
    "    target_folder.mkdir(exist_ok=True, parents=True)\n",
    "\n",
    "    list_class = list((target_trn / s_fo).glob(\"*\"))\n",
    "    n_val_sam = np.max((1, np.ceil(0.1 * len(list_class))))\n",
    "    \n",
    "    val_files = np.random.choice(list_class, int(n_val_sam))\n",
    "\n",
    "    for file in val_files:\n",
    "        try:\n",
    "            shutil.move(str(file), target_folder)\n",
    "        except:\n",
    "            print(f\"{str(file)} into {target_folder}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now we have the training dataset ready for model training, with a training set and a validation set containing the same classes. \n",
    "\n",
    "We move on to the model finetuning, using the script `classification/main_classification_finetune.py`. First we import some additional libraries from PyTorch; "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "import pytorch_lightning as pl\n",
    "from pytorch_lightning.loggers import WandbLogger, TensorBoardLogger\n",
    "from pytorch_lightning.strategies.ddp import DDPStrategy\n",
    "\n",
    "from mzbsuite.classification.mzb_classification_pilmodel import MZBModel\n",
    "from mzbsuite.utils import cfg_to_arguments, SaveLogCallback\n",
    "\n",
    "# Set the thread layer used by MKL\n",
    "os.environ[\"MKL_THREADING_LAYER\"] = \"GNU\" # this time we set the GPU computing layer to active"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Before we can launch the training, we need to define a few special parameters, relating to finding the specified monitoring the model training progress over time:  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# args.output_dir\n",
    "args.input_dir_tr = args.output_dir"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define checkpoints callbacks\n",
    "# best model on validation\n",
    "best_val_cb = pl.callbacks.ModelCheckpoint(\n",
    "    dirpath=args.save_model,\n",
    "    filename=\"best-val-{epoch}-{step}-{val_loss:.1f}\",\n",
    "    monitor=\"val_loss\",\n",
    "    mode=\"min\",\n",
    "    save_top_k=cfg.trcl_save_topk,\n",
    ")\n",
    "\n",
    "# latest model in training\n",
    "last_mod_cb = pl.callbacks.ModelCheckpoint(\n",
    "    dirpath=args.save_model,\n",
    "    filename=\"last-{step}\",\n",
    "    every_n_train_steps=50,\n",
    "    save_top_k=cfg.trcl_save_topk,\n",
    ")\n",
    "\n",
    "# Define progress bar callback\n",
    "pbar_cb = pl.callbacks.progress.TQDMProgressBar(refresh_rate=5)\n",
    "\n",
    "# Define logger callback to log training date\n",
    "trdatelog = SaveLogCallback(model_folder=args.save_model)\n",
    "\n",
    "# Define model from config\n",
    "model = MZBModel(\n",
    "    data_dir=args.input_dir_tr,\n",
    "    pretrained_network=cfg.trcl_model_pretrarch,\n",
    "    learning_rate=cfg.trcl_learning_rate,\n",
    "    batch_size=cfg.trcl_batch_size,\n",
    "    weight_decay=cfg.trcl_weight_decay,\n",
    "    num_workers_loader=cfg.trcl_num_workers,\n",
    "    step_size_decay=cfg.trcl_step_size_decay,\n",
    "    num_classes=cfg.trcl_num_classes,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We now check wether a pre-trained model is available, and if there is load the weights from that model. Note that logging model progress requires either a [Weights & Biases](https://wandb.ai/) or [Tensorflow](https://www.tensorflow.org/) account. See the documentation for more details. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mmivolpi\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "Tracking run with wandb version 0.19.4"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "Run data is saved locally in <code>./wandb/run-20250122_162849-ed2na7k7</code>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "Syncing run <strong><a href='https://wandb.ai/mivolpi/mzb-classifiers/runs/ed2na7k7' target=\"_blank\">classifier-convnext-small</a></strong> to <a href='https://wandb.ai/mivolpi/mzb-classifiers' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/developer-guide' target=\"_blank\">docs</a>)<br>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       " View project at <a href='https://wandb.ai/mivolpi/mzb-classifiers' target=\"_blank\">https://wandb.ai/mivolpi/mzb-classifiers</a>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       " View run at <a href='https://wandb.ai/mivolpi/mzb-classifiers/runs/ed2na7k7' target=\"_blank\">https://wandb.ai/mivolpi/mzb-classifiers/runs/ed2na7k7</a>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\u001b[34m\u001b[1mwandb\u001b[0m: logging graph, to disable use `wandb.watch(log_graph=False)`\n"
     ]
    }
   ],
   "source": [
    "# Check if there is a model to load, if there is, load it and train from there\n",
    "if args.save_model.is_dir():\n",
    "    if args.verbose:\n",
    "        print(f\"Loading model from {args.save_model}\")\n",
    "    try:\n",
    "        fmodel = list(args.save_model.glob(\"last-*.ckpt\"))[0]\n",
    "    except:\n",
    "        print(\"No last-* model in folder, loading best model\")\n",
    "        fmodel = list(\n",
    "            args.save_model.glob(\"best-val-epoch=*-step=*-val_loss=*.*.ckpt\")\n",
    "        )[-1]\n",
    "\n",
    "    model = model.load_from_checkpoint(fmodel)\n",
    "\n",
    "# Define logger and name of run\n",
    "name_run = f\"classifier-{cfg.trcl_model_pretrarch}\"  # f\"{model.pretrained_network}\"\n",
    "cbacks = [pbar_cb, best_val_cb, last_mod_cb, trdatelog]\n",
    "\n",
    "# Define logger, and use either wandb or tensorboard\n",
    "if cfg.trcl_logger == \"wandb\":\n",
    "    logger = WandbLogger(\n",
    "        project=cfg.trcl_wandb_project_name, name=name_run if name_run else None\n",
    "    )\n",
    "    logger.watch(model, log=\"all\")\n",
    "\n",
    "elif cfg.trcl_logger == \"tensorboard\":\n",
    "    logger = TensorBoardLogger(\n",
    "        save_dir=args.save_model,\n",
    "        name=name_run if name_run else None,\n",
    "        log_graph=True,\n",
    "    )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We are now finally ready to train our model! "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/mivolpi/micromamba/envs/str-mzb/lib/python3.10/site-packages/lightning_fabric/connector.py:572: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!\n",
      "/Users/mivolpi/micromamba/envs/str-mzb/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/accelerator_connector.py:513: You passed `Trainer(accelerator='cpu', precision='16-mixed')` but AMP with fp16 is not supported on CPU. Using `precision='bf16-mixed'` instead.\n",
      "Using bfloat16 Automatic Mixed Precision (AMP)\n",
      "GPU available: False, used: False\n",
      "TPU available: False, using: 0 TPU cores\n",
      "HPU available: False, using: 0 HPUs\n",
      "Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/1\n",
      "----------------------------------------------------------------------------------------------------\n",
      "distributed_backend=gloo\n",
      "All distributed processes registered. Starting with 1 processes\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\n",
      "[W122 16:28:50.686888000 ProcessGroupGloo.cpp:745] Warning: Unable to resolve hostname to a (local) address. Using the loopback address as fallback. Manually set the network interface to bind to with GLOO_SOCKET_IFNAME. (function operator())\n",
      "\n",
      "  | Name     | Type              | Params | Mode \n",
      "-------------------------------------------------------\n",
      "0 | model    | ConvNeXt          | 49.5 M | train\n",
      "1 | accuracy | MulticlassF1Score | 0      | train\n",
      "-------------------------------------------------------\n",
      "6.2 K     Trainable params\n",
      "49.5 M    Non-trainable params\n",
      "49.5 M    Total params\n",
      "197.843   Total estimated model params size (MB)\n",
      "384       Modules in train mode\n",
      "0         Modules in eval mode\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "696c367f6a60492d8d4eaa73b5283101",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Sanity Checking: |          | 0/? [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Validation set size: 45\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/mivolpi/micromamba/envs/str-mzb/lib/python3.10/site-packages/torch/utils/data/dataloader.py:617: UserWarning: This DataLoader will create 16 worker processes in total. Our suggested max number of worker in current system is 8 (`cpuset` is not taken into account), which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary.\n",
      "  warnings.warn(\n",
      "[rank0]:[W122 16:28:52.059777000 NNPACK.cpp:61] Could not initialize NNPACK! Reason: Unsupported hardware.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training set size: 385\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "1c93dce5e34245389c10a5d083295416",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Training: |          | 0/? [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Intel MKL WARNING: Support of Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) enabled only processors has been deprecated. Intel oneAPI Math Kernel Library 2025.0 will require Intel(R) Advanced Vector Extensions (Intel(R) AVX) instructions.\n",
      "Intel MKL WARNING: Support of Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) enabled only processors has been deprecated. Intel oneAPI Math Kernel Library 2025.0 will require Intel(R) Advanced Vector Extensions (Intel(R) AVX) instructions.\n",
      "Intel MKL WARNING: Support of Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) enabled only processors has been deprecated. Intel oneAPI Math Kernel Library 2025.0 will require Intel(R) Advanced Vector Extensions (Intel(R) AVX) instructions.Intel MKL WARNING: Support of Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) enabled only processors has been deprecated. Intel oneAPI Math Kernel Library 2025.0 will require Intel(R) Advanced Vector Extensions (Intel(R) AVX) instructions.\n",
      "Intel MKL WARNING: Support of Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) enabled only processors has been deprecated. Intel oneAPI Math Kernel Library 2025.0 will require Intel(R) Advanced Vector Extensions (Intel(R) AVX) instructions.\n",
      "\n",
      "Intel MKL WARNING: Support of Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) enabled only processors has been deprecated. Intel oneAPI Math Kernel Library 2025.0 will require Intel(R) Advanced Vector Extensions (Intel(R) AVX) instructions.\n",
      "Intel MKL WARNING: Support of Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) enabled only processors has been deprecated. Intel oneAPI Math Kernel Library 2025.0 will require Intel(R) Advanced Vector Extensions (Intel(R) AVX) instructions.\n",
      "Intel MKL WARNING: Support of Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) enabled only processors has been deprecated. Intel oneAPI Math Kernel Library 2025.0 will require Intel(R) Advanced Vector Extensions (Intel(R) AVX) instructions.\n",
      "Intel MKL WARNING: Support of Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) enabled only processors has been deprecated. Intel oneAPI Math Kernel Library 2025.0 will require Intel(R) Advanced Vector Extensions (Intel(R) AVX) instructions.\n",
      "Intel MKL WARNING: Support of Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) enabled only processors has been deprecated. Intel oneAPI Math Kernel Library 2025.0 will require Intel(R) Advanced Vector Extensions (Intel(R) AVX) instructions.\n",
      "Intel MKL WARNING: Support of Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) enabled only processors has been deprecated. Intel oneAPI Math Kernel Library 2025.0 will require Intel(R) Advanced Vector Extensions (Intel(R) AVX) instructions.\n",
      "Intel MKL WARNING: Support of Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) enabled only processors has been deprecated. Intel oneAPI Math Kernel Library 2025.0 will require Intel(R) Advanced Vector Extensions (Intel(R) AVX) instructions.\n",
      "Intel MKL WARNING: Support of Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) enabled only processors has been deprecated. Intel oneAPI Math Kernel Library 2025.0 will require Intel(R) Advanced Vector Extensions (Intel(R) AVX) instructions.\n",
      "Intel MKL WARNING: Support of Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) enabled only processors has been deprecated. Intel oneAPI Math Kernel Library 2025.0 will require Intel(R) Advanced Vector Extensions (Intel(R) AVX) instructions.\n",
      "Intel MKL WARNING: Support of Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) enabled only processors has been deprecated. Intel oneAPI Math Kernel Library 2025.0 will require Intel(R) Advanced Vector Extensions (Intel(R) AVX) instructions.\n",
      "Intel MKL WARNING: Support of Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) enabled only processors has been deprecated. Intel oneAPI Math Kernel Library 2025.0 will require Intel(R) Advanced Vector Extensions (Intel(R) AVX) instructions.\n",
      "Intel MKL WARNING: Support of Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) enabled only processors has been deprecated. Intel oneAPI Math Kernel Library 2025.0 will require Intel(R) Advanced Vector Extensions (Intel(R) AVX) instructions.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "Detected KeyboardInterrupt, attempting graceful shutdown ...\n"
     ]
    },
    {
     "ename": "NameError",
     "evalue": "name 'exit' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "File \u001b[0;32m~/micromamba/envs/str-mzb/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py:46\u001b[0m, in \u001b[0;36m_call_and_handle_interrupt\u001b[0;34m(trainer, trainer_fn, *args, **kwargs)\u001b[0m\n\u001b[1;32m     45\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m trainer\u001b[38;5;241m.\u001b[39mstrategy\u001b[38;5;241m.\u001b[39mlauncher \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m---> 46\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstrategy\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlauncher\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlaunch\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtrainer_fn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrainer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     47\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m trainer_fn(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
      "File \u001b[0;32m~/micromamba/envs/str-mzb/lib/python3.10/site-packages/pytorch_lightning/strategies/launchers/multiprocessing.py:144\u001b[0m, in \u001b[0;36m_MultiProcessingLauncher.launch\u001b[0;34m(self, function, trainer, *args, **kwargs)\u001b[0m\n\u001b[1;32m    143\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprocs \u001b[38;5;241m=\u001b[39m process_context\u001b[38;5;241m.\u001b[39mprocesses\n\u001b[0;32m--> 144\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[43mprocess_context\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mjoin\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m    145\u001b[0m     \u001b[38;5;28;01mpass\u001b[39;00m\n",
      "File \u001b[0;32m~/micromamba/envs/str-mzb/lib/python3.10/site-packages/torch/multiprocessing/spawn.py:132\u001b[0m, in \u001b[0;36mProcessContext.join\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    131\u001b[0m \u001b[38;5;66;03m# Wait for any process to fail or all of them to succeed.\u001b[39;00m\n\u001b[0;32m--> 132\u001b[0m ready \u001b[38;5;241m=\u001b[39m \u001b[43mmultiprocessing\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconnection\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    133\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msentinels\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mkeys\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    134\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    135\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    137\u001b[0m error_index \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
      "File \u001b[0;32m~/micromamba/envs/str-mzb/lib/python3.10/multiprocessing/connection.py:931\u001b[0m, in \u001b[0;36mwait\u001b[0;34m(object_list, timeout)\u001b[0m\n\u001b[1;32m    930\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[0;32m--> 931\u001b[0m     ready \u001b[38;5;241m=\u001b[39m \u001b[43mselector\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mselect\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    932\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m ready:\n",
      "File \u001b[0;32m~/micromamba/envs/str-mzb/lib/python3.10/selectors.py:416\u001b[0m, in \u001b[0;36m_PollLikeSelector.select\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    415\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 416\u001b[0m     fd_event_list \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_selector\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpoll\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    417\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mInterruptedError\u001b[39;00m:\n",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m: ",
      "\nDuring handling of the above exception, another exception occurred:\n",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[12], line 15\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[38;5;66;03m# instantiate trainer and train\u001b[39;00m\n\u001b[1;32m      4\u001b[0m trainer \u001b[38;5;241m=\u001b[39m pl\u001b[38;5;241m.\u001b[39mTrainer(\n\u001b[1;32m      5\u001b[0m     accelerator\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mauto\u001b[39m\u001b[38;5;124m\"\u001b[39m,  \u001b[38;5;66;03m# cfg.trcl_num_gpus outdated\u001b[39;00m\n\u001b[1;32m      6\u001b[0m     max_epochs\u001b[38;5;241m=\u001b[39mcfg\u001b[38;5;241m.\u001b[39mtrcl_number_epochs,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     12\u001b[0m     \u001b[38;5;66;03m# profiler=\"simple\",\u001b[39;00m\n\u001b[1;32m     13\u001b[0m )\n\u001b[0;32m---> 15\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/micromamba/envs/str-mzb/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py:539\u001b[0m, in \u001b[0;36mTrainer.fit\u001b[0;34m(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)\u001b[0m\n\u001b[1;32m    537\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mstatus \u001b[38;5;241m=\u001b[39m TrainerStatus\u001b[38;5;241m.\u001b[39mRUNNING\n\u001b[1;32m    538\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtraining \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 539\u001b[0m \u001b[43mcall\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_and_handle_interrupt\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    540\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fit_impl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtrain_dataloaders\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mval_dataloaders\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdatamodule\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mckpt_path\u001b[49m\n\u001b[1;32m    541\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/micromamba/envs/str-mzb/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py:64\u001b[0m, in \u001b[0;36m_call_and_handle_interrupt\u001b[0;34m(trainer, trainer_fn, *args, **kwargs)\u001b[0m\n\u001b[1;32m     62\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(launcher, _SubprocessScriptLauncher):\n\u001b[1;32m     63\u001b[0m         launcher\u001b[38;5;241m.\u001b[39mkill(_get_sigkill_signal())\n\u001b[0;32m---> 64\u001b[0m     \u001b[43mexit\u001b[49m(\u001b[38;5;241m1\u001b[39m)\n\u001b[1;32m     66\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mBaseException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exception:\n\u001b[1;32m     67\u001b[0m     _interrupt(trainer, exception)\n",
      "\u001b[0;31mNameError\u001b[0m: name 'exit' is not defined"
     ]
    }
   ],
   "source": [
    "\n",
    "# instantiate trainer and train\n",
    "trainer = pl.Trainer(\n",
    "    accelerator=\"auto\",  # cfg.trcl_num_gpus outdated\n",
    "    max_epochs=cfg.trcl_number_epochs,\n",
    "    strategy=\"ddp_notebook\",\n",
    "    precision=16,\n",
    "    callbacks=cbacks,\n",
    "    logger=logger,\n",
    "    log_every_n_steps=1\n",
    "    # profiler=\"simple\",\n",
    ")\n",
    "\n",
    "trainer.fit(model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}