CU-8699049kf: Allow saving multiproccessing results.

mart-r · mart-r · commit ff8eed5cf5d9 · 2025-07-17T16:28:01.000+01:00
Also add comment regarding materialising the output without keeping it all in memory
diff --git a/medcat/3_run_model/run_model.ipynb b/medcat/3_run_model/run_model.ipynb
@@ -156,18 +156,25 @@
    "source": [
     "batch_char_size = 50000  # Batch size (BS) in number of characters\n",
     "for text_id, text in data_iterator(df, doc_id_column, doc_text_column):\n",
-    "    # NOTE: get_entities_multi_text returns an iterator\n",
-    "    #       so no work gets done until the iterator use materialised\n",
+    "    # NOTE: get_entities_multi_text returns an generator\n",
+    "    #       so no work gets done until the generator use materialised\n",
     "    output = cat.get_entities_multi_texts(text,\n",
     "                     only_cui=False,\n",
     "                    #  nproc=8, # Number of processors\n",
     "                    #  out_split_size_chars=20*batch_char_size,\n",
-    "                    #  save_dir_path=ann_folder_path,\n",
+    "                     save_dir_path=ann_folder_path,\n",
     "                    #  min_free_memory=0.1,\n",
     "                     )\n",
     "    # so if we're doing a small amount of data and/or not saving it on disk\n",
     "    # we probably want to just convert it to a list\n",
     "    output = list(output)\n",
+    "    # However, if we we're saving the data on disk and don't\n",
+    "    # want to duplicate in memory (i.e there's a lot of data\n",
+    "    # and it can't all be held in memory), we may want to\n",
+    "    # just exhaust the generator\n",
+    "    # NOTE: uncomment to use, but commnet the `ouput = list(ouput)`` line\n",
+    "    # for _ in output:\n",
+    "    #     pass\n",
     "\n",
     "medcat_logger.warning(f'Annotation process complete!')\n"
    ]