Skip to content

Commit ff8eed5

Browse files
committed
CU-8699049kf: Allow saving multiproccessing results.
Also add comment regarding materialising the output without keeping it all in memory
1 parent 298ede0 commit ff8eed5

File tree

1 file changed

+10
-3
lines changed

1 file changed

+10
-3
lines changed

medcat/3_run_model/run_model.ipynb

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -156,18 +156,25 @@
156156
"source": [
157157
"batch_char_size = 50000 # Batch size (BS) in number of characters\n",
158158
"for text_id, text in data_iterator(df, doc_id_column, doc_text_column):\n",
159-
" # NOTE: get_entities_multi_text returns an iterator\n",
160-
" # so no work gets done until the iterator use materialised\n",
159+
" # NOTE: get_entities_multi_text returns an generator\n",
160+
" # so no work gets done until the generator use materialised\n",
161161
" output = cat.get_entities_multi_texts(text,\n",
162162
" only_cui=False,\n",
163163
" # nproc=8, # Number of processors\n",
164164
" # out_split_size_chars=20*batch_char_size,\n",
165-
" # save_dir_path=ann_folder_path,\n",
165+
" save_dir_path=ann_folder_path,\n",
166166
" # min_free_memory=0.1,\n",
167167
" )\n",
168168
" # so if we're doing a small amount of data and/or not saving it on disk\n",
169169
" # we probably want to just convert it to a list\n",
170170
" output = list(output)\n",
171+
" # However, if we we're saving the data on disk and don't\n",
172+
" # want to duplicate in memory (i.e there's a lot of data\n",
173+
" # and it can't all be held in memory), we may want to\n",
174+
" # just exhaust the generator\n",
175+
" # NOTE: uncomment to use, but commnet the `ouput = list(ouput)`` line\n",
176+
" # for _ in output:\n",
177+
" # pass\n",
171178
"\n",
172179
"medcat_logger.warning(f'Annotation process complete!')\n"
173180
]

0 commit comments

Comments
 (0)