|
156 | 156 | "source": [ |
157 | 157 | "batch_char_size = 50000 # Batch size (BS) in number of characters\n", |
158 | 158 | "for text_id, text in data_iterator(df, doc_id_column, doc_text_column):\n", |
159 | | - " # NOTE: get_entities_multi_text returns an iterator\n", |
160 | | - " # so no work gets done until the iterator use materialised\n", |
| 159 | + " # NOTE: get_entities_multi_text returns an generator\n", |
| 160 | + " # so no work gets done until the generator use materialised\n", |
161 | 161 | " output = cat.get_entities_multi_texts(text,\n", |
162 | 162 | " only_cui=False,\n", |
163 | 163 | " # nproc=8, # Number of processors\n", |
164 | 164 | " # out_split_size_chars=20*batch_char_size,\n", |
165 | | - " # save_dir_path=ann_folder_path,\n", |
| 165 | + " save_dir_path=ann_folder_path,\n", |
166 | 166 | " # min_free_memory=0.1,\n", |
167 | 167 | " )\n", |
168 | 168 | " # so if we're doing a small amount of data and/or not saving it on disk\n", |
169 | 169 | " # we probably want to just convert it to a list\n", |
170 | 170 | " output = list(output)\n", |
| 171 | + " # However, if we we're saving the data on disk and don't\n", |
| 172 | + " # want to duplicate in memory (i.e there's a lot of data\n", |
| 173 | + " # and it can't all be held in memory), we may want to\n", |
| 174 | + " # just exhaust the generator\n", |
| 175 | + " # NOTE: uncomment to use, but commnet the `ouput = list(ouput)`` line\n", |
| 176 | + " # for _ in output:\n", |
| 177 | + " # pass\n", |
171 | 178 | "\n", |
172 | 179 | "medcat_logger.warning(f'Annotation process complete!')\n" |
173 | 180 | ] |
|
0 commit comments