diff --git a/examples/clinical_pipeline/README.md b/examples/clinical_pipeline/README.md new file mode 100644 index 00000000..c1c2e863 --- /dev/null +++ b/examples/clinical_pipeline/README.md @@ -0,0 +1,144 @@ +## A Clinical Information Processing Example + +This example shows how we can construct a project to make ForteHealth and Stave work side by side. + +## Install extra dependencies + +To install the latest code directly from source, + +```bash +pip install git+https://git@github.com/asyml/forte-wrappers#egg=forte.elastic\&subdirectory=src/elastic +pip install git+https://git@github.com/asyml/forte-wrappers#egg=forte.spacy\&subdirectory=src/spacy +pip install git+https://git@github.com/asyml/forte-wrappers#egg=forte.spacy\&subdirectory=src/nltk +pip install git+https://github.com/asyml/ForteHealth.git +pip install git+https://github.com/astml/stave.git +``` + +To install from PyPI, +```bash +pip install forte.elastic +pip install forte.spacy +pip install forte.nltk +pip install forte.health +pip install stave +``` + +## Downloading the models + +This example includes the following six functions: +1. Sentence Segementation +2. Tokenization +3. Pos Tag +4. Bio Named Entity Recognition +5. Nagation Context Analysis +6. ICD Coding Detection + +Before running the pipeline, we need to download the some of the models + +```bash +python ./download_models.py +``` + +**Note**: The above script will save the model in `resources/NCBI-disease`. Use `--path` option to save the model into a different directory. + +## Set up the configuration +Before run Elastic Searcher and Stave, we need to ensure that the current configuration is compatible with the environment of our computer. + +Please check and change the following configurations in `clinical_config.yml`: + +1. Ensure `LastUtteranceSearcher.stave_db_path`(line 16) is the correct path -> `$Home/.stave`, e.g., `"/home/name/.stave"` +2. Ensure `Stave.username`(line 26) and `Stave.pw`(line 27) is `"admin"` and `"admin"`. + + + + +## Prepare elastic searcher +Download corresponding elasticsearch archive from https://www.elastic.co/downloads/past-releases/elasticsearch-7-17-2, unzip it and run `elasticsearch-7-17-2/bin/elasticsearch` to start the service. + +Run the following to check if elasticsearch is running properly: +```bash +curl -XGET localhost:9200/_cluster/health?pretty +``` + +Make sure you create index 'elastic_indexer' in the cluster before working with this example, you can run the following command: +```bash +curl -X PUT localhost:9200/elastic_indexer +``` + +You can also follow the online blog for more information: + +https://www.elastic.co/guide/en/elasticsearch/reference/current/starting-elasticsearch.html + +## Generate ontologies and config +We also need to generate the Stave configuration file. + +See the three json files in the directory: +1. `clinical_pipeline_ontology.json`: this file defines the ontology we define in clinical domain. +2. `stave_chat_config.json`: this file defines the configuration of the chatbox in stave. +3. `stave_chat_config.json`: this file defines the configuration of the defualt nlp interface in stave. + +Now run +```bash +python generate_stave_config.py +``` + +This command can merge clinical ontologies and base ontologies, and generate the configuration of the stave project. If it executes successfully, you can see two new json files in current directory: `defualt_onto_project.json` and `chat_project.json`. + +## Run indexer and Stave +First, you should start an Elastic Indexer backend. + +Then, to start the Stave server that our pipeline will connect to for visualization purposes, run +```bash +stave -s start -o -l -n 8899 +``` + +Here, you need to make sure `LastUtteranceSearcher.url` and `Stave.url` in `clinical_config.yml` are both `"http://localhost:8899"`. Or you can change the port 8899 to any port you like. + + + +## Run demo pipeline + +Now, open a new terminal, other than the one running stave server. You can run the following command to parse some files and index them. +```bash +python clinical_processing_pipeline.py path_to_mimiciii/1.4/NOTEEVENTS.csv.gz path_to_mimiciii_output 100 1 +``` + +The last argument, `use_mimiciii_reader` is whether to use the `Mimic3DischargeNoteReader()`. If you set the argument to `1`, you will need to make sure the input data is mimic iii dataset, else `0` for `PlainTextReader()`. + +If you do not have the mimic iii datasets and just want to test the function, you can run the following command to test the function with the given sample data: + +```bash +python clinical_processing_pipeline.py sample_data/ path_to_sample_output/ -1 0 +``` + +If we just need to check the remote pipeline connection to Stave. + +You can mask out Line 76 to Line 118 in `clinical_processing_pipeline.py`. + +Hence, if you just wish to run the demo pipeline with existing database entries, and wish to just connect with Stave for visualization, You can mask out Line 74 to Line 118 in `clinical_processing_pipeline.py` and run this command: + +```bash +python clinical_processing_pipeline.py ./ ./ 100 0 +``` + +Here, we also write out the raw data pack to `/path_to_sample_output`, and only +index the first 100 notes. Remove the `100` parameter to index all documents. + +## Visualization + +You can go ahead and open `http://localhost:8899` on your browser to access Stave UI. +Next, you will see 2 projects, named as `clinical_pipeline_base` and `clinical_pipeline_chat` by default. + +image + +Click on `clinical_pipeline_chat` and then the document that resides within to go to the chatbot/search UI. Enter the keywords you want to search for in the elasticsearch indices. The pipeline would then return a bunch of documents that match your keywords. Click on those document links to access the Annotation Viewer UI for those documents. + +image + +image + + +## Add the output data +We write out the raw data pack to `/path_to_sample_output`, so you can see many json files in the directory. + +Click on `clinical_pipeline_base` and add the json file to the documents. Click on those document links to access the Annotation Viewer UI for those documents. \ No newline at end of file diff --git a/examples/clinical_pipeline/chat_query_document.json b/examples/clinical_pipeline/chat_query_document.json new file mode 100644 index 00000000..3b8dfbf4 --- /dev/null +++ b/examples/clinical_pipeline/chat_query_document.json @@ -0,0 +1,44 @@ +{ + "name": "query_chatbot.json", + "project_id": 99, + "textPack": { + "py/object": "forte.data.data_pack.DataPack", + "py/state": { + "creation_records": {}, + "field_records": {}, + "links": [], + "groups": [], + "meta": { + "py/object": "forte.data.data_pack.Meta", + "py/state": { + "pack_name": "query_chatbot", + "_pack_id": 3, + "language": "eng", + "span_unit": "character" + } + }, + "_text": "Welcome! Please type in a query to retrieve relevant clinical reports.", + "annotations": [ + { + "py/object": "ft.onto.base_ontology.Utterance", + "py/state": { + "_span": { + "py/object": "forte.data.span.Span", + "begin": 0, + "end": 70 + }, + "_tid": 0, + "speaker": "ai" + } + } + ], + "generics": [], + "replace_back_operations": [], + "processed_original_spans": [], + "orig_text_len": 70, + "serialization": { + "next_id": 1 + } + } + } +} \ No newline at end of file diff --git a/examples/clinical_pipeline/clinical_config.yml b/examples/clinical_pipeline/clinical_config.yml new file mode 100644 index 00000000..31ecdd65 --- /dev/null +++ b/examples/clinical_pipeline/clinical_config.yml @@ -0,0 +1,31 @@ +BERTTokenizer: + model_path: "resources/NCBI-disease" + +Spacy: + processors: ["sentence", "tokenize", "pos", "dep", "ner", "umls_link"] + medical_onto_type: "ftx.medical.clinical_ontology.MedicalEntityMention" + umls_onto_type: "ftx.medical.clinical_ontology.UMLSConceptLink" + lang: "en_ner_bc5cdr_md" + +BioBERTNERPredictor: + model_path: "resources/NCBI-disease" + ner_type: "DISEASE" + ignore_labels: ["O"] + +LastUtteranceSearcher: + stave_db_path: "//home//name//.stave//db.sqlite3" + url: "http://localhost:8899" + +Remote: + port: 8008 + input_format: "DataPack" + service_name: "Medical_Chatbot" + +Stave: + url: "http://localhost:8899" + username: admin + pw: admin + +viewer_project_json: "default_onto_project.json" +chat_project_json: "chat_project.json" +chat_document_json: "chat_query_document.json" diff --git a/examples/clinical_pipeline/clinical_pipeline_ontology.json b/examples/clinical_pipeline/clinical_pipeline_ontology.json new file mode 100644 index 00000000..1d6ebf45 --- /dev/null +++ b/examples/clinical_pipeline/clinical_pipeline_ontology.json @@ -0,0 +1,49 @@ +{ + "name": "clinical_pipeline_ontology", + "definitions": [ + { + "entry_name": "ftx.medical.clinical_ontology.NegationContext", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "A span based annotation `NegationContext`, used to represent the negation context of a named entity.", + "attributes": [ + { + "name": "polarity", + "type": "bool" + } + ] + }, + { + "entry_name": "ftx.medical.clinical_ontology.MedicalEntityMention", + "parent_entry": "ft.onto.base_ontology.EntityMention", + "description": "A span based annotation class MedicalEntityMention, used to represent an Entity Mention in medical domain", + "attributes": [ + { + "name": "umls_link", + "type": "str" + }, + { + "name": "umls_entities", + "type": "List", + "item_type": "ftx.medical.clinical_ontology.UMLSConceptLink" + } + ] + }, + { + "entry_name": "ftx.medical.clinical_ontology.MedicalArticle", + "parent_entry": "forte.data.ontology.top.Annotation", + "description": "An annotation based representation for the whole medical text chunk/document", + "attributes": [ + { + "name": "icd_version", + "type": "int", + "description": "The version of ICD-Coding being used." + }, + { + "name": "icd_code", + "type": "str", + "description": "The ICD code assigned to current medical article." + } + ] + } + ] +} \ No newline at end of file diff --git a/examples/clinical_pipeline/clinical_processing_pipeline.py b/examples/clinical_pipeline/clinical_processing_pipeline.py new file mode 100644 index 00000000..4b7a2420 --- /dev/null +++ b/examples/clinical_pipeline/clinical_processing_pipeline.py @@ -0,0 +1,143 @@ +import json +import sys +import time + +import yaml +from forte.common.configuration import Config +from forte.data.data_pack import DataPack +from forte.data.readers import PlainTextReader, RawDataDeserializeReader +from forte.pipeline import Pipeline +from forte.processors.writers import PackIdJsonPackWriter +from fortex.elastic import ElasticSearchPackIndexProcessor +from fortex.health.processors.icd_coding_processor import ICDCodingProcessor +from fortex.health.processors.negation_context_analyzer import \ + NegationContextAnalyzer +from fortex.huggingface import BioBERTNERPredictor +from fortex.nltk import NLTKNER, NLTKPOSTagger, NLTKSentenceSegmenter +from fortex.spacy import SpacyProcessor +from ft.onto.base_ontology import EntityMention, Sentence +from ftx.medical.clinical_ontology import (MedicalArticle, + MedicalEntityMention, + NegationContext) +from stave_backend.lib.stave_session import StaveSession + +from mimic3_note_reader import Mimic3DischargeNoteReader +from utterance_searcher import LastUtteranceSearcher + + +def get_json(path: str): + file_obj = open(path) + data = json.load(file_obj) + file_obj.close() + return data + + +def update_stave_db(default_project_json, chat_project_json, chat_doc_json, config): + project_id_base = 0 + with StaveSession(url=config.Stave.url) as session: + session.login(username=config.Stave.username, password=config.Stave.pw) + + projects = session.get_project_list().json() + project_names = [project["name"] for project in projects] + + if ( + default_project_json["name"] in project_names + and chat_project_json["name"] in project_names + ): + + base_project = [ + proj + for proj in projects + if proj["name"] == default_project_json["name"] + ][0] + return base_project["id"] + + resp1 = session.create_project(default_project_json) + project_id_base = json.loads(resp1.text)["id"] + + resp2 = session.create_project(chat_project_json) + project_id_chat = json.loads(resp2.text)["id"] + + chat_doc_json["project_id"] = project_id_chat + doc_id = session.create_document(chat_doc_json) + project_list = session.get_project_list().json() + + return project_id_base + + +def main( + input_path: str, output_path: str, max_packs: int = -1, use_mimiciii_reader=1 + ): + print("Starting demo pipeline example..") + config = yaml.safe_load(open("clinical_config.yml", "r")) + config = Config(config, default_hparams=None) + print("Running pipeline...") + pl = Pipeline[DataPack]() + if use_mimiciii_reader == 1: + pl.set_reader( + Mimic3DischargeNoteReader(), config={"max_num_notes": max_packs} + ) + else: + pl.set_reader(PlainTextReader()) + + pl.add(SpacyProcessor(), {"processors": ["sentence", "tokenize"]}) + pl.add(NLTKPOSTagger()) + pl.add(BioBERTNERPredictor(), config=config.BioBERTNERPredictor) + pl.add(NegationContextAnalyzer()) + pl.add( + ICDCodingProcessor(), + { + "entry_type": "ft.onto.base_ontology.Sentence", + }, + ) + pl.add( + ElasticSearchPackIndexProcessor(), + { + "indexer": { + "other_kwargs": {"refresh": True}, + } + }, + ) + pl.add( + PackIdJsonPackWriter(), + { + "output_dir": output_path, + "indent": 2, + "overwrite": True, + "drop_record": True, + "zip_pack": False, + }, + ) + + pl.initialize() + + for idx, pack in enumerate(pl.process_dataset(input_path)): + if (idx + 1) % 50 == 0: + print(f"{time.strftime('%m-%d %H:%M')}: Processed {idx + 1} packs") + + default_project_json = get_json(config.viewer_project_json) + chat_project_json = get_json(config.chat_project_json) + chat_doc_json = get_json(config.chat_document_json) + + base_project_id = update_stave_db( + default_project_json, chat_project_json, chat_doc_json, config + ) + + remote_pl = Pipeline[DataPack]() + remote_pl.set_reader(RawDataDeserializeReader()) + remote_pl.add( + LastUtteranceSearcher(), + config={ + "query_result_project_id": base_project_id, + "stave_db_path": config.LastUtteranceSearcher.stave_db_path, + "url_stub": config.LastUtteranceSearcher.url, + }, + ) + remote_pl.serve( + port=config.Remote.port, + input_format=config.Remote.input_format, + service_name=config.Remote.service_name, + ) + + +main(sys.argv[1], sys.argv[2], int(sys.argv[3]), int(sys.argv[4])) diff --git a/examples/clinical_pipeline/demo/__init__.py b/examples/clinical_pipeline/demo/__init__.py new file mode 100644 index 00000000..49ecbbf8 --- /dev/null +++ b/examples/clinical_pipeline/demo/__init__.py @@ -0,0 +1 @@ +# ***automatically_generated*** diff --git a/examples/clinical_pipeline/demo/clinical.py b/examples/clinical_pipeline/demo/clinical.py new file mode 100644 index 00000000..68541b46 --- /dev/null +++ b/examples/clinical_pipeline/demo/clinical.py @@ -0,0 +1,49 @@ +# ***automatically_generated*** +# ***source json:examples/clinical_pipeline/clinical_onto.json*** +# flake8: noqa +# mypy: ignore-errors +# pylint: skip-file +""" +Automatically generated ontology clinical. Do not change manually. +""" + +from dataclasses import dataclass +from forte.data.data_pack import DataPack +from forte.data.ontology.top import Annotation +from ft.onto.base_ontology import EntityMention + +__all__ = [ + "ClinicalEntityMention", + "Description", + "Body", +] + + +@dataclass +class ClinicalEntityMention(EntityMention): + """ + A span based annotation `ClinicalEntityMention`, normally used to represent an Entity Mention in a piece of clinical text. + """ + + def __init__(self, pack: DataPack, begin: int, end: int): + super().__init__(pack, begin, end) + + +@dataclass +class Description(Annotation): + """ + A span based annotation `Description`, used to represent the description in a piece of clinical note. + """ + + def __init__(self, pack: DataPack, begin: int, end: int): + super().__init__(pack, begin, end) + + +@dataclass +class Body(Annotation): + """ + A span based annotation `Body`, used to represent the actual content in a piece of clinical note. + """ + + def __init__(self, pack: DataPack, begin: int, end: int): + super().__init__(pack, begin, end) diff --git a/examples/clinical_pipeline/download_models.py b/examples/clinical_pipeline/download_models.py new file mode 100644 index 00000000..db0d7cca --- /dev/null +++ b/examples/clinical_pipeline/download_models.py @@ -0,0 +1,25 @@ +from forte.data.data_utils import maybe_download + +# download resources +urls = [ + "https://drive.google.com/file/d/15RSfFkW9syQKtx-_fQ9KshN3BJ27Jf8t/" + "view?usp=sharing", + "https://drive.google.com/file/d/1Nh7D6Xam5JefdoSXRoL7S0DZK1d4i2UK/" + "view?usp=sharing", + "https://drive.google.com/file/d/1YWcI60lGKtTFH01Ai1HnwOKBsrFf2r29/" + "view?usp=sharing", + "https://drive.google.com/file/d/1ElHUEMPQIuWmV0GimroqFphbCvFKskYj/" + "view?usp=sharing", + "https://drive.google.com/file/d/1EhMXlieoEg-bGUbbQ2vN-iyNJvC4Dajl/" + "view?usp=sharing", +] + +filenames = [ + "config.json", + "pytorch_model.bin", + "special_tokens_map.json", + "tokenizer_config.json", + "vocab.txt", +] + +maybe_download(urls=urls, path="resources/NCBI-disease", filenames=filenames) diff --git a/examples/clinical_pipeline/generate_stave_config.py b/examples/clinical_pipeline/generate_stave_config.py new file mode 100644 index 00000000..07031b51 --- /dev/null +++ b/examples/clinical_pipeline/generate_stave_config.py @@ -0,0 +1,44 @@ +import json +from forte import ontology_specs + + +def get_json(path: str): + file_obj = open(path) + data = json.load(file_obj) + file_obj.close() + return data + + +def main(): + clinical_ontology = get_json("clinical_pipeline_ontology.json") + base_ontology = get_json( + ontology_specs.__path__[0] + "//base_ontology.json" + ) + + merged_ontology = dict() + merged_ontology["name"] = "clinical_ontology" + merged_ontology["definitions"] = ( + base_ontology["definitions"] + clinical_ontology["definitions"] + ) + + default_onto_project = dict() + default_onto_project["name"] = "clinical_pipeline_base" + default_onto_project["project_type"] = "single_pack" + default_onto_project["ontology"] = merged_ontology + default_onto_project["config"] = get_json("stave_onto_config.json") + + with open("defualt_onto_project.json", "w") as fp: + json.dump(default_onto_project, fp) + + chat_project = dict() + chat_project["name"] = "chat_project" + chat_project["project_type"] = "single_pack" + chat_project["ontology"] = merged_ontology + chat_project["config"] = get_json("stave_chat_config.json") + + with open("chat_project.json", "w") as fp: + json.dump(chat_project, fp) + + +if __name__ == "__main__": + main() diff --git a/examples/clinical_pipeline/mimic3_note_reader.py b/examples/clinical_pipeline/mimic3_note_reader.py new file mode 100644 index 00000000..b3f02de6 --- /dev/null +++ b/examples/clinical_pipeline/mimic3_note_reader.py @@ -0,0 +1,80 @@ +# Copyright 2021 The Forte Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import csv +import logging +from pathlib import Path +from typing import Any, Iterator, Union, List + +from smart_open import open + +from demo.clinical import Description, Body +from forte.data.data_pack import DataPack +from forte.data.base_reader import PackReader +from ft.onto.base_ontology import Document + + +class Mimic3DischargeNoteReader(PackReader): + """This class is designed to read the discharge notes from MIMIC3 dataset + as plain text packs. + + For more information for the dataset, visit: + https://mimic.physionet.org/ + """ + + def __init__(self): + super().__init__() + self.headers: List[str] = [] + self.text_col = -1 # Default to be last column. + self.description_col = 0 # Default to be first column. + self.__note_count = 0 # Count number of notes processed. + + def _collect( # type: ignore + self, mimic3_path: Union[Path, str] + ) -> Iterator[Any]: + with open(mimic3_path) as f: + for r in csv.reader(f): + if 0 < self.configs.max_num_notes <= self.__note_count: + break + yield r + + def _parse_pack(self, row: List[str]) -> Iterator[DataPack]: + if len(self.headers) == 0: + self.headers.extend(row) + for i, h in enumerate(self.headers): + if h == "TEXT": + self.text_col = i + logging.info("Text Column is %d", i) + if h == "DESCRIPTION": + self.description_col = i + logging.info("Description Column is %d", i) + else: + pack: DataPack = DataPack() + description: str = row[self.description_col] + text: str = row[self.text_col] + delimiter = "\n-----------------\n" + full_text = description + delimiter + text + pack.set_text(full_text) + + Description(pack, 0, len(description)) + Body(pack, len(description) + len(delimiter), len(full_text)) + Document(pack, 0, len(pack.text)) + self.__note_count += 1 + yield pack + + @classmethod + def default_configs(cls): + # If this is set (>0), the reader will only read up to + # the number specified. + return {'max_num_notes':-1} diff --git a/examples/clinical_pipeline/requirements.txt b/examples/clinical_pipeline/requirements.txt new file mode 100644 index 00000000..e69de29b diff --git a/examples/clinical_pipeline/sample_data/notes.txt b/examples/clinical_pipeline/sample_data/notes.txt new file mode 100644 index 00000000..66f03c54 --- /dev/null +++ b/examples/clinical_pipeline/sample_data/notes.txt @@ -0,0 +1,6 @@ +ADDENDUM: +RADIOLOGIC STUDIES: Radiologic studies also included a chest CT, which confirmed cavitary lesions in the left lung apex consistent with infectious process/tuberculosis. +This also moderate-sized left pleural effusion. +HEAD CT: Head CT showed no intracranial hemorrhage and no mass effect, but old infarction consistent with past medical history. +ABDOMINAL CT: Abdominal CT showed no lesions of T10 and sacrum most likely secondary to steoporosis. +These can be followed by repeat imaging as an outpatient. \ No newline at end of file diff --git a/examples/clinical_pipeline/stave_chat_config.json b/examples/clinical_pipeline/stave_chat_config.json new file mode 100644 index 00000000..7450dab9 --- /dev/null +++ b/examples/clinical_pipeline/stave_chat_config.json @@ -0,0 +1,219 @@ +{ + "legendConfigs": { + "ft.onto.base_ontology.Token": { + "is_selected": false, + "is_shown": true, + "attributes": { + "pos": false, + "ud_xpos": false, + "lemma": false, + "chunk": false, + "ner": false, + "sense": false + } + }, + "ft.onto.base_ontology.Subword": { + "is_selected": false, + "is_shown": true, + "attributes": {} + }, + "ft.onto.base_ontology.Classification": { + "is_selected": false, + "is_shown": true, + "attributes": {} + }, + "ft.onto.base_ontology.Document": { + "is_selected": false, + "is_shown": true, + "attributes": {} + }, + "ft.onto.base_ontology.Sentence": { + "is_selected": false, + "is_shown": true, + "attributes": { + "speaker": false + } + }, + "ft.onto.base_ontology.Phrase": { + "is_selected": false, + "is_shown": true, + "attributes": { + "phrase_type": false + } + }, + "ft.onto.base_ontology.UtteranceContext": { + "is_selected": false, + "is_shown": true + }, + "ft.onto.base_ontology.Utterance": { + "is_selected": false, + "is_shown": true, + "attributes": { + "speaker": false + } + }, + "ft.onto.base_ontology.PredicateArgument": { + "is_selected": false, + "is_shown": true, + "attributes": { + "ner_type": false, + "predicate_lemma": false + } + }, + "ft.onto.base_ontology.EntityMention": { + "is_selected": false, + "is_shown": true, + "attributes": { + "ner_type": false + } + }, + "ft.onto.base_ontology.EventMention": { + "is_selected": false, + "is_shown": true, + "attributes": { + "event_type": false + } + }, + "ft.onto.base_ontology.PredicateMention": { + "is_selected": false, + "is_shown": true, + "attributes": { + "predicate_lemma": false, + "framenet_id": false + } + }, + "ft.onto.base_ontology.PredicateLink": { + "is_selected": false, + "is_shown": true, + "attributes": { + "arg_type": false + } + }, + "ft.onto.base_ontology.Dependency": { + "is_selected": false, + "is_shown": true, + "attributes": { + "dep_label": false, + "rel_type": false + } + }, + "ft.onto.base_ontology.EnhancedDependency": { + "is_selected": false, + "is_shown": true, + "attributes": { + "dep_label": false + } + }, + "ft.onto.base_ontology.RelationLink": { + "is_selected": false, + "is_shown": true, + "attributes": { + "rel_type": false + } + }, + "ft.onto.base_ontology.CrossDocEntityRelation": { + "is_selected": false, + "is_shown": true, + "attributes": { + "rel_type": false + } + }, + "ft.onto.base_ontology.CoreferenceGroup": { + "is_selected": false, + "is_shown": true + }, + "ft.onto.base_ontology.EventRelation": { + "is_selected": false, + "is_shown": true, + "attributes": { + "rel_type": false + } + }, + "ft.onto.base_ontology.CrossDocEventRelation": { + "is_selected": false, + "is_shown": true, + "attributes": { + "rel_type": false + } + }, + "ft.onto.base_ontology.ConstituentNode": { + "is_selected": false, + "is_shown": true, + "attributes": { + "label": false + } + }, + "ft.onto.base_ontology.Title": { + "is_selected": false, + "is_shown": true + }, + "ft.onto.base_ontology.Body": { + "is_selected": false, + "is_shown": true + }, + "ft.onto.base_ontology.MCOption": { + "is_selected": false, + "is_shown": true + }, + "ft.onto.base_ontology.MCQuestion": { + "is_selected": false, + "is_shown": true, + "attributes": {} + }, + "ft.onto.base_ontology.MRCQuestion": { + "is_selected": false, + "is_shown": true, + "attributes": {} + }, + "ft.onto.base_ontology.Recording": { + "is_selected": false, + "is_shown": true, + "attributes": {} + }, + "ft.onto.base_ontology.AudioUtterance": { + "is_selected": false, + "is_shown": true, + "attributes": { + "speaker": false + } + }, + "ftx.medical.clinical_ontology.NegationContext": { + "is_selected": false, + "is_shown": true, + "attributes": {} + } + }, + "scopeConfigs": { + "ft.onto.base_ontology.Token": false, + "ft.onto.base_ontology.Subword": false, + "ft.onto.base_ontology.Document": false, + "ft.onto.base_ontology.Sentence": false, + "ft.onto.base_ontology.Phrase": false, + "ft.onto.base_ontology.UtteranceContext": false, + "ft.onto.base_ontology.Utterance": false, + "ft.onto.base_ontology.PredicateArgument": false, + "ft.onto.base_ontology.EntityMention": false, + "ft.onto.base_ontology.EventMention": false, + "ft.onto.base_ontology.PredicateMention": false, + "ft.onto.base_ontology.ConstituentNode": false, + "ft.onto.base_ontology.Title": false, + "ft.onto.base_ontology.Body": false, + "ft.onto.base_ontology.MCOption": false, + "ft.onto.base_ontology.MCQuestion": false, + "ft.onto.base_ontology.MRCQuestion": false, + "ftx.medical.clinical_ontology.NegationContext": false + }, + "layoutConfigs": { + "center-middle": "DialogueBox", + "left": "disable", + "right": "disable", + "center-bottom": "disable" + }, + "remoteConfigs": { + "pipelineUrl": "http://localhost:8008", + "doValidation": false, + "expectedName": "", + "inputFormat": "string", + "expectedRecords": {} + } +} \ No newline at end of file diff --git a/examples/clinical_pipeline/stave_onto_config.json b/examples/clinical_pipeline/stave_onto_config.json new file mode 100644 index 00000000..6981c8b4 --- /dev/null +++ b/examples/clinical_pipeline/stave_onto_config.json @@ -0,0 +1,235 @@ +{ + "legendConfigs": { + "ft.onto.base_ontology.Token": { + "is_selected": false, + "is_shown": true, + "attributes": { + "pos": false, + "ud_xpos": false, + "lemma": false, + "chunk": false, + "ner": false, + "sense": false + } + }, + "ft.onto.base_ontology.Subword": { + "is_selected": false, + "is_shown": true, + "attributes": {} + }, + "ft.onto.base_ontology.Classification": { + "is_selected": false, + "is_shown": false, + "attributes": {} + }, + "ft.onto.base_ontology.Document": { + "is_selected": false, + "is_shown": true, + "attributes": {} + }, + "ft.onto.base_ontology.Sentence": { + "is_selected": false, + "is_shown": true, + "attributes": { + "speaker": false + } + }, + "ft.onto.base_ontology.Phrase": { + "is_selected": false, + "is_shown": true, + "attributes": { + "phrase_type": false + } + }, + "ft.onto.base_ontology.UtteranceContext": { + "is_selected": false, + "is_shown": false + }, + "ft.onto.base_ontology.Utterance": { + "is_selected": false, + "is_shown": false, + "attributes": { + "speaker": false + } + }, + "ft.onto.base_ontology.PredicateArgument": { + "is_selected": false, + "is_shown": false, + "attributes": { + "ner_type": false, + "predicate_lemma": false + } + }, + "ft.onto.base_ontology.EntityMention": { + "is_selected": false, + "is_shown": true, + "attributes": { + "ner_type": false + } + }, + "ft.onto.base_ontology.EventMention": { + "is_selected": false, + "is_shown": true, + "attributes": { + "event_type": false + } + }, + "ft.onto.base_ontology.PredicateMention": { + "is_selected": false, + "is_shown": true, + "attributes": { + "predicate_lemma": false, + "framenet_id": false + } + }, + "ft.onto.base_ontology.PredicateLink": { + "is_selected": false, + "is_shown": false, + "attributes": { + "arg_type": false + } + }, + "ft.onto.base_ontology.Dependency": { + "is_selected": false, + "is_shown": false, + "attributes": { + "dep_label": false, + "rel_type": false + } + }, + "ft.onto.base_ontology.EnhancedDependency": { + "is_selected": false, + "is_shown": false, + "attributes": { + "dep_label": false + } + }, + "ft.onto.base_ontology.RelationLink": { + "is_selected": false, + "is_shown": true, + "attributes": { + "rel_type": false + } + }, + "ft.onto.base_ontology.CrossDocEntityRelation": { + "is_selected": false, + "is_shown": false, + "attributes": { + "rel_type": false + } + }, + "ft.onto.base_ontology.CoreferenceGroup": { + "is_selected": false, + "is_shown": false + }, + "ft.onto.base_ontology.EventRelation": { + "is_selected": false, + "is_shown": false, + "attributes": { + "rel_type": false + } + }, + "ft.onto.base_ontology.CrossDocEventRelation": { + "is_selected": false, + "is_shown": false, + "attributes": { + "rel_type": false + } + }, + "ft.onto.base_ontology.ConstituentNode": { + "is_selected": false, + "is_shown": false, + "attributes": { + "label": false + } + }, + "ft.onto.base_ontology.Title": { + "is_selected": false, + "is_shown": false + }, + "ft.onto.base_ontology.Body": { + "is_selected": false, + "is_shown": false + }, + "ft.onto.base_ontology.MCOption": { + "is_selected": false, + "is_shown": false + }, + "ft.onto.base_ontology.MCQuestion": { + "is_selected": false, + "is_shown": false, + "attributes": {} + }, + "ft.onto.base_ontology.MRCQuestion": { + "is_selected": false, + "is_shown": false, + "attributes": {} + }, + "ft.onto.base_ontology.Recording": { + "is_selected": false, + "is_shown": false, + "attributes": {} + }, + "ft.onto.base_ontology.AudioUtterance": { + "is_selected": false, + "is_shown": false, + "attributes": { + "speaker": false + } + }, + "ftx.medical.clinical_ontology.NegationContext": { + "is_selected": false, + "is_shown": true, + "attributes": {} + }, + "ftx.medical.clinical_ontology.MedicalEntityMention": { + "is_selected": false, + "is_shown": true, + "attributes": { + "umls_link": false + } + }, + "ftx.medical.clinical_ontology.MedicalArticle": { + "is_selected": false, + "is_shown": true, + "attributes": { + "icd_code": false + } + } + }, + "scopeConfigs": { + "ft.onto.base_ontology.Token": false, + "ft.onto.base_ontology.Subword": false, + "ft.onto.base_ontology.Document": false, + "ft.onto.base_ontology.Sentence": false, + "ft.onto.base_ontology.Phrase": false, + "ft.onto.base_ontology.UtteranceContext": false, + "ft.onto.base_ontology.Utterance": false, + "ft.onto.base_ontology.PredicateArgument": false, + "ft.onto.base_ontology.EntityMention": false, + "ft.onto.base_ontology.EventMention": false, + "ft.onto.base_ontology.PredicateMention": false, + "ft.onto.base_ontology.ConstituentNode": false, + "ft.onto.base_ontology.Title": false, + "ft.onto.base_ontology.Body": false, + "ft.onto.base_ontology.MCOption": false, + "ft.onto.base_ontology.MCQuestion": false, + "ft.onto.base_ontology.MRCQuestion": false, + "ftx.medical.clinical_ontology.NegationContext": false, + "ftx.medical.clinical_ontology.MedicalEntityMention": false, + "ftx.medical.clinical_ontology.MedicalArticle": false + }, + "layoutConfigs": { + "center-middle": "default-nlp", + "left": "default-meta", + "right": "default-attribute", + "center-bottom": "disable" + }, + "remoteConfigs": { + "pipelineUrl": "", + "doValidation": false, + "expectedName": "", + "inputFormat": "string", + "expectedRecords": {} + } +} \ No newline at end of file diff --git a/examples/clinical_pipeline/utterance_searcher.py b/examples/clinical_pipeline/utterance_searcher.py new file mode 100644 index 00000000..b38a9a24 --- /dev/null +++ b/examples/clinical_pipeline/utterance_searcher.py @@ -0,0 +1,122 @@ +import os +import logging +import sqlite3 +from typing import Dict, Any, Optional, List +from fortex.elastic import ElasticSearchIndexer + +from forte.common import Resources, ProcessorConfigError +from forte.common.configuration import Config +from forte.data.common_entry_utils import create_utterance, get_last_utterance +from forte.data.data_pack import DataPack +from forte.processors.base import PackProcessor +from ft.onto.base_ontology import Utterance + + +def sqlite_insert(conn, table, row): + cols: str = ", ".join('"{}"'.format(col) for col in row.keys()) + vals: str = ", ".join(":{}".format(col) for col in row.keys()) + sql: str = 'INSERT INTO "{0}" ({1}) VALUES ({2})'.format(table, cols, vals) + cursor = conn.cursor() + cursor.execute(sql, row) + conn.commit() + return cursor.lastrowid + + +def create_links(url_stub: str, ids: List[int]) -> List[str]: + links: List[str] = [] + + url_stub: str = url_stub.strip("/") + for temp_idm in ids: + links.append( + f"Report #{temp_idm}" + ) + return links + + +class LastUtteranceSearcher(PackProcessor): + # pylint: disable=attribute-defined-outside-init + + def initialize(self, resources: Resources, configs: Config): + super().initialize(resources, configs) + self.index = ElasticSearchIndexer(self.configs.indexer.hparams) + if self.configs.query_result_project_id < 0: + raise ProcessorConfigError("Query Result Project is not set.") + + if not os.path.exists(self.configs.stave_db_path): + raise ProcessorConfigError( + f"Cannot find Stave DB at: {self.configs.stave_db_path}" + ) + + def _process(self, input_pack: DataPack): + # Make sure we take the last utterance from the user. + utterance: Optional[Utterance] = get_last_utterance(input_pack, "user") + + if utterance is not None: + logging.info("The last utterance is %s", utterance) + # Create the query using the last utterance from user. + size = self.configs.size or 1000 + field = self.configs.field or "content" + query_value = { + "query": {"match": {field: utterance.text}}, + "size": size, + } + + # Search against the index. + results = self.index.search(query_value) + hits = results["hits"]["hits"] + + conn = sqlite3.connect(self.configs.stave_db_path) + + answers = [] + for idx, hit in enumerate(hits): + source = hit["_source"] + # The raw pack string and pack id (not database id) + raw_pack_str: str = source["pack_info"] + pack_id: str = source["doc_id"] + + # Now you can write the pack into the database and generate url. + item = { + "name": f"clinical_results_{idx}", + "textPack": raw_pack_str, + "project_id": self.configs.query_result_project_id, + } + + db_id = sqlite_insert(conn, "stave_backend_document", item) + answers += [db_id] + + if len(answers) == 0: + create_utterance( + input_pack, + "No results found. Please try another query.", + "ai", + ) + else: + links: List[str] = create_links(self.configs.url_stub, answers) + response_text: str = ( + "I found the following results:
-- " + + "
-- ".join(links) + ) + + create_utterance(input_pack, response_text, "ai") + else: + logging.info("Cannot get another utterance.") + create_utterance( + input_pack, + "Hey, I didn't get what you say, could you try again?", + "ai", + ) + + @classmethod + def default_configs(cls) -> Dict[str, Any]: + return { + "size": 5, + "field": "content", + "indexer": { + "name": "ElasticSearchIndexer", + "hparams": ElasticSearchIndexer.default_configs(), + "other_kwargs": {"request_timeout": 10, "refresh": False}, + }, + "stave_db_path": "~/projects/stave/simple-backend/db.sqlite3", + "url_stub": "http://localhost:3000", + "query_result_project_id": -1, + } diff --git a/examples/search_engine_streamlit/clinical_config.yml b/examples/search_engine_streamlit/clinical_config.yml new file mode 100644 index 00000000..ef8641ea --- /dev/null +++ b/examples/search_engine_streamlit/clinical_config.yml @@ -0,0 +1,13 @@ +BioBERTNERPredictor: + model_path: "resources/NCBI-disease" + ner_type: "DISEASE" + ignore_labels: ["O"] + +BERTTokenizer: + model_path: "resources/NCBI-disease" + +Stave: + stave_db_path: "C://Users//Leo//.stave//db.sqlite3" + url: "http://localhost:8899" + username: admin + pw: admin diff --git a/examples/search_engine_streamlit/clinical_pipeline_processor.py b/examples/search_engine_streamlit/clinical_pipeline_processor.py new file mode 100644 index 00000000..38907b48 --- /dev/null +++ b/examples/search_engine_streamlit/clinical_pipeline_processor.py @@ -0,0 +1,56 @@ +import time + + +import yaml +from forte.common.configuration import Config +from forte.data.data_pack import DataPack +from forte.pipeline import Pipeline +from forte.processors.writers import PackIdJsonPackWriter +from fortex.elastic import ElasticSearchPackIndexProcessor +from fortex.huggingface import BioBERTNERPredictor +from fortex.huggingface.transformers_processor import BERTTokenizer + +from mimic3_note_reader import Mimic3DischargeNoteReader +from fortex.nltk import NLTKSentenceSegmenter + + +def main( + input_path: str, output_path: str, max_packs: int = -1 + ): + + config = yaml.safe_load(open("clinical_config.yml", "r")) + config = Config(config, default_hparams=None) + + pl = Pipeline[DataPack]() + pl.set_reader( + Mimic3DischargeNoteReader(), config={"max_num_notes": max_packs} + ) + + pl.add(NLTKSentenceSegmenter()) + pl.add(BERTTokenizer(), config=config.BERTTokenizer) + + pl.add(BioBERTNERPredictor(), config=config.BioBERTNERPredictor) + pl.add( + ElasticSearchPackIndexProcessor(), + { + "indexer": { + "other_kwargs": {"refresh": True}, + } + }, + ) + pl.add( + PackIdJsonPackWriter(), + { + "output_dir": output_path, + "indent": 2, + "overwrite": True, + "drop_record": True, + "zip_pack": False, + }, + ) + + pl.initialize() + + for idx, pack in enumerate(pl.process_dataset(input_path)): + if (idx + 1) % 50 == 0: + print(f"{time.strftime('%m-%d %H:%M')}: Processed {idx + 1} packs") \ No newline at end of file diff --git a/examples/search_engine_streamlit/demo/__init__.py b/examples/search_engine_streamlit/demo/__init__.py new file mode 100644 index 00000000..49ecbbf8 --- /dev/null +++ b/examples/search_engine_streamlit/demo/__init__.py @@ -0,0 +1 @@ +# ***automatically_generated*** diff --git a/examples/search_engine_streamlit/demo/clinical.py b/examples/search_engine_streamlit/demo/clinical.py new file mode 100644 index 00000000..68541b46 --- /dev/null +++ b/examples/search_engine_streamlit/demo/clinical.py @@ -0,0 +1,49 @@ +# ***automatically_generated*** +# ***source json:examples/clinical_pipeline/clinical_onto.json*** +# flake8: noqa +# mypy: ignore-errors +# pylint: skip-file +""" +Automatically generated ontology clinical. Do not change manually. +""" + +from dataclasses import dataclass +from forte.data.data_pack import DataPack +from forte.data.ontology.top import Annotation +from ft.onto.base_ontology import EntityMention + +__all__ = [ + "ClinicalEntityMention", + "Description", + "Body", +] + + +@dataclass +class ClinicalEntityMention(EntityMention): + """ + A span based annotation `ClinicalEntityMention`, normally used to represent an Entity Mention in a piece of clinical text. + """ + + def __init__(self, pack: DataPack, begin: int, end: int): + super().__init__(pack, begin, end) + + +@dataclass +class Description(Annotation): + """ + A span based annotation `Description`, used to represent the description in a piece of clinical note. + """ + + def __init__(self, pack: DataPack, begin: int, end: int): + super().__init__(pack, begin, end) + + +@dataclass +class Body(Annotation): + """ + A span based annotation `Body`, used to represent the actual content in a piece of clinical note. + """ + + def __init__(self, pack: DataPack, begin: int, end: int): + super().__init__(pack, begin, end) diff --git a/examples/search_engine_streamlit/download_models.py b/examples/search_engine_streamlit/download_models.py new file mode 100644 index 00000000..db0d7cca --- /dev/null +++ b/examples/search_engine_streamlit/download_models.py @@ -0,0 +1,25 @@ +from forte.data.data_utils import maybe_download + +# download resources +urls = [ + "https://drive.google.com/file/d/15RSfFkW9syQKtx-_fQ9KshN3BJ27Jf8t/" + "view?usp=sharing", + "https://drive.google.com/file/d/1Nh7D6Xam5JefdoSXRoL7S0DZK1d4i2UK/" + "view?usp=sharing", + "https://drive.google.com/file/d/1YWcI60lGKtTFH01Ai1HnwOKBsrFf2r29/" + "view?usp=sharing", + "https://drive.google.com/file/d/1ElHUEMPQIuWmV0GimroqFphbCvFKskYj/" + "view?usp=sharing", + "https://drive.google.com/file/d/1EhMXlieoEg-bGUbbQ2vN-iyNJvC4Dajl/" + "view?usp=sharing", +] + +filenames = [ + "config.json", + "pytorch_model.bin", + "special_tokens_map.json", + "tokenizer_config.json", + "vocab.txt", +] + +maybe_download(urls=urls, path="resources/NCBI-disease", filenames=filenames) diff --git a/examples/search_engine_streamlit/mimic3_note_reader.py b/examples/search_engine_streamlit/mimic3_note_reader.py new file mode 100644 index 00000000..c3733e1f --- /dev/null +++ b/examples/search_engine_streamlit/mimic3_note_reader.py @@ -0,0 +1,80 @@ +# Copyright 2021 The Forte Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import csv +import logging +from pathlib import Path +from typing import Any, Iterator, Union, List + +from smart_open import open + +from demo.clinical import Description, Body +from forte.data.data_pack import DataPack +from forte.data.base_reader import PackReader +from ft.onto.base_ontology import Document + + +class Mimic3DischargeNoteReader(PackReader): + """This class is designed to read the discharge notes from MIMIC3 dataset + as plain text packs. + + For more information for the dataset, visit: + https://mimic.physionet.org/ + """ + + def __init__(self): + super().__init__() + self.headers: List[str] = [] + self.text_col = -1 # Default to be last column. + self.description_col = 0 # Default to be first column. + self.__note_count = 0 # Count number of notes processed. + + def _collect(self, mimic3_path: Union[Path, str]) -> Iterator[Any]: # type: ignore + with open(mimic3_path) as f: + for r in csv.reader(f): + if 0 < self.configs.max_num_notes <= self.__note_count: + break + yield r + + def _parse_pack(self, row: List[str]) -> Iterator[DataPack]: + if len(self.headers) == 0: + self.headers.extend(row) + for i, h in enumerate(self.headers): + if h == "TEXT": + self.text_col = i + logging.info("Text Column is %d", i) + if h == "DESCRIPTION": + self.description_col = i + logging.info("Description Column is %d", i) + else: + pack: DataPack = DataPack() + description: str = row[self.description_col] + text: str = row[self.text_col] + delimiter = "\n-----------------\n" + full_text = description + delimiter + text + pack.set_text(full_text) + + Description(pack, 0, len(description)) + Body(pack, len(description) + len(delimiter), len(full_text)) + Document(pack, 0, len(pack.text)) + self.__note_count += 1 + yield pack + + @classmethod + def default_configs(cls): + config = {} + # If this is set (>0), the reader will only read up to + # the number specified. + config["max_num_notes"] = -1 + return config diff --git a/examples/search_engine_streamlit/multiple_pages.py b/examples/search_engine_streamlit/multiple_pages.py new file mode 100644 index 00000000..eb00409e --- /dev/null +++ b/examples/search_engine_streamlit/multiple_pages.py @@ -0,0 +1,180 @@ +from __future__ import annotations +import streamlit as st +from elasticsearch import Elasticsearch +from search_utils import all_search, index_search +import templates +import spacy_streamlit +import json +from pipelines import process_data + + +st.set_page_config(page_title="ForteHealth_Search_Engine", layout="wide") + +PAGES = ["Search Engine", "Plain Text Input"] +es = Elasticsearch(hosts=["http://localhost:9200/"]) +INDEX = "elastic_indexer" + +page = st.sidebar.selectbox("Functions:", PAGES) + +# search engine +if page == PAGES[0]: + r1c1, r1c2 = st.columns([6, 6]) + with r1c1: + st.title("Search the MIMIC III Data...") + search = st.text_input("Enter search words:") + + if not search: + records = {} + results = all_search(es, INDEX) + for i in range(len(results["hits"]["hits"])): + result = results["hits"]["hits"][i] + res = result["_source"] + annotations = json.loads(res["pack_info"])["py/state"]["annotations"] + + ents = [] + for annotation in annotations: + if "EntityMention" in annotation["py/object"]: + ents.append( + { + "start": annotation["py/state"]["_span"]["begin"], + "end": annotation["py/state"]["_span"]["end"], + "label": annotation["py/state"]["ner_type"], + } + ) + records[res["doc_id"]] = [res["content"], ents] + + options = [] + for key in records: + options.append(key) + + if options: + myradio = st.radio( + label="Select a report:", + options=options, + index=0, + format_func=lambda x: f"Report# {x}", + key="radio_demo", + help="Click the radio button please", + ) + + with r1c2: + data = [ + { + "text": records[myradio][0], + "ents": records[myradio][1], + "title": None, + } + ] + + spacy_streamlit.visualize_ner( + data, + labels=["DISEASE"], + show_table=False, + manual=True, + title="Disease NER Detection", + ) + else: + st.warning("No results") + + if search: + records = {} + results = index_search(es, INDEX, search) + total_hits = results["aggregations"]["match_count"]["value"] + # show number of results and time taken + st.write( + templates.number_of_results(total_hits, results["took"] / 1000), + unsafe_allow_html=True, + ) + # search results + for i in range(len(results["hits"]["hits"])): + result = results["hits"]["hits"][i] + res = result["_source"] + annotations = json.loads(res["pack_info"])["py/state"]["annotations"] + + ents = [] + for annotation in annotations: + if "EntityMention" in annotation["py/object"]: + ents.append( + { + "start": annotation["py/state"]["_span"]["begin"], + "end": annotation["py/state"]["_span"]["end"], + "label": annotation["py/state"]["ner_type"], + } + ) + records[res["doc_id"]] = [res["content"], ents] + + options = [] + for key in records: + options.append(key) + + if options: + myradio = st.radio( + label="Select a report:", + options=options, + index=0, + format_func=lambda x: f"Report# {x}", + key="radio_demo", + help="Click the radio button please", + ) + + with r1c2: + data = [ + { + "text": records[myradio][0], + "ents": records[myradio][1], + "title": None, + } + ] + + spacy_streamlit.visualize_ner( + data, + labels=["DISEASE"], + show_table=False, + manual=True, + title="Disease NER Detection", + ) + else: + st.warning("No results") + + +# Plain Text +if page == PAGES[1]: + + st.title("Named Entity Recognition Visualization") + form = st.form("ner") + text = form.text_area( + "Input your text here:", value="The CEO of Tesla is Ellon Musk." + ) + + if form.form_submit_button("Visualize"): + pass + + ents = process_data(text) + + doc = [ + { + "text": text, + "ents": ents, + "title": None, + } + ] + + spacy_streamlit.visualize_ner( + doc, + labels=[ + "ORG", + "DATE", + "NORP", + "ORDINAL", + "CARDINAL", + "PERSON", + "PERSENT", + "GPE", + "QUANTITY", + "LAW", + "MONEY", + ], + show_table=False, + title="NER", + manual=True, + ) diff --git a/examples/search_engine_streamlit/pack_searcher.py b/examples/search_engine_streamlit/pack_searcher.py new file mode 100644 index 00000000..741514c4 --- /dev/null +++ b/examples/search_engine_streamlit/pack_searcher.py @@ -0,0 +1,124 @@ +import os +import logging +import sqlite3 +from typing import Dict, Any, Optional, List +from fortex.elastic import ElasticSearchIndexer + +from forte.common import Resources, ProcessorConfigError +from forte.common.configuration import Config +from forte.data.common_entry_utils import create_utterance, get_last_utterance +from forte.data.data_pack import DataPack +from forte.processors.base import PackProcessor +from ft.onto.base_ontology import Utterance + + +def sqlite_insert(conn, table, row): + cols: str = ", ".join('"{}"'.format(col) for col in row.keys()) + vals: str = ", ".join(":{}".format(col) for col in row.keys()) + sql: str = 'INSERT INTO "{0}" ({1}) VALUES ({2})'.format(table, cols, vals) + cursor = conn.cursor() + cursor.execute(sql, row) + conn.commit() + return cursor.lastrowid + + +def create_links(url_stub: str, ids: List[int]) -> List[str]: + links: List[str] = [] + + url_stub: str = url_stub.strip("/") + for temp_idm in ids: + links.append( + f"Report #{temp_idm}" + ) + return links + + +class LastUtteranceSearcher(PackProcessor): + # pylint: disable=attribute-defined-outside-init + + def initialize(self, resources: Resources, configs: Config): + super().initialize(resources, configs) + self.index = ElasticSearchIndexer(self.configs.indexer.hparams) + if self.configs.query_result_project_id < 0: + raise ProcessorConfigError("Query Result Project is not set.") + + if not os.path.exists(self.configs.stave_db_path): + raise ProcessorConfigError( + f"Cannot find Stave DB at: {self.configs.stave_db_path}" + ) + + def _process(self, input_pack: DataPack): + # Make sure we take the last utterance from the user. + utterance: Optional[Utterance] = get_last_utterance(input_pack, "user") + + if utterance is not None: + logging.info("The last utterance is %s", utterance) + # Create the query using the last utterance from user. + size = self.configs.size or 1000 + field = self.configs.field or "content" + query_value = { + "query": {"match": {field: utterance.text}}, + "size": size, + } + + # Search against the index. + results = self.index.search(query_value) + hits = results["hits"]["hits"] + + conn = sqlite3.connect(self.configs.stave_db_path) + + answers = [] + for idx, hit in enumerate(hits): + source = hit["_source"] + # The raw pack string and pack id (not database id) + raw_pack_str: str = source["pack_info"] + pack_id: str = source["doc_id"] + + # Now you can write the pack into the database and generate url. + item = { + "name": f"clinical_results_{idx}", + "textPack": raw_pack_str, + "project_id": self.configs.query_result_project_id, + } + + db_id = sqlite_insert(conn, "stave_backend_document", item) + answers += [db_id] + print(pack_id, db_id) + + if len(answers) == 0: + create_utterance( + input_pack, + "No results found. Please try another query.", + "ai", + ) + else: + links: List[str] = create_links(self.configs.url_stub, answers) + response_text: str = ( + "I found the following results:
-- " + + "
-- ".join(links) + ) + print(response_text) + + create_utterance(input_pack, response_text, "ai") + else: + logging.info("Cannot get another utterance.") + create_utterance( + input_pack, + "Hey, I didn't get what you say, could you try again?", + "ai", + ) + + @classmethod + def default_configs(cls) -> Dict[str, Any]: + return { + "size": 5, + "field": "content", + "indexer": { + "name": "ElasticSearchIndexer", + "hparams": ElasticSearchIndexer.default_configs(), + "other_kwargs": {"request_timeout": 10, "refresh": False}, + }, + "stave_db_path": "~/projects/stave/simple-backend/db.sqlite3", + "url_stub": "http://localhost:3000", + "query_result_project_id": -1, + } diff --git a/examples/search_engine_streamlit/pipelines.py b/examples/search_engine_streamlit/pipelines.py new file mode 100644 index 00000000..5e711fea --- /dev/null +++ b/examples/search_engine_streamlit/pipelines.py @@ -0,0 +1,26 @@ +from forte.data.readers import StringReader +from fortex.spacy import SpacyProcessor +from forte.data.data_pack import DataPack +from forte import Pipeline +from ft.onto.base_ontology import EntityMention +from typing import List + + +def process_data(text: str): + + pipeline: Pipeline = Pipeline[DataPack]() + pipeline.set_reader(StringReader()) + pipeline.add(SpacyProcessor(), {"processors": ["sentence", "tokenize", "ner"]}) + + for pack in pipeline.initialize().process_dataset(text): + pack_ents: List[EntityMention] = list(pack.get(EntityMention)) + + begin = [x.begin for x in pack_ents] + end = [x.end for x in pack_ents] + ner_type = [x.ner_type for x in pack_ents] + + res = [] + for i in range(len(begin)): + res.append({"start": int(begin[i]), "end": int(end[i]), "label": ner_type[i]}) + + return res diff --git a/examples/search_engine_streamlit/search_utils.py b/examples/search_engine_streamlit/search_utils.py new file mode 100644 index 00000000..5aacdca1 --- /dev/null +++ b/examples/search_engine_streamlit/search_utils.py @@ -0,0 +1,56 @@ +def all_search(es, index: str) -> dict: + """ + Args: + es: Elasticsearch client instance. + index: Name of the index we are going to use. + size: Number of results returned in each search. + """ + # search query + body = {"query": {"match_all": {}}} + + res = es.search(index=index, body=body) + + return res + + +def index_search(es, index: str, keywords: str) -> dict: + """ + Args: + es: Elasticsearch client instance. + index: Name of the index we are going to use. + keywords: Search keywords. + from_i: Start index of the results for pagination. + size: Number of results returned in each search. + """ + # search query + body = { + "query": { + "bool": { + "must": [ + { + "query_string": { + "query": keywords, + "fields": ["content"], + "default_operator": "AND", + } + } + ], + } + }, + "highlight": { + "pre_tags": [' '], + "post_tags": [""], + "fields": {"content": {}}, + }, + # "from": from_i, + # "size": size, + "aggs": {"match_count": {"value_count": {"field": "_id"}}}, + } + + res = es.search(index=index, body=body) + + return res + + +def do(): + return "no" diff --git a/examples/search_engine_streamlit/templates.py b/examples/search_engine_streamlit/templates.py new file mode 100644 index 00000000..f5013f9c --- /dev/null +++ b/examples/search_engine_streamlit/templates.py @@ -0,0 +1,17 @@ +def number_of_results(total_hits: int, duration: float) -> str: + """HTML scripts to display number of results and duration.""" + return f""" +
+ {total_hits} results ({duration:.2f} seconds) +

+ """ + + +def search_result(highlights: str) -> str: + """HTML scripts to display search results.""" + return f""" + +
+ {highlights} +
+ """ diff --git a/fortex/health/readers/__init__.py b/fortex/health/readers/__init__.py index 076a48e7..d3745f4b 100644 --- a/fortex/health/readers/__init__.py +++ b/fortex/health/readers/__init__.py @@ -11,5 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from fortex.health.readers.mimic3_note_reader import *