Change parse methods to support StringIO (#385)

matentzn · web-flow · commit badf527a0ac4 · 2023-06-26T10:24:38.000-05:00
* Change parse methods to support StringIO

* Fix flake8, black and mypy

* add test

* Update parsers.py

* Update util.py

* Handle comments correctly

* Fix issues

* Tox fixes

* Change test paths

* Update parsers.py

* Update util.py

* making get separator method private

* fix comment

* Fixing some more tests

* Update util.py

* Update util.py
diff --git a/src/sssom/parsers.py b/src/sssom/parsers.py
@@ -1,25 +1,24 @@
 """SSSOM parsers."""
 
+import io
 import json
 import logging
 import re
 import typing
 from collections import Counter
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, TextIO, Tuple, Union, cast
-from urllib.request import urlopen
 from xml.dom import Node, minidom
 from xml.dom.minidom import Document
 
 import numpy as np
 import pandas as pd
-import validators
+import requests
 import yaml
 from deprecation import deprecated
 from linkml_runtime.loaders.json_loader import JSONLoader
+from pandas.errors import EmptyDataError
 from rdflib import Graph, URIRef
-
-# from .sssom_datamodel import Mapping, MappingSet
 from sssom_schema import Mapping, MappingSet
 
 from sssom.constants import (
@@ -70,7 +69,6 @@
     get_file_extension,
     is_multivalued_slot,
     raise_for_bad_path,
-    read_pandas,
     to_mapping_set_dataframe,
 )
 
@@ -86,10 +84,9 @@ def read_sssom_table(
     file_path: Union[str, Path],
     prefix_map: Optional[PrefixMap] = None,
     meta: Optional[MetadataType] = None,
-    **kwargs,
 ) -> MappingSetDataFrame:
     """DEPRECATE."""
-    return parse_sssom_table(file_path=file_path, prefix_map=prefix_map, meta=meta, kwargs=kwargs)
+    return parse_sssom_table(file_path=file_path, prefix_map=prefix_map, meta=meta)
 
 
 @deprecated(
@@ -134,22 +131,130 @@ def read_sssom_json(
 # Parsers (from file)
 
 
+def _open_input(input: Union[str, Path, TextIO]) -> io.StringIO:
+    """Transform a URL, a filepath (from pathlib), or a string (with file contents) to a StringIO object.
+
+    :param input: A string representing a URL, a filepath, or file contents,
+                              or a Path object representing a filepath.
+    :return: A StringIO object containing the input data.
+    """
+    # If the import already is a StrinIO, return it
+    if isinstance(input, io.StringIO):
+        return input
+    elif isinstance(input, Path):
+        input = str(input)
+
+    if isinstance(input, str):
+        if input.startswith("http://") or input.startswith("https://"):
+            # It's a URL
+            data = requests.get(input, timeout=30).content
+            return io.StringIO(data.decode("utf-8"))
+        elif "\n" in input or "\r" in input:
+            # It's string data
+            return io.StringIO(input)
+        else:
+            # It's a local file path
+            with open(input, "r") as file:
+                file_content = file.read()
+            return io.StringIO(file_content)
+
+    raise IOError(f"Could not determine the type of input {input}")
+
+
+def _separate_metadata_and_table_from_stream(s: io.StringIO):
+    s.seek(0)
+
+    # Create a new StringIO object for filtered data
+    table_component = io.StringIO()
+    metadata_component = io.StringIO()
+
+    header_section = True
+
+    # Filter out lines starting with '#'
+    for line in s:
+        if not line.startswith("#"):
+            table_component.write(line)
+            if header_section:
+                header_section = False
+        elif header_section:
+            metadata_component.write(line)
+        else:
+            logging.info(
+                f"Line {line} is starting with hash symbol, but header section is already passed. "
+                f"This line is skipped"
+            )
+
+    # Reset the cursor to the start of the new StringIO object
+    table_component.seek(0)
+    metadata_component.seek(0)
+    return table_component, metadata_component
+
+
+def _read_pandas_and_metadata(input: io.StringIO, sep: str = None):
+    """Read a tabular data file by wrapping func:`pd.read_csv` to handles comment lines correctly.
+
+    :param input: The file to read. If no separator is given, this file should be named.
+    :param sep: File separator for pandas
+    :return: A pandas dataframe
+    """
+    table_stream, metadata_stream = _separate_metadata_and_table_from_stream(input)
+
+    try:
+        df = pd.read_csv(table_stream, sep=sep)
+        df.fillna("", inplace=True)
+    except EmptyDataError as e:
+        logging.warning(f"Seems like the dataframe is empty: {e}")
+        df = pd.DataFrame(
+            columns=[
+                SUBJECT_ID,
+                SUBJECT_LABEL,
+                PREDICATE_ID,
+                OBJECT_ID,
+                MAPPING_JUSTIFICATION,
+            ]
+        )
+
+    if isinstance(df, pd.DataFrame):
+        sssom_metadata = _read_metadata_from_table(metadata_stream)
+        return df, sssom_metadata
+
+    return None, None
+
+
+def _get_seperator_symbol_from_file_path(file):
+    r"""
+    Take as an input a filepath and return the seperate symbol used, for example, by pandas.
+
+    :param file: the file path
+    :return: the seperator symbols as a string, e.g. '\t'
+    """
+    if isinstance(file, Path) or isinstance(file, str):
+        extension = get_file_extension(file)
+        if extension == "tsv":
+            return "\t"
+        elif extension == "csv":
+            return ","
+        logging.warning(f"Could not guess file extension for {file}")
+    return None
+
+
 def parse_sssom_table(
-    file_path: Union[str, Path],
+    file_path: Union[str, Path, TextIO],
     prefix_map: Optional[PrefixMap] = None,
     meta: Optional[MetadataType] = None,
-    **kwargs
-    # mapping_predicates: Optional[List[str]] = None,
+    **kwargs,
 ) -> MappingSetDataFrame:
     """Parse a TSV to a :class:`MappingSetDocument` to a :class:`MappingSetDataFrame`."""
-    raise_for_bad_path(file_path)
-    df = read_pandas(file_path)
+    if isinstance(file_path, Path) or isinstance(file_path, str):
+        raise_for_bad_path(file_path)
+    stream: io.StringIO = _open_input(file_path)
+    sep_new = _get_seperator_symbol_from_file_path(file_path)
+    df, sssom_metadata = _read_pandas_and_metadata(stream, sep_new)
     # if mapping_predicates:
     #     # Filter rows based on presence of predicate_id list provided.
     #     df = df[df["predicate_id"].isin(mapping_predicates)]
 
     # If SSSOM external metadata is provided, merge it with the internal metadata
-    sssom_metadata = _read_metadata_from_table(file_path)
 
     if sssom_metadata:
         if meta:
@@ -733,24 +838,13 @@ def _swap_object_subject(mapping: Mapping) -> Mapping:
     return mapping
 
 
-def _read_metadata_from_table(path: Union[str, Path]) -> Dict[str, Any]:
-    if isinstance(path, Path) or not validators.url(path):
-        with open(path) as file:
-            yamlstr = ""
-            for line in file:
-                if line.startswith("#"):
-                    yamlstr += re.sub("^#", "", line)
-                else:
-                    break
-    else:
-        response = urlopen(path)
-        yamlstr = ""
-        for lin in response:
-            line = lin.decode("utf-8")
-            if line.startswith("#"):
-                yamlstr += re.sub("^#", "", line)
-            else:
-                break
+def _read_metadata_from_table(stream: io.StringIO) -> Dict[str, Any]:
+    yamlstr = ""
+    for line in stream:
+        if line.startswith("#"):
+            yamlstr += re.sub("^#", "", line)
+        else:
+            break
 
     if yamlstr:
         meta = yaml.safe_load(yamlstr)
diff --git a/src/sssom/util.py b/src/sssom/util.py
@@ -9,6 +9,7 @@
 from functools import reduce
 from io import StringIO
 from pathlib import Path
+from string import punctuation
 from typing import (
     Any,
     ChainMap,
@@ -24,6 +25,7 @@
 )
 from urllib.request import urlopen
 
+import deprecation
 import numpy as np
 import pandas as pd
 import validators
@@ -852,23 +854,28 @@ def get_file_extension(file: Union[str, Path, TextIO]) -> str:
     """Get file extension.
 
     :param file: File path
-    :raises Exception: Cannot determine extension exception
-    :return: format of the file passed
+    :return: format of the file passed, default tsv
     """
-    if isinstance(file, str):
+    if isinstance(file, Path):
+        if file.suffix:
+            return file.suffix.strip(punctuation)
+        else:
+            logging.warning(
+                f"Cannot guess format from {file}, despite appearing to be a Path-like object."
+            )
+    elif isinstance(file, str):
         filename = file
-    elif isinstance(file, Path):
-        return file.suffix
-    else:
-        filename = file.name
-    parts = filename.split(".")
-    if len(parts) > 0:
-        f_format = parts[-1]
-        return f_format
-    else:
-        raise Exception(f"Cannot guess format from {filename}")
+        parts = filename.split(".")
+        if len(parts) > 0:
+            f_format = parts[-1]
+            return f_format.strip(punctuation)
+        else:
+            logging.warning(f"Cannot guess format from {filename}")
+    logging.info("Cannot guess format extension for this file, assuming TSV.")
+    return "tsv"
 
 
+@deprecation.deprecated(details="Use pandas.read_csv() instead.")
 def read_csv(
     filename: Union[str, Path, TextIO], comment: str = "#", sep: str = ","
 ) -> pd.DataFrame:
@@ -923,6 +930,7 @@ def read_metadata(filename: str) -> Metadata:
     return Metadata(prefix_map=prefix_map, metadata=metadata)
 
 
+@deprecation.deprecated(details="Use pandas.read_csv() instead.")
 def read_pandas(file: Union[str, Path, TextIO], sep: Optional[str] = None) -> pd.DataFrame:
     """Read a tabular data file by wrapping func:`pd.read_csv` to handles comment lines correctly.
 
@@ -931,15 +939,14 @@ def read_pandas(file: Union[str, Path, TextIO], sep: Optional[str] = None) -> pd
     :return: A pandas dataframe
     """
     if sep is None:
-        extension = get_file_extension(file)
-        if extension == "tsv":
-            sep = "\t"
-        elif extension == "csv":
-            sep = ","
-        else:
-            sep = "\t"
-            logging.warning("Cannot automatically determine table format, trying tsv.")
-        df = read_csv(file, comment="#", sep=sep).fillna("")
+        if isinstance(file, Path) or isinstance(file, str):
+            extension = get_file_extension(file)
+            if extension == "tsv":
+                sep = "\t"
+            elif extension == "csv":
+                sep = ","
+            logging.warning(f"Could not guess file extension for {file}")
+    df = read_csv(file, comment="#", sep=sep).fillna("")
     return sort_df_rows_columns(df)
 
 
@@ -1188,7 +1195,7 @@ def filter_prefixes(
     return pd.DataFrame(rows) if rows else pd.DataFrame(columns=features)
 
 
-# TODO this is not used anywhere
+@deprecation.deprecated(details="This is no longer used and will be removed from the public API.")
 def guess_file_format(filename: Union[str, TextIO]) -> str:
     """Get file format.
 
@@ -1259,6 +1266,8 @@ def raise_for_bad_path(file_path: Union[str, Path]) -> None:
     if isinstance(file_path, Path):
         if not file_path.is_file():
             raise FileNotFoundError(f"{file_path} is not a valid file path or url.")
+    elif not isinstance(file_path, str):
+        logging.info("Path provided to raise_for_bad_path() is neither a Path nor str-like object.")
     elif not validators.url(file_path) and not os.path.exists(file_path):
         raise FileNotFoundError(f"{file_path} is not a valid file path or url.")
 
diff --git a/src/sssom/writers.py b/src/sssom/writers.py
@@ -7,6 +7,7 @@
 
 import pandas as pd
 import yaml
+from deprecation import deprecated
 from jsonasobj2 import JsonObj
 from linkml_runtime.dumpers import JSONDumper, rdflib_dumper
 from linkml_runtime.utils.schemaview import SchemaView
@@ -161,6 +162,9 @@ def write_ontoportal_json(
 # Converters convert a mappingsetdataframe to an object of the supportes types (json, pandas dataframe)
 
 
+@deprecated(
+    details="Use df variable of 'MappingSetDataFrame' instead (msdf.df).",
+)
 def to_dataframe(msdf: MappingSetDataFrame) -> pd.DataFrame:
     """Convert a mapping set dataframe to a dataframe."""
     data = []
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -27,6 +27,7 @@
     split,
     validate,
 )
+from tests.constants import data_dir
 from tests.test_data import (
     RECON_YAML,
     SSSOMTestCase,
@@ -35,8 +36,6 @@
     test_out_dir,
 )
 
-from .constants import data_dir
-
 
 class SSSOMCLITestSuite(unittest.TestCase):
     """A test case for the dynamic CLI tests."""
diff --git a/tests/test_conversion.py b/tests/test_conversion.py
diff --git a/tests/test_parsers.py b/tests/test_parsers.py

Original file line number	Diff line number	Diff line change
`@@ -27,6 +27,7 @@`
`27`	`27`	`split,`
`28`	`28`	`validate,`
`29`	`29`	`)`
	`30`	`+from tests.constants import data_dir`
`30`	`31`	`from tests.test_data import (`
`31`	`32`	`RECON_YAML,`
`32`	`33`	`SSSOMTestCase,`
`@@ -35,8 +36,6 @@`
`35`	`36`	`test_out_dir,`
`36`	`37`	`)`
`37`	`38`
`38`		`-from .constants import data_dir`
`39`		`-`
`40`	`39`
`41`	`40`	`class SSSOMCLITestSuite(unittest.TestCase):`
`42`	`41`	`"""A test case for the dynamic CLI tests."""`