Skip to content

Commit badf527

Browse files
authored
Change parse methods to support StringIO (#385)
* Change parse methods to support StringIO * Fix flake8, black and mypy * add test * Update parsers.py * Update util.py * Handle comments correctly * Fix issues * Tox fixes * Change test paths * Update parsers.py * Update util.py * making get separator method private * fix comment * Fixing some more tests * Update util.py * Update util.py
1 parent a983827 commit badf527

File tree

6 files changed

+188
-67
lines changed

6 files changed

+188
-67
lines changed

src/sssom/parsers.py

Lines changed: 125 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,24 @@
11
"""SSSOM parsers."""
22

3+
import io
34
import json
45
import logging
56
import re
67
import typing
78
from collections import Counter
89
from pathlib import Path
910
from typing import Any, Callable, Dict, List, Optional, TextIO, Tuple, Union, cast
10-
from urllib.request import urlopen
1111
from xml.dom import Node, minidom
1212
from xml.dom.minidom import Document
1313

1414
import numpy as np
1515
import pandas as pd
16-
import validators
16+
import requests
1717
import yaml
1818
from deprecation import deprecated
1919
from linkml_runtime.loaders.json_loader import JSONLoader
20+
from pandas.errors import EmptyDataError
2021
from rdflib import Graph, URIRef
21-
22-
# from .sssom_datamodel import Mapping, MappingSet
2322
from sssom_schema import Mapping, MappingSet
2423

2524
from sssom.constants import (
@@ -70,7 +69,6 @@
7069
get_file_extension,
7170
is_multivalued_slot,
7271
raise_for_bad_path,
73-
read_pandas,
7472
to_mapping_set_dataframe,
7573
)
7674

@@ -86,10 +84,9 @@ def read_sssom_table(
8684
file_path: Union[str, Path],
8785
prefix_map: Optional[PrefixMap] = None,
8886
meta: Optional[MetadataType] = None,
89-
**kwargs,
9087
) -> MappingSetDataFrame:
9188
"""DEPRECATE."""
92-
return parse_sssom_table(file_path=file_path, prefix_map=prefix_map, meta=meta, kwargs=kwargs)
89+
return parse_sssom_table(file_path=file_path, prefix_map=prefix_map, meta=meta)
9390

9491

9592
@deprecated(
@@ -134,22 +131,130 @@ def read_sssom_json(
134131
# Parsers (from file)
135132

136133

134+
def _open_input(input: Union[str, Path, TextIO]) -> io.StringIO:
135+
"""Transform a URL, a filepath (from pathlib), or a string (with file contents) to a StringIO object.
136+
137+
:param input: A string representing a URL, a filepath, or file contents,
138+
or a Path object representing a filepath.
139+
:return: A StringIO object containing the input data.
140+
"""
141+
# If the import already is a StrinIO, return it
142+
if isinstance(input, io.StringIO):
143+
return input
144+
elif isinstance(input, Path):
145+
input = str(input)
146+
147+
if isinstance(input, str):
148+
if input.startswith("http://") or input.startswith("https://"):
149+
# It's a URL
150+
data = requests.get(input, timeout=30).content
151+
return io.StringIO(data.decode("utf-8"))
152+
elif "\n" in input or "\r" in input:
153+
# It's string data
154+
return io.StringIO(input)
155+
else:
156+
# It's a local file path
157+
with open(input, "r") as file:
158+
file_content = file.read()
159+
return io.StringIO(file_content)
160+
161+
raise IOError(f"Could not determine the type of input {input}")
162+
163+
164+
def _separate_metadata_and_table_from_stream(s: io.StringIO):
165+
s.seek(0)
166+
167+
# Create a new StringIO object for filtered data
168+
table_component = io.StringIO()
169+
metadata_component = io.StringIO()
170+
171+
header_section = True
172+
173+
# Filter out lines starting with '#'
174+
for line in s:
175+
if not line.startswith("#"):
176+
table_component.write(line)
177+
if header_section:
178+
header_section = False
179+
elif header_section:
180+
metadata_component.write(line)
181+
else:
182+
logging.info(
183+
f"Line {line} is starting with hash symbol, but header section is already passed. "
184+
f"This line is skipped"
185+
)
186+
187+
# Reset the cursor to the start of the new StringIO object
188+
table_component.seek(0)
189+
metadata_component.seek(0)
190+
return table_component, metadata_component
191+
192+
193+
def _read_pandas_and_metadata(input: io.StringIO, sep: str = None):
194+
"""Read a tabular data file by wrapping func:`pd.read_csv` to handles comment lines correctly.
195+
196+
:param input: The file to read. If no separator is given, this file should be named.
197+
:param sep: File separator for pandas
198+
:return: A pandas dataframe
199+
"""
200+
table_stream, metadata_stream = _separate_metadata_and_table_from_stream(input)
201+
202+
try:
203+
df = pd.read_csv(table_stream, sep=sep)
204+
df.fillna("", inplace=True)
205+
except EmptyDataError as e:
206+
logging.warning(f"Seems like the dataframe is empty: {e}")
207+
df = pd.DataFrame(
208+
columns=[
209+
SUBJECT_ID,
210+
SUBJECT_LABEL,
211+
PREDICATE_ID,
212+
OBJECT_ID,
213+
MAPPING_JUSTIFICATION,
214+
]
215+
)
216+
217+
if isinstance(df, pd.DataFrame):
218+
sssom_metadata = _read_metadata_from_table(metadata_stream)
219+
return df, sssom_metadata
220+
221+
return None, None
222+
223+
224+
def _get_seperator_symbol_from_file_path(file):
225+
r"""
226+
Take as an input a filepath and return the seperate symbol used, for example, by pandas.
227+
228+
:param file: the file path
229+
:return: the seperator symbols as a string, e.g. '\t'
230+
"""
231+
if isinstance(file, Path) or isinstance(file, str):
232+
extension = get_file_extension(file)
233+
if extension == "tsv":
234+
return "\t"
235+
elif extension == "csv":
236+
return ","
237+
logging.warning(f"Could not guess file extension for {file}")
238+
return None
239+
240+
137241
def parse_sssom_table(
138-
file_path: Union[str, Path],
242+
file_path: Union[str, Path, TextIO],
139243
prefix_map: Optional[PrefixMap] = None,
140244
meta: Optional[MetadataType] = None,
141-
**kwargs
142-
# mapping_predicates: Optional[List[str]] = None,
245+
**kwargs,
143246
) -> MappingSetDataFrame:
144247
"""Parse a TSV to a :class:`MappingSetDocument` to a :class:`MappingSetDataFrame`."""
145-
raise_for_bad_path(file_path)
146-
df = read_pandas(file_path)
248+
if isinstance(file_path, Path) or isinstance(file_path, str):
249+
raise_for_bad_path(file_path)
250+
stream: io.StringIO = _open_input(file_path)
251+
sep_new = _get_seperator_symbol_from_file_path(file_path)
252+
df, sssom_metadata = _read_pandas_and_metadata(stream, sep_new)
147253
# if mapping_predicates:
148254
# # Filter rows based on presence of predicate_id list provided.
149255
# df = df[df["predicate_id"].isin(mapping_predicates)]
150256

151257
# If SSSOM external metadata is provided, merge it with the internal metadata
152-
sssom_metadata = _read_metadata_from_table(file_path)
153258

154259
if sssom_metadata:
155260
if meta:
@@ -733,24 +838,13 @@ def _swap_object_subject(mapping: Mapping) -> Mapping:
733838
return mapping
734839

735840

736-
def _read_metadata_from_table(path: Union[str, Path]) -> Dict[str, Any]:
737-
if isinstance(path, Path) or not validators.url(path):
738-
with open(path) as file:
739-
yamlstr = ""
740-
for line in file:
741-
if line.startswith("#"):
742-
yamlstr += re.sub("^#", "", line)
743-
else:
744-
break
745-
else:
746-
response = urlopen(path)
747-
yamlstr = ""
748-
for lin in response:
749-
line = lin.decode("utf-8")
750-
if line.startswith("#"):
751-
yamlstr += re.sub("^#", "", line)
752-
else:
753-
break
841+
def _read_metadata_from_table(stream: io.StringIO) -> Dict[str, Any]:
842+
yamlstr = ""
843+
for line in stream:
844+
if line.startswith("#"):
845+
yamlstr += re.sub("^#", "", line)
846+
else:
847+
break
754848

755849
if yamlstr:
756850
meta = yaml.safe_load(yamlstr)

src/sssom/util.py

Lines changed: 32 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from functools import reduce
1010
from io import StringIO
1111
from pathlib import Path
12+
from string import punctuation
1213
from typing import (
1314
Any,
1415
ChainMap,
@@ -24,6 +25,7 @@
2425
)
2526
from urllib.request import urlopen
2627

28+
import deprecation
2729
import numpy as np
2830
import pandas as pd
2931
import validators
@@ -852,23 +854,28 @@ def get_file_extension(file: Union[str, Path, TextIO]) -> str:
852854
"""Get file extension.
853855
854856
:param file: File path
855-
:raises Exception: Cannot determine extension exception
856-
:return: format of the file passed
857+
:return: format of the file passed, default tsv
857858
"""
858-
if isinstance(file, str):
859+
if isinstance(file, Path):
860+
if file.suffix:
861+
return file.suffix.strip(punctuation)
862+
else:
863+
logging.warning(
864+
f"Cannot guess format from {file}, despite appearing to be a Path-like object."
865+
)
866+
elif isinstance(file, str):
859867
filename = file
860-
elif isinstance(file, Path):
861-
return file.suffix
862-
else:
863-
filename = file.name
864-
parts = filename.split(".")
865-
if len(parts) > 0:
866-
f_format = parts[-1]
867-
return f_format
868-
else:
869-
raise Exception(f"Cannot guess format from {filename}")
868+
parts = filename.split(".")
869+
if len(parts) > 0:
870+
f_format = parts[-1]
871+
return f_format.strip(punctuation)
872+
else:
873+
logging.warning(f"Cannot guess format from {filename}")
874+
logging.info("Cannot guess format extension for this file, assuming TSV.")
875+
return "tsv"
870876

871877

878+
@deprecation.deprecated(details="Use pandas.read_csv() instead.")
872879
def read_csv(
873880
filename: Union[str, Path, TextIO], comment: str = "#", sep: str = ","
874881
) -> pd.DataFrame:
@@ -923,6 +930,7 @@ def read_metadata(filename: str) -> Metadata:
923930
return Metadata(prefix_map=prefix_map, metadata=metadata)
924931

925932

933+
@deprecation.deprecated(details="Use pandas.read_csv() instead.")
926934
def read_pandas(file: Union[str, Path, TextIO], sep: Optional[str] = None) -> pd.DataFrame:
927935
"""Read a tabular data file by wrapping func:`pd.read_csv` to handles comment lines correctly.
928936
@@ -931,15 +939,14 @@ def read_pandas(file: Union[str, Path, TextIO], sep: Optional[str] = None) -> pd
931939
:return: A pandas dataframe
932940
"""
933941
if sep is None:
934-
extension = get_file_extension(file)
935-
if extension == "tsv":
936-
sep = "\t"
937-
elif extension == "csv":
938-
sep = ","
939-
else:
940-
sep = "\t"
941-
logging.warning("Cannot automatically determine table format, trying tsv.")
942-
df = read_csv(file, comment="#", sep=sep).fillna("")
942+
if isinstance(file, Path) or isinstance(file, str):
943+
extension = get_file_extension(file)
944+
if extension == "tsv":
945+
sep = "\t"
946+
elif extension == "csv":
947+
sep = ","
948+
logging.warning(f"Could not guess file extension for {file}")
949+
df = read_csv(file, comment="#", sep=sep).fillna("")
943950
return sort_df_rows_columns(df)
944951

945952

@@ -1188,7 +1195,7 @@ def filter_prefixes(
11881195
return pd.DataFrame(rows) if rows else pd.DataFrame(columns=features)
11891196

11901197

1191-
# TODO this is not used anywhere
1198+
@deprecation.deprecated(details="This is no longer used and will be removed from the public API.")
11921199
def guess_file_format(filename: Union[str, TextIO]) -> str:
11931200
"""Get file format.
11941201
@@ -1259,6 +1266,8 @@ def raise_for_bad_path(file_path: Union[str, Path]) -> None:
12591266
if isinstance(file_path, Path):
12601267
if not file_path.is_file():
12611268
raise FileNotFoundError(f"{file_path} is not a valid file path or url.")
1269+
elif not isinstance(file_path, str):
1270+
logging.info("Path provided to raise_for_bad_path() is neither a Path nor str-like object.")
12621271
elif not validators.url(file_path) and not os.path.exists(file_path):
12631272
raise FileNotFoundError(f"{file_path} is not a valid file path or url.")
12641273

src/sssom/writers.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
import pandas as pd
99
import yaml
10+
from deprecation import deprecated
1011
from jsonasobj2 import JsonObj
1112
from linkml_runtime.dumpers import JSONDumper, rdflib_dumper
1213
from linkml_runtime.utils.schemaview import SchemaView
@@ -161,6 +162,9 @@ def write_ontoportal_json(
161162
# Converters convert a mappingsetdataframe to an object of the supportes types (json, pandas dataframe)
162163

163164

165+
@deprecated(
166+
details="Use df variable of 'MappingSetDataFrame' instead (msdf.df).",
167+
)
164168
def to_dataframe(msdf: MappingSetDataFrame) -> pd.DataFrame:
165169
"""Convert a mapping set dataframe to a dataframe."""
166170
data = []

tests/test_cli.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
split,
2828
validate,
2929
)
30+
from tests.constants import data_dir
3031
from tests.test_data import (
3132
RECON_YAML,
3233
SSSOMTestCase,
@@ -35,8 +36,6 @@
3536
test_out_dir,
3637
)
3738

38-
from .constants import data_dir
39-
4039

4140
class SSSOMCLITestSuite(unittest.TestCase):
4241
"""A test case for the dynamic CLI tests."""

0 commit comments

Comments
 (0)