11"""SSSOM parsers."""
22
3+ import io
34import json
45import logging
56import re
67import typing
78from collections import Counter
89from pathlib import Path
910from typing import Any , Callable , Dict , List , Optional , TextIO , Tuple , Union , cast
10- from urllib .request import urlopen
1111from xml .dom import Node , minidom
1212from xml .dom .minidom import Document
1313
1414import numpy as np
1515import pandas as pd
16- import validators
16+ import requests
1717import yaml
1818from deprecation import deprecated
1919from linkml_runtime .loaders .json_loader import JSONLoader
20+ from pandas .errors import EmptyDataError
2021from rdflib import Graph , URIRef
21-
22- # from .sssom_datamodel import Mapping, MappingSet
2322from sssom_schema import Mapping , MappingSet
2423
2524from sssom .constants import (
7069 get_file_extension ,
7170 is_multivalued_slot ,
7271 raise_for_bad_path ,
73- read_pandas ,
7472 to_mapping_set_dataframe ,
7573)
7674
@@ -86,10 +84,9 @@ def read_sssom_table(
8684 file_path : Union [str , Path ],
8785 prefix_map : Optional [PrefixMap ] = None ,
8886 meta : Optional [MetadataType ] = None ,
89- ** kwargs ,
9087) -> MappingSetDataFrame :
9188 """DEPRECATE."""
92- return parse_sssom_table (file_path = file_path , prefix_map = prefix_map , meta = meta , kwargs = kwargs )
89+ return parse_sssom_table (file_path = file_path , prefix_map = prefix_map , meta = meta )
9390
9491
9592@deprecated (
@@ -134,22 +131,130 @@ def read_sssom_json(
134131# Parsers (from file)
135132
136133
134+ def _open_input (input : Union [str , Path , TextIO ]) -> io .StringIO :
135+ """Transform a URL, a filepath (from pathlib), or a string (with file contents) to a StringIO object.
136+
137+ :param input: A string representing a URL, a filepath, or file contents,
138+ or a Path object representing a filepath.
139+ :return: A StringIO object containing the input data.
140+ """
141+ # If the import already is a StrinIO, return it
142+ if isinstance (input , io .StringIO ):
143+ return input
144+ elif isinstance (input , Path ):
145+ input = str (input )
146+
147+ if isinstance (input , str ):
148+ if input .startswith ("http://" ) or input .startswith ("https://" ):
149+ # It's a URL
150+ data = requests .get (input , timeout = 30 ).content
151+ return io .StringIO (data .decode ("utf-8" ))
152+ elif "\n " in input or "\r " in input :
153+ # It's string data
154+ return io .StringIO (input )
155+ else :
156+ # It's a local file path
157+ with open (input , "r" ) as file :
158+ file_content = file .read ()
159+ return io .StringIO (file_content )
160+
161+ raise IOError (f"Could not determine the type of input { input } " )
162+
163+
164+ def _separate_metadata_and_table_from_stream (s : io .StringIO ):
165+ s .seek (0 )
166+
167+ # Create a new StringIO object for filtered data
168+ table_component = io .StringIO ()
169+ metadata_component = io .StringIO ()
170+
171+ header_section = True
172+
173+ # Filter out lines starting with '#'
174+ for line in s :
175+ if not line .startswith ("#" ):
176+ table_component .write (line )
177+ if header_section :
178+ header_section = False
179+ elif header_section :
180+ metadata_component .write (line )
181+ else :
182+ logging .info (
183+ f"Line { line } is starting with hash symbol, but header section is already passed. "
184+ f"This line is skipped"
185+ )
186+
187+ # Reset the cursor to the start of the new StringIO object
188+ table_component .seek (0 )
189+ metadata_component .seek (0 )
190+ return table_component , metadata_component
191+
192+
193+ def _read_pandas_and_metadata (input : io .StringIO , sep : str = None ):
194+ """Read a tabular data file by wrapping func:`pd.read_csv` to handles comment lines correctly.
195+
196+ :param input: The file to read. If no separator is given, this file should be named.
197+ :param sep: File separator for pandas
198+ :return: A pandas dataframe
199+ """
200+ table_stream , metadata_stream = _separate_metadata_and_table_from_stream (input )
201+
202+ try :
203+ df = pd .read_csv (table_stream , sep = sep )
204+ df .fillna ("" , inplace = True )
205+ except EmptyDataError as e :
206+ logging .warning (f"Seems like the dataframe is empty: { e } " )
207+ df = pd .DataFrame (
208+ columns = [
209+ SUBJECT_ID ,
210+ SUBJECT_LABEL ,
211+ PREDICATE_ID ,
212+ OBJECT_ID ,
213+ MAPPING_JUSTIFICATION ,
214+ ]
215+ )
216+
217+ if isinstance (df , pd .DataFrame ):
218+ sssom_metadata = _read_metadata_from_table (metadata_stream )
219+ return df , sssom_metadata
220+
221+ return None , None
222+
223+
224+ def _get_seperator_symbol_from_file_path (file ):
225+ r"""
226+ Take as an input a filepath and return the seperate symbol used, for example, by pandas.
227+
228+ :param file: the file path
229+ :return: the seperator symbols as a string, e.g. '\t'
230+ """
231+ if isinstance (file , Path ) or isinstance (file , str ):
232+ extension = get_file_extension (file )
233+ if extension == "tsv" :
234+ return "\t "
235+ elif extension == "csv" :
236+ return ","
237+ logging .warning (f"Could not guess file extension for { file } " )
238+ return None
239+
240+
137241def parse_sssom_table (
138- file_path : Union [str , Path ],
242+ file_path : Union [str , Path , TextIO ],
139243 prefix_map : Optional [PrefixMap ] = None ,
140244 meta : Optional [MetadataType ] = None ,
141- ** kwargs
142- # mapping_predicates: Optional[List[str]] = None,
245+ ** kwargs ,
143246) -> MappingSetDataFrame :
144247 """Parse a TSV to a :class:`MappingSetDocument` to a :class:`MappingSetDataFrame`."""
145- raise_for_bad_path (file_path )
146- df = read_pandas (file_path )
248+ if isinstance (file_path , Path ) or isinstance (file_path , str ):
249+ raise_for_bad_path (file_path )
250+ stream : io .StringIO = _open_input (file_path )
251+ sep_new = _get_seperator_symbol_from_file_path (file_path )
252+ df , sssom_metadata = _read_pandas_and_metadata (stream , sep_new )
147253 # if mapping_predicates:
148254 # # Filter rows based on presence of predicate_id list provided.
149255 # df = df[df["predicate_id"].isin(mapping_predicates)]
150256
151257 # If SSSOM external metadata is provided, merge it with the internal metadata
152- sssom_metadata = _read_metadata_from_table (file_path )
153258
154259 if sssom_metadata :
155260 if meta :
@@ -733,24 +838,13 @@ def _swap_object_subject(mapping: Mapping) -> Mapping:
733838 return mapping
734839
735840
736- def _read_metadata_from_table (path : Union [str , Path ]) -> Dict [str , Any ]:
737- if isinstance (path , Path ) or not validators .url (path ):
738- with open (path ) as file :
739- yamlstr = ""
740- for line in file :
741- if line .startswith ("#" ):
742- yamlstr += re .sub ("^#" , "" , line )
743- else :
744- break
745- else :
746- response = urlopen (path )
747- yamlstr = ""
748- for lin in response :
749- line = lin .decode ("utf-8" )
750- if line .startswith ("#" ):
751- yamlstr += re .sub ("^#" , "" , line )
752- else :
753- break
841+ def _read_metadata_from_table (stream : io .StringIO ) -> Dict [str , Any ]:
842+ yamlstr = ""
843+ for line in stream :
844+ if line .startswith ("#" ):
845+ yamlstr += re .sub ("^#" , "" , line )
846+ else :
847+ break
754848
755849 if yamlstr :
756850 meta = yaml .safe_load (yamlstr )
0 commit comments