66import json
77import logging as _logging
88import os .path
9- import re
109import typing
1110from collections import ChainMap , Counter
1211from pathlib import Path
@@ -119,57 +118,40 @@ def _open_input(p: PathOrIO) -> TextIO:
119118 return io .StringIO (file_content )
120119
121120
122- def _separate_metadata_and_table_from_stream (stream : TextIO ):
123- stream .seek (0 )
124-
125- # Create a new StringIO object for filtered data
126- table_component = io .StringIO ()
127- metadata_component = io .StringIO ()
128-
129- header_section = True
130-
131- # Filter out lines starting with '#'
132- for line in stream :
133- if not line .startswith ("#" ):
134- table_component .write (line )
135- if header_section :
136- header_section = False
137- elif header_section :
138- # We strip any trailing tabs. Such tabs may have been left
139- # by a spreadsheet editor who treated the header lines as
140- # if they were normal data lines; they would prevent the
141- # YAML parser from correctly parsing the metadata block.
142- metadata_component .write (line .rstrip ("\t \n " ) + "\n " )
143- else :
144- logging .info (
145- f"Line { line } is starting with hash symbol, but header section is already passed. "
146- f"This line is skipped"
147- )
148-
149- # Reset the cursor to the start of the new StringIO object
150- table_component .seek (0 )
151- metadata_component .seek (0 )
152- return table_component , metadata_component
153-
154-
155- def _read_pandas_and_metadata (file_path : PathOrIO , sep : Optional [str ] = None ):
121+ def _read_pandas_and_metadata (
122+ file_path : Union [str , Path , TextIO ], sep : Optional [str ] = None
123+ ) -> tuple [pd .DataFrame , MetadataType ]:
156124 """Read a tabular data file by wrapping func:`pd.read_csv` to handles comment lines correctly.
157125
158126 :param file_path: The file path or stream to read
159127 :param sep: File separator for pandas
160- :return: A pandas dataframe
128+ :return: A pair of a dataframe and metadata dictionary
161129 """
162130 if sep is None :
163- sep = _infer_separator (file_path )
131+ sep = _infer_separator (file_path ) or " \t "
164132
165133 if isinstance (file_path , (str , Path )):
166134 raise_for_bad_path (file_path )
167135
168136 stream = _open_input (file_path )
169- table_stream , metadata_stream = _separate_metadata_and_table_from_stream (stream )
137+
138+ # consume from the top of the stream until there's no more preceding #
139+ header_yaml = ""
140+ while (line := stream .readline ()).startswith ("#" ):
141+ line = line .lstrip ("#" ).rstrip ()
142+ if not line :
143+ continue
144+ header_yaml += line + "\n "
145+
146+ sssom_metadata = yaml .safe_load (header_yaml ) if header_yaml else {}
147+
148+ # The first line that doesn't start with a # is assumed
149+ # to be the header, so we split it with the inferred separator
150+ names = line .strip ().split (sep )
170151
171152 try :
172- df = pd .read_csv (table_stream , sep = sep , dtype = str , engine = "python" )
153+ # pandas can keep going and read from the same stream that we already have
154+ df = pd .read_csv (stream , sep = sep , dtype = str , engine = "python" , header = None , names = names )
173155 except EmptyDataError as e :
174156 logging .warning (f"Seems like the dataframe is empty: { e } " )
175157 df = pd .DataFrame (
@@ -184,7 +166,6 @@ def _read_pandas_and_metadata(file_path: PathOrIO, sep: Optional[str] = None):
184166 else :
185167 df .fillna ("" , inplace = True )
186168
187- sssom_metadata = _read_metadata_from_table (metadata_stream )
188169 return df , sssom_metadata
189170
190171
@@ -895,21 +876,6 @@ def _swap_object_subject(mapping: Mapping) -> Mapping:
895876 return mapping
896877
897878
898- def _read_metadata_from_table (stream : io .StringIO ) -> Dict [str , Any ]:
899- yamlstr = ""
900- for line in stream :
901- if line .startswith ("#" ):
902- yamlstr += re .sub ("^#" , "" , line )
903- else :
904- break
905-
906- if yamlstr :
907- meta = yaml .safe_load (yamlstr )
908- logging .info (f"Meta={ meta } " )
909- return meta
910- return {}
911-
912-
913879def _set_metadata_in_mapping_set (
914880 mapping_set : MappingSet , metadata : Optional [MetadataType ] = None , overwrite : bool = True
915881) -> None :
0 commit comments