4
4
from .meta import Meta
5
5
from .errors import PinsInsecureReadError
6
6
7
- from typing import Literal , Sequence
7
+ from typing import Literal , Sequence , TypeAlias
8
8
9
9
# TODO: move IFileSystem out of boards, to fix circular import
10
10
# from .boards import IFileSystem
11
11
12
12
13
13
UNSAFE_TYPES = frozenset (["joblib" ])
14
14
REQUIRES_SINGLE_FILE = frozenset (["csv" , "joblib" , "file" ])
15
-
16
-
17
- def _assert_is_pandas_df (x , file_type : str ) -> None :
18
- df_family = _get_df_family (x )
19
-
20
- if df_family != "pandas" :
21
- raise NotImplementedError (
22
- f"Currently only pandas.DataFrame can be saved as type { file_type !r} ."
23
- )
24
-
25
-
26
- def _get_df_family (df ) -> Literal ["pandas" , "polars" ]:
27
- """Return the type of DataFrame, or raise NotImplementedError if we can't decide."""
28
- try :
29
- import polars as pl
30
- except ModuleNotFoundError :
31
- is_polars_df = False
32
- else :
33
- is_polars_df = isinstance (df , pl .DataFrame )
34
-
35
- import pandas as pd
36
-
37
- is_pandas_df = isinstance (df , pd .DataFrame )
38
-
39
- if is_polars_df and is_pandas_df :
40
- raise NotImplementedError (
41
- "Hybrid DataFrames (simultaneously pandas and polars) are not supported."
42
- )
43
- elif is_polars_df :
44
- return "polars"
45
- elif is_pandas_df :
46
- return "pandas"
47
- raise NotImplementedError (f"Unrecognized DataFrame type: { type (df )} " )
15
+ _DFLib : TypeAlias = Literal ["pandas" , "polars" ]
48
16
49
17
50
18
def load_path (meta , path_to_version ):
@@ -177,36 +145,31 @@ def save_data(obj, fname, type=None, apply_suffix: bool = True) -> "str | Sequen
177
145
final_name = f"{ fname } { suffix } "
178
146
179
147
if type == "csv" :
180
- _assert_is_pandas_df (obj , file_type = type )
181
-
148
+ _choose_df_lib (obj , supported_libs = ["pandas" ], file_type = type )
182
149
obj .to_csv (final_name , index = False )
183
150
184
151
elif type == "arrow" :
185
152
# NOTE: R pins accepts the type arrow, and saves it as feather.
186
153
# we allow reading this type, but raise an error for writing.
187
- _assert_is_pandas_df (obj , file_type = type )
188
-
154
+ _choose_df_lib (obj , supported_libs = ["pandas" ], file_type = type )
189
155
obj .to_feather (final_name )
190
156
191
157
elif type == "feather" :
192
- _assert_is_pandas_df (obj , file_type = type )
158
+ _choose_df_lib (obj , supported_libs = [ "pandas" ] , file_type = type )
193
159
194
160
raise NotImplementedError (
195
161
'Saving data as type "feather" no longer supported. Use type "arrow" instead.'
196
162
)
197
163
198
164
elif type == "parquet" :
199
- df_family = _get_df_family (obj )
200
- if df_family == "polars" :
201
- obj .write_parquet (final_name )
202
- elif df_family == "pandas" :
165
+ df_lib = _choose_df_lib (obj , supported_libs = ["pandas" , "polars" ], file_type = type )
166
+
167
+ if df_lib == "pandas" :
203
168
obj .to_parquet (final_name )
169
+ elif df_lib == "polars" :
170
+ obj .write_parquet (final_name )
204
171
else :
205
- msg = (
206
- "Currently only pandas.DataFrame and polars.DataFrame can be saved to "
207
- "a parquet file."
208
- )
209
- raise NotImplementedError (msg )
172
+ raise NotImplementedError
210
173
211
174
elif type == "joblib" :
212
175
import joblib
@@ -234,7 +197,7 @@ def save_data(obj, fname, type=None, apply_suffix: bool = True) -> "str | Sequen
234
197
235
198
def default_title (obj , name ):
236
199
try :
237
- _get_df_family (obj )
200
+ _choose_df_lib (obj )
238
201
except NotImplementedError :
239
202
obj_name = type (obj ).__qualname__
240
203
return f"{ name } : a pinned { obj_name } object"
@@ -243,3 +206,73 @@ def default_title(obj, name):
243
206
# see https://github.com/machow/pins-python/issues/5
244
207
shape_str = " x " .join (map (str , obj .shape ))
245
208
return f"{ name } : a pinned { shape_str } DataFrame"
209
+
210
+
211
+ def _choose_df_lib (
212
+ df ,
213
+ * ,
214
+ supported_libs : list [_DFLib ] = ["pandas" , "polars" ],
215
+ file_type : str | None = None ,
216
+ ) -> _DFLib :
217
+ """Return the type of DataFrame library used in the given DataFrame.
218
+
219
+ Args:
220
+ df:
221
+ The object to check - might not be a DataFrame necessarily.
222
+ supported_libs:
223
+ The DataFrame libraries to accept for this df.
224
+ file_type:
225
+ The file type we're trying to save to - used to give more specific error messages.
226
+
227
+ Raises:
228
+ NotImplementedError: If the DataFrame type is not recognized.
229
+ """
230
+ df_libs : list [_DFLib ] = []
231
+
232
+ # pandas
233
+ import pandas as pd
234
+
235
+ if isinstance (df , pd .DataFrame ):
236
+ df_libs .append ("pandas" )
237
+
238
+ # polars
239
+ try :
240
+ import polars as pl
241
+ except ModuleNotFoundError :
242
+ pass
243
+ else :
244
+ if isinstance (df , pl .DataFrame ):
245
+ df_libs .append ("polars" )
246
+
247
+ if len (df_libs ) == 1 :
248
+ (df_lib ,) = df_libs
249
+ elif len (df_libs ) > 1 :
250
+ msg = (
251
+ f"Hybrid DataFrames are not supported: "
252
+ f"should only be one of { supported_libs !r} , "
253
+ f"but got an object from multiple libraries { df_libs !r} ."
254
+ )
255
+ raise NotImplementedError (msg )
256
+ else :
257
+ raise NotImplementedError (f"Unrecognized DataFrame type: { type (df )} " )
258
+
259
+ if df_lib not in supported_libs :
260
+ if file_type is None :
261
+ ftype_clause = "in pins"
262
+ else :
263
+ ftype_clause = f"for type { file_type !r} "
264
+
265
+ if len (supported_libs ) == 1 :
266
+ msg = (
267
+ f"Currently only { supported_libs [0 ]} DataFrames can be saved "
268
+ f"{ ftype_clause } . { df_lib } DataFrames are not yet supported."
269
+ )
270
+ else :
271
+ msg = (
272
+ f"Currently only DataFrames from the following libraries can be saved "
273
+ f"{ ftype_clause } : { supported_libs !r} ."
274
+ )
275
+
276
+ raise NotImplementedError (msg )
277
+
278
+ return df_lib
0 commit comments