7
7
8
8
"""
9
9
10
+ from typing import List
11
+ from typing import Optional
12
+
10
13
from .cparser_util import parse_string
11
14
from .dialect import SimpleDialect
12
15
from .utils import pairwise
13
16
14
17
15
- def tie_breaker (data , dialects ):
18
+ def tie_breaker (
19
+ data : str , dialects : List [SimpleDialect ]
20
+ ) -> Optional [SimpleDialect ]:
16
21
"""
17
22
Break ties between dialects.
18
23
@@ -42,7 +47,9 @@ def tie_breaker(data, dialects):
42
47
return None
43
48
44
49
45
- def reduce_pairwise (data , dialects ):
50
+ def reduce_pairwise (
51
+ data : str , dialects : List [SimpleDialect ]
52
+ ) -> Optional [List [SimpleDialect ]]:
46
53
"""Reduce the set of dialects by breaking pairwise ties
47
54
48
55
Parameters
@@ -62,7 +69,7 @@ def reduce_pairwise(data, dialects):
62
69
"""
63
70
equal_delim = len (set ([d .delimiter for d in dialects ])) == 1
64
71
if not equal_delim :
65
- return None
72
+ return None # TODO: This might be wrong, it can just return the input!
66
73
67
74
# First, identify dialects that result in the same parsing result.
68
75
equal_dialects = []
@@ -99,7 +106,9 @@ def _dialects_only_differ_in_field(
99
106
)
100
107
101
108
102
- def break_ties_two (data , A , B ):
109
+ def break_ties_two (
110
+ data : str , A : SimpleDialect , B : SimpleDialect
111
+ ) -> Optional [SimpleDialect ]:
103
112
"""Break ties between two dialects.
104
113
105
114
This function breaks ties between two dialects that give the same score. We
@@ -152,7 +161,7 @@ def break_ties_two(data, A, B):
152
161
# quotechar has an effect
153
162
return d_yes
154
163
elif _dialects_only_differ_in_field (A , B , "delimiter" ):
155
- if sorted ([A .delimiter , B .delimiter ]) == sorted (["," , " " ]):
164
+ if set ([A .delimiter , B .delimiter ]) == set (["," , " " ]):
156
165
# Artifact due to type detection (comma as radix point)
157
166
if A .delimiter == "," :
158
167
return A
@@ -175,14 +184,14 @@ def break_ties_two(data, A, B):
175
184
# we can't break this tie (for now)
176
185
if len (X ) != len (Y ):
177
186
return None
178
- for x , y in zip (X , Y ):
179
- if len (x ) != len (y ):
187
+ for row_X , row_Y in zip (X , Y ):
188
+ if len (row_X ) != len (row_Y ):
180
189
return None
181
190
182
191
cells_escaped = []
183
192
cells_unescaped = []
184
- for x , y in zip (X , Y ):
185
- for u , v in zip (x , y ):
193
+ for row_X , row_Y in zip (X , Y ):
194
+ for u , v in zip (row_X , row_Y ):
186
195
if u != v :
187
196
cells_unescaped .append (u )
188
197
cells_escaped .append (v )
@@ -221,16 +230,18 @@ def break_ties_two(data, A, B):
221
230
222
231
if len (X ) != len (Y ):
223
232
return None
224
- for x , y in zip (X , Y ):
225
- if len (x ) != len (y ):
233
+ for row_X , row_Y in zip (X , Y ):
234
+ if len (row_X ) != len (row_Y ):
226
235
return None
227
236
228
237
# if we're here, then there is no effect on structure.
229
238
# we test if the only cells that differ are those that have an
230
239
# escapechar+quotechar combination.
240
+ assert isinstance (d_yes .escapechar , str )
241
+ assert isinstance (d_yes .quotechar , str )
231
242
eq = d_yes .escapechar + d_yes .quotechar
232
- for rX , rY in zip (X , Y ):
233
- for x , y in zip (rX , rY ):
243
+ for row_X , row_Y in zip (X , Y ):
244
+ for x , y in zip (row_X , row_Y ):
234
245
if x != y :
235
246
if eq not in x :
236
247
return None
@@ -243,7 +254,9 @@ def break_ties_two(data, A, B):
243
254
return None
244
255
245
256
246
- def break_ties_three (data , A , B , C ):
257
+ def break_ties_three (
258
+ data : str , A : SimpleDialect , B : SimpleDialect , C : SimpleDialect
259
+ ) -> Optional [SimpleDialect ]:
247
260
"""Break ties between three dialects.
248
261
249
262
If the delimiters and the escape characters are all equal, then we look for
@@ -273,7 +286,7 @@ def break_ties_three(data, A, B, C):
273
286
Returns
274
287
-------
275
288
276
- dialect: SimpleDialect
289
+ dialect: Optional[ SimpleDialect]
277
290
The chosen dialect if the tie can be broken, None otherwise.
278
291
279
292
Notes
@@ -307,6 +320,7 @@ def break_ties_three(data, A, B, C):
307
320
)
308
321
if p_none is None :
309
322
return None
323
+ assert d_none is not None
310
324
311
325
rem = [
312
326
(p , d ) for p , d in zip ([pA , pB , pC ], dialects ) if not p == p_none
@@ -318,6 +332,8 @@ def break_ties_three(data, A, B, C):
318
332
# the CSV paper. When fixing the delimiter to Tab, rem = [].
319
333
# Try to reduce pairwise
320
334
new_dialects = reduce_pairwise (data , dialects )
335
+ if new_dialects is None :
336
+ return None
321
337
if len (new_dialects ) == 1 :
322
338
return new_dialects [0 ]
323
339
return None
@@ -347,7 +363,9 @@ def break_ties_three(data, A, B, C):
347
363
return None
348
364
349
365
350
- def break_ties_four (data , dialects ):
366
+ def break_ties_four (
367
+ data : str , dialects : List [SimpleDialect ]
368
+ ) -> Optional [SimpleDialect ]:
351
369
"""Break ties between four dialects.
352
370
353
371
This function works by breaking the ties between pairs of dialects that
@@ -368,7 +386,7 @@ def break_ties_four(data, dialects):
368
386
369
387
Returns
370
388
-------
371
- dialect: SimpleDialect
389
+ dialect: Optional[ SimpleDialect]
372
390
The chosen dialect if the tie can be broken, None otherwise.
373
391
374
392
Notes
@@ -378,19 +396,22 @@ def break_ties_four(data, dialects):
378
396
examples are found.
379
397
380
398
"""
399
+ # TODO: Check for length 4, handle more than 4 too?
381
400
382
401
equal_delim = len (set ([d .delimiter for d in dialects ])) == 1
383
402
if not equal_delim :
384
403
return None
385
404
386
- dialects = reduce_pairwise (data , dialects )
405
+ reduced_dialects = reduce_pairwise (data , dialects )
406
+ if reduced_dialects is None :
407
+ return None
387
408
388
409
# Defer to other functions if the number of dialects was reduced
389
- if len (dialects ) == 1 :
390
- return dialects [0 ]
391
- elif len (dialects ) == 2 :
392
- return break_ties_two (data , * dialects )
393
- elif len (dialects ) == 3 :
394
- return break_ties_three (data , * dialects )
410
+ if len (reduced_dialects ) == 1 :
411
+ return reduced_dialects [0 ]
412
+ elif len (reduced_dialects ) == 2 :
413
+ return break_ties_two (data , * reduced_dialects )
414
+ elif len (reduced_dialects ) == 3 :
415
+ return break_ties_three (data , * reduced_dialects )
395
416
396
417
return None
0 commit comments