Skip to content

Commit 7098c4f

Browse files
authored
Optimize the normal form detection (#123)
* optimize normal_form The even_rows was always called with every_row_has_delim. This meant possibly two full scans of all the rows. By joining those two functions, we can save one of the scans. Also, the logic previously implemented by even_rows now exits early whenever possible (previous implementation used to scan the whole file no matter what). * further optimize normal_form Avoid unnecessary splitting and joining of the rows. The current implementation would split the file into rows in each of the is_form_x separately. They all would do it the same way. So instead, we can split the file once and pass the lines to the is_form_x directly. It also allows us to avoid "re-joining" of the lines in is_form_5 when it calls the is_form_2. The test_normal_forms test inputs were changed accordingly: they are split by the `\n` and the trailing newlines were manually removed (the actual code will always strip the trailing newlines before calling the is_form_x functions).
1 parent 4a7a270 commit 7098c4f

File tree

2 files changed

+74
-79
lines changed

2 files changed

+74
-79
lines changed

clevercsv/normal_form.py

+31-38
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
Detect the dialect with very strict functional tests.
55
66
This module uses so-called "normal forms" to detect the dialect of CSV files.
7-
Normal forms are detected with strict functional tests. The normal forms are
8-
used as a pre-test to check if files are simple enough that computing the data
7+
Normal forms are detected with strict functional tests. The normal forms are
8+
used as a pre-test to check if files are simple enough that computing the data
99
consistency measure is not necessary.
1010
1111
Author: Gertjan van den Burg
@@ -62,7 +62,7 @@ def detect_dialect_normal(
6262
return None
6363

6464
form_and_dialect: List[
65-
Tuple[int, Callable[[str, SimpleDialect], bool], SimpleDialect]
65+
Tuple[int, Callable[[List[str], SimpleDialect], bool], SimpleDialect]
6666
] = []
6767

6868
for delim in delimiters:
@@ -90,8 +90,10 @@ def detect_dialect_normal(
9090
)
9191
)
9292

93+
rows = split_file(data)
94+
9395
for ID, form_func, dialect in form_and_dialect:
94-
if form_func(data, dialect):
96+
if form_func(rows, dialect):
9597
if verbose:
9698
print("Matched normal form %i." % ID)
9799
return dialect
@@ -169,20 +171,29 @@ def every_row_has_delim(rows: List[str], dialect: SimpleDialect) -> bool:
169171
return True
170172

171173

174+
def every_row_has_delim_and_is_the_same_length(
175+
rows: List[str], dialect: SimpleDialect
176+
) -> bool:
177+
assert dialect.delimiter is not None
178+
if len(rows) == 0:
179+
return False
180+
181+
first_len = len(split_row(rows[0], dialect))
182+
for row in rows:
183+
if not has_delimiter(row, dialect.delimiter):
184+
return False
185+
if len(split_row(row, dialect)) != first_len:
186+
return False
187+
return True
188+
189+
172190
def is_elementary(cell: str) -> bool:
173191
return (
174192
regex.fullmatch(r"[a-zA-Z0-9\.\_\&\-\@\+\%\(\)\ \/]+", cell)
175193
is not None
176194
)
177195

178196

179-
def even_rows(rows: List[str], dialect: SimpleDialect) -> bool:
180-
cells_per_row = set()
181-
for row in rows:
182-
cells_per_row.add(len(split_row(row, dialect)))
183-
return len(cells_per_row) == 1
184-
185-
186197
def split_file(data: str) -> List[str]:
187198
data = strip_trailing_crnl(data)
188199
if "\r\n" in data:
@@ -219,16 +230,12 @@ def split_row(row: str, dialect: SimpleDialect) -> List[str]:
219230
return cells
220231

221232

222-
def is_form_1(data: str, dialect: SimpleDialect) -> bool:
233+
def is_form_1(rows: List[str], dialect: SimpleDialect) -> bool:
223234
# All cells quoted, quoted empty allowed, no nested quotes, more than one
224235
# column
225236
assert dialect.quotechar is not None
226237

227-
rows = split_file(data)
228-
229-
if not every_row_has_delim(rows, dialect):
230-
return False
231-
if not even_rows(rows, dialect):
238+
if not every_row_has_delim_and_is_the_same_length(rows, dialect):
232239
return False
233240

234241
for row in rows:
@@ -251,14 +258,9 @@ def is_form_1(data: str, dialect: SimpleDialect) -> bool:
251258
return True
252259

253260

254-
def is_form_2(data: str, dialect: SimpleDialect) -> bool:
261+
def is_form_2(rows: List[str], dialect: SimpleDialect) -> bool:
255262
# All unquoted, empty allowed, all elementary
256-
257-
rows = split_file(data)
258-
259-
if not every_row_has_delim(rows, dialect):
260-
return False
261-
if not even_rows(rows, dialect):
263+
if not every_row_has_delim_and_is_the_same_length(rows, dialect):
262264
return False
263265

264266
for row in rows:
@@ -278,15 +280,11 @@ def is_form_2(data: str, dialect: SimpleDialect) -> bool:
278280
return True
279281

280282

281-
def is_form_3(data: str, dialect: SimpleDialect) -> bool:
283+
def is_form_3(rows: List[str], dialect: SimpleDialect) -> bool:
282284
# some quoted, some not quoted, no empty, no nested quotes
283285
assert dialect.quotechar is not None
284286

285-
rows = split_file(data)
286-
287-
if not every_row_has_delim(rows, dialect):
288-
return False
289-
if not even_rows(rows, dialect):
287+
if not every_row_has_delim_and_is_the_same_length(rows, dialect):
290288
return False
291289
if len(rows) <= 1:
292290
return False
@@ -315,12 +313,10 @@ def is_form_3(data: str, dialect: SimpleDialect) -> bool:
315313
return True
316314

317315

318-
def is_form_4(data: str, dialect: SimpleDialect) -> bool:
316+
def is_form_4(rows: List[str], dialect: SimpleDialect) -> bool:
319317
# no delim, single column (either entirely quoted or entirely unquoted)
320318
assert dialect.quotechar is not None
321319

322-
rows = split_file(data)
323-
324320
if len(rows) <= 1:
325321
return False
326322

@@ -342,12 +338,9 @@ def is_form_4(data: str, dialect: SimpleDialect) -> bool:
342338
return True
343339

344340

345-
def is_form_5(data: str, dialect: SimpleDialect) -> bool:
341+
def is_form_5(rows: List[str], dialect: SimpleDialect) -> bool:
346342
# all rows quoted, no nested quotes
347343
# basically form 2 but with quotes around each row
348-
349-
rows = split_file(data)
350-
351344
if not every_row_has_delim(rows, dialect):
352345
return False
353346
if len(rows) <= 1:
@@ -365,4 +358,4 @@ def is_form_5(data: str, dialect: SimpleDialect) -> bool:
365358
for row in rows:
366359
newrows.append(row[1:-1])
367360

368-
return is_form_2("\n".join(newrows), dialect)
361+
return is_form_2(newrows, dialect)

tests/test_unit/test_normal_forms.py

+43-41
Original file line numberDiff line numberDiff line change
@@ -21,71 +21,73 @@ class NormalFormTestCase(unittest.TestCase):
2121
def test_form_1(self) -> None:
2222
dialect = SimpleDialect(delimiter=",", quotechar='"', escapechar="")
2323

24-
self.assertTrue(is_form_1('"A","B","C"', dialect))
25-
self.assertTrue(is_form_1('"A","B"\n"C","D"\n', dialect))
26-
self.assertTrue(is_form_1('"A","","C"', dialect))
24+
self.assertTrue(is_form_1('"A","B","C"'.split("\n"), dialect))
25+
self.assertTrue(is_form_1('"A","B"\n"C","D"'.split("\n"), dialect))
26+
self.assertTrue(is_form_1('"A","","C"'.split("\n"), dialect))
2727

28-
self.assertFalse(is_form_1('"A","B"\n"A"', dialect))
29-
self.assertFalse(is_form_1('"A"\n"B"', dialect))
30-
self.assertFalse(is_form_1('"A"\n"A","B"', dialect))
31-
self.assertFalse(is_form_1('"A",,"C"', dialect))
32-
self.assertFalse(is_form_1('"A",C', dialect))
33-
self.assertFalse(is_form_1('"A"\n"b""A""c","B"', dialect))
28+
self.assertFalse(is_form_1('"A","B"\n"A"'.split("\n"), dialect))
29+
self.assertFalse(is_form_1('"A"\n"B"'.split("\n"), dialect))
30+
self.assertFalse(is_form_1('"A"\n"A","B"'.split("\n"), dialect))
31+
self.assertFalse(is_form_1('"A",,"C"'.split("\n"), dialect))
32+
self.assertFalse(is_form_1('"A",C'.split("\n"), dialect))
33+
self.assertFalse(is_form_1('"A"\n"b""A""c","B"'.split("\n"), dialect))
3434

3535
def test_form_2(self) -> None:
3636
dialect = SimpleDialect(delimiter=",", quotechar="", escapechar="")
3737

38-
self.assertTrue(is_form_2("1,2,3", dialect))
39-
self.assertTrue(is_form_2("1,2,3\na,b,c\n", dialect))
40-
self.assertTrue(is_form_2("[email protected],3", dialect))
41-
self.assertTrue(is_form_2("a,,3\n1,2,3", dialect))
38+
self.assertTrue(is_form_2("1,2,3".split("\n"), dialect))
39+
self.assertTrue(is_form_2("1,2,3\na,b,c".split("\n"), dialect))
40+
self.assertTrue(is_form_2("[email protected],3".split("\n"), dialect))
41+
self.assertTrue(is_form_2("a,,3\n1,2,3".split("\n"), dialect))
4242

43-
self.assertFalse(is_form_2("1,2,3\n1,2\n4,5,6", dialect))
44-
self.assertFalse(is_form_2("1", dialect))
45-
self.assertFalse(is_form_2('1,"a"', dialect))
46-
self.assertFalse(is_form_2("a;b,3", dialect))
47-
self.assertFalse(is_form_2('"a,3,3\n1,2,3', dialect))
48-
self.assertFalse(is_form_2('a,"",3\n1,2,3', dialect))
43+
self.assertFalse(is_form_2("1,2,3\n1,2\n4,5,6".split("\n"), dialect))
44+
self.assertFalse(is_form_2("1".split("\n"), dialect))
45+
self.assertFalse(is_form_2('1,"a"'.split("\n"), dialect))
46+
self.assertFalse(is_form_2("a;b,3".split("\n"), dialect))
47+
self.assertFalse(is_form_2('"a,3,3\n1,2,3'.split("\n"), dialect))
48+
self.assertFalse(is_form_2('a,"",3\n1,2,3'.split("\n"), dialect))
4949

5050
def test_form_3(self) -> None:
5151
A = SimpleDialect(delimiter=",", quotechar="'", escapechar="")
5252
Q = SimpleDialect(delimiter=",", quotechar='"', escapechar="")
5353

54-
self.assertTrue(is_form_3('A,B\nC,"D"', Q))
55-
self.assertTrue(is_form_3('A,B\nC,"d,e"', Q))
54+
self.assertTrue(is_form_3('A,B\nC,"D"'.split("\n"), Q))
55+
self.assertTrue(is_form_3('A,B\nC,"d,e"'.split("\n"), Q))
5656

57-
self.assertFalse(is_form_3('A,\nC,"d,e"', Q))
58-
self.assertFalse(is_form_3("3;4,B\nC,D", Q))
57+
self.assertFalse(is_form_3('A,\nC,"d,e"'.split("\n"), Q))
58+
self.assertFalse(is_form_3("3;4,B\nC,D".split("\n"), Q))
5959

60-
self.assertFalse(is_form_3('A,B\n"C",D\n', A))
61-
self.assertTrue(is_form_3('A,B\n"C",D\n', Q))
60+
self.assertFalse(is_form_3('A,B\n"C",D'.split("\n"), A))
61+
self.assertTrue(is_form_3('A,B\n"C",D'.split("\n"), Q))
6262

6363
def test_form_4(self) -> None:
6464
quoted = SimpleDialect(delimiter="", quotechar='"', escapechar="")
6565
unquoted = SimpleDialect(delimiter="", quotechar="", escapechar="")
6666

67-
self.assertTrue(is_form_4("A\nB\nC", unquoted))
68-
self.assertTrue(is_form_4("1\n2\n3", unquoted))
69-
self.assertTrue(is_form_4("A_B\n1\n2", unquoted))
70-
self.assertTrue(is_form_4("A&B\n1\n2", unquoted))
71-
self.assertTrue(is_form_4("A&B\n-1\n2", unquoted))
72-
self.assertTrue(is_form_4('"A"\n"B"\n"C"\n', quoted))
67+
self.assertTrue(is_form_4("A\nB\nC".split("\n"), unquoted))
68+
self.assertTrue(is_form_4("1\n2\n3".split("\n"), unquoted))
69+
self.assertTrue(is_form_4("A_B\n1\n2".split("\n"), unquoted))
70+
self.assertTrue(is_form_4("A&B\n1\n2".split("\n"), unquoted))
71+
self.assertTrue(is_form_4("A&B\n-1\n2".split("\n"), unquoted))
72+
self.assertTrue(is_form_4('"A"\n"B"\n"C"'.split("\n"), quoted))
7373

74-
self.assertFalse(is_form_4('"A", "B"\n"B"\n"C"\n', quoted))
75-
self.assertFalse(is_form_4('"A","B"\n"B"\n"C"\n', quoted))
76-
self.assertFalse(is_form_4('"A@b"\n"B"\n"C"\n', quoted))
77-
self.assertFalse(is_form_4('A\n"-1"\n2', unquoted))
78-
self.assertFalse(is_form_4("A B\n-1 3\n2 4", unquoted))
74+
self.assertFalse(is_form_4('"A", "B"\n"B"\n"C"'.split("\n"), quoted))
75+
self.assertFalse(is_form_4('"A","B"\n"B"\n"C"'.split("\n"), quoted))
76+
self.assertFalse(is_form_4('"A@b"\n"B"\n"C"'.split("\n"), quoted))
77+
self.assertFalse(is_form_4('A\n"-1"\n2'.split("\n"), unquoted))
78+
self.assertFalse(is_form_4("A B\n-1 3\n2 4".split("\n"), unquoted))
7979

8080
def test_form_5(self) -> None:
8181
dialect = SimpleDialect(delimiter=",", quotechar='"', escapechar="")
8282

83-
self.assertTrue(is_form_5('"A,B"\n"1,2"\n"3,4"', dialect))
84-
self.assertTrue(is_form_5('"A,B"\n"1,"\n"2,3"', dialect))
83+
self.assertTrue(is_form_5('"A,B"\n"1,2"\n"3,4"'.split("\n"), dialect))
84+
self.assertTrue(is_form_5('"A,B"\n"1,"\n"2,3"'.split("\n"), dialect))
8585

86-
self.assertFalse(is_form_5("A,B\n1,2\n3,4", dialect))
87-
self.assertFalse(is_form_5("A,B\n1,\n2,3", dialect))
88-
self.assertFalse(is_form_5('"A,""B"""\n"1,"\n"2,3"', dialect))
86+
self.assertFalse(is_form_5("A,B\n1,2\n3,4".split("\n"), dialect))
87+
self.assertFalse(is_form_5("A,B\n1,\n2,3".split("\n"), dialect))
88+
self.assertFalse(
89+
is_form_5('"A,""B"""\n"1,"\n"2,3"'.split("\n"), dialect)
90+
)
8991

9092

9193
if __name__ == "__main__":

0 commit comments

Comments
 (0)