Skip to content

Commit 207f4ef

Browse files
authored
Add type hints to CleverCSV (#108)
* Add type hints to CleverCSV This commit adds type hints to CleverCSV and enables static type checking with MyPy. An attempt was made to keep the changes minimal, in order to change only the code needed to get MyPy to pass. As can be seen, this already required a large number of changes to the code. It is also clear that certain design decisions could be reevaluated to make the type checking smoother (e.g., requiring @overload is not ideal). Such improvements are left for future work.
1 parent 6220562 commit 207f4ef

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+1992
-261
lines changed

.github/workflows/build.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ jobs:
4242
strategy:
4343
matrix:
4444
os: [ 'ubuntu-latest', 'macos-latest', 'windows-latest' ]
45-
py: [ '3.7', '3.11' ] # minimal and latest
45+
py: [ '3.8', '3.11' ] # minimal and latest
4646
steps:
4747
- name: Install Python ${{ matrix.py }}
4848
uses: actions/setup-python@v2

.github/workflows/deploy.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ jobs:
3131
env:
3232
CIBW_TEST_COMMAND: "python -VV && python -m unittest discover -f -s {project}/tests/test_unit/"
3333
CIBW_TEST_EXTRAS: "full"
34-
CIBW_SKIP: "pp* cp27-* cp33-* cp34-* cp35-* cp36-* cp310-win32 *-musllinux_* *-manylinux_i686"
34+
CIBW_SKIP: "pp* cp27-* cp33-* cp34-* cp35-* cp36-* cp37-* cp310-win32 *-musllinux_* *-manylinux_i686"
3535
CIBW_ARCHS_MACOS: x86_64 arm64 universal2
3636
CIBW_ARCHS_LINUX: auto aarch64
3737

Makefile

+6-2
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ MAKEFLAGS += --no-builtin-rules
99

1010
PACKAGE=clevercsv
1111
DOC_DIR=./docs/
12-
VENV_DIR=/tmp/clevercsv_venv/
12+
VENV_DIR=/tmp/clevercsv_venv
1313
PYTHON ?= python
1414

1515
.PHONY: help
@@ -51,14 +51,18 @@ dist: man ## Make Python source distribution
5151

5252
.PHONY: test integration integration_partial
5353

54-
test: green pytest
54+
test: mypy green pytest
5555

5656
green: venv ## Run unit tests
5757
source $(VENV_DIR)/bin/activate && green -a -vv ./tests/test_unit
5858

5959
pytest: venv ## Run unit tests with PyTest
6060
source $(VENV_DIR)/bin/activate && pytest -ra -m 'not network'
6161

62+
mypy: venv ## Run type checks
63+
source $(VENV_DIR)/bin/activate && \
64+
mypy --check-untyped-defs ./stubs $(PACKAGE) ./tests
65+
6266
integration: venv ## Run integration tests
6367
source $(VENV_DIR)/bin/activate && python ./tests/test_integration/test_dialect_detection.py -v
6468

clevercsv/__version__.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
# -*- coding: utf-8 -*-
22

3-
VERSION = (0, 8, 0)
3+
from typing import Tuple
44

5-
__version__ = ".".join(map(str, VERSION))
5+
VERSION: Tuple[int, int, int] = (0, 8, 0)
6+
7+
__version__: str = ".".join(map(str, VERSION))

clevercsv/_optional.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,12 @@
1313

1414
import importlib
1515

16+
from types import ModuleType
17+
1618
from typing import Dict
1719
from typing import List
1820
from typing import NamedTuple
21+
from typing import Optional
1922

2023
from packaging.version import Version
2124

@@ -35,7 +38,9 @@ class OptionalDependency(NamedTuple):
3538
]
3639

3740

38-
def import_optional_dependency(name, raise_on_missing=True):
41+
def import_optional_dependency(
42+
name: str, raise_on_missing: bool = True
43+
) -> Optional[ModuleType]:
3944
"""
4045
Import an optional dependency.
4146

clevercsv/_types.py

+50
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# -*- coding: utf-8 -*-
2+
3+
from __future__ import annotations
4+
5+
import csv
6+
import os
7+
import sys
8+
9+
from typing import TYPE_CHECKING
10+
from typing import Any
11+
from typing import Mapping
12+
from typing import Type
13+
from typing import Union
14+
15+
from clevercsv.dialect import SimpleDialect
16+
17+
AnyPath = Union[str, bytes, "os.PathLike[str]", "os.PathLike[bytes]"]
18+
_OpenFile = Union[AnyPath, int]
19+
_DictRow = Mapping[str, Any]
20+
_DialectLike = Union[str, csv.Dialect, Type[csv.Dialect], SimpleDialect]
21+
22+
if sys.version_info >= (3, 8):
23+
from typing import Dict as _DictReadMapping
24+
else:
25+
from collections import OrderedDict as _DictReadMapping
26+
27+
28+
if TYPE_CHECKING:
29+
from _typeshed import FileDescriptorOrPath # NOQA
30+
from _typeshed import SupportsIter # NOQA
31+
from _typeshed import SupportsWrite # NOQA
32+
33+
__all__ = [
34+
"SupportsWrite",
35+
"SupportsIter",
36+
"FileDescriptorOrPath",
37+
"AnyPath",
38+
"_OpenFile",
39+
"_DictRow",
40+
"_DialectLike",
41+
"_DictReadMapping",
42+
]
43+
else:
44+
__all__ = [
45+
"AnyPath",
46+
"_OpenFile",
47+
"_DictRow",
48+
"_DialectLike",
49+
"_DictReadMapping",
50+
]

clevercsv/break_ties.py

+45-24
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,17 @@
77
88
"""
99

10+
from typing import List
11+
from typing import Optional
12+
1013
from .cparser_util import parse_string
1114
from .dialect import SimpleDialect
1215
from .utils import pairwise
1316

1417

15-
def tie_breaker(data, dialects):
18+
def tie_breaker(
19+
data: str, dialects: List[SimpleDialect]
20+
) -> Optional[SimpleDialect]:
1621
"""
1722
Break ties between dialects.
1823
@@ -42,7 +47,9 @@ def tie_breaker(data, dialects):
4247
return None
4348

4449

45-
def reduce_pairwise(data, dialects):
50+
def reduce_pairwise(
51+
data: str, dialects: List[SimpleDialect]
52+
) -> Optional[List[SimpleDialect]]:
4653
"""Reduce the set of dialects by breaking pairwise ties
4754
4855
Parameters
@@ -62,7 +69,7 @@ def reduce_pairwise(data, dialects):
6269
"""
6370
equal_delim = len(set([d.delimiter for d in dialects])) == 1
6471
if not equal_delim:
65-
return None
72+
return None # TODO: This might be wrong, it can just return the input!
6673

6774
# First, identify dialects that result in the same parsing result.
6875
equal_dialects = []
@@ -99,7 +106,9 @@ def _dialects_only_differ_in_field(
99106
)
100107

101108

102-
def break_ties_two(data, A, B):
109+
def break_ties_two(
110+
data: str, A: SimpleDialect, B: SimpleDialect
111+
) -> Optional[SimpleDialect]:
103112
"""Break ties between two dialects.
104113
105114
This function breaks ties between two dialects that give the same score. We
@@ -152,7 +161,7 @@ def break_ties_two(data, A, B):
152161
# quotechar has an effect
153162
return d_yes
154163
elif _dialects_only_differ_in_field(A, B, "delimiter"):
155-
if sorted([A.delimiter, B.delimiter]) == sorted([",", " "]):
164+
if set([A.delimiter, B.delimiter]) == set([",", " "]):
156165
# Artifact due to type detection (comma as radix point)
157166
if A.delimiter == ",":
158167
return A
@@ -175,14 +184,14 @@ def break_ties_two(data, A, B):
175184
# we can't break this tie (for now)
176185
if len(X) != len(Y):
177186
return None
178-
for x, y in zip(X, Y):
179-
if len(x) != len(y):
187+
for row_X, row_Y in zip(X, Y):
188+
if len(row_X) != len(row_Y):
180189
return None
181190

182191
cells_escaped = []
183192
cells_unescaped = []
184-
for x, y in zip(X, Y):
185-
for u, v in zip(x, y):
193+
for row_X, row_Y in zip(X, Y):
194+
for u, v in zip(row_X, row_Y):
186195
if u != v:
187196
cells_unescaped.append(u)
188197
cells_escaped.append(v)
@@ -221,16 +230,18 @@ def break_ties_two(data, A, B):
221230

222231
if len(X) != len(Y):
223232
return None
224-
for x, y in zip(X, Y):
225-
if len(x) != len(y):
233+
for row_X, row_Y in zip(X, Y):
234+
if len(row_X) != len(row_Y):
226235
return None
227236

228237
# if we're here, then there is no effect on structure.
229238
# we test if the only cells that differ are those that have an
230239
# escapechar+quotechar combination.
240+
assert isinstance(d_yes.escapechar, str)
241+
assert isinstance(d_yes.quotechar, str)
231242
eq = d_yes.escapechar + d_yes.quotechar
232-
for rX, rY in zip(X, Y):
233-
for x, y in zip(rX, rY):
243+
for row_X, row_Y in zip(X, Y):
244+
for x, y in zip(row_X, row_Y):
234245
if x != y:
235246
if eq not in x:
236247
return None
@@ -243,7 +254,9 @@ def break_ties_two(data, A, B):
243254
return None
244255

245256

246-
def break_ties_three(data, A, B, C):
257+
def break_ties_three(
258+
data: str, A: SimpleDialect, B: SimpleDialect, C: SimpleDialect
259+
) -> Optional[SimpleDialect]:
247260
"""Break ties between three dialects.
248261
249262
If the delimiters and the escape characters are all equal, then we look for
@@ -273,7 +286,7 @@ def break_ties_three(data, A, B, C):
273286
Returns
274287
-------
275288
276-
dialect: SimpleDialect
289+
dialect: Optional[SimpleDialect]
277290
The chosen dialect if the tie can be broken, None otherwise.
278291
279292
Notes
@@ -307,6 +320,7 @@ def break_ties_three(data, A, B, C):
307320
)
308321
if p_none is None:
309322
return None
323+
assert d_none is not None
310324

311325
rem = [
312326
(p, d) for p, d in zip([pA, pB, pC], dialects) if not p == p_none
@@ -318,6 +332,8 @@ def break_ties_three(data, A, B, C):
318332
# the CSV paper. When fixing the delimiter to Tab, rem = [].
319333
# Try to reduce pairwise
320334
new_dialects = reduce_pairwise(data, dialects)
335+
if new_dialects is None:
336+
return None
321337
if len(new_dialects) == 1:
322338
return new_dialects[0]
323339
return None
@@ -347,7 +363,9 @@ def break_ties_three(data, A, B, C):
347363
return None
348364

349365

350-
def break_ties_four(data, dialects):
366+
def break_ties_four(
367+
data: str, dialects: List[SimpleDialect]
368+
) -> Optional[SimpleDialect]:
351369
"""Break ties between four dialects.
352370
353371
This function works by breaking the ties between pairs of dialects that
@@ -368,7 +386,7 @@ def break_ties_four(data, dialects):
368386
369387
Returns
370388
-------
371-
dialect: SimpleDialect
389+
dialect: Optional[SimpleDialect]
372390
The chosen dialect if the tie can be broken, None otherwise.
373391
374392
Notes
@@ -378,19 +396,22 @@ def break_ties_four(data, dialects):
378396
examples are found.
379397
380398
"""
399+
# TODO: Check for length 4, handle more than 4 too?
381400

382401
equal_delim = len(set([d.delimiter for d in dialects])) == 1
383402
if not equal_delim:
384403
return None
385404

386-
dialects = reduce_pairwise(data, dialects)
405+
reduced_dialects = reduce_pairwise(data, dialects)
406+
if reduced_dialects is None:
407+
return None
387408

388409
# Defer to other functions if the number of dialects was reduced
389-
if len(dialects) == 1:
390-
return dialects[0]
391-
elif len(dialects) == 2:
392-
return break_ties_two(data, *dialects)
393-
elif len(dialects) == 3:
394-
return break_ties_three(data, *dialects)
410+
if len(reduced_dialects) == 1:
411+
return reduced_dialects[0]
412+
elif len(reduced_dialects) == 2:
413+
return break_ties_two(data, *reduced_dialects)
414+
elif len(reduced_dialects) == 3:
415+
return break_ties_three(data, *reduced_dialects)
395416

396417
return None

clevercsv/cabstraction.pyi

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# -*- coding: utf-8 -*-
2+
3+
from typing import Optional
4+
5+
def base_abstraction(
6+
data: str,
7+
delimiter: Optional[str],
8+
quotechar: Optional[str],
9+
escapechar: Optional[str],
10+
) -> str: ...
11+
def c_merge_with_quotechar(data: str) -> str: ...

clevercsv/consistency.py

+11-4
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ def cached_is_known_type(cell: str, is_quoted: bool) -> bool:
8989

9090
def detect(
9191
self, data: str, delimiters: Optional[Iterable[str]] = None
92-
) -> None:
92+
) -> Optional[SimpleDialect]:
9393
"""Detect the dialect using the consistency measure
9494
9595
Parameters
@@ -184,8 +184,11 @@ def get_best_dialects(
184184
) -> List[SimpleDialect]:
185185
"""Identify the dialects with the highest consistency score"""
186186
Qscores = [score.Q for score in scores.values()]
187-
Qscores = list(filter(lambda q: q is not None, Qscores))
188-
Qmax = max(Qscores)
187+
Qmax = -float("inf")
188+
for q in Qscores:
189+
if q is None:
190+
continue
191+
Qmax = max(Qmax, q)
189192
return [d for d, score in scores.items() if score.Q == Qmax]
190193

191194
def compute_type_score(
@@ -194,6 +197,7 @@ def compute_type_score(
194197
"""Compute the type score"""
195198
total = known = 0
196199
for row in parse_string(data, dialect, return_quoted=True):
200+
assert all(isinstance(cell, tuple) for cell in row)
197201
for cell, is_quoted in row:
198202
total += 1
199203
known += self._cached_is_known_type(cell, is_quoted=is_quoted)
@@ -203,7 +207,10 @@ def compute_type_score(
203207

204208

205209
def detect_dialect_consistency(
206-
data, delimiters=None, skip=True, verbose=False
210+
data: str,
211+
delimiters: Optional[Iterable[str]] = None,
212+
skip: bool = True,
213+
verbose: bool = False,
207214
):
208215
"""Helper function that wraps ConsistencyDetector"""
209216
# Mostly kept for backwards compatibility

clevercsv/console/commands/detect.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@
44
import sys
55
import time
66

7+
from typing import Any
8+
from typing import Dict
9+
710
from wilderness import Command
811

912
from clevercsv.wrappers import detect_dialect
@@ -125,7 +128,7 @@ def handle(self):
125128
if self.args.add_runtime:
126129
print(f"runtime = {runtime}")
127130
elif self.args.json:
128-
dialect_dict = dialect.to_dict()
131+
dialect_dict: Dict[str, Any] = dialect.to_dict()
129132
if self.args.add_runtime:
130133
dialect_dict["runtime"] = runtime
131134
print(json.dumps(dialect_dict))

0 commit comments

Comments
 (0)