Skip to content

Commit 3c9980b

Browse files
authored
Update zipped_files to return name as well as contents; update json_records and csv_records to take new structure; bump to 0.2 (#9)
* Add more logging; add py.typed file to mark library as typed * Change zipped_files to return ZippedfileRef instead of just open file, update to version 0.2 * Update json_records and csv_records to either take IO or OpenedFileRef
1 parent 146285e commit 3c9980b

File tree

10 files changed

+130
-26
lines changed

10 files changed

+130
-26
lines changed

README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,3 +140,10 @@ print(json.dumps(chain.get_counts(), indent=4))
140140
print(StreamStart(range(10)).flat_map(chain).to_list())
141141
#> [2, 10, 10]
142142
```
143+
144+
## Similar Functionality
145+
146+
- Python has built in functionality for building iterators
147+
148+
- [LangChain](https://www.langchain.com/) implements chained operations using its
149+
[Runnable protocol](https://python.langchain.com/docs/expression_language/interface)

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ build-backend = "poetry.core.masonry.api"
55

66
[tool.poetry]
77
name = "pipedata"
8-
version = "0.1.1"
8+
version = "0.2"
99
description = "Framework for building pipelines for data processing"
1010
authors = ["Simon Wicks <[email protected]>"]
1111
readme = "README.md"

src/pipedata/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "0.1.1"
1+
__version__ = "0.2"
22

33
__all__ = [
44
"__version__",

src/pipedata/ops/files.py

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,32 @@
11
import logging
22
import zipfile
3+
from dataclasses import dataclass
34
from typing import IO, Iterator
45

56
import fsspec # type: ignore
67

78
logger = logging.getLogger(__name__)
89

910

10-
def zipped_files(file_refs: Iterator[str]) -> Iterator[IO[bytes]]:
11+
@dataclass
12+
class OpenedFileRef:
13+
name: str
14+
contents: IO[bytes]
15+
16+
17+
def zipped_files(file_refs: Iterator[str]) -> Iterator[OpenedFileRef]:
18+
logger.info("Initializing zipped files reader")
1119
for file_ref in file_refs:
20+
logger.info(f"Opening zip file at {file_ref}")
1221
with fsspec.open(file_ref, "rb") as file:
1322
with zipfile.ZipFile(file) as zip_file:
14-
for name in zip_file.namelist():
23+
infos = zip_file.infolist()
24+
logger.info(f"Found {len(infos)} files in zip file")
25+
for i, info in enumerate(infos):
26+
name = info.filename
27+
logger.info(f"Reading file {i} ({name}) from zip file")
1528
with zip_file.open(name) as inner_file:
16-
yield inner_file
29+
yield OpenedFileRef(
30+
name=name,
31+
contents=inner_file,
32+
)

src/pipedata/ops/records.py

Lines changed: 30 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,53 @@
11
import csv
22
import io
33
import logging
4-
from typing import IO, Any, Callable, Dict, Iterator, Optional
4+
from typing import IO, Any, Callable, Dict, Iterator, Optional, Union
55

66
import ijson # type: ignore
77

8+
from .files import OpenedFileRef
9+
810
logger = logging.getLogger(__name__)
911

1012

1113
def json_records(
1214
json_path: str = "item", multiple_values: Optional[bool] = False
13-
) -> Callable[[Iterator[IO[bytes]]], Iterator[Dict[str, Any]]]:
15+
) -> Callable[[Iterator[Union[IO[bytes], OpenedFileRef]]], Iterator[Dict[str, Any]]]:
1416
logger.info(f"Initializing json reader for {json_path}")
1517

16-
def json_records_func(json_files: Iterator[IO[bytes]]) -> Iterator[Dict[str, Any]]:
18+
def json_records_func(
19+
json_files: Iterator[Union[IO[bytes], OpenedFileRef]]
20+
) -> Iterator[Dict[str, Any]]:
1721
for json_file in json_files:
18-
logger.info(f"Reading json file {json_file}")
19-
records = ijson.items(json_file, json_path, multiple_values=multiple_values)
22+
if isinstance(json_file, OpenedFileRef):
23+
contents = json_file.contents
24+
logger.info(f"Reading json file {json_file.name}")
25+
else:
26+
contents = json_file
27+
logger.info(f"Reading json file {json_file}")
28+
records = ijson.items(contents, json_path, multiple_values=multiple_values)
2029
yield from records
2130

2231
return json_records_func
2332

2433

25-
def csv_records() -> Callable[[Iterator[IO[bytes]]], Iterator[Dict[str, Any]]]:
26-
def csv_records_func(csv_paths: Iterator[IO[bytes]]) -> Iterator[Dict[str, Any]]:
27-
for csv_path in csv_paths:
28-
logger.info(f"Reading csv file {csv_path}")
34+
def csv_records() -> (
35+
Callable[[Iterator[Union[IO[bytes], OpenedFileRef]]], Iterator[Dict[str, Any]]]
36+
):
37+
logger.info("Initializing csv reader")
38+
39+
def csv_records_func(
40+
csv_files: Iterator[Union[IO[bytes], OpenedFileRef]]
41+
) -> Iterator[Dict[str, Any]]:
42+
for csv_file in csv_files:
43+
if isinstance(csv_file, OpenedFileRef):
44+
contents = csv_file.contents
45+
logger.info(f"Reading csv file {csv_file.name}")
46+
else:
47+
contents = csv_file
48+
logger.info(f"Reading csv file {csv_file}")
2949
csv_reader = csv.DictReader(
30-
io.TextIOWrapper(csv_path, "utf-8"), delimiter=","
50+
io.TextIOWrapper(contents, "utf-8"), delimiter=","
3151
)
3252
yield from csv_reader
3353

src/pipedata/ops/storage.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,12 @@
1+
import logging
12
from typing import Any, Callable, Dict, Iterator, Optional
23

34
import pyarrow as pa # type: ignore
45
import pyarrow.parquet as pq # type: ignore
56

67
from pipedata.core.chain import batched
78

8-
# Option to accumulate the pyarrow table more frequently
9-
# so that doesn't need whole list(dict) and pyarrow table
10-
# in memory at the same time
11-
12-
# Option to hae row_group_length and max_file_length dpendent
13-
# on size of data, as opposed to number of just numbers of rows.
14-
# Can combine this with the existing settings, so runs
15-
# at the smaller of the two.
9+
logger = logging.getLogger(__name__)
1610

1711

1812
def parquet_writer(
@@ -24,13 +18,16 @@ def parquet_writer(
2418
if row_group_length is None and max_file_length is not None:
2519
row_group_length = max_file_length
2620

27-
if max_file_length is not None:
21+
multi_file = max_file_length is not None
22+
if multi_file:
2823
if file_path.format(i=1) == file_path:
2924
msg = "When (possibly) writing to multiple files (as the file_length"
3025
msg += " argument is not None), the file_path argument must be a"
3126
msg += " format string that contains a format specifier for the file."
3227
raise ValueError(msg)
3328

29+
logger.info(f"Initializing parquet writer with {file_path=}")
30+
3431
def parquet_writer_func(records: Iterator[Dict[str, Any]]) -> Iterator[str]:
3532
writer = None
3633
file_number = 1
@@ -39,22 +36,29 @@ def parquet_writer_func(records: Iterator[Dict[str, Any]]) -> Iterator[str]:
3936
table = pa.Table.from_pylist(batch, schema=schema)
4037
if writer is None:
4138
formated_file_path = file_path
42-
if max_file_length is not None:
39+
if multi_file:
4340
formated_file_path = file_path.format(i=file_number)
41+
logger.info(f"Writing to {formated_file_path=}")
4442
writer = pq.ParquetWriter(formated_file_path, table.schema)
4543

4644
writer.write_table(table)
4745
file_length += len(batch)
46+
logger.info(
47+
f"Written {len(batch)} ({file_length} total) rows "
48+
f"to {formated_file_path}"
49+
)
4850

4951
if max_file_length is not None and file_length >= max_file_length:
5052
writer.close()
5153
writer = None
5254
file_length = 0
5355
file_number += 1
56+
logger.info(f"Finished writing to {formated_file_path}")
5457
yield formated_file_path
5558

5659
if writer is not None:
5760
writer.close()
61+
logger.info(f"Final file closed at {formated_file_path}")
5862
yield formated_file_path
5963

6064
return parquet_writer_func

src/pipedata/py.typed

Whitespace-only changes.

tests/ops/test_files.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,22 @@
77

88

99
def test_zipped_files() -> None:
10+
with tempfile.TemporaryDirectory() as temp_dir:
11+
zip_path = Path(temp_dir) / "test.zip"
12+
13+
with zipfile.ZipFile(zip_path, "w") as zip_file:
14+
zip_file.writestr("test.txt", "Hello, world 1!")
15+
zip_file.writestr("test2.txt", "Hello, world 2!")
16+
zip_file.writestr("test3.txt", "Hello, world 3!")
17+
18+
result = StreamStart([str(zip_path)]).flat_map(zipped_files).to_list()
19+
20+
assert result[0].name == "test.txt"
21+
assert result[1].name == "test2.txt"
22+
assert result[2].name == "test3.txt"
23+
24+
25+
def test_zipped_file_contents() -> None:
1026
with tempfile.TemporaryDirectory() as temp_dir:
1127
zip_path = Path(temp_dir) / "test.zip"
1228

@@ -18,7 +34,7 @@ def test_zipped_files() -> None:
1834
result = (
1935
StreamStart([str(zip_path)])
2036
.flat_map(zipped_files)
21-
.map(lambda x: x.read().decode("utf-8"))
37+
.map(lambda x: x.contents.read().decode("utf-8"))
2238
.to_list()
2339
)
2440

tests/ops/test_pipeline.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ def test_zipped_files() -> None:
3535
result = (
3636
StreamStart([str(zip_path)])
3737
.flat_map(zipped_files)
38+
.map(lambda x: x.contents)
3839
.flat_map(json_records())
3940
.flat_map(parquet_writer(str(output_path)))
4041
.to_list()

tests/ops/test_records.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import json
33

44
from pipedata.core import StreamStart
5+
from pipedata.ops.files import OpenedFileRef
56
from pipedata.ops.records import csv_records, json_records
67

78

@@ -17,6 +18,23 @@ def test_json_records() -> None:
1718
assert result == expected
1819

1920

21+
def test_json_records_from_file_ref() -> None:
22+
json1 = [{"a": 1, "b": 2}, {"a": 3, "b": 4}]
23+
json2 = [{"a": 5, "b": 6}, {"a": 7, "b": 8}]
24+
25+
file1 = io.BytesIO(json.dumps(json1).encode("utf-8"))
26+
file2 = io.BytesIO(json.dumps(json2).encode("utf-8"))
27+
28+
file_refs = [
29+
OpenedFileRef(name="test1.json", contents=file1),
30+
OpenedFileRef(name="test2.json", contents=file2),
31+
]
32+
33+
result = StreamStart(file_refs).flat_map(json_records()).to_list()
34+
expected = json1 + json2
35+
assert result == expected
36+
37+
2038
def test_csv_records() -> None:
2139
csv1 = "a,b\n1,2\n3,4"
2240
csv2 = "a,b\n5,6\n7,8"
@@ -32,3 +50,25 @@ def test_csv_records() -> None:
3250
{"a": "7", "b": "8"},
3351
]
3452
assert result == expected
53+
54+
55+
def test_csv_records_from_file_ref() -> None:
56+
csv1 = "a,b\n1,2\n3,4"
57+
csv2 = "a,b\n5,6\n7,8"
58+
59+
file1 = io.BytesIO(csv1.encode("utf-8"))
60+
file2 = io.BytesIO(csv2.encode("utf-8"))
61+
62+
file_refs = [
63+
OpenedFileRef(name="test1.csv", contents=file1),
64+
OpenedFileRef(name="test2.csv", contents=file2),
65+
]
66+
67+
result = StreamStart(file_refs).flat_map(csv_records()).to_list()
68+
expected = [
69+
{"a": "1", "b": "2"},
70+
{"a": "3", "b": "4"},
71+
{"a": "5", "b": "6"},
72+
{"a": "7", "b": "8"},
73+
]
74+
assert result == expected

0 commit comments

Comments
 (0)