Skip to content

Commit 8ae76d2

Browse files
authored
Refactor in order to be able to get more data about the operations run in the pipeline (#4)
* Add function to get the counts of all stages of the pipeline * Update README, fix tuple -> Tuple, add checks that zero values don't end the stream
1 parent 4b2bea4 commit 8ae76d2

File tree

6 files changed

+402
-95
lines changed

6 files changed

+402
-95
lines changed

README.md

Lines changed: 70 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,72 @@
11
# pipedata
22

3-
A framework for building pipelines for data processing
3+
Chained operations in Python, applied to data processing.
4+
5+
## Installation
6+
7+
```bash
8+
pip install pipedata
9+
```
10+
11+
## An Example
12+
13+
### Core Framework
14+
15+
The core framework provides the building blocks for chaining operations.
16+
17+
Running a stream:
18+
```py
19+
from pipedata.core import StreamStart
20+
21+
22+
result = (
23+
StreamStart(range(10))
24+
.filter(lambda x: x % 2 == 0)
25+
.map(lambda x: x ^ 2)
26+
.map_tuple(lambda x: x, 2)
27+
.to_list()
28+
)
29+
print(result)
30+
#> [(2, 0), (6, 4), (10,)]
31+
```
32+
33+
Creating a chain and then using it:
34+
```py
35+
import json
36+
from pipedata.core import ChainStart, Stream, StreamStart
37+
38+
39+
chain = (
40+
ChainStart()
41+
.filter(lambda x: x % 2 == 0)
42+
.map(lambda x: x ^ 2)
43+
.map_tuple(lambda x: sum(x), 2)
44+
)
45+
print(Stream(range(10), chain).to_list())
46+
#> [2, 10, 10]
47+
print(json.dumps(chain.get_counts(), indent=4))
48+
#> [
49+
#> {
50+
#> "name": "_identity",
51+
#> "inputs": 10,
52+
#> "outputs": 10
53+
#> },
54+
#> {
55+
#> "name": "<lambda>",
56+
#> "inputs": 10,
57+
#> "outputs": 5
58+
#> },
59+
#> {
60+
#> "name": "<lambda>",
61+
#> "inputs": 5,
62+
#> "outputs": 5
63+
#> },
64+
#> {
65+
#> "name": "<lambda>",
66+
#> "inputs": 5,
67+
#> "outputs": 3
68+
#> }
69+
#> ]
70+
print(StreamStart(range(10)).flat_map(chain).to_list())
71+
#> [2, 10, 10]
72+
```

src/pipedata/core/__init__.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
1-
from .chain import Chain
2-
from .stream import Stream, start_stream
1+
from .chain import Chain, ChainStart
2+
from .stream import Stream, StreamStart
33

44
__all__ = [
55
"Chain",
6+
"ChainStart",
67
"Stream",
7-
"start_stream",
8+
"StreamStart",
89
]

src/pipedata/core/chain.py

Lines changed: 135 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,83 +1,192 @@
11
from __future__ import annotations
22

3-
from typing import Callable, Generic, Iterator, Optional, Tuple, TypeVar
3+
from dataclasses import dataclass
4+
from typing import (
5+
Any,
6+
Callable,
7+
Dict,
8+
Generic,
9+
Iterator,
10+
List,
11+
Optional,
12+
Tuple,
13+
TypeVar,
14+
Union,
15+
cast,
16+
overload,
17+
)
418

519
from pipedata.core.itertools import take_next, take_up_to_n
620

721
TStart = TypeVar("TStart")
822
TEnd = TypeVar("TEnd")
9-
TNewEnd = TypeVar("TNewEnd")
23+
TOther = TypeVar("TOther")
1024

1125

1226
def _identity(input_iterator: Iterator[TEnd]) -> Iterator[TEnd]:
13-
while element := take_next(input_iterator):
27+
while (element := take_next(input_iterator)) is not None:
1428
yield element
1529

1630

17-
class Chain(Generic[TStart, TEnd]):
18-
def __init__(self, action: Callable[[Iterator[TStart]], Iterator[TEnd]]) -> None:
19-
self._action = action
31+
class CountingIterator(Iterator[TStart]):
32+
def __init__(self, iterator: Iterator[TStart]) -> None:
33+
self._iterator = iterator
34+
self._count = 0
35+
36+
def __iter__(self) -> Iterator[TStart]:
37+
return self
38+
39+
def __next__(self) -> TStart:
40+
self._count += 1
41+
try:
42+
return next(self._iterator)
43+
except StopIteration as err:
44+
self._count -= 1
45+
raise StopIteration from err
46+
47+
def get_count(self) -> int:
48+
return self._count
49+
50+
51+
class CountedFunc(Generic[TStart, TEnd]):
52+
def __init__(
53+
self,
54+
func: Callable[[Iterator[TStart]], Iterator[TEnd]],
55+
) -> None:
56+
self._func = func
57+
self._counting_input: Optional[CountingIterator[TStart]] = None
58+
self._counting_output: Optional[CountingIterator[TEnd]] = None
59+
60+
@property
61+
def __name__(self) -> str: # noqa: A003
62+
return self._func.__name__
63+
64+
def __call__(self, input_iterator: Iterator[TStart]) -> Iterator[TEnd]:
65+
self._counting_input = CountingIterator(input_iterator)
66+
self._counting_output = CountingIterator(self._func(self._counting_input))
67+
return self._counting_output
2068

21-
@classmethod
22-
def start(cls) -> Chain[TEnd, TEnd]:
23-
return Chain(_identity)
69+
def get_counts(self) -> Tuple[int, int]:
70+
return (
71+
0 if self._counting_input is None else self._counting_input.get_count(),
72+
0 if self._counting_output is None else self._counting_output.get_count(),
73+
)
2474

25-
def __call__(self, previous_step: Iterator[TStart]) -> Iterator[TEnd]:
26-
return self._action(previous_step)
75+
76+
@dataclass
77+
class StepCount:
78+
name: str
79+
inputs: int
80+
outputs: int
81+
82+
83+
class Chain(Generic[TStart, TEnd]):
84+
@overload
85+
def __init__(
86+
self,
87+
previous_steps: Chain[TStart, TOther],
88+
func: Callable[[Iterator[TOther]], Iterator[TEnd]],
89+
):
90+
...
91+
92+
@overload
93+
def __init__(
94+
self,
95+
previous_steps: None,
96+
func: Callable[[Iterator[TStart]], Iterator[TEnd]],
97+
):
98+
...
99+
100+
def __init__(
101+
self,
102+
previous_steps: Optional[Chain[TStart, TOther]],
103+
func: Union[
104+
Callable[[Iterator[TOther]], Iterator[TEnd]],
105+
Callable[[Iterator[TStart]], Iterator[TEnd]],
106+
],
107+
) -> None:
108+
self._previous_steps = previous_steps
109+
self._func = CountedFunc(func)
110+
111+
def __call__(self, input_iterator: Iterator[TStart]) -> Iterator[TEnd]:
112+
if self._previous_steps is None:
113+
func = cast(CountedFunc[TStart, TEnd], self._func)
114+
return func(input_iterator)
115+
116+
return self._func(self._previous_steps(input_iterator)) # type: ignore
27117

28118
def flat_map(
29-
self, func: Callable[[Iterator[TEnd]], Iterator[TNewEnd]]
30-
) -> Chain[TStart, TNewEnd]:
119+
self, func: Callable[[Iterator[TEnd]], Iterator[TOther]]
120+
) -> Chain[TStart, TOther]:
31121
"""
32122
Output zero or more elements from one or more input elements.
33123
34124
This is a fully general operation, that can arbitrarily transform the
35125
stream of elements. It is the most powerful operation, and all the
36126
other operations are implemented in terms of it.
37127
"""
38-
39-
def new_action(previous_step: Iterator[TStart]) -> Iterator[TNewEnd]:
40-
return func(self._action(previous_step))
41-
42-
return Chain(new_action)
128+
return Chain(self, func)
43129

44130
def filter(self, func: Callable[[TEnd], bool]) -> Chain[TStart, TEnd]: # noqa: A003
45131
"""
46132
Remove elements from the stream that do not pass the filter function.
47133
"""
48134

49135
def new_action(previous_step: Iterator[TEnd]) -> Iterator[TEnd]:
50-
while element := take_next(previous_step):
136+
while (element := take_next(previous_step)) is not None:
51137
if func(element) is True:
52138
yield element
53139

140+
new_action.__name__ = func.__name__
54141
return self.flat_map(new_action)
55142

56143
def map( # noqa: A003
57-
self, func: Callable[[TEnd], TNewEnd]
58-
) -> Chain[TStart, TNewEnd]:
144+
self, func: Callable[[TEnd], TOther]
145+
) -> Chain[TStart, TOther]:
59146
"""
60147
Return a single transformed element from each input element.
61148
"""
62149

63-
def new_action(previous_step: Iterator[TEnd]) -> Iterator[TNewEnd]:
64-
while element := take_next(previous_step):
150+
def new_action(previous_step: Iterator[TEnd]) -> Iterator[TOther]:
151+
while (element := take_next(previous_step)) is not None:
65152
yield func(element)
66153

154+
new_action.__name__ = func.__name__
67155
return self.flat_map(new_action)
68156

69157
def map_tuple(
70-
self, func: Callable[[Tuple[TEnd, ...]], TNewEnd], n: Optional[int] = None
71-
) -> Chain[TStart, TNewEnd]:
158+
self, func: Callable[[Tuple[TEnd, ...]], TOther], n: Optional[int] = None
159+
) -> Chain[TStart, TOther]:
72160
"""
73161
Return a single transformed element from (up to) n input elements.
74162
75163
If n is None, then apply the function to all the elements, and return
76164
an iterator of 1 element.
77165
"""
78166

79-
def new_action(previous_step: Iterator[TEnd]) -> Iterator[TNewEnd]:
167+
def new_action(previous_step: Iterator[TEnd]) -> Iterator[TOther]:
80168
while elements := take_up_to_n(previous_step, n):
81169
yield func(elements)
82170

171+
new_action.__name__ = func.__name__
83172
return self.flat_map(new_action)
173+
174+
def get_counts(self) -> List[Dict[str, Any]]:
175+
step_counts = []
176+
if self._previous_steps is not None:
177+
step_counts = self._previous_steps.get_counts()
178+
179+
inputs, outputs = self._func.get_counts()
180+
step_counts.append(
181+
{
182+
"name": self._func.__name__,
183+
"inputs": inputs,
184+
"outputs": outputs,
185+
}
186+
)
187+
return step_counts
188+
189+
190+
class ChainStart(Chain[TOther, TOther]):
191+
def __init__(self) -> None:
192+
super().__init__(None, _identity)

src/pipedata/core/stream.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33
import functools
44
import itertools
55
from typing import (
6+
Any,
67
Callable,
8+
Dict,
79
Iterable,
810
Iterator,
911
List,
@@ -13,7 +15,7 @@
1315
overload,
1416
)
1517

16-
from .chain import Chain
18+
from .chain import Chain, ChainStart
1719

1820
TStart = TypeVar("TStart")
1921
TEnd = TypeVar("TEnd")
@@ -69,7 +71,10 @@ def to_list(
6971
) -> List[TEnd]:
7072
return list(itertools.islice(self, stop))
7173

74+
def get_counts(self) -> List[Dict[str, Any]]:
75+
return self._chain.get_counts()
7276

73-
def start_stream(items: Iterable[TStart]) -> Stream[TStart]:
74-
chain = Chain[TStart, TStart].start()
75-
return Stream(items, chain)
77+
78+
class StreamStart(Stream[TEnd]):
79+
def __init__(self, items: Iterable[TEnd]) -> None:
80+
super().__init__(items, ChainStart[TEnd]())

0 commit comments

Comments
 (0)