Skip to content

Commit 0738794

Browse files
committed
v2.1.15 dataflows>=0.0.65 with deduplicate processor
1 parent c218677 commit 0738794

File tree

5 files changed

+148
-2
lines changed

5 files changed

+148
-2
lines changed

README.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -740,6 +740,27 @@ Filtering just American and European countries, leaving out countries whose main
740740
sort-by: "{country_name}"
741741
```
742742

743+
### ***`deduplicate`***
744+
745+
Deduplicates rows in resources based on the resources' primary key
746+
747+
`deduplicate` accepts a resource specifier - for each resource, it will output only unique rows (based on the values in the primary key fields). Rows with duplicate primary keys will be ignored.
748+
749+
_Parameters_:
750+
751+
- `resources` - Which resources to sort. Same semantics as `resources` in `stream_remote_resources`.
752+
753+
*Examples*:
754+
755+
Deduplicating rows in the `world-population` resource.
756+
757+
```yaml
758+
- run: deduplicate
759+
parameters:
760+
resources: world_population
761+
```
762+
763+
743764
### ***`duplicate`***
744765

745766
Duplicate a resource.

datapackage_pipelines/VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.1.14
1+
2.1.15
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
from dataflows import Flow, deduplicate
2+
from datapackage_pipelines.wrapper import ingest
3+
from datapackage_pipelines.utilities.flow_utils import spew_flow
4+
5+
6+
def flow(parameters):
7+
return Flow(
8+
deduplicate(
9+
resources=parameters.get('resources'),
10+
)
11+
)
12+
13+
14+
if __name__ == '__main__':
15+
with ingest() as ctx:
16+
spew_flow(flow(ctx.parameters), ctx)

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ def read(*paths):
3939
'cachetools',
4040
'tabulator>=1.17.0',
4141
'globster>=0.1.0',
42-
'dataflows>=0.0.57',
42+
'dataflows>=0.0.65',
4343
'python-dateutil<2.8.1',
4444
]
4545
SPEEDUP_REQUIRES = [
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
deduplicate
2+
--
3+
{
4+
"resources": ["concat-a1", "concat-a2"]
5+
}
6+
--
7+
{
8+
"name": "test",
9+
"resources": [
10+
{
11+
"name": "concat-a1",
12+
"dpp:streaming": true,
13+
"path": "concat-a1.csv",
14+
"schema": { "fields": [
15+
{"name": "a1", "type": "string"},
16+
{"name": "a2", "type": "string"},
17+
{"name": "a3", "type": "string"}
18+
], "primaryKey": ["a1", "a2"]}
19+
},
20+
{
21+
"name": "concat-a2",
22+
"dpp:streaming": true,
23+
"path": "concat-a2.csv",
24+
"schema": { "fields": [
25+
{"name": "a1", "type": "string"},
26+
{"name": "a2", "type": "string"},
27+
{"name": "a3", "type": "string"}
28+
]}
29+
},
30+
{
31+
"name": "concat-c",
32+
"dpp:streaming": true,
33+
"path": "concat-c.csv",
34+
"schema": { "fields": [
35+
{"name": "c1", "type": "string"},
36+
{"name": "c2", "type": "string"},
37+
{"name": "c3", "type": "string"}
38+
]}
39+
}
40+
]
41+
}
42+
--
43+
{"a1":"a1","a2":"a1","a3":"a2"}
44+
{"a1":"a2","a2":"a1","a3":"a1"}
45+
{"a1":"a1","a2":"a1","a3":"a2"}
46+
{"a1":"a2","a2":"a1","a3":"a1"}
47+
48+
{"a1":"a1","a2":"a3","a3":"a2"}
49+
{"a1":"a2","a2":"a3","a3":"a1"}
50+
{"a1":"a3","a2":"a4","a3":"a2"}
51+
{"a1":"a4","a2":"a4","a3":"a1"}
52+
53+
{"c1":"c11","c2":"c21","c3":"c31"}
54+
{"c1":"c12","c2":"c22","c3":"c32"}
55+
{"c1":"c13","c2":"c23","c3":"c33"}
56+
--
57+
{
58+
"name": "test",
59+
"profile": "data-package",
60+
"resources": [
61+
{
62+
"name": "concat-a1",
63+
"dpp:streaming": true,
64+
"path": "concat-a1.csv",
65+
"profile": "data-resource",
66+
"schema": { "fields": [
67+
{"name": "a1", "type": "string"},
68+
{"name": "a2", "type": "string"},
69+
{"name": "a3", "type": "string"}
70+
], "primaryKey": ["a1", "a2"]}
71+
},
72+
{
73+
"name": "concat-a2",
74+
"dpp:streaming": true,
75+
"path": "concat-a2.csv",
76+
"profile": "data-resource",
77+
"schema": { "fields": [
78+
{"name": "a1", "type": "string"},
79+
{"name": "a2", "type": "string"},
80+
{"name": "a3", "type": "string"}
81+
]}
82+
},
83+
{
84+
"name": "concat-c",
85+
"dpp:streaming": true,
86+
"path": "concat-c.csv",
87+
"profile": "data-resource",
88+
"schema": { "fields": [
89+
{"name": "c1", "type": "string"},
90+
{"name": "c2", "type": "string"},
91+
{"name": "c3", "type": "string"}
92+
]}
93+
}
94+
]
95+
}
96+
--
97+
{"a1":"a1","a2":"a1","a3":"a2"}
98+
{"a1":"a2","a2":"a1","a3":"a1"}
99+
100+
{"a1":"a1","a2":"a3","a3":"a2"}
101+
{"a1":"a2","a2":"a3","a3":"a1"}
102+
{"a1":"a3","a2":"a4","a3":"a2"}
103+
{"a1":"a4","a2":"a4","a3":"a1"}
104+
105+
{"c1":"c11","c2":"c21","c3":"c31"}
106+
{"c1":"c12","c2":"c22","c3":"c32"}
107+
{"c1":"c13","c2":"c23","c3":"c33"}
108+
109+
{}

0 commit comments

Comments
 (0)