Skip to content
This repository was archived by the owner on May 5, 2019. It is now read-only.

Commit 20c71d6

Browse files
cjprybolnalimilan
authored andcommitted
Enhance joining and grouping (#17)
Using a hashing approach rather than converting all columns to categorical arrays. Based on work by @alyst in DataFrames.
1 parent 9902e6e commit 20c71d6

File tree

20 files changed

+640
-453
lines changed

20 files changed

+640
-453
lines changed

REQUIRE

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
julia 0.5
22
NullableArrays 0.1.0
3-
CategoricalArrays 0.0.6
3+
CategoricalArrays 0.1.2
44
StatsBase 0.11.0
55
GZip
66
SortingAlgorithms

docs/src/man/joins.md

+5-5
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ Cross joins are the only kind of join that does not use a key:
5151
join(a, b, kind = :cross)
5252
```
5353

54-
In order to join data frames on keys which have different names, you must first rename them so that they match. This can be done using rename!:
54+
In order to join data tables on keys which have different names, you must first rename them so that they match. This can be done using rename!:
5555

5656
```julia
5757
a = DataTable(ID = [1, 2], Name = ["A", "B"])
@@ -63,11 +63,11 @@ join(a, b, on = :ID, kind = :inner)
6363
Or renaming multiple columns at a time:
6464

6565
```julia
66-
a = DataTable(City = ["Amsterdam", "London", "London", "New York", "New York"],
67-
Job = ["Lawyer", "Lawyer", "Lawyer", "Doctor", "Doctor"],
66+
a = DataTable(City = ["Amsterdam", "London", "London", "New York", "New York"],
67+
Job = ["Lawyer", "Lawyer", "Lawyer", "Doctor", "Doctor"],
6868
Category = [1, 2, 3, 4, 5])
69-
b = DataTable(Location = ["Amsterdam", "London", "London", "New York", "New York"],
70-
Work = ["Lawyer", "Lawyer", "Lawyer", "Doctor", "Doctor"],
69+
b = DataTable(Location = ["Amsterdam", "London", "London", "New York", "New York"],
70+
Work = ["Lawyer", "Lawyer", "Lawyer", "Doctor", "Doctor"],
7171
Name = ["a", "b", "c", "d", "e"])
7272
rename!(b, [:Location => :City, :Work => :Job])
7373
join(a, b, on = [:City, :Job])

src/DataTables.jl

+1
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ for (dir, filename) in [
104104
("subdatatable", "subdatatable.jl"),
105105
("groupeddatatable", "grouping.jl"),
106106
("datatablerow", "datatablerow.jl"),
107+
("datatablerow", "utils.jl"),
107108

108109
("abstractdatatable", "iteration.jl"),
109110
("abstractdatatable", "join.jl"),

src/abstractdatatable/abstractdatatable.jl

+7-10
Original file line numberDiff line numberDiff line change
@@ -602,17 +602,14 @@ nonunique(dt, 1)
602602
603603
"""
604604
function nonunique(dt::AbstractDataTable)
605-
res = fill(false, nrow(dt))
606-
rows = Set{DataTableRow}()
607-
for i in 1:nrow(dt)
608-
arow = DataTableRow(dt, i)
609-
if in(arow, rows)
610-
res[i] = true
611-
else
612-
push!(rows, arow)
613-
end
605+
gslots = row_group_slots(dt)[3]
606+
# unique rows are the first encountered group representatives,
607+
# nonunique are everything else
608+
res = fill(true, nrow(dt))
609+
@inbounds for g_row in gslots
610+
(g_row > 0) && (res[g_row] = false)
614611
end
615-
res
612+
return res
616613
end
617614

618615
nonunique(dt::AbstractDataTable, cols::Union{Real, Symbol}) = nonunique(dt[[cols]])

0 commit comments

Comments
 (0)