From fc42eb95feb617e11d3372fd6fc12358410b4611 Mon Sep 17 00:00:00 2001 From: MaciekLeks Date: Sun, 17 Jan 2016 13:33:59 +0100 Subject: [PATCH 1/4] `reorder` and `reorder` functions implementation --- src/DataArrays.jl | 1 + src/pooleddataarray.jl | 79 ++++++++++++++++++++++++++++++++++++++++- test/pooleddataarray.jl | 20 +++++++++++ 3 files changed, 99 insertions(+), 1 deletion(-) diff --git a/src/DataArrays.jl b/src/DataArrays.jl index 69d6687..73298c1 100644 --- a/src/DataArrays.jl +++ b/src/DataArrays.jl @@ -46,6 +46,7 @@ module DataArrays PooledDataVector, reldiff, reorder, + reorder!, rep, replace!, setlevels!, diff --git a/src/pooleddataarray.jl b/src/pooleddataarray.jl index fe86867..84dce60 100644 --- a/src/pooleddataarray.jl +++ b/src/pooleddataarray.jl @@ -433,9 +433,86 @@ function setlevels!{T,R}(x::PooledDataArray{T,R}, d::Dict{T,Any}) # this version setlevels!(x, newpool) end +############################################################################## +## +## reorder() +## +############################################################################## + + reorder(x::PooledDataArray) = PooledDataArray(x, sort(levels(x))) # just re-sort the pool -reorder(x::PooledDataArray, y::AbstractVector...) = reorder(mean, x, y...) +""" + `tmp_reorder(pda,newpool)` reorders the current pool and references related to that pool. A new pool must be a subset of the + old one. If you want to change pool identifiers, use `setlevels` first, before using `reorder`. + + Pre-Condition: + - `newpoll` ⊆ pda.pool + + Input: + - `pda` reference object to be used to contruct a new one + - `newpool` to replace the current one + + Output: + A new PooledDataObject object +""" +reorder{T,R<:Integer,N}(pda::PooledDataArray{T,R,N}, newpool::Vector{T}) = begin + newpda = copy(pda) + reorder!(newpda, newpool) +end + +""" + `tmp_reorder!(pda,newpool)` reorders the current pool and references related to that pool. A new pool must be a subset of the + old one. If you want to change pool identifiers, use `setlevels` first, before using `reorder!`. + + Pre-Condition: + - `newpoll` ⊆ pda.pool + + Input: + - `pda` PooledDataArray to be changed + - `newpool` to replace the current one + + Output: + Current `pda` object +""" +reorder!{T,R<:Integer,N}(pda::PooledDataArray{T,R,N}, newpool::Vector{T}) = begin + #pre-condition + if !issubset(newpool, pda.pool) + println("is not subset newpool:$newpool") + throw(ArgumentError("A new pool must be a subset of the current one.")) + end + + # rebuild old poolref + oldpoolref = Dict{T, R}() + + # loop through oldpool to fill the oldpoolref dict + for i = 1:length(pda.pool) + oldpoolref[pda.pool[i]] = i + end + + newpoolref = Dict{T, R}() + for i=1:length(newpool) + newpoolref[newpool[i]] = i + end + + # map old to new refs for the pre-condition: new pool ⊆ old pool + refmap = Dict{R,R}() + for oldref in keys(oldpoolref) + refmap[oldpoolref[oldref]] = get(newpoolref, oldref, 0) + end + + # fill-in newrefs + for i = 1:length(pda.refs) + pda.refs[i] = get(refmap, pda.refs[i], 0) + end + + pda.pool = newpool + + return pda +end + +# commented due to #167 issue +#reorder(x::PooledDataArray, y::AbstractVector...) = reorder(mean, x, y...) ### FIXME: this can't work because we don't know about DataFrames # reorder(fun::Function, x::PooledDataArray, y::AbstractVector...) = diff --git a/test/pooleddataarray.jl b/test/pooleddataarray.jl index f747b76..10a081a 100644 --- a/test/pooleddataarray.jl +++ b/test/pooleddataarray.jl @@ -51,6 +51,7 @@ module TestPDA pim = @pdata [1 + im, 2 + im, 3 + im, 2 + im, 1 + im] @assert levels(pim) == [1 + im, 2 + im, 3 + im] + # Test explicitly setting refs type testarray = [1, 1, 2, 2, 0, 0, 3, 3] testdata = @data [1, 1, 2, 2, 0, 0, 3, 3] @@ -107,4 +108,23 @@ module TestPDA pda = @pdata([NA, "A", "B", "C", "A", "B"]) @test isequal(Base.permute!!(copy(pda), [2, 5, 3, 6, 4, 1]), @pdata(["A", "A", "B", "B", "C", NA])) @test isequal(Base.ipermute!!(copy(pda), [6, 1, 3, 5, 2, 4]), @pdata(["A", "A", "B", "B", "C", NA])) + + #1. reordering levels + pda = @pdata(["high" , "medium" , "low" , "high" , NA, "medium"]) + #1.1 positive scenarios + @test isequal(pda.pool, Vector{eltype(pda.pool)}(["high", "low", "medium"])) #alphabetically + @test isequal(pda.refs, Vector{eltype(pda.refs)}([1,3,2,1,0,3])) #high is 1, medium is 3, low is 1 according to alphabetical order + + reorder!(pda, ["low","medium","high"]) #reorder according to e.g. visual plot needs + @test isequal(pda.pool, Vector{eltype(pda.pool)}(["low", "medium", "high"])) #semantic order + @test isequal(pda.refs, Vector{eltype(pda.refs)}([3,2,1,3,0,2])) + + reorder!(pda, ["low","medium"]) + @test isequal(pda.pool, Vector{eltype(pda.pool)}(["low", "medium"])) #semantic order + @test isequal(pda.refs, Vector{eltype(pda.refs)}([0,2,1,0,0,2])) + + newpda = reorder(pda, ["low"]) + @test newpda !== pda + #1.2 negative scenarios + @test_throws ArgumentError reorder!(pda, ["very low","very high"]) #new levels must be a subset of the original one end From ef469fb70c82e56993d3c00de8496808a93e8493 Mon Sep 17 00:00:00 2001 From: MaciekLeks Date: Sun, 17 Jan 2016 13:59:11 +0100 Subject: [PATCH 2/4] `reorder` and `reorder` functions implementation --- src/pooleddataarray.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pooleddataarray.jl b/src/pooleddataarray.jl index 84dce60..3723f5a 100644 --- a/src/pooleddataarray.jl +++ b/src/pooleddataarray.jl @@ -443,7 +443,7 @@ end reorder(x::PooledDataArray) = PooledDataArray(x, sort(levels(x))) # just re-sort the pool """ - `tmp_reorder(pda,newpool)` reorders the current pool and references related to that pool. A new pool must be a subset of the + `reorder(pda,newpool)` reorders the current pool and references related to that pool. A new pool must be a subset of the old one. If you want to change pool identifiers, use `setlevels` first, before using `reorder`. Pre-Condition: @@ -462,7 +462,7 @@ reorder{T,R<:Integer,N}(pda::PooledDataArray{T,R,N}, newpool::Vector{T}) = begin end """ - `tmp_reorder!(pda,newpool)` reorders the current pool and references related to that pool. A new pool must be a subset of the + `reorder!(pda,newpool)` reorders the current pool and references related to that pool. A new pool must be a subset of the old one. If you want to change pool identifiers, use `setlevels` first, before using `reorder!`. Pre-Condition: From 86f514b309b959f44c7120b017bad4d4ad56ffd5 Mon Sep 17 00:00:00 2001 From: MaciekLeks Date: Sun, 17 Jan 2016 14:46:03 +0100 Subject: [PATCH 3/4] Update pooleddataarray.jl removal of an excess `println` in `reorder!` --- src/pooleddataarray.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/src/pooleddataarray.jl b/src/pooleddataarray.jl index 3723f5a..4fdf108 100644 --- a/src/pooleddataarray.jl +++ b/src/pooleddataarray.jl @@ -478,7 +478,6 @@ end reorder!{T,R<:Integer,N}(pda::PooledDataArray{T,R,N}, newpool::Vector{T}) = begin #pre-condition if !issubset(newpool, pda.pool) - println("is not subset newpool:$newpool") throw(ArgumentError("A new pool must be a subset of the current one.")) end From 24425c8e01820cc609c2223c677a5e91dd0a21f5 Mon Sep 17 00:00:00 2001 From: MaciekLeks Date: Mon, 18 Jan 2016 10:05:47 +0100 Subject: [PATCH 4/4] Changing `reorder` impl to be compatible with existing methods (i.e. PooledDataArray{S,R,N}(x::PooledDataArray{S,R,N}, newpool::Vector{S})) --- src/pooleddataarray.jl | 72 ++++++++++++++++------------------------- test/pooleddataarray.jl | 5 ++- 2 files changed, 32 insertions(+), 45 deletions(-) diff --git a/src/pooleddataarray.jl b/src/pooleddataarray.jl index 3723f5a..ebee5de 100644 --- a/src/pooleddataarray.jl +++ b/src/pooleddataarray.jl @@ -439,75 +439,59 @@ end ## ############################################################################## +""" + `reorder(pda,newpool)` reorders the current pool and references related to that pool using alphabetical order of the `newpool`. -reorder(x::PooledDataArray) = PooledDataArray(x, sort(levels(x))) # just re-sort the pool + Input: + - `pda` reference object to be used to contruct a new one + - `newpool` to replace the current one + Output: + A new PooledDataObject object """ - `reorder(pda,newpool)` reorders the current pool and references related to that pool. A new pool must be a subset of the - old one. If you want to change pool identifiers, use `setlevels` first, before using `reorder`. +reorder(x::PooledDataArray) = PooledDataArray(x, sort(levels(x))) # just re-sort the pool - Pre-Condition: - - `newpoll` ⊆ pda.pool +""" + `reorder(pda,newpool)` reorders the current pool and references related to that pool. A new pool should be a subset of the + old one(see `inclusioncheck` argument). If you want to change pool identifiers, use `setlevels` first, before using `reorder`. Input: - `pda` reference object to be used to contruct a new one - `newpool` to replace the current one + - `inclusioncheck` (default true) checks whether `newpoll` ⊆ `pda.pool` Output: A new PooledDataObject object """ -reorder{T,R<:Integer,N}(pda::PooledDataArray{T,R,N}, newpool::Vector{T}) = begin - newpda = copy(pda) - reorder!(newpda, newpool) +reorder(pda::PooledDataArray, newpool::AbstractVector, inclusioncheck=true) = begin + inclusioncheck && !issubset(newpool, pda.pool) && throw(ArgumentError("A new pool must be a subset of the current one.")) + + PooledDataArray(pda, newpool) end """ - `reorder!(pda,newpool)` reorders the current pool and references related to that pool. A new pool must be a subset of the - old one. If you want to change pool identifiers, use `setlevels` first, before using `reorder!`. - - Pre-Condition: - - `newpoll` ⊆ pda.pool + `reorder!(pda,newpool)` reorders the current pool and references related to that pool. A new pool should be a subset of the + old one(see `inclusioncheck` argument). If you want to change pool identifiers, use `setlevels` first, before using `reorder!`. Input: - `pda` PooledDataArray to be changed - `newpool` to replace the current one + - `inclusioncheck` (default true) checks whether `newpoll` ⊆ `pda.pool` Output: Current `pda` object """ -reorder!{T,R<:Integer,N}(pda::PooledDataArray{T,R,N}, newpool::Vector{T}) = begin - #pre-condition - if !issubset(newpool, pda.pool) - println("is not subset newpool:$newpool") - throw(ArgumentError("A new pool must be a subset of the current one.")) - end - - # rebuild old poolref - oldpoolref = Dict{T, R}() - - # loop through oldpool to fill the oldpoolref dict - for i = 1:length(pda.pool) - oldpoolref[pda.pool[i]] = i - end - - newpoolref = Dict{T, R}() - for i=1:length(newpool) - newpoolref[newpool[i]] = i - end - - # map old to new refs for the pre-condition: new pool ⊆ old pool - refmap = Dict{R,R}() - for oldref in keys(oldpoolref) - refmap[oldpoolref[oldref]] = get(newpoolref, oldref, 0) - end +reorder!{T,R<:Integer,N}(pda::PooledDataArray{T,R,N}, newpool::Vector{T}, inclusioncheck=true) = begin + inclusioncheck && !issubset(newpool, pda.pool) && throw(ArgumentError("A new pool must be a subset of the current one.")) - # fill-in newrefs - for i = 1:length(pda.refs) - pda.refs[i] = get(refmap, pda.refs[i], 0) + tidx::Array{R} = findat(newpool, pda.pool) + oldrefs = pda.refs + for i in 1:length(oldrefs) + if oldrefs[i] != 0 + oldrefs[i] = tidx[oldrefs[i]] + end end - - pda.pool = newpool - + pda.pool = newpool return pda end diff --git a/test/pooleddataarray.jl b/test/pooleddataarray.jl index 10a081a..6f48572 100644 --- a/test/pooleddataarray.jl +++ b/test/pooleddataarray.jl @@ -126,5 +126,8 @@ module TestPDA newpda = reorder(pda, ["low"]) @test newpda !== pda #1.2 negative scenarios - @test_throws ArgumentError reorder!(pda, ["very low","very high"]) #new levels must be a subset of the original one + pda = @pdata(["high" , "medium" , "low" , "high" , NA, "medium"]) + @test_throws ArgumentError reorder(pda, ["very low","very high"]) #new levels must be a subset of the original one + reorder!(pda, ["new low","new medium"], false) #don't check inclusion and change level names + @test isequal(pda.refs, Vector{eltype(pda.refs)}([0,0,0,0,0,0])) #we have a mess, it's not reordering end