Add unique! (#20619)

JackDevine · quinnj · commit 886cace4e802 · 2017-06-19T16:33:55.000-06:00
diff --git a/NEWS.md b/NEWS.md
@@ -77,6 +77,8 @@ Library improvements
   * `logspace` now accepts a `base` keyword argument to specify the base of the logarithmic
     range. The base defaults to 10 ([#22310]).
 
+  * Added `unique!` which is an inplace version of `unique` ([#20549]).
+
 Compiler/Runtime improvements
 -----------------------------
 
diff --git a/base/exports.jl b/base/exports.jl
@@ -719,6 +719,7 @@ export
     symdiff,
     union!,
     union,
+    unique!,
     unique,
     values,
     valtype,
diff --git a/base/set.jl b/base/set.jl
@@ -202,6 +202,94 @@ function unique(f::Callable, C)
     out
 end
 
+# If A is not grouped, then we will need to keep track of all of the elements that we have
+# seen so far.
+function _unique!(A::AbstractVector)
+    seen = Set{eltype(A)}()
+    idxs = eachindex(A)
+    i = state = start(idxs)
+    for x in A
+        if x ∉ seen
+            push!(seen, x)
+            i, state = next(idxs, state)
+            A[i] = x
+        end
+    end
+    resize!(A, i - first(idxs) + 1)
+end
+
+# If A is grouped, so that each unique element is in a contiguous group, then we only
+# need to keep track of one element at a time. We replace the elements of A with the
+# unique elements that we see in the order that we see them. Once we have iterated
+# through A, we resize A based on the number of unique elements that we see.
+function _groupedunique!(A::AbstractVector)
+    isempty(A) && return A
+    idxs = eachindex(A)
+    y = first(A)
+    state = start(idxs)
+    i, state = next(idxs, state)
+    for x in A
+        if !isequal(x, y)
+            i, state = next(idxs, state)
+            y = A[i] = x
+        end
+    end
+    resize!(A, i - first(idxs) + 1)
+end
+
+"""
+    unique!(A::AbstractVector)
+
+Remove duplicate items as determined by [`isequal`](@ref), then return the modified `A`.
+`unique!` will return the elements of `A` in the order that they occur. If you do not care
+about the order of the returned data, then calling `(sort!(A); unique!(A))` will be much
+more efficient as long as the elements of `A` can be sorted.
+
+```jldoctest
+julia> unique!([1, 1, 1])
+1-element Array{Int64,1}:
+ 1
+
+julia> A = [7, 3, 2, 3, 7, 5];
+
+julia> unique!(A)
+4-element Array{Int64,1}:
+ 7
+ 3
+ 2
+ 5
+
+julia> B = [7, 6, 42, 6, 7, 42];
+
+julia> sort!(B);  # unique! is able to process sorted data much more efficiently.
+
+julia> unique!(B)
+3-element Array{Int64,1}:
+ 6
+ 7
+ 42
+```
+"""
+function unique!(A::Union{AbstractVector{<:Real}, AbstractVector{<:AbstractString},
+                          AbstractVector{<:Symbol}})
+    if isempty(A)
+        return A
+    elseif issorted(A) || issorted(A, rev=true)
+        return _groupedunique!(A)
+    else
+        return _unique!(A)
+    end
+end
+# issorted fails for some element types, so the method above has to be restricted to
+# elements with isless/< defined.
+function unique!(A)
+    if isempty(A)
+        return A
+    else
+        return _unique!(A)
+    end
+end
+
 """
     allunique(itr) -> Bool
 
diff --git a/doc/src/stdlib/collections.md b/doc/src/stdlib/collections.md
@@ -79,6 +79,7 @@ Base.eltype
 Base.indexin
 Base.findin
 Base.unique
+Base.unique!
 Base.allunique
 Base.reduce(::Any, ::Any, ::Any)
 Base.reduce(::Any, ::Any)
diff --git a/test/TestHelpers.jl b/test/TestHelpers.jl
@@ -224,6 +224,8 @@ _offset(out, ::Tuple{}, ::Tuple{}) = out
 indsoffset(r::Range) = first(r) - 1
 indsoffset(i::Integer) = 0
 
+Base.resize!(A::OffsetVector, nl::Integer) = (resize!(A.parent, nl); A)
+
 end
 
 end
diff --git a/test/sets.jl b/test/sets.jl
@@ -1,6 +1,8 @@
 # This file is a part of Julia. License is MIT: https://julialang.org/license
 
 # Set tests
+isdefined(Main, :TestHelpers) || @eval Main include("TestHelpers.jl")
+using TestHelpers.OAs
 
 # Construction, collect
 @test ===(typeof(Set([1,2,3])), Set{Int})
@@ -221,6 +223,42 @@ u = unique([1,1,2])
 @test @inferred(unique(x for x in 1:1)) == [1]
 @test unique(x for x in Any[1,1.0])::Vector{Real} == [1]
 
+# unique!
+@testset "unique!" begin
+    u = [1,1,3,2,1]
+    unique!(u)
+    @test u == [1,3,2]
+    @test unique!([]) == []
+    @test unique!(Float64[]) == Float64[]
+    u = [1,2,2,3,5,5]
+    @test unique!(u) === u
+    @test u == [1,2,3,5]
+    u = [6,5,5,3,3,2,1]
+    @test unique!(u) === u
+    @test u == [6,5,3,2,1]
+    u = OffsetArray([1,2,2,3,5,5], -1)
+    @test unique!(u) === u
+    @test u == OffsetArray([1,2,3,5], -1)
+    u = OffsetArray([5,5,4,4,2,2,0,-1,-1], -1)
+    @test unique!(u) === u
+    @test u == OffsetArray([5,4,2,0,-1], -1)
+    u = OffsetArray(["w","we","w",5,"r",5,5], -1)
+    @test unique!(u) === u
+    @test u == OffsetArray(["w","we",5,"r"], -1)
+    u = [0.0,-0.0,1.0,2]
+    @test unique!(u) === u
+    @test u == [0.0,-0.0,1.0,2.0]
+    u = [1,NaN,NaN,3]
+    @test unique!(u) === u
+    @test u[1] == 1
+    @test isnan(u[2])
+    @test u[3] == 3
+    u = [5,"w","we","w","r",5,"w"]
+    unique!(u)
+    @test u == [5,"w","we","r"]
+    u = [1,2,5,1,3,2]
+end
+
 # allunique
 @test allunique([])
 @test allunique(Set())