Skip to content

Commit c2c7bd8

Browse files
authored
Merge pull request #46 from Arkoniak/elkan_algorithm
Full Elkan implementation and refactoring of MLJ Interface
2 parents a44f676 + 9f8f5b4 commit c2c7bd8

14 files changed

+514
-443
lines changed

Project.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "ParallelKMeans"
22
uuid = "42b8e9d4-006b-409a-8472-7f34b3fb58af"
33
authors = ["Bernard Brenyah", "Andrey Oskin"]
4-
version = "0.1.1"
4+
version = "0.1.2"
55

66
[deps]
77
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"

docs/src/index.md

+3-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# ParallelKMeans.jl Package
1+
# [ParallelKMeans.jl Package](https://github.com/PyDataBlog/ParallelKMeans.jl)
22

33
```@contents
44
Depth = 4
@@ -59,7 +59,7 @@ git checkout experimental
5959

6060
- [X] Implementation of [Hamerly implementation](https://www.researchgate.net/publication/220906984_Making_k-means_Even_Faster).
6161
- [X] Interface for inclusion in Alan Turing Institute's [MLJModels](https://github.com/alan-turing-institute/MLJModels.jl#who-is-this-repo-for).
62-
- [ ] Full Implementation of Triangle inequality based on [Elkan - 2003 Using the Triangle Inequality to Accelerate K-Means"](https://www.aaai.org/Papers/ICML/2003/ICML03-022.pdf).
62+
- [X] Full Implementation of Triangle inequality based on [Elkan - 2003 Using the Triangle Inequality to Accelerate K-Means"](https://www.aaai.org/Papers/ICML/2003/ICML03-022.pdf).
6363
- [ ] Implementation of [Geometric methods to accelerate k-means algorithm](http://cs.baylor.edu/~hamerly/papers/sdm2016_rysavy_hamerly.pdf).
6464
- [ ] Native support for tabular data inputs outside of MLJModels' interface.
6565
- [ ] Refactoring and finalizaiton of API desgin.
@@ -177,6 +177,7 @@ ________________________________________________________________________________
177177

178178
- 0.1.0 Initial release
179179
- 0.1.1 Added interface for MLJ
180+
- 0.1.2 Added Elkan algorithm
180181

181182
## Contributing
182183

src/ParallelKMeans.jl

+5-3
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,20 @@
11
module ParallelKMeans
22

33
using StatsBase
4-
using MLJModelInterface
4+
import MLJModelInterface
55
import Base.Threads: @spawn
66
import Distances
77

8+
const MMI = MLJModelInterface
9+
810
include("seeding.jl")
911
include("kmeans.jl")
1012
include("lloyd.jl")
11-
include("light_elkan.jl")
1213
include("hamerly.jl")
14+
include("elkan.jl")
1315
include("mlj_interface.jl")
1416

1517
export kmeans
16-
export Lloyd, LightElkan, Hamerly
18+
export Lloyd, Hamerly, Elkan
1719

1820
end # module

src/elkan.jl

+278
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,278 @@
1+
"""
2+
Elkan()
3+
4+
Elkan algorithm implementation, based on "Charles Elkan. 2003.
5+
Using the triangle inequality to accelerate k-means.
6+
In Proceedings of the Twentieth International Conference on
7+
International Conference on Machine Learning (ICML’03). AAAI Press, 147–153."
8+
9+
This algorithm provides much faster convergence than Lloyd algorithm especially
10+
for high dimensional data.
11+
It can be used directly in `kmeans` function
12+
13+
```julia
14+
X = rand(30, 100_000) # 100_000 random points in 30 dimensions
15+
16+
kmeans(Elkan(), X, 3) # 3 clusters, Elkan algorithm
17+
```
18+
"""
19+
struct Elkan <: AbstractKMeansAlg end
20+
21+
function kmeans!(alg::Elkan, containers, X, k;
22+
n_threads = Threads.nthreads(),
23+
k_init = "k-means++", max_iters = 300,
24+
tol = 1e-6, verbose = false, init = nothing)
25+
nrow, ncol = size(X)
26+
centroids = init == nothing ? smart_init(X, k, n_threads, init=k_init).centroids : deepcopy(init)
27+
28+
update_containers(alg, containers, centroids, n_threads)
29+
@parallelize n_threads ncol chunk_initialize(alg, containers, centroids, X)
30+
31+
converged = false
32+
niters = 0
33+
J_previous = 0.0
34+
35+
# Update centroids & labels with closest members until convergence
36+
while niters < max_iters
37+
niters += 1
38+
# Core iteration
39+
@parallelize n_threads ncol chunk_update_centroids(alg, containers, centroids, X)
40+
41+
# Collect distributed containers (such as centroids_new, centroids_cnt)
42+
# in paper it is step 4
43+
collect_containers(alg, containers, n_threads)
44+
45+
J = sum(containers.ub)
46+
47+
# auxiliary calculation, in paper it's d(c, m(c))
48+
calculate_centroids_movement(alg, containers, centroids)
49+
50+
# lower and ounds update, in paper it's steps 5 and 6
51+
@parallelize n_threads ncol chunk_update_bounds(alg, containers, centroids)
52+
53+
# Step 7, final assignment of new centroids
54+
centroids .= containers.centroids_new[end]
55+
56+
if verbose
57+
# Show progress and terminate if J stopped decreasing.
58+
println("Iteration $niters: Jclust = $J")
59+
end
60+
61+
# Check for convergence
62+
if (niters > 1) & (abs(J - J_previous) < (tol * J))
63+
converged = true
64+
break
65+
end
66+
67+
# Step 1 in original paper, calulation of distance d(c, c')
68+
update_containers(alg, containers, centroids, n_threads)
69+
J_previous = J
70+
end
71+
72+
@parallelize n_threads ncol sum_of_squares(containers, X, containers.labels, centroids)
73+
totalcost = sum(containers.sum_of_squares)
74+
75+
# Terminate algorithm with the assumption that K-means has converged
76+
if verbose & converged
77+
println("Successfully terminated with convergence.")
78+
end
79+
80+
# TODO empty placeholder vectors should be calculated
81+
# TODO Float64 type definitions is too restrictive, should be relaxed
82+
# especially during GPU related development
83+
return KmeansResult(centroids, containers.labels, Float64[], Int[], Float64[], totalcost, niters, converged)
84+
end
85+
86+
function create_containers(::Elkan, k, nrow, ncol, n_threads)
87+
lng = n_threads + 1
88+
centroids_new = Vector{Array{Float64,2}}(undef, lng)
89+
centroids_cnt = Vector{Vector{Int}}(undef, lng)
90+
91+
for i = 1:lng
92+
centroids_new[i] = zeros(nrow, k)
93+
centroids_cnt[i] = zeros(k)
94+
end
95+
96+
centroids_dist = Matrix{Float64}(undef, k, k)
97+
98+
# lower bounds
99+
lb = Matrix{Float64}(undef, k, ncol)
100+
101+
# upper bounds
102+
ub = Vector{Float64}(undef, ncol)
103+
104+
# r(x) in original paper, shows whether point distance should be updated
105+
stale = ones(Bool, ncol)
106+
107+
# distance that centroid moved
108+
p = Vector{Float64}(undef, k)
109+
110+
labels = zeros(Int, ncol)
111+
112+
# total_sum_calculation
113+
sum_of_squares = Vector{Float64}(undef, n_threads)
114+
115+
return (
116+
centroids_new = centroids_new,
117+
centroids_cnt = centroids_cnt,
118+
labels = labels,
119+
centroids_dist = centroids_dist,
120+
lb = lb,
121+
ub = ub,
122+
stale = stale,
123+
p = p,
124+
sum_of_squares = sum_of_squares
125+
)
126+
end
127+
128+
function chunk_initialize(::Elkan, containers, centroids, X, r, idx)
129+
ub = containers.ub
130+
lb = containers.lb
131+
centroids_dist = containers.centroids_dist
132+
labels = containers.labels
133+
centroids_new = containers.centroids_new[idx]
134+
centroids_cnt = containers.centroids_cnt[idx]
135+
136+
@inbounds for i in r
137+
min_dist = distance(X, centroids, i, 1)
138+
label = 1
139+
lb[label, i] = min_dist
140+
for j in 2:size(centroids, 2)
141+
# triangular inequality
142+
if centroids_dist[j, label] > min_dist
143+
lb[j, i] = min_dist
144+
else
145+
dist = distance(X, centroids, i, j)
146+
label = dist < min_dist ? j : label
147+
min_dist = dist < min_dist ? dist : min_dist
148+
lb[j, i] = dist
149+
end
150+
end
151+
ub[i] = min_dist
152+
labels[i] = label
153+
centroids_cnt[label] += 1
154+
for j in axes(X, 1)
155+
centroids_new[j, label] += X[j, i]
156+
end
157+
end
158+
end
159+
160+
function update_containers(::Elkan, containers, centroids, n_threads)
161+
# unpack containers for easier manipulations
162+
centroids_dist = containers.centroids_dist
163+
164+
k = size(centroids_dist, 1) # number of clusters
165+
@inbounds for j in axes(centroids_dist, 2)
166+
min_dist = Inf
167+
for i in j + 1:k
168+
d = distance(centroids, centroids, i, j)
169+
centroids_dist[i, j] = d
170+
centroids_dist[j, i] = d
171+
min_dist = min_dist < d ? min_dist : d
172+
end
173+
for i in 1:j - 1
174+
min_dist = min_dist < centroids_dist[j, i] ? min_dist : centroids_dist[j, i]
175+
end
176+
centroids_dist[j, j] = min_dist
177+
end
178+
179+
# TODO: oh, one should be careful here. inequality holds for eucledian metrics
180+
# not square eucledian. So, for Lp norm it should be something like
181+
# centroids_dist = 0.5^p. Should check one more time original paper
182+
centroids_dist .*= 0.25
183+
184+
return centroids_dist
185+
end
186+
187+
function chunk_update_centroids(::Elkan, containers, centroids, X, r, idx)
188+
# unpack
189+
ub = containers.ub
190+
lb = containers.lb
191+
centroids_dist = containers.centroids_dist
192+
labels = containers.labels
193+
stale = containers.stale
194+
centroids_new = containers.centroids_new[idx]
195+
centroids_cnt = containers.centroids_cnt[idx]
196+
197+
@inbounds for i in r
198+
label_old = labels[i]
199+
label = label_old
200+
min_dist = ub[i]
201+
# tighten the loop, exclude points that very close to center
202+
min_dist <= centroids_dist[label, label] && continue
203+
for j in axes(centroids, 2)
204+
# tighten the loop once more, exclude far away centers
205+
j == label && continue
206+
min_dist <= lb[j, i] && continue
207+
min_dist <= centroids_dist[j, label] && continue
208+
209+
# one calculation per iteration is enough
210+
if stale[i]
211+
min_dist = distance(X, centroids, i, label)
212+
lb[label, i] = min_dist
213+
ub[i] = min_dist
214+
stale[i] = false
215+
end
216+
217+
if (min_dist > lb[j, i]) | (min_dist > centroids_dist[j, label])
218+
dist = distance(X, centroids, i, j)
219+
lb[j, i] = dist
220+
if dist < min_dist
221+
min_dist = dist
222+
label = j
223+
end
224+
end
225+
end
226+
227+
if label != label_old
228+
labels[i] = label
229+
centroids_cnt[label_old] -= 1
230+
centroids_cnt[label] += 1
231+
for j in axes(X, 1)
232+
centroids_new[j, label_old] -= X[j, i]
233+
centroids_new[j, label] += X[j, i]
234+
end
235+
end
236+
end
237+
end
238+
239+
function collect_containers(alg::Elkan, containers, n_threads)
240+
if n_threads == 1
241+
@inbounds containers.centroids_new[end] .= containers.centroids_new[1] ./ containers.centroids_cnt[1]'
242+
else
243+
@inbounds containers.centroids_new[end] .= containers.centroids_new[1]
244+
@inbounds containers.centroids_cnt[end] .= containers.centroids_cnt[1]
245+
@inbounds for i in 2:n_threads
246+
containers.centroids_new[end] .+= containers.centroids_new[i]
247+
containers.centroids_cnt[end] .+= containers.centroids_cnt[i]
248+
end
249+
250+
@inbounds containers.centroids_new[end] .= containers.centroids_new[end] ./ containers.centroids_cnt[end]'
251+
end
252+
end
253+
254+
function calculate_centroids_movement(alg::Elkan, containers, centroids)
255+
p = containers.p
256+
centroids_new = containers.centroids_new[end]
257+
258+
for i in axes(centroids, 2)
259+
p[i] = distance(centroids, centroids_new, i, i)
260+
end
261+
end
262+
263+
264+
function chunk_update_bounds(alg, containers, centroids, r, idx)
265+
p = containers.p
266+
lb = containers.lb
267+
ub = containers.ub
268+
stale = containers.stale
269+
labels = containers.labels
270+
271+
@inbounds for i in r
272+
for j in axes(centroids, 2)
273+
lb[j, i] = lb[j, i] > p[j] ? lb[j, i] + p[j] - 2*sqrt(abs(lb[j, i]*p[j])) : 0.0
274+
end
275+
stale[i] = true
276+
ub[i] += p[labels[i]] + 2*sqrt(abs(ub[i]*p[labels[i]]))
277+
end
278+
end

0 commit comments

Comments
 (0)