Skip to content

Commit 97bf157

Browse files
authored
Add files via upload
Updated
1 parent d381ef4 commit 97bf157

File tree

1 file changed

+205
-0
lines changed

1 file changed

+205
-0
lines changed

Kmeans_plus_plus.ipynb

+205
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 9,
6+
"metadata": {},
7+
"outputs": [
8+
{
9+
"name": "stdout",
10+
"output_type": "stream",
11+
"text": [
12+
"[[1.000e+00 1.386e+01 1.350e+00 2.270e+00 1.600e+01 9.800e+01 2.980e+00\n",
13+
" 3.150e+00 2.200e-01 1.850e+00 7.220e+00 1.010e+00 3.550e+00 1.045e+03]\n",
14+
" [2.000e+00 1.184e+01 2.890e+00 2.230e+00 1.800e+01 1.120e+02 1.720e+00\n",
15+
" 1.320e+00 4.300e-01 9.500e-01 2.650e+00 9.600e-01 2.520e+00 5.000e+02]]\n"
16+
]
17+
}
18+
],
19+
"source": [
20+
"# Write a kmeans plus plus algorithm to initialize the centroids to be used in the kmeans algorithm\n",
21+
"# Initialize centroids based on a probability index set by their distance from each other\n",
22+
"# Dataset is a numpy array and k is the number of centroids\n",
23+
"\n",
24+
"\n",
25+
"import numpy as np\n",
26+
"np.set_printoptions(precision=3)\n",
27+
"import matplotlib.pyplot as plt\n",
28+
"%matplotlib inline\n",
29+
"\n",
30+
"\n",
31+
"def kmeans_plus_plus(dataset, k):\n",
32+
" # define the shape of the dataset\n",
33+
" N, D = dataset.shape\n",
34+
" # define the centroid size\n",
35+
" C = np.zeros([k, D])\n",
36+
" # pick a random instance in the dataset\n",
37+
" random_index = np.random.choice(len(dataset))\n",
38+
" # choose a random instance\n",
39+
" random_instance = dataset[random_index]\n",
40+
" for i in range(k):\n",
41+
" # define the distance array to store all items in the dataset from the randomly chosen instance\n",
42+
" distance_array = np.linalg.norm(random_instance - dataset , axis = 1)\n",
43+
" # each distance proportionally increase the chances for the next data to be chosen\n",
44+
" prob_array = distance_array / np.sum(distance_array)\n",
45+
" # based on the distances, chose a random instance index\n",
46+
" chosen_index = np.random.choice(len(dataset) , p = prob_array)\n",
47+
" # assing the next instance to the centroid[i]\n",
48+
" C[i] = dataset[chosen_index]\n",
49+
" # last chosen C[i] will be the new reference to chose next instance for C[i+1]\n",
50+
" random_instance = C[i] \n",
51+
" return C\n",
52+
"\n",
53+
"# test the function \n",
54+
"if __name__ == '__main__':\n",
55+
" \n",
56+
" # download your copy of wine dataset from: https://archive.ics.uci.edu/ml/machine-learning-databases/wine/\n",
57+
" # change the format to .txt or .cvs if current data is causing any parsing error.\n",
58+
" \n",
59+
" dataset = np.loadtxt('wine.data.txt' , dtype = float, delimiter = ',')\n",
60+
" dataset = np.array(dataset)\n",
61+
" print(kmeans_plus_plus(dataset, 2))\n",
62+
" \n",
63+
" # if there was a need for standardization, this code below could be applied:\n",
64+
" # standardized_data = [(i - np.mean(dataset)) / (np.std(dataset)) for i in dataset]\n",
65+
" "
66+
]
67+
},
68+
{
69+
"cell_type": "code",
70+
"execution_count": 10,
71+
"metadata": {},
72+
"outputs": [
73+
{
74+
"name": "stderr",
75+
"output_type": "stream",
76+
"text": [
77+
"/Users/volkansonmez/miniconda/lib/python3.7/site-packages/numpy/core/fromnumeric.py:3335: RuntimeWarning: Mean of empty slice.\n",
78+
" out=out, **kwargs)\n",
79+
"/Users/volkansonmez/miniconda/lib/python3.7/site-packages/numpy/core/_methods.py:161: RuntimeWarning: invalid value encountered in double_scalars\n",
80+
" ret = ret.dtype.type(ret / rcount)\n"
81+
]
82+
},
83+
{
84+
"name": "stdout",
85+
"output_type": "stream",
86+
"text": [
87+
"[ 0. 17592402.704 4545800.928 2633614.463 1341434.153\n",
88+
" 916424.194 684095.537 415042.347 674861.281 353365.048\n",
89+
" 657438.809 187113.672 492970.971 565012.338 138874.119]\n"
90+
]
91+
},
92+
{
93+
"data": {
94+
"image/png": "\n",
95+
"text/plain": [
96+
"<Figure size 432x288 with 1 Axes>"
97+
]
98+
},
99+
"metadata": {
100+
"needs_background": "light"
101+
},
102+
"output_type": "display_data"
103+
}
104+
],
105+
"source": [
106+
"# the Within-Cluster-Sum-of-Squares amount normally goes down as the number of clusters increase\n",
107+
"# the ideal number of clusters can be chosen if the WCSS graph achieves a clear elbow shape\n",
108+
"\n",
109+
"def k_means(dataset, k): # return the WCSS value for chosen k clusters\n",
110+
" # Initialize the WCSS value to 0\n",
111+
" WCSS = 0\n",
112+
" # create an array to collect the indices of the groups\n",
113+
" final_array = np.zeros(len(dataset))\n",
114+
" # initialize the centroids based on the kmeansplusplus function. If data is too big, just choose k random instances\n",
115+
" C = kmeans_plus_plus(dataset, k)\n",
116+
" # create an old C initialized with zeros\n",
117+
" C_old = np.zeros(C.shape)\n",
118+
" # define the error function\n",
119+
" parameter_gradient = np.linalg.norm(C-C_old)\n",
120+
" # count the number of epochs performed until the error is zero, put an iterator\n",
121+
" epoch = 0\n",
122+
" # continue to assign new C until the distance btw C_new and C_old is very small\n",
123+
" while parameter_gradient > 1e-4: \n",
124+
" # loop over the dataset to measure the instance distances with C\n",
125+
" # assign the group number for each instance based on the shortest distance from the instance to the C\n",
126+
" for i in range(len(dataset)):\n",
127+
" distance = np.linalg.norm((dataset[i] - C), axis = 1) # axis 1 will output distance array with size k\n",
128+
" centroid_index = np.argmin(distance) # the closest centroid_index from the instance \n",
129+
" final_array[i] = centroid_index # assign the centroid index as group number to an array \n",
130+
" # assign the current C as C_old to update the new C\n",
131+
" C_old = np.copy(C)\n",
132+
" # loop over each index of the centroid\n",
133+
" for i in range(k):\n",
134+
" # list of items with the same group numbers are assigned to sub_groups\n",
135+
" sub_group = [dataset[j] for j in range(len(dataset)) if final_array[j] == i]\n",
136+
" # take the mean of the sub-group and \n",
137+
" mean_of_sub_group = np.mean(sub_group, axis = 0)\n",
138+
" # assign the new values to C\n",
139+
" C[i] = mean_of_sub_group\n",
140+
" # update the parameter gradient after assigning the new C \n",
141+
" parameter_gradient = np.linalg.norm(C - C_old) # when C does not change, we reach to zero error \n",
142+
" # update epoch value\n",
143+
" epoch += 1\n",
144+
"\n",
145+
" # To measure WCSS, sum up the square of the \"distances from all group data to their mean\"\n",
146+
" for j in range(k):\n",
147+
" grouped_data = [dataset[i] for i in range(len(dataset)) if final_array[i] == j]\n",
148+
" mean_of_grouped_data = np.mean(grouped_data, axis = 0) # outputs number of attributes\n",
149+
" for i in range(len(grouped_data)):\n",
150+
" WCSS += np.sum((grouped_data[i] - mean_of_grouped_data)**2) \n",
151+
" \n",
152+
" \n",
153+
" # return the final cluster coordinates, the array carrying indices of all data points, and the WCSS with given k\n",
154+
" return epoch, C, final_array, WCSS\n",
155+
"\n",
156+
"\n",
157+
"if __name__ == '__main__':\n",
158+
" dataset = np.loadtxt('wine.data.txt' , dtype = float, delimiter = ',')\n",
159+
" dataset = np.array(dataset)\n",
160+
" np.random.seed(1)\n",
161+
" k = 15\n",
162+
" def run_k_means():\n",
163+
" WCSS_list = np.zeros(k)\n",
164+
" for i in range(1, k):\n",
165+
" epoch, C, final_array, WCSS = k_means(dataset, i)\n",
166+
" WCSS_list[i] = WCSS\n",
167+
"\n",
168+
" return WCSS_list\n",
169+
"\n",
170+
" result = run_k_means()\n",
171+
" print(result)\n",
172+
" plt.plot(np.arange(1, k), result[1:])\n",
173+
" "
174+
]
175+
},
176+
{
177+
"cell_type": "code",
178+
"execution_count": null,
179+
"metadata": {},
180+
"outputs": [],
181+
"source": []
182+
}
183+
],
184+
"metadata": {
185+
"kernelspec": {
186+
"display_name": "Python 3",
187+
"language": "python",
188+
"name": "python3"
189+
},
190+
"language_info": {
191+
"codemirror_mode": {
192+
"name": "ipython",
193+
"version": 3
194+
},
195+
"file_extension": ".py",
196+
"mimetype": "text/x-python",
197+
"name": "python",
198+
"nbconvert_exporter": "python",
199+
"pygments_lexer": "ipython3",
200+
"version": "3.7.7"
201+
}
202+
},
203+
"nbformat": 4,
204+
"nbformat_minor": 2
205+
}

0 commit comments

Comments
 (0)