|
| 1 | +{ |
| 2 | + "cells": [ |
| 3 | + { |
| 4 | + "cell_type": "code", |
| 5 | + "execution_count": 9, |
| 6 | + "metadata": {}, |
| 7 | + "outputs": [ |
| 8 | + { |
| 9 | + "name": "stdout", |
| 10 | + "output_type": "stream", |
| 11 | + "text": [ |
| 12 | + "[[1.000e+00 1.386e+01 1.350e+00 2.270e+00 1.600e+01 9.800e+01 2.980e+00\n", |
| 13 | + " 3.150e+00 2.200e-01 1.850e+00 7.220e+00 1.010e+00 3.550e+00 1.045e+03]\n", |
| 14 | + " [2.000e+00 1.184e+01 2.890e+00 2.230e+00 1.800e+01 1.120e+02 1.720e+00\n", |
| 15 | + " 1.320e+00 4.300e-01 9.500e-01 2.650e+00 9.600e-01 2.520e+00 5.000e+02]]\n" |
| 16 | + ] |
| 17 | + } |
| 18 | + ], |
| 19 | + "source": [ |
| 20 | + "# Write a kmeans plus plus algorithm to initialize the centroids to be used in the kmeans algorithm\n", |
| 21 | + "# Initialize centroids based on a probability index set by their distance from each other\n", |
| 22 | + "# Dataset is a numpy array and k is the number of centroids\n", |
| 23 | + "\n", |
| 24 | + "\n", |
| 25 | + "import numpy as np\n", |
| 26 | + "np.set_printoptions(precision=3)\n", |
| 27 | + "import matplotlib.pyplot as plt\n", |
| 28 | + "%matplotlib inline\n", |
| 29 | + "\n", |
| 30 | + "\n", |
| 31 | + "def kmeans_plus_plus(dataset, k):\n", |
| 32 | + " # define the shape of the dataset\n", |
| 33 | + " N, D = dataset.shape\n", |
| 34 | + " # define the centroid size\n", |
| 35 | + " C = np.zeros([k, D])\n", |
| 36 | + " # pick a random instance in the dataset\n", |
| 37 | + " random_index = np.random.choice(len(dataset))\n", |
| 38 | + " # choose a random instance\n", |
| 39 | + " random_instance = dataset[random_index]\n", |
| 40 | + " for i in range(k):\n", |
| 41 | + " # define the distance array to store all items in the dataset from the randomly chosen instance\n", |
| 42 | + " distance_array = np.linalg.norm(random_instance - dataset , axis = 1)\n", |
| 43 | + " # each distance proportionally increase the chances for the next data to be chosen\n", |
| 44 | + " prob_array = distance_array / np.sum(distance_array)\n", |
| 45 | + " # based on the distances, chose a random instance index\n", |
| 46 | + " chosen_index = np.random.choice(len(dataset) , p = prob_array)\n", |
| 47 | + " # assing the next instance to the centroid[i]\n", |
| 48 | + " C[i] = dataset[chosen_index]\n", |
| 49 | + " # last chosen C[i] will be the new reference to chose next instance for C[i+1]\n", |
| 50 | + " random_instance = C[i] \n", |
| 51 | + " return C\n", |
| 52 | + "\n", |
| 53 | + "# test the function \n", |
| 54 | + "if __name__ == '__main__':\n", |
| 55 | + " \n", |
| 56 | + " # download your copy of wine dataset from: https://archive.ics.uci.edu/ml/machine-learning-databases/wine/\n", |
| 57 | + " # change the format to .txt or .cvs if current data is causing any parsing error.\n", |
| 58 | + " \n", |
| 59 | + " dataset = np.loadtxt('wine.data.txt' , dtype = float, delimiter = ',')\n", |
| 60 | + " dataset = np.array(dataset)\n", |
| 61 | + " print(kmeans_plus_plus(dataset, 2))\n", |
| 62 | + " \n", |
| 63 | + " # if there was a need for standardization, this code below could be applied:\n", |
| 64 | + " # standardized_data = [(i - np.mean(dataset)) / (np.std(dataset)) for i in dataset]\n", |
| 65 | + " " |
| 66 | + ] |
| 67 | + }, |
| 68 | + { |
| 69 | + "cell_type": "code", |
| 70 | + "execution_count": 10, |
| 71 | + "metadata": {}, |
| 72 | + "outputs": [ |
| 73 | + { |
| 74 | + "name": "stderr", |
| 75 | + "output_type": "stream", |
| 76 | + "text": [ |
| 77 | + "/Users/volkansonmez/miniconda/lib/python3.7/site-packages/numpy/core/fromnumeric.py:3335: RuntimeWarning: Mean of empty slice.\n", |
| 78 | + " out=out, **kwargs)\n", |
| 79 | + "/Users/volkansonmez/miniconda/lib/python3.7/site-packages/numpy/core/_methods.py:161: RuntimeWarning: invalid value encountered in double_scalars\n", |
| 80 | + " ret = ret.dtype.type(ret / rcount)\n" |
| 81 | + ] |
| 82 | + }, |
| 83 | + { |
| 84 | + "name": "stdout", |
| 85 | + "output_type": "stream", |
| 86 | + "text": [ |
| 87 | + "[ 0. 17592402.704 4545800.928 2633614.463 1341434.153\n", |
| 88 | + " 916424.194 684095.537 415042.347 674861.281 353365.048\n", |
| 89 | + " 657438.809 187113.672 492970.971 565012.338 138874.119]\n" |
| 90 | + ] |
| 91 | + }, |
| 92 | + { |
| 93 | + "data": { |
| 94 | + "image/png": "\n", |
| 95 | + "text/plain": [ |
| 96 | + "<Figure size 432x288 with 1 Axes>" |
| 97 | + ] |
| 98 | + }, |
| 99 | + "metadata": { |
| 100 | + "needs_background": "light" |
| 101 | + }, |
| 102 | + "output_type": "display_data" |
| 103 | + } |
| 104 | + ], |
| 105 | + "source": [ |
| 106 | + "# the Within-Cluster-Sum-of-Squares amount normally goes down as the number of clusters increase\n", |
| 107 | + "# the ideal number of clusters can be chosen if the WCSS graph achieves a clear elbow shape\n", |
| 108 | + "\n", |
| 109 | + "def k_means(dataset, k): # return the WCSS value for chosen k clusters\n", |
| 110 | + " # Initialize the WCSS value to 0\n", |
| 111 | + " WCSS = 0\n", |
| 112 | + " # create an array to collect the indices of the groups\n", |
| 113 | + " final_array = np.zeros(len(dataset))\n", |
| 114 | + " # initialize the centroids based on the kmeansplusplus function. If data is too big, just choose k random instances\n", |
| 115 | + " C = kmeans_plus_plus(dataset, k)\n", |
| 116 | + " # create an old C initialized with zeros\n", |
| 117 | + " C_old = np.zeros(C.shape)\n", |
| 118 | + " # define the error function\n", |
| 119 | + " parameter_gradient = np.linalg.norm(C-C_old)\n", |
| 120 | + " # count the number of epochs performed until the error is zero, put an iterator\n", |
| 121 | + " epoch = 0\n", |
| 122 | + " # continue to assign new C until the distance btw C_new and C_old is very small\n", |
| 123 | + " while parameter_gradient > 1e-4: \n", |
| 124 | + " # loop over the dataset to measure the instance distances with C\n", |
| 125 | + " # assign the group number for each instance based on the shortest distance from the instance to the C\n", |
| 126 | + " for i in range(len(dataset)):\n", |
| 127 | + " distance = np.linalg.norm((dataset[i] - C), axis = 1) # axis 1 will output distance array with size k\n", |
| 128 | + " centroid_index = np.argmin(distance) # the closest centroid_index from the instance \n", |
| 129 | + " final_array[i] = centroid_index # assign the centroid index as group number to an array \n", |
| 130 | + " # assign the current C as C_old to update the new C\n", |
| 131 | + " C_old = np.copy(C)\n", |
| 132 | + " # loop over each index of the centroid\n", |
| 133 | + " for i in range(k):\n", |
| 134 | + " # list of items with the same group numbers are assigned to sub_groups\n", |
| 135 | + " sub_group = [dataset[j] for j in range(len(dataset)) if final_array[j] == i]\n", |
| 136 | + " # take the mean of the sub-group and \n", |
| 137 | + " mean_of_sub_group = np.mean(sub_group, axis = 0)\n", |
| 138 | + " # assign the new values to C\n", |
| 139 | + " C[i] = mean_of_sub_group\n", |
| 140 | + " # update the parameter gradient after assigning the new C \n", |
| 141 | + " parameter_gradient = np.linalg.norm(C - C_old) # when C does not change, we reach to zero error \n", |
| 142 | + " # update epoch value\n", |
| 143 | + " epoch += 1\n", |
| 144 | + "\n", |
| 145 | + " # To measure WCSS, sum up the square of the \"distances from all group data to their mean\"\n", |
| 146 | + " for j in range(k):\n", |
| 147 | + " grouped_data = [dataset[i] for i in range(len(dataset)) if final_array[i] == j]\n", |
| 148 | + " mean_of_grouped_data = np.mean(grouped_data, axis = 0) # outputs number of attributes\n", |
| 149 | + " for i in range(len(grouped_data)):\n", |
| 150 | + " WCSS += np.sum((grouped_data[i] - mean_of_grouped_data)**2) \n", |
| 151 | + " \n", |
| 152 | + " \n", |
| 153 | + " # return the final cluster coordinates, the array carrying indices of all data points, and the WCSS with given k\n", |
| 154 | + " return epoch, C, final_array, WCSS\n", |
| 155 | + "\n", |
| 156 | + "\n", |
| 157 | + "if __name__ == '__main__':\n", |
| 158 | + " dataset = np.loadtxt('wine.data.txt' , dtype = float, delimiter = ',')\n", |
| 159 | + " dataset = np.array(dataset)\n", |
| 160 | + " np.random.seed(1)\n", |
| 161 | + " k = 15\n", |
| 162 | + " def run_k_means():\n", |
| 163 | + " WCSS_list = np.zeros(k)\n", |
| 164 | + " for i in range(1, k):\n", |
| 165 | + " epoch, C, final_array, WCSS = k_means(dataset, i)\n", |
| 166 | + " WCSS_list[i] = WCSS\n", |
| 167 | + "\n", |
| 168 | + " return WCSS_list\n", |
| 169 | + "\n", |
| 170 | + " result = run_k_means()\n", |
| 171 | + " print(result)\n", |
| 172 | + " plt.plot(np.arange(1, k), result[1:])\n", |
| 173 | + " " |
| 174 | + ] |
| 175 | + }, |
| 176 | + { |
| 177 | + "cell_type": "code", |
| 178 | + "execution_count": null, |
| 179 | + "metadata": {}, |
| 180 | + "outputs": [], |
| 181 | + "source": [] |
| 182 | + } |
| 183 | + ], |
| 184 | + "metadata": { |
| 185 | + "kernelspec": { |
| 186 | + "display_name": "Python 3", |
| 187 | + "language": "python", |
| 188 | + "name": "python3" |
| 189 | + }, |
| 190 | + "language_info": { |
| 191 | + "codemirror_mode": { |
| 192 | + "name": "ipython", |
| 193 | + "version": 3 |
| 194 | + }, |
| 195 | + "file_extension": ".py", |
| 196 | + "mimetype": "text/x-python", |
| 197 | + "name": "python", |
| 198 | + "nbconvert_exporter": "python", |
| 199 | + "pygments_lexer": "ipython3", |
| 200 | + "version": "3.7.7" |
| 201 | + } |
| 202 | + }, |
| 203 | + "nbformat": 4, |
| 204 | + "nbformat_minor": 2 |
| 205 | +} |
0 commit comments