Skip to content

Commit 97bf157

Browse files
authored
Add files via upload
Updated
1 parent d381ef4 commit 97bf157

File tree

1 file changed

+205
-0
lines changed

1 file changed

+205
-0
lines changed

Kmeans_plus_plus.ipynb

+205
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 9,
6+
"metadata": {},
7+
"outputs": [
8+
{
9+
"name": "stdout",
10+
"output_type": "stream",
11+
"text": [
12+
"[[1.000e+00 1.386e+01 1.350e+00 2.270e+00 1.600e+01 9.800e+01 2.980e+00\n",
13+
" 3.150e+00 2.200e-01 1.850e+00 7.220e+00 1.010e+00 3.550e+00 1.045e+03]\n",
14+
" [2.000e+00 1.184e+01 2.890e+00 2.230e+00 1.800e+01 1.120e+02 1.720e+00\n",
15+
" 1.320e+00 4.300e-01 9.500e-01 2.650e+00 9.600e-01 2.520e+00 5.000e+02]]\n"
16+
]
17+
}
18+
],
19+
"source": [
20+
"# Write a kmeans plus plus algorithm to initialize the centroids to be used in the kmeans algorithm\n",
21+
"# Initialize centroids based on a probability index set by their distance from each other\n",
22+
"# Dataset is a numpy array and k is the number of centroids\n",
23+
"\n",
24+
"\n",
25+
"import numpy as np\n",
26+
"np.set_printoptions(precision=3)\n",
27+
"import matplotlib.pyplot as plt\n",
28+
"%matplotlib inline\n",
29+
"\n",
30+
"\n",
31+
"def kmeans_plus_plus(dataset, k):\n",
32+
" # define the shape of the dataset\n",
33+
" N, D = dataset.shape\n",
34+
" # define the centroid size\n",
35+
" C = np.zeros([k, D])\n",
36+
" # pick a random instance in the dataset\n",
37+
" random_index = np.random.choice(len(dataset))\n",
38+
" # choose a random instance\n",
39+
" random_instance = dataset[random_index]\n",
40+
" for i in range(k):\n",
41+
" # define the distance array to store all items in the dataset from the randomly chosen instance\n",
42+
" distance_array = np.linalg.norm(random_instance - dataset , axis = 1)\n",
43+
" # each distance proportionally increase the chances for the next data to be chosen\n",
44+
" prob_array = distance_array / np.sum(distance_array)\n",
45+
" # based on the distances, chose a random instance index\n",
46+
" chosen_index = np.random.choice(len(dataset) , p = prob_array)\n",
47+
" # assing the next instance to the centroid[i]\n",
48+
" C[i] = dataset[chosen_index]\n",
49+
" # last chosen C[i] will be the new reference to chose next instance for C[i+1]\n",
50+
" random_instance = C[i] \n",
51+
" return C\n",
52+
"\n",
53+
"# test the function \n",
54+
"if __name__ == '__main__':\n",
55+
" \n",
56+
" # download your copy of wine dataset from: https://archive.ics.uci.edu/ml/machine-learning-databases/wine/\n",
57+
" # change the format to .txt or .cvs if current data is causing any parsing error.\n",
58+
" \n",
59+
" dataset = np.loadtxt('wine.data.txt' , dtype = float, delimiter = ',')\n",
60+
" dataset = np.array(dataset)\n",
61+
" print(kmeans_plus_plus(dataset, 2))\n",
62+
" \n",
63+
" # if there was a need for standardization, this code below could be applied:\n",
64+
" # standardized_data = [(i - np.mean(dataset)) / (np.std(dataset)) for i in dataset]\n",
65+
" "
66+
]
67+
},
68+
{
69+
"cell_type": "code",
70+
"execution_count": 10,
71+
"metadata": {},
72+
"outputs": [
73+
{
74+
"name": "stderr",
75+
"output_type": "stream",
76+
"text": [
77+
"/Users/volkansonmez/miniconda/lib/python3.7/site-packages/numpy/core/fromnumeric.py:3335: RuntimeWarning: Mean of empty slice.\n",
78+
" out=out, **kwargs)\n",
79+
"/Users/volkansonmez/miniconda/lib/python3.7/site-packages/numpy/core/_methods.py:161: RuntimeWarning: invalid value encountered in double_scalars\n",
80+
" ret = ret.dtype.type(ret / rcount)\n"
81+
]
82+
},
83+
{
84+
"name": "stdout",
85+
"output_type": "stream",
86+
"text": [
87+
"[ 0. 17592402.704 4545800.928 2633614.463 1341434.153\n",
88+
" 916424.194 684095.537 415042.347 674861.281 353365.048\n",
89+
" 657438.809 187113.672 492970.971 565012.338 138874.119]\n"
90+
]
91+
},
92+
{
93+
"data": {
94+
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAEDCAYAAAA7jc+ZAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nO3de3Scd33n8fd3RneNrYs18k3yTQqBJNhOoubGKSS0ZA2leKG0tU8WKJe6oQm97G5bOOwJZ7enLafslqVASL0hNTlA0nJJyZaQhAJLUpK0kUPiS0ISR3Ys2U4k2ZJs3W/f/WMeyWNZl5E18qN55vM6Z45mnsvMVzr25/nN7/k9z8/cHRERia5Y2AWIiMjiUtCLiEScgl5EJOIU9CIiEaegFxGJOAW9iEjELdmgN7N7zKzdzA5ksO3nzezZ4PGSmXVfjBpFRHKBLdVx9Gb2VqAXuNfdr5jHfp8ArnT3jyxacSIiOWTJtujd/THgVPoyM2sws4fNbK+ZPW5mb5xm153AfRelSBGRHFAQdgHztBu41d1fNrNrgTuBt0+sNLP1wEbgxyHVJyKy5ORM0JtZArgB+JaZTSwunrLZDuDb7j52MWsTEVnKciboSXUzdbv71lm22QHcdpHqERHJCUu2j34qdz8NHDaz3wSwlC0T683sUqAKeDKkEkVElqQlG/Rmdh+p0L7UzNrM7KPALcBHzew54CCwPW2XncD9vlSHEYmIhGTJDq8UEZHsWLItehERyY4leTK2pqbGN2zYEHYZIiI5Y+/evZ3unpxu3ZIM+g0bNtDc3Bx2GSIiOcPMXp1pnbpuREQiTkEvIhJxCnoRkYhT0IuIRJyCXkQk4hT0IiIRN+fwSjO7B3g30D7dBCBm9iekbk0w8X5vApLufsrMjgBngDFg1N2bslW4iIhkJpMW/R5g20wr3f1z7r41uKvkp4Cfunv6hCE3BesXNeRHx8b58k8O8dhLHYv5MSIiOWfOoJ9upqdZhDa7Uzxm7H6shYcPvhbGx4uILFlZ66M3szJSLf/vpC124NFg6r9dc+y/y8yazay5o2P+rXIzoyFZzivtvfPeV0QkyrJ5MvbXgZ9N6bZ5i7tfBbwTuC2Y8Hta7r7b3ZvcvSmZnPZ2DXNqrE3wSoeCXkQkXTaDfgdTum3c/Xjwsx14ALgmi593nsbaBJ29w3T3Dy/mx4iI5JSsBL2ZVQBvA76XtqzczJZNPAduBg5k4/Nm0pBMAKhVLyKSJpPhlfcBNwI1ZtYGfAYoBHD3u4LN3gs86u59abuuBB4IJvIuAL7p7g9nr/TzNdamgv5Qey9Xr69ezI8SEckZcwa9u+/MYJs9pIZhpi9rAbZMt/1iqasqo6ggxisdfXNvLCKSJyJ1ZWw8ZmyqKeeQRt6IiEyKVNBDqp9effQiImdFL+hrE7Se6mdwZCzsUkREloTIBX1jbYJxh8Od6qcXEYEIBn1DshzQEEsRkQmRC/pNNQnM0AlZEZFA5IK+tCjO2spSDbEUEQlELugh1U+vFr2ISEokg74hmaClo5fxcQ+7FBGR0EUy6BtrEwyNjnOseyDsUkREQhfJoJ+4udkhjbwREYlm0E/c3EyTkIiIRDToq8uLqC4v0glZEREiGvSQunBKF02JiEQ46DXEUkQkJbJB35BM0NU/wqk+TSsoIvktukGfNtuUiEg+i2zQN2r+WBERIMJBv7aylJLCmFr0IpL35gx6M7vHzNrN7MAM6280sx4zezZ43JG2bpuZvWhmh8zsk9ksfC6xmLGpRidkRUQyadHvAbbNsc3j7r41ePwPADOLA18G3glcBuw0s8sWUux8NdRqWkERkTmD3t0fA05dwHtfAxxy9xZ3HwbuB7ZfwPtcsMZkgmPdAwwMa1pBEclf2eqjv97MnjOzH5jZ5cGytUBr2jZtwbKLpqG2HHdo6VSrXkTyVzaC/hlgvbtvAb4I/FOw3KbZdsb7BpvZLjNrNrPmjo6OLJR19p436qcXkXy24KB399Pu3hs8fwgoNLMaUi34+rRN64Djs7zPbndvcvemZDK50LIA2LCinJih2aZEJK8tOOjNbJWZWfD8muA9TwJPA5eY2UYzKwJ2AA8u9PPmo6QwTn11me5iKSJ5rWCuDczsPuBGoMbM2oDPAIUA7n4X8H7g42Y2CgwAO9zdgVEzux14BIgD97j7wUX5LWbRmNTIGxHJb3MGvbvvnGP9l4AvzbDuIeChCystOxpqEzx+qJOxcScem+60gYhItEX2ytgJjckEw6PjtJ7qD7sUEZFQRD7oG2rLAd3zRkTyV/SDPqkhliKS3yIf9JVlRdQkitSiF5G8Ffmgh1SrXi16EclX+RH0tQle6egjNepTRCS/5EXQNyYT9AyM0NmraQVFJP/kR9DXarYpEclfeRH0mj9WRPJZXgT96uUllBXFFfQikpfyIuhjMWNTslxdNyKSl/Ii6CG4uZla9CKSh/Im6BuSCY73DNI3NBp2KSIiF1XeBP3EyJsWTUIiInkm74Je/fQikm/yJujXrygnHjONvBGRvJM3QV9UEGN9dZmCXkTyTt4EPcAmTSsoInkor4K+sTbBkZN9jI6Nh12KiMhFk1dB35AsZ2TMOappBUUkj8wZ9GZ2j5m1m9mBGdbfYmb7gscTZrYlbd0RM9tvZs+aWXM2C78QjbrnjYjkoUxa9HuAbbOsPwy8zd03A38O7J6y/iZ33+ruTRdWYvY0TA6x1Fh6EckfBXNt4O6PmdmGWdY/kfbyKaBu4WUtjuUlhdQuK1aLXkTySrb76D8K/CDttQOPmtleM9s1245mtsvMms2suaOjI8tlndVYq5E3IpJfshb0ZnYTqaD/s7TFb3H3q4B3AreZ2Vtn2t/dd7t7k7s3JZPJbJV1nobg5maaVlBE8kVWgt7MNgN3A9vd/eTEcnc/HvxsBx4ArsnG5y1EY22CM0OjtJ8ZCrsUEZGLYsFBb2brgO8CH3D3l9KWl5vZsonnwM3AtCN3LqaGZHBCVv30IpIn5jwZa2b3ATcCNWbWBnwGKARw97uAO4AVwJ1mBjAajLBZCTwQLCsAvunuDy/C7zAvk0MsO3q5obEm5GpERBZfJqNuds6x/mPAx6ZZ3gJsOX+PcK1cXkyiuEAtehHJG3l1ZSyAmdGQLOeQRt6ISJ7Iu6CH1IVTr7TroikRyQ/5GfTJBK+dHuTM4EjYpYiILLq8DPpG3QpBRPJIXga9hliKSD7Jy6Bfv6KMgpjphKyI5IW8DPrCeIz1K8rUoheRvJCXQQ+pfnq16EUkH+R10B892c+IphUUkYjL26BvSCYYHXdePamRNyISbXkb9GenFVTQi0i05W3Qb5oYYql+ehGJuLwN+kRxAasrSjStoIhEXt4GPQSzTalFLyIRl9dB31iraQVFJPryOugbahP0DY/x2unBsEsREVk0+R30yXIA9dOLSKTlddBP3sVSQS8iEZbXQZ9MFLOspEC3QhCRSJsz6M3sHjNrN7MDM6w3M/tbMztkZvvM7Kq0ddvM7MVg3SezWXg2mFlwQlYXTYlIdGXSot8DbJtl/TuBS4LHLuArAGYWB74crL8M2Glmly2k2MXQkNTNzUQk2uYMend/DDg1yybbgXs95Smg0sxWA9cAh9y9xd2HgfuDbZeUxtoEHWeG6BnQtIIiEk3Z6KNfC7SmvW4Lls20fFpmtsvMms2suaOjIwtlZaZRt0IQkYjLRtDbNMt8luXTcvfd7t7k7k3JZDILZWWmYfLmZgp6EYmmgiy8RxtQn/a6DjgOFM2wfEmpryqlKB5Ti15EIisbLfoHgQ8Go2+uA3rc/QTwNHCJmW00syJgR7DtklIQj7GhRtMKikh0zdmiN7P7gBuBGjNrAz4DFAK4+13AQ8C7gENAP/DhYN2omd0OPALEgXvc/eAi/A4L1lib4IUTZ8IuQ0RkUcwZ9O6+c471Dtw2w7qHSB0IlrSGZIKHD7zG0OgYxQXxsMsREcmqvL4ydkJjbYJxhyOd/WGXIiKSdQp6Ui160BBLEYkmBT2wSXexFJEIU9ADZUUFrK0sVYteRCJJQR9oqE2oRS8ikaSgDzQmE7R09DE+rmkFRSRaFPSBhtpyBkbGON4zEHYpIiJZpaAPnL25me5NLyLRoqAP6OZmIhJVCvrAivIiKssKFfQiEjkK+oCZ0ZhMaIiliESOgj5NQzKhu1iKSOQo6NM01iY42TdMV99w2KWIiGSNgj5NQ23qVgjqvhGRKFHQp2lMLgMU9CISLQr6NGurSikqiGnkjYhEioI+TTxmbKopV9CLSKQo6KdorE3o6lgRiRQF/RQNyQStXf0MjoyFXYqISFZkFPRmts3MXjSzQ2b2yWnW/4mZPRs8DpjZmJlVB+uOmNn+YF1ztn+BbGusTeAOhzvVqheRaJgz6M0sDnwZeCdwGbDTzC5L38bdP+fuW919K/Ap4Kfufiptk5uC9U1ZrH1RTEwrqH56EYmKTFr01wCH3L3F3YeB+4Hts2y/E7gvG8WFYVOyHDMNsRSR6Mgk6NcCrWmv24Jl5zGzMmAb8J20xQ48amZ7zWzXhRZ6sZQUxqmrKlWLXkQioyCDbWyaZTNNw/TrwM+mdNu8xd2Pm1kt8EMz+4W7P3beh6QOArsA1q1bl0FZiyd1czP10YtINGTSom8D6tNe1wHHZ9h2B1O6bdz9ePCzHXiAVFfQedx9t7s3uXtTMpnMoKzF05BM0NLRy5imFRSRCMgk6J8GLjGzjWZWRCrMH5y6kZlVAG8Dvpe2rNzMlk08B24GDmSj8MXUWJtgaHScY12aVlBEct+cXTfuPmpmtwOPAHHgHnc/aGa3BuvvCjZ9L/Cou6f3eawEHjCzic/6prs/nM1fYDE01k5MK9jLuhVlIVcjIrIwmfTR4+4PAQ9NWXbXlNd7gD1TlrUAWxZUYQjSh1je9MbakKsREVkYXRk7jaryIlaUF2mIpYhEgoJ+Bg3JhIZYikgkKOhn0FCr+WNFJBoU9DNoSJbT1T/Cyd6hsEsREVkQBf0Mzo680YVTIpLbFPQzmAh69dOLSK5T0M9gTUUppYVxBb2I5DwF/QxiMWNTslwnZEUk5ynoZ6EhliISBQr6WTTWJjjWPcDAsKYVFJHcpaCfxcStENR9IyK5TEE/i/Sbm4mI5CoF/Sw21JQRM3hF/fQiksMU9LMoLoizrrqMQ2rRi0gOU9DPobE2wSvtujpWRHKXgn4ODckEhzv7GB0bD7sUEZELoqCfQ0NtguGxcdo0raCI5CgF/RzSZ5sSEclFCvo5NGosvYjkOAX9HCrKCqlJFKtFLyI5K6OgN7NtZvaimR0ys09Os/5GM+sxs2eDxx2Z7psLGmt1czMRyV0Fc21gZnHgy8A7gDbgaTN70N2fn7Lp4+7+7gvcd0lrrE3w4LPHcXfMLOxyRETmJZMW/TXAIXdvcfdh4H5ge4bvv5B9l4yGZILTg6N0aFpBEclBmQT9WqA17XVbsGyq683sOTP7gZldPs99MbNdZtZsZs0dHR0ZlHXxTN7zRhdOiUgOyiTop+ur8CmvnwHWu/sW4IvAP81j39RC993u3uTuTclkMoOyLp7JIZbqpxeRHJRJ0LcB9Wmv64Dj6Ru4+2l37w2ePwQUmllNJvvmgtUVJZQVxXVzMxHJSZkE/dPAJWa20cyKgB3Ag+kbmNkqC85Smtk1wfuezGTfXGBmNCQTGnkjIjlpzlE37j5qZrcDjwBx4B53P2hmtwbr7wLeD3zczEaBAWCHuzsw7b6L9LssqsbaBI+/3En/8ChlRXP+2URElgxL5fHS0tTU5M3NzWGXcY4f/+J1Pva1ZrbUV/L3v/NLVJYVhV2SiMgkM9vr7k3TrdOVsRl6+xtXcuctV3Pw2Gl++++e4vXTg2GXJCKSEQX9PGy7YhV7PvxLtHX18xtfeYIjnRpuKSJLn4J+nm5orOGbv3sdfUOjvP+uJ3n++OmwSxIRmZWC/gJsqa/kW7feQGHc+O3dT/L0kVNhlyQiMiMF/QVqrE3w7Y/fQHJZMR/46r/xk1+0h12SiMi0FPQLsLaylG/93vU01ib43Xub+d6zx8IuSUTkPAr6BVqRKOa+372Opg1V/NE/PMu9Tx4JuyQRkXMo6LNgWUkhez58Db/6ppXc8b2DfOFfXmYpXp8gIvlJQZ8lJYVxvnLLVbz/6jo+/y8v8d//7/OMjyvsRSR8upY/iwriMf76NzZTWVrI3f96mO7+YT73m1sojOt4KiLhUdBnWSxmfPrX3kRVeRGfe+RFTg+OcuctV1FSGA+7NBHJU2pqLgIz47abGvmL917BT15s54Nf/Xd6BkbCLktE8pSCfhHdcu16vrjzSn7e2sWO3U/RcUZTEYrIxaegX2Tv3ryGuz/0Sxzp7OM373qC1lP9YZckInlGQX8RvO0NSb7+sWvp6h/h/Xc9wUuvnwm7JBHJIwr6i+Tq9VX84+9djzv81t89yc+PdoVdkojkCQX9RXTpqmV85+M3UFFayC13/xuPv9wRdkkikgcU9BdZfXUZ37r1etavKOcje57m+/tOhF2SiEScgj4EtctKuH/XdWytr+T2+57hvn8/GnZJIhJhGQW9mW0zsxfN7JCZfXKa9beY2b7g8YSZbUlbd8TM9pvZs2a2tCaCDVFFaSH3fuRabnxDkk99dz93/r9Duj+OiCyKOYPezOLAl4F3ApcBO83ssimbHQbe5u6bgT8Hdk9Zf5O7b51p4tp8VVoUZ/cHm9i+dQ1//fCLfPzrz/Baj+aiFZHsyqRFfw1wyN1b3H0YuB/Ynr6Buz/h7hPDSJ4C6rJbZnQVxmN8/re28qfbLuUnL7bzq3/zU772xBHGdEM0EcmSTIJ+LdCa9rotWDaTjwI/SHvtwKNmttfMds20k5ntMrNmM2vu6Miv0SixmPH7Nzby6B+/lSvXVfKZBw/yvjt/xsHjPWGXJiIRkEnQ2zTLpm1umtlNpIL+z9IWv8XdryLV9XObmb11un3dfbe7N7l7UzKZzKCs6Fm/opx7P3INf7vzSo51D/CeL/2Mv/j+8/QNjYZdmojksEyCvg2oT3tdBxyfupGZbQbuBra7+8mJ5e5+PPjZDjxAqitIZmBmvGfLGn70n2/kt5rq+T+PH+bmzz/Gj154PezSRCRHZRL0TwOXmNlGMysCdgAPpm9gZuuA7wIfcPeX0paXm9myiefAzcCBbBUfZRVlhfzV+97Mt2+9nvLiOB/9WjMf//penawVkXmbM+jdfRS4HXgEeAH4R3c/aGa3mtmtwWZ3ACuAO6cMo1wJ/KuZPQf8O/B9d384679FhDVtqOafP/HL/Ml/uJQf/0Ina0Vk/mwpjt1uamry5mYNuZ/q1ZN9/Ld/OsDjL3eypa6Cv3zfm7l8TUXYZYnIEmBme2cawq4rY3PIxMnaL+zYes7J2v5hnawVkZkp6HOMmbF969pzTta+4290slZEZqagz1HpJ2vLilIna3//G3t5/bRO1orIuRT0Oa5pQzXf/4PUydofvdDOr/yvn3LvkzpZKyJnKegjoKggxm03nb2y9o7vHeR9X3lCV9aKCKCgj5RzTtZ29fOeL/2Mv3zoBZ2sFclzBWEXINk1cbL2xjfU8tmHX2D3Yy18f98Jbn97I1evr6IhmSAem+6uFiISVRpHH3FPHznFpx/Yz0uv9wJQVhTnijUVbK6r4M11FWypq2T9ijLMFP4iuWy2cfQK+jwwPu60dPayr60neHRz8PhphkbHAVheUsDmuko211UEj0pWV5Qo/EVyiIJezjMyNs5Lr59hf1sPz7X1sP9YN784cYbRYLROTaI4LfhT4V+TKA65ahGZyWxBrz76PFUYj3H5mgouX1PBjuB+ooMjY7xw4jT7j/XwXGuq5f+TF9uZaAusqShJtfzrK9i8tpI311VQUVoY3i8hIhlR0MukksI4V66r4sp1VXB9alnf0CgHjvWkwj/o9nn44GuT+2xYUcYVayuoqypjTWUJqytKWV1RwuqKEqrLi9T9I7IEKOhlVuXFBVy7aQXXbloxuay7f5j9x8729z/X1s0jB19jZOzcbsDiglgQ+kH4BweC9ANCRWmhDgYii0xBL/NWWVbEL1+S5JcvOTsT2Pi409k3xInuQU70DHA8+HmiZ5ATPYM81XKS188MnXfFbmlhnNWVJaypKGVVRQlrKkpYXZk6CKypTC1bXqLuIZGFUNBLVsRiRu2yEmqXlbClvnLabcbGnY4zQxzvGZg8IKQOBKkDw+Mvd9B+Zoip4wOWlxRQX11GfVUZ61aUUV9VSl11Geuqy1hbWUpJYfwi/IbTGx4d50TPAG1dA7R19dPWNUDrqdTPEz2D1CSKaEgmaKhN0JAspyGZYP2KcooKls61imPjzvHuAVo6+zjS2cfhzj5ePz3IG1Yu4+r1VWxdV6mDbY7TqBtZUkbGxmk/M8SJ7gGO9wxyonuAY90DHD3VPxmgE8NCJ6xcXsy64EBQV506EKyrLqO+uoyVy0sWdIHYyNg4J7oHJ0P87M8BWrv6ee304DkHpnjMWF1RQl1VKauWl9DZO0xLRy/H02YGi8eMddVlbKopP+cA0JBMUFVedMG1zsY9dZBND/OW4OfRk/0Mj539myaKC0guK+bVk32MO5jBpUHoTzzWVefOtRfuzvDYOIMj4wyNjDE4Ms7g6BiDE89HguejqedDI2MMjY6fu3409Xx4dJy6qlI211Wypb6CVcuXzjBkDa+UyBgfdzp6h2g91U9rVz9HT6YCtzU4EJyYEryFcaOuqoy6qlLqg28B9VVl1FenDgaJ4gJO9AxOBnd6mB/rGuBEzwDpvU0xg9UVpaytKk0dWKpKg0fq+eqKEgri57fW+4ZGOdzZxysdvbzS3ssrHannLZ19DKcduKrKCidDv6E2dQDYlExQX1U67ftO1dM/QktnL4eDQJ8I8yOdffQNj01uV1QQY311GRtrytmYLGdTTTkbVqSeJxPFmBm9Q6M8e7Sbva92sfdoFz9/tYszwUT1NYlirl5fORn8V6ytoLjg4n6zGh0bp61rgMPB7zjxONY9wMBwKpyHglC/0JiLWWqQQklhnJKCGAXxGMe7ByaHISeXFbN5bUXaaLQKVoQ0DFlBL3ljeHSc4xPfALr6aT01MHlQaD3VT1f/yKz7m8Hq5SWTwT0Z4tWpYF9VUUJhBoGbqYluk0PBAaCls2/yQNDZOzS5XWHc2LCi/JwDQHFBnMOdvRzu7A9+9p3z+8UM6qvLUgFeU86mZPnk8zWVpfP+pjM27rzcfiYV/K928cyrXRw52Q9AUTzGm+squHp9FVetS4V/ctnCA8/daT8zREvHRJD3Tn4baT3Vf84AgOUlBWxMJqirKqW8KH5OQBcXxikuiJ1dVhijpCDtedr64uB1SUGcwrid12IfHBnj+ROn2dfazb5gUMIrHb2TB5O1laVsqQ/Cf20FV9RVXJSuLwW9SODM4Egq/IPgPz04ytrKkrQWeemS6T/v6R/hlc6pB4BeXj3ZP9miBFi1vISNNeVsqEm1zCeer6suW/TfpePMEM8cTYX+3le72NfWM9kNtH5FGVevq+Kq9VU0bajiktplMx5c0r+JTHYrdfRx5GQf/WnfRIoLYqlvIWmPTclyNtYkqCoLbwTXmcERDhw7zf5j3ZPDkFtPDUyu35QsZ0tdJW9eW8GW+tT1K9k+t7TgoDezbcAXgDhwt7t/dsp6C9a/C+gHfsfdn8lk3+ko6EVmNjI2ztFT/QyNjLOhpoyyoqUzpmJodIwDx05PBn/zq12T30yWFRewdV2qu6cwHjunu+VU3/Dke8RjRn1VaRDkCTbWlKV+JstZvbyEWI7clO9UXzAMufVs+LefSf0t4jHjDSuXsSXtnlOXrlq2oG+LCwp6M4sDLwHvANqAp4Gd7v582jbvAj5BKuivBb7g7tdmsu90FPQi0eDutJ4aYO/RU6ngP9LFi6+fwT11En0izCe+iWxMllNftfjfRMLy+ulBnmvtZl9bD8+1dbP/WA/dQXdbUUGMLXUV/MOu6y/oYLbQWyBcAxxy95bgze4HtgPpYb0duNdTR42nzKzSzFYDGzLYV0QiysxYtyI1LPa9V9YB0Ds0ipG6GC/frFxews2Xr+Lmy1cBZw+Ez7V1s6+tm96h0UX5xpLJX3ot0Jr2uo1Uq32ubdZmuC8AZrYL2AWwbt26DMoSkVyUyMOAn0n6gfDXt6xZtM/J5PvRdIeXqf09M22Tyb6phe673b3J3ZuSyeR0m4iIyAXI5NDaBtSnva4Djme4TVEG+4qIyCLKpEX/NHCJmW00syJgB/DglG0eBD5oKdcBPe5+IsN9RURkEc3Zonf3UTO7HXiE1BDJe9z9oJndGqy/C3iI1IibQ6SGV354tn0X5TcREZFp6YIpEZEImG14ZTQHq4qIyCQFvYhIxCnoRUQibkn20ZtZB/Bq2HVMowboDLuIC6Taw6HaL75crRsWVvt6d5/2IqQlGfRLlZk1z3SyY6lT7eFQ7RdfrtYNi1e7um5ERCJOQS8iEnEK+vnZHXYBC6Daw6HaL75crRsWqXb10YuIRJxa9CIiEaegFxGJOAV9Bsys3sx+YmYvmNlBM/vDsGuaDzOLm9nPzeyfw65lPoKZyr5tZr8I/vbXh11Tpszsj4N/KwfM7D4zKwm7ppmY2T1m1m5mB9KWVZvZD83s5eBnVZg1zmSG2j8X/JvZZ2YPmFllmDXOZLra09b9VzNzM6vJxmcp6DMzCvwXd38TcB1wm5ldFnJN8/GHwAthF3EBvgA87O5vBLaQI7+Dma0F/gBocvcrSN25dUe4Vc1qD7BtyrJPAj9y90uAHwWvl6I9nF/7D4Er3H0zqTmrP3Wxi8rQHs6vHTOrJzXP9tFsfZCCPgPufsLdnwmenyEVOGvDrSozZlYH/Bpwd9i1zIeZLQfeCnwVwN2H3b073KrmpQAoNbMCoIwlPOGOuz8GnJqyeDvwteD514D/eFGLytB0tbv7o+4+Grx8itSER0vODH93gM8Df8oMs/FdCAX9PJnZBuBK4N/CrSRj/5vUP5rxsAuZp01AB/D3QbfT3WZWHnZRmXD3Y8D/JNUiO0FqIp5Hw61q3lYGkwcR/KwNuZ4L9RHgB2EXkSkzew9wzN2fy+b7KujnwZBG3zkAAAG1SURBVMwSwHeAP3L302HXMxczezfQ7u57w67lAhQAVwFfcfcrgT6WbvfBOYL+7O3ARmANUG5m/yncqvKPmX2aVLfrN8KuJRNmVgZ8Grgj2++toM+QmRWSCvlvuPt3w64nQ28B3mNmR4D7gbeb2dfDLSljbUCbu098c/o2qeDPBb8KHHb3DncfAb4L3BByTfP1upmtBgh+todcz7yY2YeAdwO3eO5cLNRAqnHwXPB/tg54xsxWLfSNFfQZMDMj1Vf8grv/Tdj1ZMrdP+Xude6+gdTJwB+7e060LN39NaDVzC4NFv0K8HyIJc3HUeA6MysL/u38CjlyIjnNg8CHgucfAr4XYi3zYmbbgD8D3uPu/WHXkyl33+/ute6+Ifg/2wZcFfxfWBAFfWbeAnyAVIv42eDxrrCLygOfAL5hZvuArcBfhlxPRoJvId8GngH2k/p/tmQvyzez+4AngUvNrM3MPgp8FniHmb1MagTIZ8OscSYz1P4lYBnww+D/6l2hFjmDGWpfnM/KnW81IiJyIdSiFxGJOAW9iEjEKehFRCJOQS8iEnEKehGRiFPQi4hEnIJeRCTi/j8a85GdhDD61wAAAABJRU5ErkJggg==\n",
95+
"text/plain": [
96+
"<Figure size 432x288 with 1 Axes>"
97+
]
98+
},
99+
"metadata": {
100+
"needs_background": "light"
101+
},
102+
"output_type": "display_data"
103+
}
104+
],
105+
"source": [
106+
"# the Within-Cluster-Sum-of-Squares amount normally goes down as the number of clusters increase\n",
107+
"# the ideal number of clusters can be chosen if the WCSS graph achieves a clear elbow shape\n",
108+
"\n",
109+
"def k_means(dataset, k): # return the WCSS value for chosen k clusters\n",
110+
" # Initialize the WCSS value to 0\n",
111+
" WCSS = 0\n",
112+
" # create an array to collect the indices of the groups\n",
113+
" final_array = np.zeros(len(dataset))\n",
114+
" # initialize the centroids based on the kmeansplusplus function. If data is too big, just choose k random instances\n",
115+
" C = kmeans_plus_plus(dataset, k)\n",
116+
" # create an old C initialized with zeros\n",
117+
" C_old = np.zeros(C.shape)\n",
118+
" # define the error function\n",
119+
" parameter_gradient = np.linalg.norm(C-C_old)\n",
120+
" # count the number of epochs performed until the error is zero, put an iterator\n",
121+
" epoch = 0\n",
122+
" # continue to assign new C until the distance btw C_new and C_old is very small\n",
123+
" while parameter_gradient > 1e-4: \n",
124+
" # loop over the dataset to measure the instance distances with C\n",
125+
" # assign the group number for each instance based on the shortest distance from the instance to the C\n",
126+
" for i in range(len(dataset)):\n",
127+
" distance = np.linalg.norm((dataset[i] - C), axis = 1) # axis 1 will output distance array with size k\n",
128+
" centroid_index = np.argmin(distance) # the closest centroid_index from the instance \n",
129+
" final_array[i] = centroid_index # assign the centroid index as group number to an array \n",
130+
" # assign the current C as C_old to update the new C\n",
131+
" C_old = np.copy(C)\n",
132+
" # loop over each index of the centroid\n",
133+
" for i in range(k):\n",
134+
" # list of items with the same group numbers are assigned to sub_groups\n",
135+
" sub_group = [dataset[j] for j in range(len(dataset)) if final_array[j] == i]\n",
136+
" # take the mean of the sub-group and \n",
137+
" mean_of_sub_group = np.mean(sub_group, axis = 0)\n",
138+
" # assign the new values to C\n",
139+
" C[i] = mean_of_sub_group\n",
140+
" # update the parameter gradient after assigning the new C \n",
141+
" parameter_gradient = np.linalg.norm(C - C_old) # when C does not change, we reach to zero error \n",
142+
" # update epoch value\n",
143+
" epoch += 1\n",
144+
"\n",
145+
" # To measure WCSS, sum up the square of the \"distances from all group data to their mean\"\n",
146+
" for j in range(k):\n",
147+
" grouped_data = [dataset[i] for i in range(len(dataset)) if final_array[i] == j]\n",
148+
" mean_of_grouped_data = np.mean(grouped_data, axis = 0) # outputs number of attributes\n",
149+
" for i in range(len(grouped_data)):\n",
150+
" WCSS += np.sum((grouped_data[i] - mean_of_grouped_data)**2) \n",
151+
" \n",
152+
" \n",
153+
" # return the final cluster coordinates, the array carrying indices of all data points, and the WCSS with given k\n",
154+
" return epoch, C, final_array, WCSS\n",
155+
"\n",
156+
"\n",
157+
"if __name__ == '__main__':\n",
158+
" dataset = np.loadtxt('wine.data.txt' , dtype = float, delimiter = ',')\n",
159+
" dataset = np.array(dataset)\n",
160+
" np.random.seed(1)\n",
161+
" k = 15\n",
162+
" def run_k_means():\n",
163+
" WCSS_list = np.zeros(k)\n",
164+
" for i in range(1, k):\n",
165+
" epoch, C, final_array, WCSS = k_means(dataset, i)\n",
166+
" WCSS_list[i] = WCSS\n",
167+
"\n",
168+
" return WCSS_list\n",
169+
"\n",
170+
" result = run_k_means()\n",
171+
" print(result)\n",
172+
" plt.plot(np.arange(1, k), result[1:])\n",
173+
" "
174+
]
175+
},
176+
{
177+
"cell_type": "code",
178+
"execution_count": null,
179+
"metadata": {},
180+
"outputs": [],
181+
"source": []
182+
}
183+
],
184+
"metadata": {
185+
"kernelspec": {
186+
"display_name": "Python 3",
187+
"language": "python",
188+
"name": "python3"
189+
},
190+
"language_info": {
191+
"codemirror_mode": {
192+
"name": "ipython",
193+
"version": 3
194+
},
195+
"file_extension": ".py",
196+
"mimetype": "text/x-python",
197+
"name": "python",
198+
"nbconvert_exporter": "python",
199+
"pygments_lexer": "ipython3",
200+
"version": "3.7.7"
201+
}
202+
},
203+
"nbformat": 4,
204+
"nbformat_minor": 2
205+
}

0 commit comments

Comments
 (0)