SIPUNK
diff --git a/‎Collaborative_Filter.ipynb
+17-15 b/‎Collaborative_Filter.ipynb
+17-15
diff --git a/‎anomaly_detection.ipynb
+7-6 b/‎anomaly_detection.ipynb
+7-6
diff --git a/‎kmeans.ipynb
+12-10 b/‎kmeans.ipynb
+12-10
@@ -16,6 +16,7 @@
     }
    ],
    "source": [
+    "#本文是协同过滤模型用于电影推荐\n",
     "import scipy.io as sio\n",
     "import numpy as np\n",
     "import matplotlib.pyplot as plt\n",
@@ -77,7 +78,7 @@
    ],
    "source": [
     "#compute the cost\n",
-    "#note:no x0 = 1, no 1/m  !!!!!!!!\n",
+    "#note:no x0 = 1, no 1/m  !!!!!!!!没有x0=1,也没有除以m\n",
     "def computeCost(x,theta,y,r,reg=0):\n",
     "    j = 0\n",
     "    x_grad = np.zeros_like(x)\n",
@@ -93,6 +94,7 @@
     "    return j,x_grad,theta_grad\n",
     "    \n",
     "#Reduce the data set size so that this runs faster\n",
+    "#用一个小模型测试一下算法是否正确\n",
     "nu, nm, nf = 4, 5, 3\n",
     "X1 = X[0:nm,0:nf]\n",
     "Theta1 = Theta[0:nu,0:nf]\n",
@@ -113,7 +115,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "#compute numerical_gradient\n",
+    "#compute numerical_gradient数值计算梯度\n",
     "def eval_numerical_gradient(f, x, verbose=True, h=0.00001):\n",
     "    \"\"\" \n",
     "    a naive implementation of numerical gradient of f at x \n",
@@ -162,7 +164,7 @@
     }
    ],
    "source": [
-    "#check the gradient \n",
+    "#check the gradient ,进行梯度检查，一般都是产生小数字，小模型，因为数值梯度计算量太大\n",
     "#create the small data\n",
     "\n",
     "x_t = np.random.rand(4,3)\n",
@@ -179,12 +181,12 @@
     "nm_t = 4\n",
     "nf_t = 3\n",
     "\n",
-    "#j,x_grad,theta_grad = computeCost(X1,Theta1,Y1,R1,0)\n",
-    "#f = lambda W:computeCost(X1,Theta1,Y1,R1,0)[0]\n",
+    "\n",
     "j,x_grad,theta_grad = computeCost(X_t,Theta_t,y_t,r_t,1.5)\n",
     "f = lambda W:computeCost(X_t,Theta_t,y_t,r_t,1.5)[0] #f = J(x)\n",
     "x_grad1 = eval_numerical_gradient(f,X_t , verbose=False, h=0.00001)\n",
     "theta_grad1 = eval_numerical_gradient(f, Theta_t, verbose=False, h=0.00001)\n",
+    "#放在一起对比一下，前三列是x_grad，后三列是x_grad1\n",
     "display_X = np.hstack((x_grad,x_grad1))\n",
     "print display_X\n",
     "display_Theta = np.hstack((theta_grad,theta_grad1))\n",
@@ -210,7 +212,7 @@
     }
    ],
    "source": [
-    "def load_data(filename):\n",
+    "def load_data(filename):#导入电影数据\n",
     "    movieList = []\n",
     "    file = open(filename)\n",
     "    for line in file.readlines():\n",
@@ -279,13 +281,13 @@
    ],
    "source": [
     "#Learning Movie Ratings\n",
-    "\n",
+    "#将新的用户的评分数据放入数据集合,进行训练\n",
     "#Add our own ratings to the data matrix\n",
     "YY = np.hstack((my_ratings,Y))\n",
     "RR = np.hstack((my_ratings!=0,R))\n",
     "print YY.shape,RR.shape\n",
     "#Normalize Ratings\n",
-    "\n",
+    "#进行数值归一化\n",
     "def normalizeRatings(y,r):\n",
     "    m,n = y.shape\n",
     "    ymean = np.zeros((m,1))\n",
@@ -331,13 +333,12 @@
    ],
    "source": [
     "from scipy import optimize\n",
-    "\n",
-    "#we use the scipy.optimize.fmin_cg,so we need to change the \n",
-    "#function computeCost(),beacause x must be 1-D\n",
+    "#还是使用优化算法去训练\n",
     "\n",
     "args = (Ynorm,RR,num_users1,num_movies1,num_features1,1.5)\n",
     "params = np.hstack((XX.ravel(),TTheta.ravel())).ravel()\n",
-    "\n",
+    "#we use the scipy.optimize.fmin_cg,so we need to change the \n",
+    "#function computeCost(),beacause x must be 1-D\n",
     "def Cost(params,*args):\n",
     "    '''now params is 1-D,include [x,theta]'''\n",
     "    y,r,nu,nm,nf,reg = args\n",
@@ -367,18 +368,19 @@
     "    \n",
     "res = optimize.fmin_cg(Cost,x0=params,fprime=grad,args=args,maxiter=100)\n",
     "#get the bestX,bestTheta\n",
-    "\n",
+    "#改变一下参数的shape\n",
     "bestX = res[0:num_movies1*num_features1].reshape(num_movies1,num_features1)\n",
     "bestTheta = res[num_movies1*num_features1:].reshape(num_users1,num_features1)\n",
     "\n",
     "\n",
     "print bestX.shape,bestTheta.shape\n",
-    "\n",
+    "#预测一下分数\n",
     "score = bestX.dot(bestTheta.T) + Ymean\n",
-    "\n",
+    "#只有第一行是新用户的分数\n",
     "my_score = score[:,0]  #line 1 is my scoce\n",
     "print score.shape\n",
     "print my_score[:5]\n",
+    "#排序，推荐最高的分数的电影给新用户\n",
     "sort_index = my_score.argsort()\n",
     "favorite = 10\n",
     "for i in xrange(favorite):\n",
 
@@ -26,6 +26,7 @@
     }
    ],
    "source": [
+    "#本文是高斯分布用于异常检测\n",
     "#load data set \n",
     "import scipy.io as sio\n",
     "import numpy as np\n",
@@ -72,7 +73,7 @@
    "source": [
     "#高斯分布\n",
     "from scipy import stats\n",
-    "\n",
+    "#求得均值和方差\n",
     "def estimateGaussian(x):\n",
     "    #train set x fit the mu,sigma2\n",
     "    m,n = x.shape\n",
@@ -81,7 +82,7 @@
     "    sigma2 = np.var(x,axis=0).reshape(1,-1)\n",
     "    \n",
     "    return mu,sigma2\n",
-    "    \n",
+    "#计算高斯概率  \n",
     "def p(x,mu,sigma2):\n",
     "    #x is a new example:[m*n]\n",
     "    m,n = x.shape\n",
@@ -117,7 +118,7 @@
    "outputs": [],
    "source": [
     "#cross validation for select threshold\n",
-    "\n",
+    "#交叉验证用于阈值选定,这里用的是F1 score这个评估指标\n",
     "def selectThreshold(y,pval):\n",
     "    bestEpsilon = 0 \n",
     "    bestF1 = 0\n",
@@ -200,8 +201,8 @@
     }
    ],
    "source": [
-    "#xx = np.linspace(0,35,100)\n",
-    "#x1,x2 = np.meshgrid(xx,xx)\n",
+    "\n",
+    "#可视化一下检测是异常值\n",
     "print \"Outliers found: %d\"%(np.sum(p_train < epsilon))\n",
     "\n",
     "#visualization:Draw a red circle around those outliers\n",
@@ -232,7 +233,7 @@
    ],
    "source": [
     "# load the data2\n",
-    "\n",
+    "#接下来用多元高斯分布模型\n",
     "import scipy.io as sio\n",
     "import numpy as np\n",
     "import matplotlib.pyplot as plt\n",