Skip to content

Commit 8efbbef

Browse files
authored
Add files via upload
1 parent 70e9d72 commit 8efbbef

12 files changed

+211
-122
lines changed

Collaborative_Filter.ipynb

+17-15
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
}
1717
],
1818
"source": [
19+
"#本文是协同过滤模型用于电影推荐\n",
1920
"import scipy.io as sio\n",
2021
"import numpy as np\n",
2122
"import matplotlib.pyplot as plt\n",
@@ -77,7 +78,7 @@
7778
],
7879
"source": [
7980
"#compute the cost\n",
80-
"#note:no x0 = 1, no 1/m !!!!!!!!\n",
81+
"#note:no x0 = 1, no 1/m !!!!!!!!没有x0=1,也没有除以m\n",
8182
"def computeCost(x,theta,y,r,reg=0):\n",
8283
" j = 0\n",
8384
" x_grad = np.zeros_like(x)\n",
@@ -93,6 +94,7 @@
9394
" return j,x_grad,theta_grad\n",
9495
" \n",
9596
"#Reduce the data set size so that this runs faster\n",
97+
"#用一个小模型测试一下算法是否正确\n",
9698
"nu, nm, nf = 4, 5, 3\n",
9799
"X1 = X[0:nm,0:nf]\n",
98100
"Theta1 = Theta[0:nu,0:nf]\n",
@@ -113,7 +115,7 @@
113115
"metadata": {},
114116
"outputs": [],
115117
"source": [
116-
"#compute numerical_gradient\n",
118+
"#compute numerical_gradient数值计算梯度\n",
117119
"def eval_numerical_gradient(f, x, verbose=True, h=0.00001):\n",
118120
" \"\"\" \n",
119121
" a naive implementation of numerical gradient of f at x \n",
@@ -162,7 +164,7 @@
162164
}
163165
],
164166
"source": [
165-
"#check the gradient \n",
167+
"#check the gradient ,进行梯度检查,一般都是产生小数字,小模型,因为数值梯度计算量太大\n",
166168
"#create the small data\n",
167169
"\n",
168170
"x_t = np.random.rand(4,3)\n",
@@ -179,12 +181,12 @@
179181
"nm_t = 4\n",
180182
"nf_t = 3\n",
181183
"\n",
182-
"#j,x_grad,theta_grad = computeCost(X1,Theta1,Y1,R1,0)\n",
183-
"#f = lambda W:computeCost(X1,Theta1,Y1,R1,0)[0]\n",
184+
"\n",
184185
"j,x_grad,theta_grad = computeCost(X_t,Theta_t,y_t,r_t,1.5)\n",
185186
"f = lambda W:computeCost(X_t,Theta_t,y_t,r_t,1.5)[0] #f = J(x)\n",
186187
"x_grad1 = eval_numerical_gradient(f,X_t , verbose=False, h=0.00001)\n",
187188
"theta_grad1 = eval_numerical_gradient(f, Theta_t, verbose=False, h=0.00001)\n",
189+
"#放在一起对比一下,前三列是x_grad,后三列是x_grad1\n",
188190
"display_X = np.hstack((x_grad,x_grad1))\n",
189191
"print display_X\n",
190192
"display_Theta = np.hstack((theta_grad,theta_grad1))\n",
@@ -210,7 +212,7 @@
210212
}
211213
],
212214
"source": [
213-
"def load_data(filename):\n",
215+
"def load_data(filename):#导入电影数据\n",
214216
" movieList = []\n",
215217
" file = open(filename)\n",
216218
" for line in file.readlines():\n",
@@ -279,13 +281,13 @@
279281
],
280282
"source": [
281283
"#Learning Movie Ratings\n",
282-
"\n",
284+
"#将新的用户的评分数据放入数据集合,进行训练\n",
283285
"#Add our own ratings to the data matrix\n",
284286
"YY = np.hstack((my_ratings,Y))\n",
285287
"RR = np.hstack((my_ratings!=0,R))\n",
286288
"print YY.shape,RR.shape\n",
287289
"#Normalize Ratings\n",
288-
"\n",
290+
"#进行数值归一化\n",
289291
"def normalizeRatings(y,r):\n",
290292
" m,n = y.shape\n",
291293
" ymean = np.zeros((m,1))\n",
@@ -331,13 +333,12 @@
331333
],
332334
"source": [
333335
"from scipy import optimize\n",
334-
"\n",
335-
"#we use the scipy.optimize.fmin_cg,so we need to change the \n",
336-
"#function computeCost(),beacause x must be 1-D\n",
336+
"#还是使用优化算法去训练\n",
337337
"\n",
338338
"args = (Ynorm,RR,num_users1,num_movies1,num_features1,1.5)\n",
339339
"params = np.hstack((XX.ravel(),TTheta.ravel())).ravel()\n",
340-
"\n",
340+
"#we use the scipy.optimize.fmin_cg,so we need to change the \n",
341+
"#function computeCost(),beacause x must be 1-D\n",
341342
"def Cost(params,*args):\n",
342343
" '''now params is 1-D,include [x,theta]'''\n",
343344
" y,r,nu,nm,nf,reg = args\n",
@@ -367,18 +368,19 @@
367368
" \n",
368369
"res = optimize.fmin_cg(Cost,x0=params,fprime=grad,args=args,maxiter=100)\n",
369370
"#get the bestX,bestTheta\n",
370-
"\n",
371+
"#改变一下参数的shape\n",
371372
"bestX = res[0:num_movies1*num_features1].reshape(num_movies1,num_features1)\n",
372373
"bestTheta = res[num_movies1*num_features1:].reshape(num_users1,num_features1)\n",
373374
"\n",
374375
"\n",
375376
"print bestX.shape,bestTheta.shape\n",
376-
"\n",
377+
"#预测一下分数\n",
377378
"score = bestX.dot(bestTheta.T) + Ymean\n",
378-
"\n",
379+
"#只有第一行是新用户的分数\n",
379380
"my_score = score[:,0] #line 1 is my scoce\n",
380381
"print score.shape\n",
381382
"print my_score[:5]\n",
383+
"#排序,推荐最高的分数的电影给新用户\n",
382384
"sort_index = my_score.argsort()\n",
383385
"favorite = 10\n",
384386
"for i in xrange(favorite):\n",

anomaly_detection.ipynb

+7-6
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
}
2727
],
2828
"source": [
29+
"#本文是高斯分布用于异常检测\n",
2930
"#load data set \n",
3031
"import scipy.io as sio\n",
3132
"import numpy as np\n",
@@ -72,7 +73,7 @@
7273
"source": [
7374
"#高斯分布\n",
7475
"from scipy import stats\n",
75-
"\n",
76+
"#求得均值和方差\n",
7677
"def estimateGaussian(x):\n",
7778
" #train set x fit the mu,sigma2\n",
7879
" m,n = x.shape\n",
@@ -81,7 +82,7 @@
8182
" sigma2 = np.var(x,axis=0).reshape(1,-1)\n",
8283
" \n",
8384
" return mu,sigma2\n",
84-
" \n",
85+
"#计算高斯概率 \n",
8586
"def p(x,mu,sigma2):\n",
8687
" #x is a new example:[m*n]\n",
8788
" m,n = x.shape\n",
@@ -117,7 +118,7 @@
117118
"outputs": [],
118119
"source": [
119120
"#cross validation for select threshold\n",
120-
"\n",
121+
"#交叉验证用于阈值选定,这里用的是F1 score这个评估指标\n",
121122
"def selectThreshold(y,pval):\n",
122123
" bestEpsilon = 0 \n",
123124
" bestF1 = 0\n",
@@ -200,8 +201,8 @@
200201
}
201202
],
202203
"source": [
203-
"#xx = np.linspace(0,35,100)\n",
204-
"#x1,x2 = np.meshgrid(xx,xx)\n",
204+
"\n",
205+
"#可视化一下检测是异常值\n",
205206
"print \"Outliers found: %d\"%(np.sum(p_train < epsilon))\n",
206207
"\n",
207208
"#visualization:Draw a red circle around those outliers\n",
@@ -232,7 +233,7 @@
232233
],
233234
"source": [
234235
"# load the data2\n",
235-
"\n",
236+
"#接下来用多元高斯分布模型\n",
236237
"import scipy.io as sio\n",
237238
"import numpy as np\n",
238239
"import matplotlib.pyplot as plt\n",

kmeans.ipynb

+12-10
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)