-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathspi_for_poi_1.py
76 lines (66 loc) · 2.24 KB
/
spi_for_poi_1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import math
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors.kde import KernelDensity
# to estimate probability density for each feature
def density_estimate(data):
x = [[i] for i in data]
bandwidth = 10**np.linspace(-1,1,100)
grid = GridSearchCV(KernelDensity(kernel = 'gaussian'),{'bandwidth':bandwidth},cv = 10).fit(x)
h = grid.best_index_
kde = KernelDensity(kernel = 'gaussian',bandwidth= bandwidth[h]).fit(x)
return kde
# to train the model
def train_model(df):
data = [[],[],[],[],[],[],[],[],[],[]]
# fetching record one by one
for row in df.iterrows():
index, record = row
if(index>=100):
break
else:
# complete record for each feature
for i in range(10):
data[i].append(record[i+1])
# train the model for each feature
probability = []
for i in range(10):
probability.append(density_estimate(data[i]))
# pdf for each feature computed
return probability
#to calculate rank
def btmodel(probability):
score = [0]
for i in range(len(probability)-1):
score.append(math.log(probability[i+1]/probability[i])+score[-1])
return score
def point_of_interest(df, func):
probability = []
for row in df.iterrows():
index, record = row
p = 1
for i in range(10):
p = p*math.exp(func[i].score_samples(record[i+1]))
probability.append(p)
return btmodel(probability)
#return probability
#reading csv file
def extractfromcsv(fname):
df = pd.read_csv(fname)
return df
def get_ranked_data(fname):
df = extractfromcsv(fname) # to read from csv file
func = train_model(df)
score = point_of_interest(df,func)
df['bt_score'] = pd.Series(score)
# to compute mean squared error in ranking
rank = [x for _, x in sorted(zip(score, list(df['rank'])), reverse=True)]
error = 0
for i in range(100,len(rank)):
error = error + abs(i+1-rank[i])
print(error/(len(rank)-100)) # printing error in ranking
return df
fname = input("enter file name")
print(get_ranked_data(fname))
#to final output