-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathboostit.py
202 lines (166 loc) · 7.18 KB
/
boostit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
import numpy as np
class BoostingClassifier:
def __init__(self):
self.M_x = [] # weighted average of ensemble of centroids
self.T = 5 # model count; hyper-parameter
def fit(self, X, y):
""" Fit the boosting model.
Parameters
----------
X : { numpy.ndarray } of shape (n_samples, n_features)
The input samples with dtype=np.float32.
y : { numpy.ndarray } of shape (n_samples,)
Target values. By default, the labels will be in {-1, +1}.
Returns
-------
self : object
"""
w = [1/len(X)] * len(X) # initialize weights
ensemble = [] # centroids from each model
confidence_factors = []
for t in range(self.T):
print('Iteration %i:' % (t + 1))
# run base learner algorithm
means = centroids(X, y, w)
predictions_t = basic_linear_classifier(means, X)
# calculate weighted error
mclf = [] # instances misclassified this round
w_m = [] # weights of misclassified instances
for i in range(len(predictions_t)):
if predictions_t[i] != y[i]:
mclf.append(i)
w_m.append(w[i])
error_rate = np.sum(w_m)
print('Error rate = %s' % round(error_rate, 4))
if error_rate >= 0.5:
break
# weight adjustments
confidence = 0.5 * np.log((1 - error_rate) / error_rate)
print('Alpha = %s' % round(confidence, 4))
confidence_factors.append(confidence)
mclf_weight = 1 / (2 * error_rate)
cclf_weight = 1 / (2 * (1 - error_rate))
for i in range(len(X)):
if i in mclf:
w[i] = w[i] * mclf_weight
else:
w[i] = w[i] * cclf_weight
print('Factor to increase weights = %s' % round(mclf_weight, 4))
print('Factor to decrease weights = %s' % round(cclf_weight, 4))
ensemble.append(confidence * np.array(means))
# compute weighted average of our model ensemble, save that as our new weighted average model
self.M_x = np.sum(ensemble, axis=0) / sum(confidence_factors)
return self
def predict(self, X):
""" Predict binary class for X.
Parameters
----------
X : { numpy.ndarray } of shape (n_samples, n_features)
Returns
-------
y_pred : { numpy.ndarray } of shape (n_samples)
In this sample submission file, we generate all ones predictions.
"""
return basic_linear_classifier(self.M_x, X)
#--------------------------------------------------------------------------------------------------
# Helper functions for fit()
# Function: Splits training data according to class
# Helper for basic_linear_classifier()
# Output: 2 arrays of set of points of each class
# Note: train_data and train_labels are numpy arrays, read into and from fit()
# IMPORTANT: Altered to pair point with its weight to calculate weighted centroids in centroids()
def class_arr(train_data, train_labels, weights):
# class0 = -1 class
class0 = []
class1 = []
for i in range(len(train_data)):
if train_labels[i] == -1:
class0.append([train_data[i], weights[i]])
else:
class1.append([train_data[i], weights[i]])
return class0, class1
# Input: 2 numpy arrays (X's points have weights applied before call)
# X = data points, Y = training labels
# Output: array of centroids for each class
# Function: calculates the WEIGHTED centroids for 2 classes
def centroids(X, Y, weights):
classes = class_arr(X, Y, weights) # point-weight pairs by class
centroid_arr = []
for i in range(2):
sum_points = 0 # sum of weighted points in class
sum_weights = 0 # sum of weights of points in class
for p, w in classes[i]:
sum_points += p * w
sum_weights += w
# compute weighted class mean/centroid
class_centroid = sum_points / sum_weights
centroid_arr.append(class_centroid)
return centroid_arr
# Input: centroids, test data set
# Output: a numpy array of the predicted class for every data point
# Function: calculates distance of point from each class centroid;
# minimum distance is predicted class
# - base learner for fit()
def basic_linear_classifier(centroid_arr, test_data):
predictions = np.array([])
# centroid0, dist0 are for label -1
centroid0 = centroid_arr[0]
centroid1 = centroid_arr[1]
for point in test_data:
dist0 = np.linalg.norm(centroid0 - point)
dist1 = np.linalg.norm(centroid1 - point)
# min. distance = predicted class
if np.amin([dist0, dist1]) == dist0:
predictions = np.append(predictions, -1)
else:
predictions = np.append(predictions, 1)
return predictions
if __name__ == "__main__":
#!/usr/bin/env python3
import sys
import os
import numpy as np
import time
# evaluation on your local machine only
dataset_dir = 'dataset1'
train_set = os.path.join(dataset_dir, 'train.npy')
test_set = os.path.join(dataset_dir, 'test.npy')
def evaluation_score(y_pred, y_test):
y_pred = np.squeeze(y_pred)
assert y_pred.shape == y_test.shape, "Error: the shape of your prediction doesn't match the shape of ground truth label."
TP = 0 # truth positive
FN = 0 # false negetive
TN = 0 # true negetive
FP = 0 # false positive
for i in range(len(y_pred)):
pred_label = y_pred[i]
gt_label = y_test[i]
if int(pred_label) == -1:
if pred_label == gt_label:
TN += 1
else:
FN += 1
else:
if pred_label == gt_label:
TP += 1
else:
FP += 1
accuracy = round((TP + TN) / (TP + FN + FP + TN),4)
precision = round(TP / (TP + FP) if ((TP + FP) > 0) else 0,4)
recall = round(TP / (TP + FN) if ((TP + FN)) > 0 else 0,4)
f1 = round(2 * precision * recall / (precision + recall) if ((precision + recall) > 0) else 0,4)
final_score = round(50 * accuracy + 50 * f1,4)
print("\nTesting:")
print("TP: {}\nFP: {}\nTN: {}\nFN: {}\nError rate: {}".format(TP, FP, TN, FN, (FP + FN) / (TP + FP + TN + FN)))
return accuracy, precision, recall, f1, final_score
# load dataset
with open(train_set, 'rb') as f:
X_train = np.load(f)
y_train = np.load(f)
with open(test_set, 'rb') as f:
X_test = np.load(f)
y_test = np.load(f)
clf = BoostingClassifier().fit(X_train, y_train)
y_pred = clf.predict(X_test)
acc, precision, recall, f1, final_score = evaluation_score(y_pred, y_test)
print("Accuracy: {}, F-measure: {}, Precision: {}, Recall: {}, Final_Score: {}".format(acc, f1, precision, recall, final_score))