forked from BoulderDataScience/kaggle-santander
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsubmission_knn.py
78 lines (63 loc) · 2.37 KB
/
submission_knn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import pandas as pd
from statsmodels.distributions import ECDF
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.covariance import EmpiricalCovariance
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from santander.preprocessing import ColumnDropper
from santander.preprocessing import ZERO_VARIANCE_COLUMNS, CORRELATED_COLUMNS
filename = 'submission_knn.csv'
heuristic_correction = True
bag = True
pipeline = Pipeline([
('cd', ColumnDropper(drop=ZERO_VARIANCE_COLUMNS+CORRELATED_COLUMNS)),
('std', StandardScaler()),
('pca', PCA(n_components=0.6)) # param from cv experiments
])
df_train = pd.read_csv('data/train.csv')
df_target = df_train['TARGET']
df_train = df_train.drop(['TARGET', 'ID'], axis=1)
df_test = pd.read_csv('data/test.csv')
df_id = df_test['ID']
df_test = df_test.drop(['ID'], axis=1)
# save for heuristic correction
age = df_test['var15']
# age_ecdf = ECDF(df_train['var15'])
# df_train['var15'] = age_ecdf(df_train['var15'])
# df_test['var15'] = age_ecdf(df_test['var15'])
# feature engineering
df_train.loc[df_train['var3'] == -999999.000000, 'var3'] = 2.0
df_train['num_zeros'] = (df_train == 0).sum(axis=1)
df_test.loc[df_train['var3'] == -999999.000000, 'var3'] = 2.0
df_test['num_zeros'] = (df_test == 0).sum(axis=1)
# outliers
ec = EmpiricalCovariance()
ec = ec.fit(df_train)
m2 = ec.mahalanobis(df_train)
df_train = df_train[m2 < 40000]
df_target = df_target[m2 < 40000]
# clip
# df_test = df_test.clip(df_train.min(), df_train.max(), axis=1)
# standard pipeline
pipeline = pipeline.fit(df_train)
X_train = pipeline.transform(df_train)
y_train = df_target
X_test = pipeline.transform(df_test)
ID_test = df_id
# params from cv experiments
if bag:
knn = BaggingClassifier(KNeighborsClassifier(n_jobs=-1),
max_samples=0.01, max_features=0.9, n_estimators=250, random_state=0)
else:
knn = KNeighborsClassifier(n_jobs=-1)
knn = knn.fit(X_train, y_train)
print 'Final AUC: %f' % roc_auc_score(y_train, knn.predict_proba(X_train)[:, -1])
y_pred = knn.predict_proba(X_test)[:, -1]
if heuristic_correction :
y_pred[age < 23] = 0
submission = pd.DataFrame({'ID': ID_test, 'TARGET': y_pred})
submission.to_csv(filename, index=False)
print 'Wrote %s' % filename