Skip to content

Commit 0843e36

Browse files
committed
logistic regression
1 parent 3c170b6 commit 0843e36

File tree

3 files changed

+554
-0
lines changed

3 files changed

+554
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
"""
2+
Created on Wed Sep 09 12:38:16 2015
3+
@author: ujjwal.karn
4+
"""
5+
6+
import pandas as pd #for handling datasets
7+
import statsmodels.api as sm #for statistical modeling
8+
import pylab as pl #for plotting
9+
import numpy as np #for numerical computation
10+
11+
# read the data in
12+
dfTrain = pd.read_csv("C:\\Users\\ujjwal.karn\\Desktop\\Python\\train.csv")
13+
dfTest = pd.read_csv("C:\\Users\\ujjwal.karn\\Desktop\\Python\\test.csv")
14+
15+
# take a look at the dataset
16+
print dfTrain.head()
17+
# admit gre gpa prestige
18+
#0 0 380 3.61 good
19+
#1 1 660 3.67 good
20+
#2 1 800 4.00 best
21+
#3 1 640 3.19 ok
22+
#4 0 520 2.93 ok
23+
24+
print dfTest.head()
25+
# gre gpa prestige
26+
#0 640 3.30 veryGood
27+
#1 660 3.60 good
28+
#2 400 3.15 veryGood
29+
#3 680 3.98 veryGood
30+
#4 220 2.83 good
31+
32+
33+
# summarize the data
34+
print dfTrain.describe()
35+
# admit gre gpa
36+
#count 300.000000 300.000000 300.000000
37+
#mean 0.306667 590.866667 3.386233
38+
#std 0.461880 117.717630 0.374880
39+
#min 0.000000 300.000000 2.260000
40+
#25% 0.000000 515.000000 3.130000
41+
#50% 0.000000 600.000000 3.390000
42+
#75% 1.000000 680.000000 3.642500
43+
#max 1.000000 800.000000 4.000000
44+
45+
# take a look at the standard deviation of each column
46+
print dfTrain.std()
47+
#admit 0.46188
48+
#gre 117.71763
49+
#gpa 0.37488
50+
51+
# frequency table cutting presitge and whether or not someone was admitted
52+
print pd.crosstab(dfTrain['admit'], dfTrain['prestige'], rownames=['dmit'])
53+
#prestige best good ok veryGood
54+
#admit
55+
#0 20 73 47 68
56+
#1 25 19 9 39
57+
58+
#explore data
59+
dfTrain.groupby('admit').mean()
60+
# gre gpa
61+
#admit
62+
#0 573.461538 3.336587
63+
#1 630.217391 3.498478
64+
65+
# plot one column
66+
dfTrain['gpa'].hist()
67+
pl.title('Histogram of GPA')
68+
pl.xlabel('GPA')
69+
pl.ylabel('Frequency')
70+
pl.show()
71+
72+
# barplot of gre score grouped by admission status (True or False)
73+
pd.crosstab(dfTrain.gre, dfTrain.admit.astype(bool)).plot(kind='bar')
74+
pl.title('GRE score by Admission Status')
75+
pl.xlabel('GRE score')
76+
pl.ylabel('Frequency')
77+
pl.show()
78+
79+
# dummify prestige
80+
dummy_ranks = pd.get_dummies(dfTrain['prestige'], prefix='prestige')
81+
print dummy_ranks.head()
82+
# prestige_best prestige_good prestige_ok prestige_veryGood
83+
#0 0 1 0 0
84+
#1 0 1 0 0
85+
#2 1 0 0 0
86+
#3 0 0 1 0
87+
#4 0 0 1 0
88+
89+
# create a clean data frame for the regression
90+
cols_to_keep = ['admit', 'gre', 'gpa']
91+
data = dfTrain[cols_to_keep].join(dummy_ranks.ix[:, 'prestige_good':])
92+
print data.head()
93+
# admit gre gpa prestige_good prestige_ok prestige_veryGood
94+
#0 0 380 3.61 1 0 0
95+
#1 1 660 3.67 1 0 0
96+
#2 1 800 4.00 0 0 0
97+
#3 1 640 3.19 0 1 0
98+
#4 0 520 2.93 0 1 0
99+
100+
# manually add the intercept
101+
data['intercept'] = 1.0
102+
103+
print data.head()
104+
105+
train_cols = data.columns[1:]
106+
print data.columns[1:]
107+
# Index([u'gre', u'gpa', u'prestige_good', u'prestige_ok', u'prestige_veryGood', u'intercept'], dtype='object')
108+
109+
#Logistic Regression
110+
logit = sm.Logit(data['admit'], data[train_cols])
111+
112+
# fit the model
113+
result = logit.fit()
114+
print result.summary()
115+
116+
# recreate the dummy variables
117+
dummy_ranks_test = pd.get_dummies(dfTest['prestige'], prefix='prestige')
118+
print dummy_ranks_test
119+
120+
#create intercept column
121+
dfTest['intercept'] = 1.0
122+
123+
# keep only what we need for making predictions
124+
cols_to_keep = ['gre', 'gpa', 'prestige', 'intercept']
125+
dfTest = dfTest[cols_to_keep].join(dummy_ranks_test.ix[:, 'prestige_good':])
126+
127+
dfTest.head()
128+
# make predictions on the enumerated dataset
129+
dfTest['admit_pred'] = result.predict(dfTest[train_cols])
130+
131+
#see probabilities
132+
print dfTest.head()
133+
134+
#convert probabilities to 'yes' 'no'
135+
dfTest['admit_yn']= np.where(dfTest['admit_pred'] > 0.5,'yes','no')
136+
print dfTest.head()
137+
138+
cols= ['gre', 'gpa', 'admit_yn']
139+
dfTest[cols].groupby('admit_yn').mean()
140+
# gre gpa
141+
#admit_yn
142+
#no 556.585366 3.324268
143+
#yes 676.666667 3.750000
144+
145+
cols= ['gre', 'gpa', 'admit_yn']
146+
dfTest[cols].groupby('admit_yn').mean()
147+
# gre gpa
148+
#admit_yn
149+
#no 556.585366 3.324268
150+
#yes 676.666667 3.750000
151+
152+
dfTest.to_csv('C:\\Users\\ujjwal.karn\\Desktop\\Python\\output.csv', sep=',')
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
gre,gpa,prestige
2+
640,3.3,veryGood
3+
660,3.6,good
4+
400,3.15,veryGood
5+
680,3.98,veryGood
6+
220,2.83,good
7+
580,3.46,ok
8+
540,3.17,best
9+
580,3.51,veryGood
10+
540,3.13,veryGood
11+
440,2.98,good
12+
560,4,good
13+
660,3.67,veryGood
14+
660,3.77,good
15+
520,3.65,ok
16+
540,3.46,ok
17+
300,2.84,veryGood
18+
340,3,veryGood
19+
780,3.63,ok
20+
480,3.71,ok
21+
540,3.28,best
22+
460,3.14,good
23+
460,3.58,veryGood
24+
500,3.01,ok
25+
420,2.69,veryGood
26+
520,2.7,good
27+
680,3.9,best
28+
680,3.31,veryGood
29+
560,3.48,veryGood
30+
580,3.34,veryGood
31+
500,2.93,ok
32+
740,4,good
33+
660,3.59,good
34+
420,2.96,best
35+
560,3.43,good
36+
460,3.64,good
37+
620,3.71,best
38+
520,3.15,good
39+
620,3.09,ok
40+
540,3.2,best
41+
660,3.47,good
42+
500,3.23,ok
43+
560,2.65,good
44+
500,3.95,ok
45+
580,3.06,veryGood
46+
520,3.35,good
47+
500,3.03,good
48+
600,3.35,veryGood
49+
580,3.8,veryGood
50+
400,3.36,veryGood
51+
620,2.85,veryGood
52+
780,4,veryGood
53+
620,3.43,good
54+
580,3.12,good
55+
700,3.52,veryGood
56+
540,3.78,veryGood
57+
760,2.81,best
58+
700,3.27,veryGood
59+
720,3.31,best
60+
560,3.69,good
61+
720,3.94,good
62+
520,4,best
63+
540,3.49,best
64+
680,3.14,veryGood
65+
460,3.44,veryGood
66+
560,3.36,best
67+
480,2.78,good
68+
460,2.93,good
69+
620,3.63,good
70+
580,4,best
71+
800,3.89,veryGood
72+
540,3.77,veryGood
73+
680,3.76,good
74+
680,2.42,best
75+
620,3.37,best
76+
560,3.78,veryGood
77+
560,3.49,ok
78+
620,3.63,veryGood
79+
800,4,veryGood
80+
640,3.12,good
81+
540,2.7,veryGood
82+
700,3.65,veryGood
83+
540,3.49,veryGood
84+
540,3.51,veryGood
85+
660,4,best
86+
480,2.62,veryGood
87+
420,3.02,best
88+
740,3.86,veryGood
89+
580,3.36,veryGood
90+
640,3.17,veryGood
91+
640,3.51,veryGood
92+
800,3.05,veryGood
93+
660,3.88,veryGood
94+
600,3.38,good
95+
620,3.75,veryGood
96+
460,3.99,good
97+
620,4,veryGood
98+
560,3.04,good
99+
460,2.63,veryGood
100+
700,3.65,veryGood
101+
600,3.89,good

0 commit comments

Comments
 (0)