|
| 1 | +""" |
| 2 | +Created on Wed Sep 09 12:38:16 2015 |
| 3 | +@author: ujjwal.karn |
| 4 | +""" |
| 5 | + |
| 6 | +import pandas as pd #for handling datasets |
| 7 | +import statsmodels.api as sm #for statistical modeling |
| 8 | +import pylab as pl #for plotting |
| 9 | +import numpy as np #for numerical computation |
| 10 | + |
| 11 | +# read the data in |
| 12 | +dfTrain = pd.read_csv("C:\\Users\\ujjwal.karn\\Desktop\\Python\\train.csv") |
| 13 | +dfTest = pd.read_csv("C:\\Users\\ujjwal.karn\\Desktop\\Python\\test.csv") |
| 14 | + |
| 15 | +# take a look at the dataset |
| 16 | +print dfTrain.head() |
| 17 | +# admit gre gpa prestige |
| 18 | +#0 0 380 3.61 good |
| 19 | +#1 1 660 3.67 good |
| 20 | +#2 1 800 4.00 best |
| 21 | +#3 1 640 3.19 ok |
| 22 | +#4 0 520 2.93 ok |
| 23 | + |
| 24 | +print dfTest.head() |
| 25 | +# gre gpa prestige |
| 26 | +#0 640 3.30 veryGood |
| 27 | +#1 660 3.60 good |
| 28 | +#2 400 3.15 veryGood |
| 29 | +#3 680 3.98 veryGood |
| 30 | +#4 220 2.83 good |
| 31 | + |
| 32 | + |
| 33 | +# summarize the data |
| 34 | +print dfTrain.describe() |
| 35 | +# admit gre gpa |
| 36 | +#count 300.000000 300.000000 300.000000 |
| 37 | +#mean 0.306667 590.866667 3.386233 |
| 38 | +#std 0.461880 117.717630 0.374880 |
| 39 | +#min 0.000000 300.000000 2.260000 |
| 40 | +#25% 0.000000 515.000000 3.130000 |
| 41 | +#50% 0.000000 600.000000 3.390000 |
| 42 | +#75% 1.000000 680.000000 3.642500 |
| 43 | +#max 1.000000 800.000000 4.000000 |
| 44 | + |
| 45 | +# take a look at the standard deviation of each column |
| 46 | +print dfTrain.std() |
| 47 | +#admit 0.46188 |
| 48 | +#gre 117.71763 |
| 49 | +#gpa 0.37488 |
| 50 | + |
| 51 | +# frequency table cutting presitge and whether or not someone was admitted |
| 52 | +print pd.crosstab(dfTrain['admit'], dfTrain['prestige'], rownames=['dmit']) |
| 53 | +#prestige best good ok veryGood |
| 54 | +#admit |
| 55 | +#0 20 73 47 68 |
| 56 | +#1 25 19 9 39 |
| 57 | + |
| 58 | +#explore data |
| 59 | +dfTrain.groupby('admit').mean() |
| 60 | +# gre gpa |
| 61 | +#admit |
| 62 | +#0 573.461538 3.336587 |
| 63 | +#1 630.217391 3.498478 |
| 64 | + |
| 65 | +# plot one column |
| 66 | +dfTrain['gpa'].hist() |
| 67 | +pl.title('Histogram of GPA') |
| 68 | +pl.xlabel('GPA') |
| 69 | +pl.ylabel('Frequency') |
| 70 | +pl.show() |
| 71 | + |
| 72 | +# barplot of gre score grouped by admission status (True or False) |
| 73 | +pd.crosstab(dfTrain.gre, dfTrain.admit.astype(bool)).plot(kind='bar') |
| 74 | +pl.title('GRE score by Admission Status') |
| 75 | +pl.xlabel('GRE score') |
| 76 | +pl.ylabel('Frequency') |
| 77 | +pl.show() |
| 78 | + |
| 79 | +# dummify prestige |
| 80 | +dummy_ranks = pd.get_dummies(dfTrain['prestige'], prefix='prestige') |
| 81 | +print dummy_ranks.head() |
| 82 | +# prestige_best prestige_good prestige_ok prestige_veryGood |
| 83 | +#0 0 1 0 0 |
| 84 | +#1 0 1 0 0 |
| 85 | +#2 1 0 0 0 |
| 86 | +#3 0 0 1 0 |
| 87 | +#4 0 0 1 0 |
| 88 | + |
| 89 | +# create a clean data frame for the regression |
| 90 | +cols_to_keep = ['admit', 'gre', 'gpa'] |
| 91 | +data = dfTrain[cols_to_keep].join(dummy_ranks.ix[:, 'prestige_good':]) |
| 92 | +print data.head() |
| 93 | +# admit gre gpa prestige_good prestige_ok prestige_veryGood |
| 94 | +#0 0 380 3.61 1 0 0 |
| 95 | +#1 1 660 3.67 1 0 0 |
| 96 | +#2 1 800 4.00 0 0 0 |
| 97 | +#3 1 640 3.19 0 1 0 |
| 98 | +#4 0 520 2.93 0 1 0 |
| 99 | + |
| 100 | +# manually add the intercept |
| 101 | +data['intercept'] = 1.0 |
| 102 | + |
| 103 | +print data.head() |
| 104 | + |
| 105 | +train_cols = data.columns[1:] |
| 106 | +print data.columns[1:] |
| 107 | +# Index([u'gre', u'gpa', u'prestige_good', u'prestige_ok', u'prestige_veryGood', u'intercept'], dtype='object') |
| 108 | + |
| 109 | +#Logistic Regression |
| 110 | +logit = sm.Logit(data['admit'], data[train_cols]) |
| 111 | + |
| 112 | +# fit the model |
| 113 | +result = logit.fit() |
| 114 | +print result.summary() |
| 115 | + |
| 116 | +# recreate the dummy variables |
| 117 | +dummy_ranks_test = pd.get_dummies(dfTest['prestige'], prefix='prestige') |
| 118 | +print dummy_ranks_test |
| 119 | + |
| 120 | +#create intercept column |
| 121 | +dfTest['intercept'] = 1.0 |
| 122 | + |
| 123 | +# keep only what we need for making predictions |
| 124 | +cols_to_keep = ['gre', 'gpa', 'prestige', 'intercept'] |
| 125 | +dfTest = dfTest[cols_to_keep].join(dummy_ranks_test.ix[:, 'prestige_good':]) |
| 126 | + |
| 127 | +dfTest.head() |
| 128 | +# make predictions on the enumerated dataset |
| 129 | +dfTest['admit_pred'] = result.predict(dfTest[train_cols]) |
| 130 | + |
| 131 | +#see probabilities |
| 132 | +print dfTest.head() |
| 133 | + |
| 134 | +#convert probabilities to 'yes' 'no' |
| 135 | +dfTest['admit_yn']= np.where(dfTest['admit_pred'] > 0.5,'yes','no') |
| 136 | +print dfTest.head() |
| 137 | + |
| 138 | +cols= ['gre', 'gpa', 'admit_yn'] |
| 139 | +dfTest[cols].groupby('admit_yn').mean() |
| 140 | +# gre gpa |
| 141 | +#admit_yn |
| 142 | +#no 556.585366 3.324268 |
| 143 | +#yes 676.666667 3.750000 |
| 144 | + |
| 145 | +cols= ['gre', 'gpa', 'admit_yn'] |
| 146 | +dfTest[cols].groupby('admit_yn').mean() |
| 147 | +# gre gpa |
| 148 | +#admit_yn |
| 149 | +#no 556.585366 3.324268 |
| 150 | +#yes 676.666667 3.750000 |
| 151 | + |
| 152 | +dfTest.to_csv('C:\\Users\\ujjwal.karn\\Desktop\\Python\\output.csv', sep=',') |
0 commit comments