Skip to content

Commit eb74ee4

Browse files
authoredMar 16, 2023
Add files via upload
1 parent 379f7db commit eb74ee4

File tree

2 files changed

+184
-0
lines changed

2 files changed

+184
-0
lines changed
 
+105
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
import pandas as pd
2+
import numpy as np
3+
import matplotlib.pyplot as plt
4+
import seaborn as sns
5+
from sklearn.naive_bayes import GaussianNB
6+
from sklearn.model_selection import train_test_split
7+
from sklearn.metrics import accuracy_score, f1_score, recall_score, classification_report, confusion_matrix, precision_score
8+
import time
9+
10+
data = pd.read_csv('E:\Datasets\pima_indians_diabetes.csv')
11+
print('The original dataset is: \n', data)
12+
13+
columns = np.array(['No_of_times_Pregnant', 'Plasma_Glucose_Conc.', 'Lower_Blood_Pressure', 'Triceps_Skinfold_Thickness', 'Serum_Insulin', 'BMI', 'DPF', 'Age', 'Diabetic'])
14+
data.columns = columns
15+
# columns = list(data.columns)
16+
time.sleep(2)
17+
print('\n Number of Zeroes in Each Column')
18+
print('-------------------------------------------------')
19+
for i in columns:
20+
print('{:25s} \t: {:6s}'.format(i, str(data[data[i] == 0].shape[0])))
21+
print()
22+
23+
time.sleep(1)
24+
for i in columns:
25+
data[i].plot.density()
26+
plt.title(f'{i}(Original) Density Plot')
27+
plt.xlabel(f'{i}')
28+
plt.show()
29+
time.sleep(1.9)
30+
print("Those were a few density plots for each column of the data.\n")
31+
# data1 = data.copy()
32+
# columns1 = np.array(data1.columns)
33+
# for rows in range(data1.shape[0]):
34+
# for cols in columns1[1:6]:
35+
# if (data1[cols][rows] == 0):
36+
# data1[cols][rows] = np.median(data1[cols])
37+
38+
# The code snippet above also replaces all the 0 values of a column with its corresponding median, regardless if a patient is diabetic or not,
39+
# meaning that it doesn't take into consideration that diabetic patients may have different feature values (for the chosen columns-2, 3, 4, and 5)
40+
# than non-diabetic patients. So, the medians of diabetic and non-diabetic arrays are taken separately and the zeroes are
41+
# imputed respectively with the corresponding medians.
42+
43+
imputation_matrix = {}
44+
for i in columns:
45+
# In the line below, using bitwise AND (i.e. &) gives the correct answer whereas using logical AND (i.e. 'and') throws a KeyError saying that the truth value of a Series is ambiguous
46+
imputation_matrix.update({i : [data[(data['Diabetic']==0) & (data[i]!=0)][i].median(), data[(data['Diabetic']==1) & (data[i]!=0)][i].median()]}) # update the old dictionary with the imputed values
47+
time.sleep(1)
48+
# print(f"The medians of the Plasma_Glucose_Conc. column for non-diabetic persons is {imputation_matrix['Plasma_Glucose_Conc.'][0]} and diabetic persons is {imputation_matrix['Plasma_Glucose_Conc.'][1]}")
49+
time.sleep(1)
50+
print(" Median values to be imputed for each column's zeroes")
51+
print('---------------------------------------------------------')
52+
print('Column\t\t\t Non-Diabetic Diabetic')
53+
for i in imputation_matrix:
54+
print('{:27s}'.format(i), ' {:10s}\t {:10s}'.format(str(imputation_matrix[i][0]), str(imputation_matrix[i][1])))
55+
56+
for cols in columns[1:6]:
57+
temp_arr = []
58+
for i in range(data.shape[0]):
59+
if ((data['Diabetic'][i]==0) and data[cols][i]==0):
60+
temp_arr.append(imputation_matrix[cols][0])
61+
elif ((data['Diabetic'][i] == 1) and data[cols][i] == 0): temp_arr.append(imputation_matrix[cols][1])
62+
else: temp_arr.append(data[cols][i])
63+
data[cols] = np.array(temp_arr)
64+
time.sleep(2.12)
65+
print()
66+
print('The modified (median-filled) dataset is: \n', data)
67+
time.sleep(2.31)
68+
for i in columns:
69+
data[i].plot.density()
70+
plt.title(f'{i} (Modified) Density Plot')
71+
plt.xlabel(f'{i}')
72+
plt.show()
73+
time.sleep(2.21)
74+
# Phew! Now onto actually fitting the model to this modified data...
75+
X_train, X_test, y_train, y_test= train_test_split(data[columns[0:7]], data['Diabetic'], test_size=0.30, random_state=1748, stratify=data['Diabetic']) # 1748 gives the max accuracy
76+
print('X_train shape is: ', X_train.shape)
77+
print('X_test shape is: ', X_test.shape)
78+
print('y_train shape is: ', y_train.shape)
79+
print('y_test shape is: ', y_test.shape)
80+
81+
print("\nTraining the model...\n")
82+
model = GaussianNB()
83+
model.fit(X_train, y_train)
84+
y_pred = model.predict(X_test)
85+
86+
time.sleep(3.45)
87+
print(f"The accuracy of the model is: {100*accuracy_score(y_test, y_pred)}%")
88+
print(f"The precision of the model is: {100*precision_score(y_test, y_pred)}%")
89+
print(f"The recall of the model is: {100*recall_score(y_test, y_pred)}%")
90+
print(f"The f1-score of the model is: {100*f1_score(y_test, y_pred)}%"); time.sleep(2.38)
91+
print('\nOur model has misclassified a total of %d examples out of %d in the test-set.\n' %((y_pred!=y_test).sum(), X_test.shape[0]))
92+
93+
time.sleep(2)
94+
cm = confusion_matrix(y_true=y_test, y_pred=y_pred)
95+
sns.heatmap(data=cm, cmap='Greys_r', annot=True, linewidth=4.12, linecolor='g')
96+
plt.title("Confusion Matrix for Evaluation of our Model's Performance")
97+
plt.show()
98+
time.sleep(0.92)
99+
print('The confusion matrix is: \n', cm)
100+
101+
time.sleep(2.1)
102+
print('-------------------- CLASSIFICATION REPORT ------------------------------')
103+
cr = classification_report(y_true=y_test, y_pred=y_pred)
104+
print(cr, end="")
105+
print('-------------------------------------------------------------------------')

‎ML_Assignment_on_KNN_120AD0015.py

+79
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
import pandas as pd
2+
from sklearn.model_selection import train_test_split
3+
import matplotlib.pyplot as plt
4+
import seaborn as sns
5+
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
6+
from sklearn.preprocessing import MinMaxScaler, StandardScaler
7+
from sklearn.neighbors import KNeighborsClassifier
8+
9+
dataset = pd.read_csv('E:\Datasets\iris.csv')
10+
print("The first few rows of the original training set are: \n", dataset.head())
11+
X = dataset.iloc[:, [0,1,2,3]]
12+
Y= dataset.iloc[:, 4]
13+
14+
min_max_scaler = MinMaxScaler()
15+
X= min_max_scaler.fit_transform(X)
16+
17+
X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.6, shuffle=True, stratify=dataset['species'], random_state=10)
18+
X_cross_val, X_val, y_cross_val, y_val = train_test_split(X_test, y_test, test_size=0.5, shuffle=True, stratify=y_test, random_state=15)
19+
print(pd.DataFrame(y_val).value_counts('species'))
20+
print(pd.DataFrame(y_train).value_counts('species'))
21+
print(pd.DataFrame(y_cross_val).value_counts('species'))
22+
print(X_train.shape)
23+
print(X_test.shape)
24+
print(X_cross_val.shape)
25+
print(X_val.shape)
26+
27+
print("The first few rows of the scaled x-training set are: \n", pd.DataFrame(X_train).head())
28+
29+
# Building a kNN Model and making predictions...
30+
knn_classifier = KNeighborsClassifier(n_neighbors=10, metric='minkowski', p=2, weights='distance') # metric='euclidean' gives the same result
31+
knn_classifier.fit(X_train, y_train)
32+
y_pred_train = knn_classifier.predict(X_train)
33+
y_pred_cross_val = knn_classifier.predict(X_cross_val)
34+
y_pred_test = knn_classifier.predict(X_val)
35+
36+
print(y_pred_train.shape)
37+
print(y_pred_cross_val.shape)
38+
print(y_pred_test.shape)
39+
40+
# Printing the Confusion Matrices and Accuracies for Training, Cross-validation, and Testing respectively...
41+
print(f'The confusion matrix for training set is: \n{confusion_matrix(y_pred=y_pred_train, y_true=y_train)}')
42+
print(f"Training accuracy: {100*accuracy_score(y_true=y_train, y_pred=y_pred_train)}%")
43+
print(f'The confusion matrix for training set is: \n{confusion_matrix(y_pred=y_pred_cross_val, y_true=y_cross_val)}')
44+
print(f"Cross-validation accuracy: {100*accuracy_score(y_true=y_cross_val, y_pred=y_pred_cross_val)}%")
45+
print(f'The confusion matrix for testing set is: \n{confusion_matrix(y_pred=y_pred_test, y_true=y_val)}')
46+
print(f"Testing accuracy: {100*accuracy_score(y_true=y_val, y_pred=y_pred_test)}%")
47+
48+
# Printing the Precisions for Training, Cross-validation, and Testing respectively...
49+
print(f"Training Precision is: {100*precision_score(y_true=y_train, y_pred=y_pred_train, average=None)}")
50+
print(f"Cross-validation Precision is: {100*precision_score(y_true=y_cross_val, y_pred=y_pred_cross_val, average=None)}")
51+
print(f"Testing Precision is: {100*precision_score(y_true=y_val, y_pred=y_pred_test, average=None)}")
52+
53+
# Printing the Recalls for Training, Cross-validation, and Testing respectively...
54+
print(f"Training Recall is: {100*recall_score(y_true=y_train, y_pred=y_pred_train, average=None)}")
55+
print(f"Cross-validation Recall is: {100*recall_score(y_true=y_cross_val, y_pred=y_pred_cross_val, average=None)}")
56+
print(f"Testing Recall is: {100*recall_score(y_true=y_val, y_pred=y_pred_test, average=None)}")
57+
58+
# Printing the F1-scores for Training, Cross-validation, and Testing respectively...
59+
print(f"Training F1-score is: {100*f1_score(y_true=y_train, y_pred=y_pred_train, average=None)}")
60+
print(f"Cross-validation F1-score is: {100*f1_score(y_true=y_cross_val, y_pred=y_pred_cross_val, average=None)}")
61+
print(f"Testing F1-score is: {100*f1_score(y_true=y_val, y_pred=y_pred_test, average=None)}")
62+
63+
# Visualizing our model's performance...
64+
plt.title('Training data confusion matrix')
65+
sns.heatmap(confusion_matrix(knn_classifier.predict(X_train), y_train), cmap = 'Greys_r', annot=True, xticklabels=dataset['species'].unique(), yticklabels=dataset['species'].unique(), cbar=False)
66+
plt.show()
67+
68+
plt.title('Testing data confusion matrix')
69+
sns.heatmap(confusion_matrix(knn_classifier.predict(X_val), y_val), cmap = 'Greys_r', annot=True, xticklabels=dataset['species'].unique(), yticklabels=dataset['species'].unique(), cbar=False)
70+
plt.show()
71+
72+
plt.title('Cross-validation data confusion matrix')
73+
sns.heatmap(confusion_matrix(knn_classifier.predict(X_cross_val), y_cross_val), cmap = 'Greys_r', annot=True, xticklabels=dataset['species'].unique(), yticklabels=dataset['species'].unique(), cbar=False)
74+
plt.show()
75+
# from sklearn.metrics import plot_confusion_matrix
76+
# from sklearn.preprocessing import LabelEncoder
77+
# le = LabelEncoder()
78+
# y1 = float(le.fit_transform(y_cross_val))
79+
# plot_confusion_matrix(knn_classifier, knn_classifier.predict(X_cross_val), y1)

0 commit comments

Comments
 (0)
Please sign in to comment.