-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain.py
130 lines (91 loc) · 3.54 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# Start with testing code from https://thecleverprogrammer.com/2021/02/19/text-emotions-detection-with-machine-learning/
import re
import pickle
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
import joblib
def read_data(file):
data = []
with open(file, 'r')as f:
for line in f:
line = line.strip()
label = ' '.join(line[1:line.find("]")].strip().split())
text = line[line.find("]")+1:].strip()
data.append([label, text])
return data
print("\n\n***** Reading Training Data *****\n\n")
file = 'text.txt'
data = read_data(file)
print("Number of instances: {}".format(len(data)))
def ngram(token, n):
output = []
for i in range(n-1, len(token)):
ngram = ' '.join(token[i-n+1:i+1])
output.append(ngram)
return output
def create_feature(text, nrange=(1, 1)):
text_features = []
text = text.lower()
text_alphanum = re.sub('[^a-z0-9#]', ' ', text)
for n in range(nrange[0], nrange[1]+1):
text_features += ngram(text_alphanum.split(), n)
text_punc = re.sub('[a-z0-9]', ' ', text)
text_features += ngram(text_punc.split(), 1)
return Counter(text_features)
def convert_label(item, name):
items = list(map(float, item.split()))
label = ""
for idx in range(len(items)):
if items[idx] == 1:
label += name[idx] + " "
return label.strip()
emotions = ["joy", 'fear', "anger", "sadness", "disgust", "shame", "guilt"]
X_all = []
y_all = []
for label, text in data:
y_all.append(convert_label(label, emotions))
X_all.append(create_feature(text, nrange=(1, 4)))
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = 0.2, random_state = 123)
def train_test(clf, X_train, X_test, y_train, y_test):
clf.fit(X_train, y_train)
train_acc = accuracy_score(y_train, clf.predict(X_train))
test_acc = accuracy_score(y_test, clf.predict(X_test))
return train_acc, test_acc
vectorizer = DictVectorizer(sparse = True)
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)
svc = SVC()
lsvc = LinearSVC(random_state=123)
rforest = RandomForestClassifier(random_state=123)
dtree = DecisionTreeClassifier()
clifs = [svc, lsvc, rforest, dtree]
# train and test them
print("| {:25} | {} | {} |".format("Classifier", "Training Accuracy", "Test Accuracy"))
print("| {} | {} | {} |".format("-"*25, "-"*17, "-"*13))
for clf in clifs:
clf_name = clf.__class__.__name__
train_acc, test_acc = train_test(clf, X_train, X_test, y_train, y_test)
print("| {:25} | {:17.7f} | {:13.7f} |".format(clf_name, train_acc, test_acc))
with open('saved_model', 'wb') as f:
pickle.dump(clf, f)
with open('saved_model', 'rb') as f:
clf = pickle.load(f)
# Can also use joblib above as:
# joblib.dump(clf, 'saved_model')
# clf = joblib.load('saved_model')
l = ["joy", 'fear', "anger", "sadness", "disgust", "shame", "guilt"]
l.sort()
label_freq = {}
for label, _ in data:
label_freq[label] = label_freq.get(label, 0) + 1
# print the labels and their counts in sorted order
for l in sorted(label_freq, key=label_freq.get, reverse=True):
print("{:10}({}) {}".format(convert_label(l, emotions), l, label_freq[l]))
print("Model Trained and Saved\a")
# quit()