Skip to content

Commit 2b93e26

Browse files
committed
add files
0 parents  commit 2b93e26

6 files changed

+1675
-0
lines changed

.gitignore

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
frikk_eirik_dataset.csv
2+
andrea_martine_dataset_final.csv
3+
venv/
4+
dataset.csv
5+
.vscode/
6+
training_ckpt/
7+
.idea/

DistilBERT_predictions.ipynb

+473
Large diffs are not rendered by default.

SVM_classifier.ipynb

+427
Large diffs are not rendered by default.

distilbert-finetune.py

+87
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
import pandas as pd
2+
import numpy as np
3+
import time
4+
import tensorflow as tf
5+
from transformers import DistilBertTokenizerFast, TFDistilBertForSequenceClassification
6+
from sklearn.model_selection import train_test_split
7+
from sklearn.metrics import classification_report
8+
import os
9+
10+
start = time.time()
11+
12+
df = pd.read_csv("frikk_eirik_dataset.csv")
13+
14+
X, y = df['text_document'], df["target"]
15+
16+
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42, stratify=df['target'])
17+
18+
X_train = X_train.values.tolist()
19+
for i in range(len(X_train)):
20+
X_train[i] = str(X_train[i])
21+
X_test = X_test.values.tolist()
22+
for i in range(len(X_test)):
23+
X_test[i] = str(X_test[i])
24+
25+
labels_dict = {'unrelated': 0, 'pro_ed': 1, 'pro_recovery': 2}
26+
27+
y_train = y_train.values.tolist()
28+
for i in range(len(y_train)):
29+
y_train[i] = labels_dict[y_train[i]]
30+
31+
y_test = y_test.values.tolist()
32+
for i in range(len(y_test)):
33+
y_test[i] = labels_dict[y_test[i]]
34+
35+
time_a = time.time() - start
36+
37+
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
38+
39+
train_encodings = tokenizer(X_train, truncation=True, padding=True)
40+
time_b = time.time() - time_a - start
41+
print(f"Created train encodings, time used {time_b}")
42+
test_encodings = tokenizer(X_test, truncation=True, padding=True)
43+
time_c = time.time() - time_b - time_a - start
44+
print(f"Created val encodings, time used {time_c}")
45+
46+
train_dataset = np.array(list(dict(train_encodings).values()))
47+
test_dataset = np.array(list(dict(test_encodings).values()))
48+
49+
BATCH_SIZE = 16
50+
51+
# Create a callback that saves the model's weights every x epochs
52+
checkpoint_path = "training_ckpt2/cp-{epoch:04d}.ckpt"
53+
checkpoint_dir = os.path.dirname(checkpoint_path)
54+
cp_callback = tf.keras.callbacks.ModelCheckpoint(
55+
filepath=checkpoint_path,
56+
verbose=1,
57+
save_weights_only=True)
58+
59+
save_model = True
60+
61+
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3, return_dict=True)
62+
63+
if save_model:
64+
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
65+
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])
66+
67+
model.fit(
68+
train_dataset[0],
69+
np.array(y_train),
70+
epochs=5,
71+
batch_size=BATCH_SIZE,
72+
callbacks=[cp_callback]
73+
)
74+
75+
else:
76+
latest = tf.train.latest_checkpoint(checkpoint_dir)
77+
model.load_weights(latest)
78+
79+
preds = model.predict(test_dataset[0])["logits"]
80+
81+
classes = np.argmax(preds, axis=-1)
82+
83+
score = classification_report(y_test, classes, digits=3)
84+
print(score)
85+
86+
total = time.time() - start
87+
print(f"Done in: {total}")

0 commit comments

Comments
 (0)