curiousily
diff --git a/‎08.sentiment-analysis-with-bert.ipynb
+128-1,221 b/‎08.sentiment-analysis-with-bert.ipynb
+128-1,221
diff --git a/‎manuscript/08.sentiment-analysis-with-bert.md
+67-37 b/‎manuscript/08.sentiment-analysis-with-bert.md
+67-37
diff --git a/‎manuscript/images/pytorch-07/08.sentiment-analysis-with-bert_105_0.png
61.1 KB b/‎manuscript/images/pytorch-07/08.sentiment-analysis-with-bert_105_0.png
61.1 KB
diff --git a/‎manuscript/images/pytorch-07/08.sentiment-analysis-with-bert_110_0.png
26.8 KB b/‎manuscript/images/pytorch-07/08.sentiment-analysis-with-bert_110_0.png
26.8 KB
diff --git a/‎manuscript/images/pytorch-07/08_sentiment_analysis_with_bert_16_0.png ‎manuscript/images/pytorch-07/08.sentiment-analysis-with-bert_15_0.png b/‎manuscript/images/pytorch-07/08_sentiment_analysis_with_bert_16_0.png ‎manuscript/images/pytorch-07/08.sentiment-analysis-with-bert_15_0.png
diff --git a/‎manuscript/images/pytorch-07/08_sentiment_analysis_with_bert_20_0.png ‎manuscript/images/pytorch-07/08.sentiment-analysis-with-bert_19_0.png b/‎manuscript/images/pytorch-07/08_sentiment_analysis_with_bert_20_0.png ‎manuscript/images/pytorch-07/08.sentiment-analysis-with-bert_19_0.png
diff --git a/‎manuscript/images/pytorch-07/08_sentiment_analysis_with_bert_50_0.png ‎manuscript/images/pytorch-07/08.sentiment-analysis-with-bert_49_0.png b/‎manuscript/images/pytorch-07/08_sentiment_analysis_with_bert_50_0.png ‎manuscript/images/pytorch-07/08.sentiment-analysis-with-bert_49_0.png
diff --git a/‎manuscript/images/pytorch-07/08.sentiment-analysis-with-bert_93_0.png
48.9 KB b/‎manuscript/images/pytorch-07/08.sentiment-analysis-with-bert_93_0.png
48.9 KB
diff --git a/‎manuscript/images/pytorch-07/08_sentiment_analysis_with_bert_106_0.png
-61.1 KB b/‎manuscript/images/pytorch-07/08_sentiment_analysis_with_bert_106_0.png
-61.1 KB
diff --git a/‎manuscript/images/pytorch-07/08_sentiment_analysis_with_bert_111_0.png
-26.7 KB b/‎manuscript/images/pytorch-07/08_sentiment_analysis_with_bert_111_0.png
-26.7 KB
diff --git a/‎manuscript/images/pytorch-07/08_sentiment_analysis_with_bert_94_0.png
-53.2 KB b/‎manuscript/images/pytorch-07/08_sentiment_analysis_with_bert_94_0.png
-53.2 KB
@@ -158,7 +158,7 @@ sns.countplot(df.score)
 plt.xlabel('review score');
 ```
 
-![png](images/pytorch-07/08_sentiment_analysis_with_bert_16_0.png)
+![png](images/pytorch-07/08.sentiment-analysis-with-bert_15_0.png)
 
 That's hugely imbalanced, but it's okay. We're going to convert the dataset into negative, neutral and positive sentiment:
 
@@ -185,7 +185,7 @@ plt.xlabel('review sentiment')
 ax.set_xticklabels(class_names);
 ```
 
-![png](images/pytorch-07/08_sentiment_analysis_with_bert_20_0.png)
+![png](images/pytorch-07/08.sentiment-analysis-with-bert_19_0.png)
 
 The balance was (mostly) restored.
 
@@ -366,7 +366,7 @@ plt.xlim([0, 256]);
 plt.xlabel('Token count');
 ```
 
-![png](images/pytorch-07/08_sentiment_analysis_with_bert_50_0.png)
+![png](images/pytorch-07/08.sentiment-analysis-with-bert_49_0.png)
 
 Most of the reviews seem to contain less than 128 tokens, but we'll be on the safe side and choose a maximum length of 160.
 
@@ -529,19 +529,19 @@ class SentimentClassifier(nn.Module):
     self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
     self.drop = nn.Dropout(p=0.3)
     self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
-    self.softmax = nn.Softmax(dim=1)
 
   def forward(self, input_ids, attention_mask):
     _, pooled_output = self.bert(
       input_ids=input_ids,
       attention_mask=attention_mask
     )
     output = self.drop(pooled_output)
-    output = self.out(output)
-    return self.softmax(output)
+    return self.out(output
 ```
 
-Our classifier delegates most of the heavy lifting to the BertModel. We use a dropout layer for some regularization and a fully-connected layer for our output. This should work like any other PyTorch model. Let's create an instance and move it to the GPU:
+Our classifier delegates most of the heavy lifting to the BertModel. We use a dropout layer for some regularization and a fully-connected layer for our output. Note that we're returning the raw output of the last layer since that is required for the cross-entropy loss function in PyTorch to work.
+
+This should work like any other PyTorch model. Let's create an instance and move it to the GPU
 
 ```py
 model = SentimentClassifier(len(class_names))
@@ -561,10 +561,10 @@ print(attention_mask.shape) # batch size x seq length
     torch.Size([16, 160])
     torch.Size([16, 160])
 
-And get predictions from our (untrained) model:
+To get the predicted probabilities from our trained model, we'll apply the softmax function to the outputs:
 
 ```py
-model(input_ids, attention_mask)
+F.softmax(model(input_ids, attention_mask), dim=1)
 ```
 
     tensor([[0.5879, 0.0842, 0.3279],
@@ -589,7 +589,7 @@ model(input_ids, attention_mask)
 To reproduce the training procedure from the BERT paper, we'll use the [AdamW](https://huggingface.co/transformers/main_classes/optimizer_schedules.html#adamw) optimizer provided by Hugging Face. It corrects weight decay, so it's similar to the original paper. We'll also use a linear scheduler with no warmup steps:
 
 ```py
-EPOCHS = 50
+EPOCHS = 10
 
 optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
 total_steps = len(train_data_loader) * EPOCHS
@@ -730,30 +730,58 @@ for epoch in range(EPOCHS):
     best_accuracy = val_acc
 ```
 
-    Epoch 1/50
+    Epoch 1/10
+    ----------
+    Train loss 0.7330631300571541 accuracy 0.6653729447463129
+    Val   loss 0.5767546480894089 accuracy 0.7776365946632783
+
+    Epoch 2/10
     ----------
-    Train loss 0.9025589391151885 accuracy 0.6324183191023922
-    Val   loss 0.8391157329082489 accuracy 0.7115628970775095
+    Train loss 0.4158683338330777 accuracy 0.8420012701997036
+    Val   loss 0.5365073362737894 accuracy 0.832274459974587
 
-    Epoch 2/50
+    Epoch 3/10
     ----------
-    Train loss 0.8013420265765007 accuracy 0.7453955260743773
-    Val   loss 0.8175631034374237 accuracy 0.7357052096569251
+    Train loss 0.24015077009679367 accuracy 0.922023851527768
+    Val   loss 0.5074492372572422 accuracy 0.8716645489199493
 
-    .....
+    Epoch 4/10
+    ----------
+    Train loss 0.16012676668187295 accuracy 0.9546962105708843
+    Val   loss 0.6009970247745514 accuracy 0.8703939008894537
 
-    Epoch 49/50
+    Epoch 5/10
     ----------
-    Train loss 0.6315805039475788 accuracy 0.9197657187213323
-    Val   loss 0.7163282692432403 accuracy 0.8424396442185516
+    Train loss 0.11209654617575301 accuracy 0.9675393409074872
+    Val   loss 0.7367783848941326 accuracy 0.8742058449809403
 
-    Epoch 50/50
+    Epoch 6/10
     ----------
-    Train loss 0.631561377785814 accuracy 0.9199068520217346
-    Val   loss 0.7175787663459778 accuracy 0.841168996188056
+    Train loss 0.08572274737026433 accuracy 0.9764307388328276
+    Val   loss 0.7251267762482166 accuracy 0.8843710292249047
 
-    CPU times: user 2h 27min 31s, sys: 1h 7min, total: 3h 34min 32s
-    Wall time: 3h 35min 51s
+    Epoch 7/10
+    ----------
+    Train loss 0.06132202987342602 accuracy 0.9833462705525369
+    Val   loss 0.7083295831084251 accuracy 0.889453621346887
+
+    Epoch 8/10
+    ----------
+    Train loss 0.050604159273123096 accuracy 0.9849693035071626
+    Val   loss 0.753860274553299 accuracy 0.8907242693773825
+
+    Epoch 9/10
+    ----------
+    Train loss 0.04373276197092931 accuracy 0.9862395032107826
+    Val   loss 0.7506809896230697 accuracy 0.8919949174078781
+
+    Epoch 10/10
+    ----------
+    Train loss 0.03768671146314381 accuracy 0.9880036694658105
+    Val   loss 0.7431786182522774 accuracy 0.8932655654383737
+
+    CPU times: user 29min 54s, sys: 13min 28s, total: 43min 23s
+    Wall time: 43min 43s
 
 Note that we're storing the state of the best model, indicated by the highest validation accuracy.
 
@@ -770,12 +798,14 @@ plt.legend()
 plt.ylim([0, 1]);
 ```
 
-![png](images/pytorch-07/08_sentiment_analysis_with_bert_94_0.png)
+![png](images/pytorch-07/08.sentiment-analysis-with-bert_93_0.png)
+
+The training accuracy starts to approach 100% after 10 epochs or so. You might try to fine-tune the parameters a bit more, but this will be good enough for us.
 
 Don't want to wait? Uncomment the next cell to download my pre-trained model:
 
 ```py
-# !gdown --id 1ZZFaHiJjsftT2fc4vUZbXZfVkYVDV5-y
+# !gdown --id 1V8itWtowCYnb2Bc9KlK9SxGff9WwmogA
 
 # model = SentimentClassifier(len(class_names))
 # model.load_state_dict(torch.load('best_model_state.bin'))
@@ -798,9 +828,9 @@ test_acc, _ = eval_model(
 test_acc.item()
 ```
 
-    0.8223350253807106
+    0.883248730964467
 
-The accuracy is about 2% lower on the test set. Our model seems to generalize well.
+The accuracy is about 1% lower on the test set. Our model seems to generalize well.
 
 We'll define a helper function to get the predictions from our model:
 
@@ -855,13 +885,13 @@ print(classification_report(y_test, y_pred, target_names=class_names))
 
                   precision    recall  f1-score   support
 
-        negative       0.81      0.81      0.81       245
-         neutral       0.78      0.75      0.77       254
-        positive       0.87      0.89      0.88       289
+        negative       0.89      0.87      0.88       245
+         neutral       0.83      0.85      0.84       254
+        positive       0.92      0.93      0.92       289
 
-        accuracy                           0.82       788
-       macro avg       0.82      0.82      0.82       788
-    weighted avg       0.82      0.82      0.82       788
+        accuracy                           0.88       788
+       macro avg       0.88      0.88      0.88       788
+    weighted avg       0.88      0.88      0.88       788
 
 Looks like it is really hard to classify neutral (3 stars) reviews. And I can tell you from experience, looking at many reviews, those are hard to classify.
 
@@ -880,7 +910,7 @@ df_cm = pd.DataFrame(cm, index=class_names, columns=class_names)
 show_confusion_matrix(df_cm)
 ```
 
-![png](images/pytorch-07/08_sentiment_analysis_with_bert_106_0.png)
+![png](images/pytorch-07/08.sentiment-analysis-with-bert_105_0.png)
 
 This confirms that our model is having difficulty classifying neutral reviews. It mistakes those for negative and positive at a roughly equal frequency.
 
@@ -923,7 +953,7 @@ plt.xlabel('probability')
 plt.xlim([0, 1]);
 ```
 
-![png](images/pytorch-07/08_sentiment_analysis_with_bert_111_0.png)
+![png](images/pytorch-07/08.sentiment-analysis-with-bert_110_0.png)
 
 ### Predicting on Raw Text