diff --git a/PyTorchEx1.ipynb b/PyTorchEx1.ipynb new file mode 100644 index 0000000..6424553 --- /dev/null +++ b/PyTorchEx1.ipynb @@ -0,0 +1,478 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "c1e710a9-aa1c-4959-ad65-9669d5d65208", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "!pip install transformers==3" + ] + }, + { + "cell_type": "markdown", + "id": "65de73f4-2ec1-4da5-a472-0daba3da260d", + "metadata": {}, + "source": [ + "### Import Packages" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a358d7ae-c5da-464f-8f0e-2151709f975b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import transformers\n", + "from transformers import AutoModel, BertTokenizerFast\n", + "\n", + "import torch\n", + "import torch.nn as nn\n", + "\n", + "import cupy as cp\n", + "import cudf\n", + "from cudf.utils.hash_vocab_utils import hash_vocab\n", + "hash_vocab('bert-base-cased-vocab.txt', 'voc_hash.txt')\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import classification_report" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "993e0d57-441f-4856-884b-e447175672d1", + "metadata": {}, + "outputs": [], + "source": [ + "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")" + ] + }, + { + "cell_type": "markdown", + "id": "db9e98be-101d-4b75-8ee1-d6bf9c12f197", + "metadata": {}, + "source": [ + "### Loading Dataset\n", + "We're using a dataset of Amazon customer reviews of books. We'll be evaluating the reviews for sentiment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15ff6909-f9c2-4534-8f07-a4084e6359ea", + "metadata": {}, + "outputs": [], + "source": [ + "import gzip\n", + "\n", + "def parse(path):\n", + " g = gzip.open(path, 'rb')\n", + " for l in g:\n", + " yield eval(l)\n", + "\n", + "def getDF(path):\n", + " i = 0\n", + " df = {}\n", + " for d in parse(path):\n", + " df[i] = d\n", + " i += 1\n", + " return cudf.DataFrame.from_dict(df, orient='index')\n", + "\n", + "df = getDF('/nvme/1/ssayyah/nv-wip/amazon_bookreview.json.gz')" + ] + }, + { + "cell_type": "markdown", + "id": "5b885bbc-e1b5-4bb3-ae6d-a89f2111f3e0", + "metadata": { + "tags": [] + }, + "source": [ + "Let's take a look and see what our data looks like." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0404089d-202d-487b-90fe-7e85068650bc", + "metadata": {}, + "outputs": [], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "e83857b5-2617-481e-b182-1bdb8c28ffde", + "metadata": {}, + "source": [ + "### Data Cleaning\n", + "Before we get started, let's get rid of null values using cuDF." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7d3b8ba-096f-47d5-a7de-764e7b88d6b5", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df = df[df['reviews.rating'].notnull() & df['reviews.text'].notnull()]\n", + "df.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e8aba1f0-19d4-42e6-9aeb-57c12db4b6f9", + "metadata": {}, + "outputs": [], + "source": [ + "len(df['Message'])" + ] + }, + { + "cell_type": "markdown", + "id": "244a1176-5931-4b84-9bea-726d08ae84a5", + "metadata": {}, + "source": [ + "Now let's split the data into training, validation, and testing sets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "71645f12-70de-44ee-8a10-9a0a7d3904ab", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "train_text, temp_text, train_labels, temp_labels = train_test_split(df[''], df['Spam/Ham'], \n", + " random_state=2018, \n", + " test_size=0.3, \n", + " stratify=df['Spam/Ham'])\n", + "\n", + "\n", + "val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels, \n", + " random_state=2018, \n", + " test_size=0.5, \n", + " stratify=temp_labels)" + ] + }, + { + "cell_type": "markdown", + "id": "59feaffe-c353-4a92-9233-271e45694e46", + "metadata": { + "tags": [] + }, + "source": [ + "### Import BERT Tokenizer and BERT Model" + ] + }, + { + "cell_type": "markdown", + "id": "a85a33d9-5e39-4337-a2a9-c7260bf77e14", + "metadata": {}, + "source": [ + "In order to feed the model our texts, we need to tokenize and format the inputs. This is done by the cuDF subword tokenizer, which will tokenize the inputs and convert the tokens to their corresponding IDs in the pretrained vocabulary.\n", + "\n", + "We're going to use a pretrained tokenizer that corresponds to the model architecture we want to use. The vocabulary used to pretrain this specific checkpoint will be cached, so it won't download again if we run the cell more than once." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a07bc0d-3fed-4ce2-ad95-f97ad3bee02c", + "metadata": {}, + "outputs": [], + "source": [ + "from cudf.core.subword_tokenizer import SubwordTokenizer\n", + " \n", + "tokenizer = SubwordTokenizer('voc_hash.txt', do_lower_case=True)\n", + "\n", + "bert = AutoModel.from_pretrained('bert-base-uncased')" + ] + }, + { + "cell_type": "markdown", + "id": "6ea1ce80-55a1-4410-b2fc-f96a0f80535c", + "metadata": {}, + "source": [ + "We can directly call the tokenizer on some text:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34d51ed1-68f2-4325-9842-a1faf776abab", + "metadata": {}, + "outputs": [], + "source": [ + "tokenizer('Hello, new learner!', 'And how about a second sentence?')" + ] + }, + { + "cell_type": "markdown", + "id": "ceb1aff0-cb90-4bcc-b7ae-dd3a08c3e4e0", + "metadata": {}, + "source": [ + "### Tokenize Messages" + ] + }, + { + "cell_type": "markdown", + "id": "340a8dc9-eca6-4815-8e64-77f4157c7e59", + "metadata": {}, + "source": [ + "Since the emails are of varying lengths, we'll use the maximum sequence length to pad them. First let's look at the training st to find the right padding length." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "90a427d9-7ade-4aad-86be-b1162722116c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "seq_len = [len(i.split()) for i in train_text]\n", + "\n", + "cudf.Series(seq_len).hist(bins = 30)" + ] + }, + { + "cell_type": "markdown", + "id": "c5e7cb54-8b61-4fea-8d6d-7b6a6eff7840", + "metadata": {}, + "source": [ + "Now our tokenizer is ready, let's encode our datasets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "792ca680-6037-4d83-8283-d466cf87a56a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "tokens_train = tokenizer.batch_encode_plus(\n", + " train_text.tolist(),\n", + " max_length = 25,\n", + " pad_to_max_length=True,\n", + " truncation=True\n", + ")\n", + "\n", + "tokens_val = tokenizer.batch_encode_plus(\n", + " val_text.tolist(),\n", + " max_length = 25,\n", + " pad_to_max_length=True,\n", + " truncation=True\n", + ")\n", + "\n", + "tokens_test = tokenizer.batch_encode_plus(\n", + " test_text.tolist(),\n", + " max_length = 25,\n", + " pad_to_max_length=True,\n", + " truncation=True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "be60a24e-64ca-4da4-b093-21f2ba1f21f2", + "metadata": {}, + "source": [ + "Next, we'll convert the integer sequences to tensors." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ab1cbffc-f004-47e2-9d6f-c61d79f3c840", + "metadata": {}, + "outputs": [], + "source": [ + "train_seq = torch.tensor(tokens_train['input_ids'])\n", + "train_mask = torch.tensor(tokens_train['attention_mask'])\n", + "train_y = torch.tensor(train_labels.tolist())\n", + "\n", + "val_seq = torch.tensor(tokens_val['input_ids'])\n", + "val_mask = torch.tensor(tokens_val['attention_mask'])\n", + "val_y = torch.tensor(val_labels.tolist())\n", + "\n", + "test_seq = torch.tensor(tokens_test['input_ids'])\n", + "test_mask = torch.tensor(tokens_test['attention_mask'])\n", + "test_y = torch.tensor(test_labels.tolist())" + ] + }, + { + "cell_type": "markdown", + "id": "d4f4d078-2b3c-476d-a8ac-8279c30170f9", + "metadata": {}, + "source": [ + "Here we'll create dataloader for the training and validation sets that will pass batches of data as input to the model during the training phase." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d9519e1d-b98b-4c78-b562-24f02f63209d", + "metadata": {}, + "outputs": [], + "source": [ + "from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler\n", + "\n", + "batch_size = 32\n", + "\n", + "# wrap tensors\n", + "train_data = TensorDataset(train_seq, train_mask, train_y)\n", + "\n", + "# sampler for sampling the data during training\n", + "train_sampler = RandomSampler(train_data)\n", + "\n", + "# dataLoader for train set\n", + "train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)\n", + "\n", + "# and again for validation set\n", + "val_data = TensorDataset(val_seq, val_mask, val_y)\n", + "val_sampler = SequentialSampler(val_data)\n", + "val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)" + ] + }, + { + "cell_type": "markdown", + "id": "91795094-8de7-4419-9023-8f5a0b99ba2a", + "metadata": {}, + "source": [ + "### Define Model Architecture" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b7e7b6f-b8ec-4ec2-bf0d-8c20c44aac42", + "metadata": {}, + "outputs": [], + "source": [ + "class BERT_Arch(nn.Module):\n", + "\n", + " def __init__(self, bert):\n", + " \n", + " super(BERT_Arch, self).__init__()\n", + "\n", + " self.bert = bert\n", + " self.dropout = nn.Dropout(0.1)\n", + " self.relu = nn.ReLU()\n", + " self.fc1 = nn.Linear(768,512)\n", + " self.fc2 = nn.Linear(512,2)\n", + " self.softmax = nn.LogSoftmax(dim=1)\n", + "\n", + " \n", + " #define the forward pass\n", + " def forward(self, sent_id, mask):\n", + "\n", + " #pass the inputs to the model \n", + " _, cls_hs = self.bert(sent_id, attention_mask=mask)\n", + " x = self.fc1(cls_hs)\n", + " x = self.relu(x)\n", + " x = self.dropout(x)\n", + " \n", + " # output layer\n", + " x = self.fc2(x)\n", + " # apply softmax activation\n", + " x = self.softmax(x)\n", + "\n", + " return x" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a2fda194-4cec-4b70-a1df-0da083f18183", + "metadata": {}, + "outputs": [], + "source": [ + "model = BERT_arch(bert)\n", + "\n", + "model = model.to(device)" + ] + }, + { + "cell_type": "markdown", + "id": "d6271b80-466f-4a44-9462-66edb284819d", + "metadata": {}, + "source": [ + "### Make Predictions\n", + "First we'll load the best model weights, which were saved during training, and then we can make predictions on the test set." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18d65792-c951-482a-a0de-1da815c71c83", + "metadata": {}, + "outputs": [], + "source": [ + "path = 'saved_weights.pt'\n", + "model.load_state_dict(torch.load(path))\n", + "\n", + "with torch.no_grad():\n", + " preds = model(test_seq.to(device), test_mask.to(device))\n", + " preds = preds.detach().cpu().numpy()" + ] + }, + { + "cell_type": "markdown", + "id": "d4fefa3e-0f01-48b3-8ee0-431690f1b8bf", + "metadata": {}, + "source": [ + "Now let's see how it performed!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53a3df9a-6116-420d-9d4f-aae22503820f", + "metadata": {}, + "outputs": [], + "source": [ + "preds = np.argmax(preds, axis = 1)\n", + "print(classification_report(test_y, preds))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}