From 7b40e5c979ac9dde4389fe4c0b4a48ed7259b023 Mon Sep 17 00:00:00 2001 From: Dimitri Saridakis Date: Tue, 14 Nov 2023 10:21:08 +0000 Subject: [PATCH 1/7] refactor: add disconnected mnist training script --- .../guided-demos/mnist_disconnected.py | 162 ++++++++++++++++++ 1 file changed, 162 insertions(+) create mode 100644 demo-notebooks/guided-demos/mnist_disconnected.py diff --git a/demo-notebooks/guided-demos/mnist_disconnected.py b/demo-notebooks/guided-demos/mnist_disconnected.py new file mode 100644 index 000000000..758c27790 --- /dev/null +++ b/demo-notebooks/guided-demos/mnist_disconnected.py @@ -0,0 +1,162 @@ +# Copyright 2022 IBM, Red Hat +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# In[] +import os + +import torch +from pytorch_lightning import LightningModule, Trainer +from pytorch_lightning.callbacks.progress import TQDMProgressBar +from pytorch_lightning.loggers import CSVLogger +from torch import nn +from torch.nn import functional as F +from torch.utils.data import DataLoader, random_split +from torchmetrics import Accuracy +from torchvision import transforms +from torchvision.datasets import MNIST + +PATH_DATASETS = os.environ.get("PATH_DATASETS", ".") +BATCH_SIZE = 256 if torch.cuda.is_available() else 64 +# %% + +local_minst_path = os.path.join(PATH_DATASETS, "mnist") + +print("prior to running the trainer") +print("MASTER_ADDR: is ", os.getenv("MASTER_ADDR")) +print("MASTER_PORT: is ", os.getenv("MASTER_PORT")) + + +class LitMNIST(LightningModule): + def __init__(self, data_dir=PATH_DATASETS, hidden_size=64, learning_rate=2e-4): + super().__init__() + + # Set our init args as class attributes + self.data_dir = data_dir + self.hidden_size = hidden_size + self.learning_rate = learning_rate + + # Hardcode some dataset specific attributes + self.num_classes = 10 + self.dims = (1, 28, 28) + channels, width, height = self.dims + self.transform = transforms.Compose( + [ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)), + ] + ) + + # Define PyTorch model + self.model = nn.Sequential( + nn.Flatten(), + nn.Linear(channels * width * height, hidden_size), + nn.ReLU(), + nn.Dropout(0.1), + nn.Linear(hidden_size, hidden_size), + nn.ReLU(), + nn.Dropout(0.1), + nn.Linear(hidden_size, self.num_classes), + ) + + self.val_accuracy = Accuracy() + self.test_accuracy = Accuracy() + + def forward(self, x): + x = self.model(x) + return F.log_softmax(x, dim=1) + + def training_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + loss = F.nll_loss(logits, y) + return loss + + def validation_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + loss = F.nll_loss(logits, y) + preds = torch.argmax(logits, dim=1) + self.val_accuracy.update(preds, y) + + # Calling self.log will surface up scalars for you in TensorBoard + self.log("val_loss", loss, prog_bar=True) + self.log("val_acc", self.val_accuracy, prog_bar=True) + + def test_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + loss = F.nll_loss(logits, y) + preds = torch.argmax(logits, dim=1) + self.test_accuracy.update(preds, y) + + # Calling self.log will surface up scalars for you in TensorBoard + self.log("test_loss", loss, prog_bar=True) + self.log("test_acc", self.test_accuracy, prog_bar=True) + + def configure_optimizers(self): + optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate) + return optimizer + + #################### + # DATA RELATED HOOKS + #################### + + def prepare_data(self): + # download + print("Downloading MNIST dataset...") + MNIST(self.data_dir, train=True, download=False) + MNIST(self.data_dir, train=False, download=False) + + def setup(self, stage=None): + # Assign train/val datasets for use in dataloaders + if stage == "fit" or stage is None: + mnist_full = MNIST(self.data_dir, train=True, transform=self.transform, download=False) + self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000]) + + # Assign test dataset for use in dataloader(s) + if stage == "test" or stage is None: + self.mnist_test = MNIST( + self.data_dir, train=False, transform=self.transform, download=False + ) + + def train_dataloader(self): + return DataLoader(self.mnist_train, batch_size=BATCH_SIZE) + + def val_dataloader(self): + return DataLoader(self.mnist_val, batch_size=BATCH_SIZE) + + def test_dataloader(self): + return DataLoader(self.mnist_test, batch_size=BATCH_SIZE) + + +# Init DataLoader from MNIST Dataset + +model = LitMNIST(data_dir=local_minst_path) + +print("GROUP: ", int(os.environ.get("GROUP_WORLD_SIZE", 1))) +print("LOCAL: ", int(os.environ.get("LOCAL_WORLD_SIZE", 1))) + +# Initialize a trainer +trainer = Trainer( + accelerator="auto", + # devices=1 if torch.cuda.is_available() else None, # limiting got iPython runs + max_epochs=5, + callbacks=[TQDMProgressBar(refresh_rate=20)], + num_nodes=int(os.environ.get("GROUP_WORLD_SIZE", 1)), + devices=int(os.environ.get("LOCAL_WORLD_SIZE", 1)), + strategy="ddp", +) + +# Train the model ⚡ +trainer.fit(model) From 37bb9a0925dea1d62510e52e35cfb75d7dae269b Mon Sep 17 00:00:00 2001 From: Dimitri Saridakis Date: Tue, 14 Nov 2023 10:23:59 +0000 Subject: [PATCH 2/7] refactor: addition of the mnist download script for use with disconnected env --- .../guided-demos/download_mnist_datasets.py | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 demo-notebooks/guided-demos/download_mnist_datasets.py diff --git a/demo-notebooks/guided-demos/download_mnist_datasets.py b/demo-notebooks/guided-demos/download_mnist_datasets.py new file mode 100644 index 000000000..00b2bd32e --- /dev/null +++ b/demo-notebooks/guided-demos/download_mnist_datasets.py @@ -0,0 +1,42 @@ +# Copyright 2022 IBM, Red Hat +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from torchvision.datasets import MNIST +from torchvision import transforms + +def download_mnist_dataset(destination_dir): + # Ensure the destination directory exists + if not os.path.exists(destination_dir): + os.makedirs(destination_dir) + + # Define transformations + transform = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ]) + + # Download the training data + train_set = MNIST(root=destination_dir, train=True, download=True, transform=transform) + + # Download the test data + test_set = MNIST(root=destination_dir, train=False, download=True, transform=transform) + + print(f"MNIST dataset downloaded in {destination_dir}") + +# Specify the directory where you +script_dir = os.path.dirname(os.path.abspath(__file__)) +destination_dir = script_dir + "/mnist_datasets" + +download_mnist_dataset(destination_dir) \ No newline at end of file From fcf49967113c03a7628e59313fdacef4084350d3 Mon Sep 17 00:00:00 2001 From: Dimitri Saridakis Date: Tue, 14 Nov 2023 10:38:45 +0000 Subject: [PATCH 3/7] refactor: addition of note to test in disconnected env --- demo-notebooks/guided-demos/2_basic_jobs.ipynb | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/demo-notebooks/guided-demos/2_basic_jobs.ipynb b/demo-notebooks/guided-demos/2_basic_jobs.ipynb index 5d862c03a..72accf926 100644 --- a/demo-notebooks/guided-demos/2_basic_jobs.ipynb +++ b/demo-notebooks/guided-demos/2_basic_jobs.ipynb @@ -203,7 +203,9 @@ "id": "31096641", "metadata": {}, "source": [ - "Now, an alternative option for job submission is to submit directly to MCAD, which will schedule pods to run the job with requested resources:" + "Now, an alternative option for job submission is to submit directly to MCAD, which will schedule pods to run the job with requested resources:\n", + "\n", + "NOTE: To test this demo in an air-gapped/ disconnected environment alter the training script to use a local dataset." ] }, { @@ -216,6 +218,7 @@ "jobdef = DDPJobDefinition(\n", " name=\"mnistjob\",\n", " script=\"mnist.py\",\n", + " # script=\"mnist_disconnected.py\", # training script for disconnected environment\n", " scheduler_args={\"namespace\": \"default\"},\n", " j=\"1x1\",\n", " gpu=0,\n", From cf6c9f32ccf8e15c4f0e182b08e57bed97132689 Mon Sep 17 00:00:00 2001 From: Dimitri Saridakis Date: Tue, 14 Nov 2023 10:41:25 +0000 Subject: [PATCH 4/7] style: black formatting for precommit --- .../guided-demos/download_mnist_datasets.py | 19 ++++++++++++------- .../guided-demos/mnist_disconnected.py | 4 +++- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/demo-notebooks/guided-demos/download_mnist_datasets.py b/demo-notebooks/guided-demos/download_mnist_datasets.py index 00b2bd32e..5df8be0fd 100644 --- a/demo-notebooks/guided-demos/download_mnist_datasets.py +++ b/demo-notebooks/guided-demos/download_mnist_datasets.py @@ -16,27 +16,32 @@ from torchvision.datasets import MNIST from torchvision import transforms + def download_mnist_dataset(destination_dir): # Ensure the destination directory exists if not os.path.exists(destination_dir): os.makedirs(destination_dir) # Define transformations - transform = transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ]) + transform = transforms.Compose( + [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] + ) # Download the training data - train_set = MNIST(root=destination_dir, train=True, download=True, transform=transform) + train_set = MNIST( + root=destination_dir, train=True, download=True, transform=transform + ) # Download the test data - test_set = MNIST(root=destination_dir, train=False, download=True, transform=transform) + test_set = MNIST( + root=destination_dir, train=False, download=True, transform=transform + ) print(f"MNIST dataset downloaded in {destination_dir}") + # Specify the directory where you script_dir = os.path.dirname(os.path.abspath(__file__)) destination_dir = script_dir + "/mnist_datasets" -download_mnist_dataset(destination_dir) \ No newline at end of file +download_mnist_dataset(destination_dir) diff --git a/demo-notebooks/guided-demos/mnist_disconnected.py b/demo-notebooks/guided-demos/mnist_disconnected.py index 758c27790..609f93aab 100644 --- a/demo-notebooks/guided-demos/mnist_disconnected.py +++ b/demo-notebooks/guided-demos/mnist_disconnected.py @@ -121,7 +121,9 @@ def prepare_data(self): def setup(self, stage=None): # Assign train/val datasets for use in dataloaders if stage == "fit" or stage is None: - mnist_full = MNIST(self.data_dir, train=True, transform=self.transform, download=False) + mnist_full = MNIST( + self.data_dir, train=True, transform=self.transform, download=False + ) self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000]) # Assign test dataset for use in dataloader(s) From 30160bf18a0d86863612712c94cdce22091dccdb Mon Sep 17 00:00:00 2001 From: Dimitri Saridakis Date: Tue, 14 Nov 2023 10:46:00 +0000 Subject: [PATCH 5/7] refactor: correct path to datasets --- demo-notebooks/guided-demos/mnist_disconnected.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/demo-notebooks/guided-demos/mnist_disconnected.py b/demo-notebooks/guided-demos/mnist_disconnected.py index 609f93aab..ffc71b418 100644 --- a/demo-notebooks/guided-demos/mnist_disconnected.py +++ b/demo-notebooks/guided-demos/mnist_disconnected.py @@ -30,7 +30,7 @@ BATCH_SIZE = 256 if torch.cuda.is_available() else 64 # %% -local_minst_path = os.path.join(PATH_DATASETS, "mnist") +local_minst_path = os.path.dirname(os.path.abspath(__file__) + "/mnist_datasets") print("prior to running the trainer") print("MASTER_ADDR: is ", os.getenv("MASTER_ADDR")) @@ -114,7 +114,7 @@ def configure_optimizers(self): def prepare_data(self): # download - print("Downloading MNIST dataset...") + print("Preparing MNIST dataset...") MNIST(self.data_dir, train=True, download=False) MNIST(self.data_dir, train=False, download=False) From 32a8a8bc82027b6f9f5c6d87c87551855461f906 Mon Sep 17 00:00:00 2001 From: Dimitri Saridakis Date: Tue, 14 Nov 2023 11:35:47 +0000 Subject: [PATCH 6/7] fix: fix paths to datasets --- demo-notebooks/guided-demos/download_mnist_datasets.py | 3 +-- demo-notebooks/guided-demos/mnist_disconnected.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/demo-notebooks/guided-demos/download_mnist_datasets.py b/demo-notebooks/guided-demos/download_mnist_datasets.py index 5df8be0fd..6493296fb 100644 --- a/demo-notebooks/guided-demos/download_mnist_datasets.py +++ b/demo-notebooks/guided-demos/download_mnist_datasets.py @@ -41,7 +41,6 @@ def download_mnist_dataset(destination_dir): # Specify the directory where you -script_dir = os.path.dirname(os.path.abspath(__file__)) -destination_dir = script_dir + "/mnist_datasets" +destination_dir = os.path.dirname(os.path.abspath(__file__)) download_mnist_dataset(destination_dir) diff --git a/demo-notebooks/guided-demos/mnist_disconnected.py b/demo-notebooks/guided-demos/mnist_disconnected.py index ffc71b418..d6cff250c 100644 --- a/demo-notebooks/guided-demos/mnist_disconnected.py +++ b/demo-notebooks/guided-demos/mnist_disconnected.py @@ -30,7 +30,7 @@ BATCH_SIZE = 256 if torch.cuda.is_available() else 64 # %% -local_minst_path = os.path.dirname(os.path.abspath(__file__) + "/mnist_datasets") +local_minst_path = os.path.dirname(os.path.abspath(__file__) + "/MNIST/raw") print("prior to running the trainer") print("MASTER_ADDR: is ", os.getenv("MASTER_ADDR")) From 12533714a8aee08c9bd83223928ac416cc5ac543 Mon Sep 17 00:00:00 2001 From: Dimitri Saridakis Date: Tue, 14 Nov 2023 11:36:18 +0000 Subject: [PATCH 7/7] feat: make it easier to download the datasets --- demo-notebooks/guided-demos/2_basic_jobs.ipynb | 11 +++++++++-- demo-notebooks/guided-demos/mnist_disconnected.py | 2 +- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/demo-notebooks/guided-demos/2_basic_jobs.ipynb b/demo-notebooks/guided-demos/2_basic_jobs.ipynb index 72accf926..4b3cee910 100644 --- a/demo-notebooks/guided-demos/2_basic_jobs.ipynb +++ b/demo-notebooks/guided-demos/2_basic_jobs.ipynb @@ -116,7 +116,13 @@ "id": "83d77b74", "metadata": {}, "source": [ - "First, let's begin by submitting to Ray, training a basic NN on the MNIST dataset:" + "First, let's begin by submitting to Ray, training a basic NN on the MNIST dataset:\n", + "\n", + "NOTE: To test this demo in an air-gapped/ disconnected environment alter the training script to use a local dataset.\n", + "First we must download the MNIST dataset. We've included a helper script to do this for you. \n", + "\n", + "You can run the python script (`python download_mnist_datasets.py`) directly and then place the dataset in the same directory as this notebook. \n", + "The path to the dataset would be: `..guided-demos/MNIST/raw/` " ] }, { @@ -129,6 +135,7 @@ "jobdef = DDPJobDefinition(\n", " name=\"mnisttest\",\n", " script=\"mnist.py\",\n", + " # script=\"mnist_disconnected.py\", # training script for disconnected environment\n", " scheduler_args={\"requirements\": \"requirements.txt\"}\n", ")\n", "job = jobdef.submit(cluster)" @@ -302,7 +309,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.13" + "version": "3.9.18" }, "vscode": { "interpreter": { diff --git a/demo-notebooks/guided-demos/mnist_disconnected.py b/demo-notebooks/guided-demos/mnist_disconnected.py index d6cff250c..9fc72130b 100644 --- a/demo-notebooks/guided-demos/mnist_disconnected.py +++ b/demo-notebooks/guided-demos/mnist_disconnected.py @@ -30,7 +30,7 @@ BATCH_SIZE = 256 if torch.cuda.is_available() else 64 # %% -local_minst_path = os.path.dirname(os.path.abspath(__file__) + "/MNIST/raw") +local_minst_path = os.path.dirname(os.path.abspath(__file__)) print("prior to running the trainer") print("MASTER_ADDR: is ", os.getenv("MASTER_ADDR"))