diff --git a/demo-notebooks/guided-demos/2_basic_jobs.ipynb b/demo-notebooks/guided-demos/2_basic_jobs.ipynb index 5d862c03a..4b3cee910 100644 --- a/demo-notebooks/guided-demos/2_basic_jobs.ipynb +++ b/demo-notebooks/guided-demos/2_basic_jobs.ipynb @@ -116,7 +116,13 @@ "id": "83d77b74", "metadata": {}, "source": [ - "First, let's begin by submitting to Ray, training a basic NN on the MNIST dataset:" + "First, let's begin by submitting to Ray, training a basic NN on the MNIST dataset:\n", + "\n", + "NOTE: To test this demo in an air-gapped/ disconnected environment alter the training script to use a local dataset.\n", + "First we must download the MNIST dataset. We've included a helper script to do this for you. \n", + "\n", + "You can run the python script (`python download_mnist_datasets.py`) directly and then place the dataset in the same directory as this notebook. \n", + "The path to the dataset would be: `..guided-demos/MNIST/raw/` " ] }, { @@ -129,6 +135,7 @@ "jobdef = DDPJobDefinition(\n", " name=\"mnisttest\",\n", " script=\"mnist.py\",\n", + " # script=\"mnist_disconnected.py\", # training script for disconnected environment\n", " scheduler_args={\"requirements\": \"requirements.txt\"}\n", ")\n", "job = jobdef.submit(cluster)" @@ -203,7 +210,9 @@ "id": "31096641", "metadata": {}, "source": [ - "Now, an alternative option for job submission is to submit directly to MCAD, which will schedule pods to run the job with requested resources:" + "Now, an alternative option for job submission is to submit directly to MCAD, which will schedule pods to run the job with requested resources:\n", + "\n", + "NOTE: To test this demo in an air-gapped/ disconnected environment alter the training script to use a local dataset." ] }, { @@ -216,6 +225,7 @@ "jobdef = DDPJobDefinition(\n", " name=\"mnistjob\",\n", " script=\"mnist.py\",\n", + " # script=\"mnist_disconnected.py\", # training script for disconnected environment\n", " scheduler_args={\"namespace\": \"default\"},\n", " j=\"1x1\",\n", " gpu=0,\n", @@ -299,7 +309,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.13" + "version": "3.9.18" }, "vscode": { "interpreter": { diff --git a/demo-notebooks/guided-demos/download_mnist_datasets.py b/demo-notebooks/guided-demos/download_mnist_datasets.py new file mode 100644 index 000000000..6493296fb --- /dev/null +++ b/demo-notebooks/guided-demos/download_mnist_datasets.py @@ -0,0 +1,46 @@ +# Copyright 2022 IBM, Red Hat +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from torchvision.datasets import MNIST +from torchvision import transforms + + +def download_mnist_dataset(destination_dir): + # Ensure the destination directory exists + if not os.path.exists(destination_dir): + os.makedirs(destination_dir) + + # Define transformations + transform = transforms.Compose( + [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] + ) + + # Download the training data + train_set = MNIST( + root=destination_dir, train=True, download=True, transform=transform + ) + + # Download the test data + test_set = MNIST( + root=destination_dir, train=False, download=True, transform=transform + ) + + print(f"MNIST dataset downloaded in {destination_dir}") + + +# Specify the directory where you +destination_dir = os.path.dirname(os.path.abspath(__file__)) + +download_mnist_dataset(destination_dir) diff --git a/demo-notebooks/guided-demos/mnist_disconnected.py b/demo-notebooks/guided-demos/mnist_disconnected.py new file mode 100644 index 000000000..9fc72130b --- /dev/null +++ b/demo-notebooks/guided-demos/mnist_disconnected.py @@ -0,0 +1,164 @@ +# Copyright 2022 IBM, Red Hat +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# In[] +import os + +import torch +from pytorch_lightning import LightningModule, Trainer +from pytorch_lightning.callbacks.progress import TQDMProgressBar +from pytorch_lightning.loggers import CSVLogger +from torch import nn +from torch.nn import functional as F +from torch.utils.data import DataLoader, random_split +from torchmetrics import Accuracy +from torchvision import transforms +from torchvision.datasets import MNIST + +PATH_DATASETS = os.environ.get("PATH_DATASETS", ".") +BATCH_SIZE = 256 if torch.cuda.is_available() else 64 +# %% + +local_minst_path = os.path.dirname(os.path.abspath(__file__)) + +print("prior to running the trainer") +print("MASTER_ADDR: is ", os.getenv("MASTER_ADDR")) +print("MASTER_PORT: is ", os.getenv("MASTER_PORT")) + + +class LitMNIST(LightningModule): + def __init__(self, data_dir=PATH_DATASETS, hidden_size=64, learning_rate=2e-4): + super().__init__() + + # Set our init args as class attributes + self.data_dir = data_dir + self.hidden_size = hidden_size + self.learning_rate = learning_rate + + # Hardcode some dataset specific attributes + self.num_classes = 10 + self.dims = (1, 28, 28) + channels, width, height = self.dims + self.transform = transforms.Compose( + [ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)), + ] + ) + + # Define PyTorch model + self.model = nn.Sequential( + nn.Flatten(), + nn.Linear(channels * width * height, hidden_size), + nn.ReLU(), + nn.Dropout(0.1), + nn.Linear(hidden_size, hidden_size), + nn.ReLU(), + nn.Dropout(0.1), + nn.Linear(hidden_size, self.num_classes), + ) + + self.val_accuracy = Accuracy() + self.test_accuracy = Accuracy() + + def forward(self, x): + x = self.model(x) + return F.log_softmax(x, dim=1) + + def training_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + loss = F.nll_loss(logits, y) + return loss + + def validation_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + loss = F.nll_loss(logits, y) + preds = torch.argmax(logits, dim=1) + self.val_accuracy.update(preds, y) + + # Calling self.log will surface up scalars for you in TensorBoard + self.log("val_loss", loss, prog_bar=True) + self.log("val_acc", self.val_accuracy, prog_bar=True) + + def test_step(self, batch, batch_idx): + x, y = batch + logits = self(x) + loss = F.nll_loss(logits, y) + preds = torch.argmax(logits, dim=1) + self.test_accuracy.update(preds, y) + + # Calling self.log will surface up scalars for you in TensorBoard + self.log("test_loss", loss, prog_bar=True) + self.log("test_acc", self.test_accuracy, prog_bar=True) + + def configure_optimizers(self): + optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate) + return optimizer + + #################### + # DATA RELATED HOOKS + #################### + + def prepare_data(self): + # download + print("Preparing MNIST dataset...") + MNIST(self.data_dir, train=True, download=False) + MNIST(self.data_dir, train=False, download=False) + + def setup(self, stage=None): + # Assign train/val datasets for use in dataloaders + if stage == "fit" or stage is None: + mnist_full = MNIST( + self.data_dir, train=True, transform=self.transform, download=False + ) + self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000]) + + # Assign test dataset for use in dataloader(s) + if stage == "test" or stage is None: + self.mnist_test = MNIST( + self.data_dir, train=False, transform=self.transform, download=False + ) + + def train_dataloader(self): + return DataLoader(self.mnist_train, batch_size=BATCH_SIZE) + + def val_dataloader(self): + return DataLoader(self.mnist_val, batch_size=BATCH_SIZE) + + def test_dataloader(self): + return DataLoader(self.mnist_test, batch_size=BATCH_SIZE) + + +# Init DataLoader from MNIST Dataset + +model = LitMNIST(data_dir=local_minst_path) + +print("GROUP: ", int(os.environ.get("GROUP_WORLD_SIZE", 1))) +print("LOCAL: ", int(os.environ.get("LOCAL_WORLD_SIZE", 1))) + +# Initialize a trainer +trainer = Trainer( + accelerator="auto", + # devices=1 if torch.cuda.is_available() else None, # limiting got iPython runs + max_epochs=5, + callbacks=[TQDMProgressBar(refresh_rate=20)], + num_nodes=int(os.environ.get("GROUP_WORLD_SIZE", 1)), + devices=int(os.environ.get("LOCAL_WORLD_SIZE", 1)), + strategy="ddp", +) + +# Train the model ⚡ +trainer.fit(model)