From 7b40e5c979ac9dde4389fe4c0b4a48ed7259b023 Mon Sep 17 00:00:00 2001
From: Dimitri Saridakis <dimitri.saridakis@gmail.com>
Date: Tue, 14 Nov 2023 10:21:08 +0000
Subject: [PATCH 1/7] refactor: add disconnected mnist training script

---
 .../guided-demos/mnist_disconnected.py        | 162 ++++++++++++++++++
 1 file changed, 162 insertions(+)
 create mode 100644 demo-notebooks/guided-demos/mnist_disconnected.py

diff --git a/demo-notebooks/guided-demos/mnist_disconnected.py b/demo-notebooks/guided-demos/mnist_disconnected.py
new file mode 100644
index 000000000..758c27790
--- /dev/null
+++ b/demo-notebooks/guided-demos/mnist_disconnected.py
@@ -0,0 +1,162 @@
+# Copyright 2022 IBM, Red Hat
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# In[]
+import os
+
+import torch
+from pytorch_lightning import LightningModule, Trainer
+from pytorch_lightning.callbacks.progress import TQDMProgressBar
+from pytorch_lightning.loggers import CSVLogger
+from torch import nn
+from torch.nn import functional as F
+from torch.utils.data import DataLoader, random_split
+from torchmetrics import Accuracy
+from torchvision import transforms
+from torchvision.datasets import MNIST
+
+PATH_DATASETS = os.environ.get("PATH_DATASETS", ".")
+BATCH_SIZE = 256 if torch.cuda.is_available() else 64
+# %%
+
+local_minst_path = os.path.join(PATH_DATASETS, "mnist")
+
+print("prior to running the trainer")
+print("MASTER_ADDR: is ", os.getenv("MASTER_ADDR"))
+print("MASTER_PORT: is ", os.getenv("MASTER_PORT"))
+
+
+class LitMNIST(LightningModule):
+    def __init__(self, data_dir=PATH_DATASETS, hidden_size=64, learning_rate=2e-4):
+        super().__init__()
+
+        # Set our init args as class attributes
+        self.data_dir = data_dir
+        self.hidden_size = hidden_size
+        self.learning_rate = learning_rate
+
+        # Hardcode some dataset specific attributes
+        self.num_classes = 10
+        self.dims = (1, 28, 28)
+        channels, width, height = self.dims
+        self.transform = transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize((0.1307,), (0.3081,)),
+            ]
+        )
+
+        # Define PyTorch model
+        self.model = nn.Sequential(
+            nn.Flatten(),
+            nn.Linear(channels * width * height, hidden_size),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(hidden_size, hidden_size),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(hidden_size, self.num_classes),
+        )
+
+        self.val_accuracy = Accuracy()
+        self.test_accuracy = Accuracy()
+
+    def forward(self, x):
+        x = self.model(x)
+        return F.log_softmax(x, dim=1)
+
+    def training_step(self, batch, batch_idx):
+        x, y = batch
+        logits = self(x)
+        loss = F.nll_loss(logits, y)
+        return loss
+
+    def validation_step(self, batch, batch_idx):
+        x, y = batch
+        logits = self(x)
+        loss = F.nll_loss(logits, y)
+        preds = torch.argmax(logits, dim=1)
+        self.val_accuracy.update(preds, y)
+
+        # Calling self.log will surface up scalars for you in TensorBoard
+        self.log("val_loss", loss, prog_bar=True)
+        self.log("val_acc", self.val_accuracy, prog_bar=True)
+
+    def test_step(self, batch, batch_idx):
+        x, y = batch
+        logits = self(x)
+        loss = F.nll_loss(logits, y)
+        preds = torch.argmax(logits, dim=1)
+        self.test_accuracy.update(preds, y)
+
+        # Calling self.log will surface up scalars for you in TensorBoard
+        self.log("test_loss", loss, prog_bar=True)
+        self.log("test_acc", self.test_accuracy, prog_bar=True)
+
+    def configure_optimizers(self):
+        optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
+        return optimizer
+
+    ####################
+    # DATA RELATED HOOKS
+    ####################
+
+    def prepare_data(self):
+        # download
+        print("Downloading MNIST dataset...")
+        MNIST(self.data_dir, train=True, download=False)
+        MNIST(self.data_dir, train=False, download=False)
+
+    def setup(self, stage=None):
+        # Assign train/val datasets for use in dataloaders
+        if stage == "fit" or stage is None:
+            mnist_full = MNIST(self.data_dir, train=True, transform=self.transform, download=False)
+            self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])
+
+        # Assign test dataset for use in dataloader(s)
+        if stage == "test" or stage is None:
+            self.mnist_test = MNIST(
+                self.data_dir, train=False, transform=self.transform, download=False
+            )
+
+    def train_dataloader(self):
+        return DataLoader(self.mnist_train, batch_size=BATCH_SIZE)
+
+    def val_dataloader(self):
+        return DataLoader(self.mnist_val, batch_size=BATCH_SIZE)
+
+    def test_dataloader(self):
+        return DataLoader(self.mnist_test, batch_size=BATCH_SIZE)
+
+
+# Init DataLoader from MNIST Dataset
+
+model = LitMNIST(data_dir=local_minst_path)
+
+print("GROUP: ", int(os.environ.get("GROUP_WORLD_SIZE", 1)))
+print("LOCAL: ", int(os.environ.get("LOCAL_WORLD_SIZE", 1)))
+
+# Initialize a trainer
+trainer = Trainer(
+    accelerator="auto",
+    # devices=1 if torch.cuda.is_available() else None,  # limiting got iPython runs
+    max_epochs=5,
+    callbacks=[TQDMProgressBar(refresh_rate=20)],
+    num_nodes=int(os.environ.get("GROUP_WORLD_SIZE", 1)),
+    devices=int(os.environ.get("LOCAL_WORLD_SIZE", 1)),
+    strategy="ddp",
+)
+
+# Train the model ⚡
+trainer.fit(model)

From 37bb9a0925dea1d62510e52e35cfb75d7dae269b Mon Sep 17 00:00:00 2001
From: Dimitri Saridakis <dimitri.saridakis@gmail.com>
Date: Tue, 14 Nov 2023 10:23:59 +0000
Subject: [PATCH 2/7] refactor: addition of the mnist download script for use
 with disconnected env

---
 .../guided-demos/download_mnist_datasets.py   | 42 +++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 demo-notebooks/guided-demos/download_mnist_datasets.py

diff --git a/demo-notebooks/guided-demos/download_mnist_datasets.py b/demo-notebooks/guided-demos/download_mnist_datasets.py
new file mode 100644
index 000000000..00b2bd32e
--- /dev/null
+++ b/demo-notebooks/guided-demos/download_mnist_datasets.py
@@ -0,0 +1,42 @@
+# Copyright 2022 IBM, Red Hat
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from torchvision.datasets import MNIST
+from torchvision import transforms
+
+def download_mnist_dataset(destination_dir):
+    # Ensure the destination directory exists
+    if not os.path.exists(destination_dir):
+        os.makedirs(destination_dir)
+
+    # Define transformations
+    transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize((0.1307,), (0.3081,))
+    ])
+
+    # Download the training data
+    train_set = MNIST(root=destination_dir, train=True, download=True, transform=transform)
+
+    # Download the test data
+    test_set = MNIST(root=destination_dir, train=False, download=True, transform=transform)
+
+    print(f"MNIST dataset downloaded in {destination_dir}")
+
+# Specify the directory where you
+script_dir = os.path.dirname(os.path.abspath(__file__))
+destination_dir = script_dir + "/mnist_datasets"
+
+download_mnist_dataset(destination_dir)
\ No newline at end of file

From fcf49967113c03a7628e59313fdacef4084350d3 Mon Sep 17 00:00:00 2001
From: Dimitri Saridakis <dimitri.saridakis@gmail.com>
Date: Tue, 14 Nov 2023 10:38:45 +0000
Subject: [PATCH 3/7] refactor: addition of note to test in disconnected env

---
 demo-notebooks/guided-demos/2_basic_jobs.ipynb | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/demo-notebooks/guided-demos/2_basic_jobs.ipynb b/demo-notebooks/guided-demos/2_basic_jobs.ipynb
index 5d862c03a..72accf926 100644
--- a/demo-notebooks/guided-demos/2_basic_jobs.ipynb
+++ b/demo-notebooks/guided-demos/2_basic_jobs.ipynb
@@ -203,7 +203,9 @@
    "id": "31096641",
    "metadata": {},
    "source": [
-    "Now, an alternative option for job submission is to submit directly to MCAD, which will schedule pods to run the job with requested resources:"
+    "Now, an alternative option for job submission is to submit directly to MCAD, which will schedule pods to run the job with requested resources:\n",
+    "\n",
+    "NOTE: To test this demo in an air-gapped/ disconnected environment alter the training script to use a local dataset."
    ]
   },
   {
@@ -216,6 +218,7 @@
     "jobdef = DDPJobDefinition(\n",
     "    name=\"mnistjob\",\n",
     "    script=\"mnist.py\",\n",
+    "    # script=\"mnist_disconnected.py\", # training script for disconnected environment\n",
     "    scheduler_args={\"namespace\": \"default\"},\n",
     "    j=\"1x1\",\n",
     "    gpu=0,\n",

From cf6c9f32ccf8e15c4f0e182b08e57bed97132689 Mon Sep 17 00:00:00 2001
From: Dimitri Saridakis <dimitri.saridakis@gmail.com>
Date: Tue, 14 Nov 2023 10:41:25 +0000
Subject: [PATCH 4/7] style: black formatting for precommit

---
 .../guided-demos/download_mnist_datasets.py   | 19 ++++++++++++-------
 .../guided-demos/mnist_disconnected.py        |  4 +++-
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/demo-notebooks/guided-demos/download_mnist_datasets.py b/demo-notebooks/guided-demos/download_mnist_datasets.py
index 00b2bd32e..5df8be0fd 100644
--- a/demo-notebooks/guided-demos/download_mnist_datasets.py
+++ b/demo-notebooks/guided-demos/download_mnist_datasets.py
@@ -16,27 +16,32 @@
 from torchvision.datasets import MNIST
 from torchvision import transforms
 
+
 def download_mnist_dataset(destination_dir):
     # Ensure the destination directory exists
     if not os.path.exists(destination_dir):
         os.makedirs(destination_dir)
 
     # Define transformations
-    transform = transforms.Compose([
-        transforms.ToTensor(),
-        transforms.Normalize((0.1307,), (0.3081,))
-    ])
+    transform = transforms.Compose(
+        [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
+    )
 
     # Download the training data
-    train_set = MNIST(root=destination_dir, train=True, download=True, transform=transform)
+    train_set = MNIST(
+        root=destination_dir, train=True, download=True, transform=transform
+    )
 
     # Download the test data
-    test_set = MNIST(root=destination_dir, train=False, download=True, transform=transform)
+    test_set = MNIST(
+        root=destination_dir, train=False, download=True, transform=transform
+    )
 
     print(f"MNIST dataset downloaded in {destination_dir}")
 
+
 # Specify the directory where you
 script_dir = os.path.dirname(os.path.abspath(__file__))
 destination_dir = script_dir + "/mnist_datasets"
 
-download_mnist_dataset(destination_dir)
\ No newline at end of file
+download_mnist_dataset(destination_dir)
diff --git a/demo-notebooks/guided-demos/mnist_disconnected.py b/demo-notebooks/guided-demos/mnist_disconnected.py
index 758c27790..609f93aab 100644
--- a/demo-notebooks/guided-demos/mnist_disconnected.py
+++ b/demo-notebooks/guided-demos/mnist_disconnected.py
@@ -121,7 +121,9 @@ def prepare_data(self):
     def setup(self, stage=None):
         # Assign train/val datasets for use in dataloaders
         if stage == "fit" or stage is None:
-            mnist_full = MNIST(self.data_dir, train=True, transform=self.transform, download=False)
+            mnist_full = MNIST(
+                self.data_dir, train=True, transform=self.transform, download=False
+            )
             self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])
 
         # Assign test dataset for use in dataloader(s)

From 30160bf18a0d86863612712c94cdce22091dccdb Mon Sep 17 00:00:00 2001
From: Dimitri Saridakis <dimitri.saridakis@gmail.com>
Date: Tue, 14 Nov 2023 10:46:00 +0000
Subject: [PATCH 5/7] refactor: correct path to datasets

---
 demo-notebooks/guided-demos/mnist_disconnected.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/demo-notebooks/guided-demos/mnist_disconnected.py b/demo-notebooks/guided-demos/mnist_disconnected.py
index 609f93aab..ffc71b418 100644
--- a/demo-notebooks/guided-demos/mnist_disconnected.py
+++ b/demo-notebooks/guided-demos/mnist_disconnected.py
@@ -30,7 +30,7 @@
 BATCH_SIZE = 256 if torch.cuda.is_available() else 64
 # %%
 
-local_minst_path = os.path.join(PATH_DATASETS, "mnist")
+local_minst_path = os.path.dirname(os.path.abspath(__file__) + "/mnist_datasets")
 
 print("prior to running the trainer")
 print("MASTER_ADDR: is ", os.getenv("MASTER_ADDR"))
@@ -114,7 +114,7 @@ def configure_optimizers(self):
 
     def prepare_data(self):
         # download
-        print("Downloading MNIST dataset...")
+        print("Preparing MNIST dataset...")
         MNIST(self.data_dir, train=True, download=False)
         MNIST(self.data_dir, train=False, download=False)
 

From 32a8a8bc82027b6f9f5c6d87c87551855461f906 Mon Sep 17 00:00:00 2001
From: Dimitri Saridakis <dimitri.saridakis@gmail.com>
Date: Tue, 14 Nov 2023 11:35:47 +0000
Subject: [PATCH 6/7] fix: fix paths to datasets

---
 demo-notebooks/guided-demos/download_mnist_datasets.py | 3 +--
 demo-notebooks/guided-demos/mnist_disconnected.py      | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/demo-notebooks/guided-demos/download_mnist_datasets.py b/demo-notebooks/guided-demos/download_mnist_datasets.py
index 5df8be0fd..6493296fb 100644
--- a/demo-notebooks/guided-demos/download_mnist_datasets.py
+++ b/demo-notebooks/guided-demos/download_mnist_datasets.py
@@ -41,7 +41,6 @@ def download_mnist_dataset(destination_dir):
 
 
 # Specify the directory where you
-script_dir = os.path.dirname(os.path.abspath(__file__))
-destination_dir = script_dir + "/mnist_datasets"
+destination_dir = os.path.dirname(os.path.abspath(__file__))
 
 download_mnist_dataset(destination_dir)
diff --git a/demo-notebooks/guided-demos/mnist_disconnected.py b/demo-notebooks/guided-demos/mnist_disconnected.py
index ffc71b418..d6cff250c 100644
--- a/demo-notebooks/guided-demos/mnist_disconnected.py
+++ b/demo-notebooks/guided-demos/mnist_disconnected.py
@@ -30,7 +30,7 @@
 BATCH_SIZE = 256 if torch.cuda.is_available() else 64
 # %%
 
-local_minst_path = os.path.dirname(os.path.abspath(__file__) + "/mnist_datasets")
+local_minst_path = os.path.dirname(os.path.abspath(__file__) + "/MNIST/raw")
 
 print("prior to running the trainer")
 print("MASTER_ADDR: is ", os.getenv("MASTER_ADDR"))

From 12533714a8aee08c9bd83223928ac416cc5ac543 Mon Sep 17 00:00:00 2001
From: Dimitri Saridakis <dimitri.saridakis@gmail.com>
Date: Tue, 14 Nov 2023 11:36:18 +0000
Subject: [PATCH 7/7] feat: make it easier to download the datasets

---
 demo-notebooks/guided-demos/2_basic_jobs.ipynb    | 11 +++++++++--
 demo-notebooks/guided-demos/mnist_disconnected.py |  2 +-
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/demo-notebooks/guided-demos/2_basic_jobs.ipynb b/demo-notebooks/guided-demos/2_basic_jobs.ipynb
index 72accf926..4b3cee910 100644
--- a/demo-notebooks/guided-demos/2_basic_jobs.ipynb
+++ b/demo-notebooks/guided-demos/2_basic_jobs.ipynb
@@ -116,7 +116,13 @@
    "id": "83d77b74",
    "metadata": {},
    "source": [
-    "First, let's begin by submitting to Ray, training a basic NN on the MNIST dataset:"
+    "First, let's begin by submitting to Ray, training a basic NN on the MNIST dataset:\n",
+    "\n",
+    "NOTE: To test this demo in an air-gapped/ disconnected environment alter the training script to use a local dataset.\n",
+    "First we must download the MNIST dataset. We've included a helper script to do this for you. \n",
+    "\n",
+    "You can run the python script (`python download_mnist_datasets.py`) directly and then place the dataset in the same directory as this notebook. \n",
+    "The path to the dataset would be: `..guided-demos/MNIST/raw/` "
    ]
   },
   {
@@ -129,6 +135,7 @@
     "jobdef = DDPJobDefinition(\n",
     "    name=\"mnisttest\",\n",
     "    script=\"mnist.py\",\n",
+    "    # script=\"mnist_disconnected.py\", # training script for disconnected environment\n",
     "    scheduler_args={\"requirements\": \"requirements.txt\"}\n",
     ")\n",
     "job = jobdef.submit(cluster)"
@@ -302,7 +309,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.13"
+   "version": "3.9.18"
   },
   "vscode": {
    "interpreter": {
diff --git a/demo-notebooks/guided-demos/mnist_disconnected.py b/demo-notebooks/guided-demos/mnist_disconnected.py
index d6cff250c..9fc72130b 100644
--- a/demo-notebooks/guided-demos/mnist_disconnected.py
+++ b/demo-notebooks/guided-demos/mnist_disconnected.py
@@ -30,7 +30,7 @@
 BATCH_SIZE = 256 if torch.cuda.is_available() else 64
 # %%
 
-local_minst_path = os.path.dirname(os.path.abspath(__file__) + "/MNIST/raw")
+local_minst_path = os.path.dirname(os.path.abspath(__file__))
 
 print("prior to running the trainer")
 print("MASTER_ADDR: is ", os.getenv("MASTER_ADDR"))