Update e2e tests to run on lambda (#1653)

wizeng23 · web-flow · commit b7b9eddb717b · 2025-04-25T14:32:26.000-07:00
diff --git a/tests/e2e/test_train_e2e.py b/tests/e2e/test_train_e2e.py
@@ -427,32 +427,34 @@ def test_train_text_1gpu_24gb(
     _test_train_impl(test_config=test_config, tmp_path=tmp_path, use_distributed=False)
 
 
-@requires_gpus(count=1, min_gb=24.0)
+@requires_gpus(count=4, min_gb=39.0)
 @pytest.mark.parametrize(
     "test_config",
     [
         TrainTestConfig(
-            test_name="train_mm_qwen2_vl_2b_trl_sft",
+            test_name="train_mm_qwen2_vl_2b_trl_sft_fft",
             config_path=(
                 get_configs_dir()
                 / "recipes"
                 / "vision"
                 / "qwen2_vl_2b"
                 / "sft"
+                / "full"
                 / "train.yaml"
             ),
             trainer_type=TrainerType.TRL_SFT,
             max_steps=5,
             save_steps=5,
         ),
         TrainTestConfig(
-            test_name="train_mm_qwen2_vl_2b_oumi",
+            test_name="train_mm_qwen2_vl_2b_oumi_fft",
             config_path=(
                 get_configs_dir()
                 / "recipes"
                 / "vision"
                 / "qwen2_vl_2b"
                 / "sft"
+                / "full"
                 / "train.yaml"
             ),
             trainer_type=TrainerType.OUMI,
@@ -464,8 +466,40 @@ def test_train_text_1gpu_24gb(
     ids=get_train_test_id_fn,
 )
 @pytest.mark.e2e
+@pytest.mark.multi_gpu
+def test_train_multimodal_4gpu_40gb(test_config: TrainTestConfig, tmp_path: Path):
+    _test_train_impl(
+        test_config=test_config,
+        tmp_path=tmp_path,
+        use_distributed=True,
+    )
+
+
+@requires_gpus(count=1, min_gb=39.0)
+@pytest.mark.parametrize(
+    "test_config",
+    [
+        TrainTestConfig(
+            test_name="train_mm_qwen2_vl_2b_trl_sft_lora",
+            config_path=(
+                get_configs_dir()
+                / "recipes"
+                / "vision"
+                / "qwen2_vl_2b"
+                / "sft"
+                / "lora"
+                / "train.yaml"
+            ),
+            trainer_type=TrainerType.TRL_SFT,
+            max_steps=5,
+            save_steps=5,
+        ),
+    ],
+    ids=get_train_test_id_fn,
+)
+@pytest.mark.e2e
 @pytest.mark.single_gpu
-def test_train_multimodal_1gpu_24gb(test_config: TrainTestConfig, tmp_path: Path):
+def test_train_multimodal_lora_1gpu_40gb(test_config: TrainTestConfig, tmp_path: Path):
     _test_train_impl(
         test_config=test_config,
         tmp_path=tmp_path,
diff --git a/tests/scripts/lambda_e2e_tests_job.yaml b/tests/scripts/lambda_e2e_tests_job.yaml
@@ -2,11 +2,11 @@
 # https://github.com/oumi-ai/oumi/blob/main/src/oumi/core/configs/job_config.py
 
 # Sample command:
-# oumi launch up --config tests/scripts/gcp_e2e_tests_job.yaml --cluster oumi-e2e-tests-cluster
+# oumi launch up --config tests/scripts/lambda_e2e_tests_job.yaml --cluster oumi-e2e-tests-cluster
 name: oumi-e2e-tests
 
 resources:
-  cloud: gcp
+  cloud: lambda
   accelerators: "A100:4" # "A100:1", "A100-80GB:1", "A100-80GB:4"
   use_spot: false
   disk_size: 1000 # Disk size in GBs
diff --git a/tests/scripts/launch_tests.sh b/tests/scripts/launch_tests.sh
@@ -2,13 +2,16 @@
 set -e
 
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-E2E_TEST_CONFIG="${SCRIPT_DIR}/gcp_e2e_tests_job.yaml"
+E2E_TEST_CONFIG="${SCRIPT_DIR}/lambda_e2e_tests_job.yaml"
 echo "Using test config: ${E2E_TEST_CONFIG}"
 
 export E2E_CLUSTER_PREFIX="oumi-${USER}-e2e-tests"
 export E2E_USE_SPOT_VM=0 # Whether to use Spot VMs.
+export E2E_CLUSTER="" # Cloud provider to use (e.g., "lambda", "aws", etc.)
 
-declare -a accelerators_arr=("A100:1" "A100:4" "A100-80GB:4")
+# An alternative to H100 is A100-80GB, if they are available.
+# However, A100-80GB:4 isn't available in Lambda.
+declare -a accelerators_arr=("A100:1" "A100:4" "H100:4")
 
 # Reset the variable to make sure that CLI `--resources.use_spot` arg is not ignored.
 OUMI_USE_SPOT_VM=""
@@ -25,10 +28,20 @@ do
       CLUSTER_SUFFIX="${CLUSTER_SUFFIX}-spot"
    fi
    CLUSTER_NAME="${E2E_CLUSTER_PREFIX}-${CLUSTER_SUFFIX}"
+
+   CLOUD_ARG=""
+   if [ -n "$E2E_CLUSTER" ]; then
+      CLOUD_ARG="--resources.cloud=${E2E_CLUSTER}"
+   else
+      CLOUD_ARG="--resources.cloud=lambda"
+   fi
+
+   set -x
    oumi launch up \
       --config "${E2E_TEST_CONFIG}" \
       --resources.accelerators="${CURR_GPU_NAME}" \
       "${USE_SPOT_ARG}" \
+      "${CLOUD_ARG}" \
       --cluster "${CLUSTER_NAME}" \
       --detach
 done