Skip to content

Commit b7b9edd

Browse files
authored
Update e2e tests to run on lambda (#1653)
1 parent 279e9ba commit b7b9edd

File tree

3 files changed

+55
-8
lines changed

3 files changed

+55
-8
lines changed

tests/e2e/test_train_e2e.py

Lines changed: 38 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -427,32 +427,34 @@ def test_train_text_1gpu_24gb(
427427
_test_train_impl(test_config=test_config, tmp_path=tmp_path, use_distributed=False)
428428

429429

430-
@requires_gpus(count=1, min_gb=24.0)
430+
@requires_gpus(count=4, min_gb=39.0)
431431
@pytest.mark.parametrize(
432432
"test_config",
433433
[
434434
TrainTestConfig(
435-
test_name="train_mm_qwen2_vl_2b_trl_sft",
435+
test_name="train_mm_qwen2_vl_2b_trl_sft_fft",
436436
config_path=(
437437
get_configs_dir()
438438
/ "recipes"
439439
/ "vision"
440440
/ "qwen2_vl_2b"
441441
/ "sft"
442+
/ "full"
442443
/ "train.yaml"
443444
),
444445
trainer_type=TrainerType.TRL_SFT,
445446
max_steps=5,
446447
save_steps=5,
447448
),
448449
TrainTestConfig(
449-
test_name="train_mm_qwen2_vl_2b_oumi",
450+
test_name="train_mm_qwen2_vl_2b_oumi_fft",
450451
config_path=(
451452
get_configs_dir()
452453
/ "recipes"
453454
/ "vision"
454455
/ "qwen2_vl_2b"
455456
/ "sft"
457+
/ "full"
456458
/ "train.yaml"
457459
),
458460
trainer_type=TrainerType.OUMI,
@@ -464,8 +466,40 @@ def test_train_text_1gpu_24gb(
464466
ids=get_train_test_id_fn,
465467
)
466468
@pytest.mark.e2e
469+
@pytest.mark.multi_gpu
470+
def test_train_multimodal_4gpu_40gb(test_config: TrainTestConfig, tmp_path: Path):
471+
_test_train_impl(
472+
test_config=test_config,
473+
tmp_path=tmp_path,
474+
use_distributed=True,
475+
)
476+
477+
478+
@requires_gpus(count=1, min_gb=39.0)
479+
@pytest.mark.parametrize(
480+
"test_config",
481+
[
482+
TrainTestConfig(
483+
test_name="train_mm_qwen2_vl_2b_trl_sft_lora",
484+
config_path=(
485+
get_configs_dir()
486+
/ "recipes"
487+
/ "vision"
488+
/ "qwen2_vl_2b"
489+
/ "sft"
490+
/ "lora"
491+
/ "train.yaml"
492+
),
493+
trainer_type=TrainerType.TRL_SFT,
494+
max_steps=5,
495+
save_steps=5,
496+
),
497+
],
498+
ids=get_train_test_id_fn,
499+
)
500+
@pytest.mark.e2e
467501
@pytest.mark.single_gpu
468-
def test_train_multimodal_1gpu_24gb(test_config: TrainTestConfig, tmp_path: Path):
502+
def test_train_multimodal_lora_1gpu_40gb(test_config: TrainTestConfig, tmp_path: Path):
469503
_test_train_impl(
470504
test_config=test_config,
471505
tmp_path=tmp_path,

tests/scripts/gcp_e2e_tests_job.yaml renamed to tests/scripts/lambda_e2e_tests_job.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,11 @@
22
# https://github.com/oumi-ai/oumi/blob/main/src/oumi/core/configs/job_config.py
33

44
# Sample command:
5-
# oumi launch up --config tests/scripts/gcp_e2e_tests_job.yaml --cluster oumi-e2e-tests-cluster
5+
# oumi launch up --config tests/scripts/lambda_e2e_tests_job.yaml --cluster oumi-e2e-tests-cluster
66
name: oumi-e2e-tests
77

88
resources:
9-
cloud: gcp
9+
cloud: lambda
1010
accelerators: "A100:4" # "A100:1", "A100-80GB:1", "A100-80GB:4"
1111
use_spot: false
1212
disk_size: 1000 # Disk size in GBs

tests/scripts/launch_tests.sh

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,16 @@
22
set -e
33

44
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
5-
E2E_TEST_CONFIG="${SCRIPT_DIR}/gcp_e2e_tests_job.yaml"
5+
E2E_TEST_CONFIG="${SCRIPT_DIR}/lambda_e2e_tests_job.yaml"
66
echo "Using test config: ${E2E_TEST_CONFIG}"
77

88
export E2E_CLUSTER_PREFIX="oumi-${USER}-e2e-tests"
99
export E2E_USE_SPOT_VM=0 # Whether to use Spot VMs.
10+
export E2E_CLUSTER="" # Cloud provider to use (e.g., "lambda", "aws", etc.)
1011

11-
declare -a accelerators_arr=("A100:1" "A100:4" "A100-80GB:4")
12+
# An alternative to H100 is A100-80GB, if they are available.
13+
# However, A100-80GB:4 isn't available in Lambda.
14+
declare -a accelerators_arr=("A100:1" "A100:4" "H100:4")
1215

1316
# Reset the variable to make sure that CLI `--resources.use_spot` arg is not ignored.
1417
OUMI_USE_SPOT_VM=""
@@ -25,10 +28,20 @@ do
2528
CLUSTER_SUFFIX="${CLUSTER_SUFFIX}-spot"
2629
fi
2730
CLUSTER_NAME="${E2E_CLUSTER_PREFIX}-${CLUSTER_SUFFIX}"
31+
32+
CLOUD_ARG=""
33+
if [ -n "$E2E_CLUSTER" ]; then
34+
CLOUD_ARG="--resources.cloud=${E2E_CLUSTER}"
35+
else
36+
CLOUD_ARG="--resources.cloud=lambda"
37+
fi
38+
39+
set -x
2840
oumi launch up \
2941
--config "${E2E_TEST_CONFIG}" \
3042
--resources.accelerators="${CURR_GPU_NAME}" \
3143
"${USE_SPOT_ARG}" \
44+
"${CLOUD_ARG}" \
3245
--cluster "${CLUSTER_NAME}" \
3346
--detach
3447
done

0 commit comments

Comments
 (0)