[skyrl-train] Update Megatron + LoRA to use PEFT only bridge #885

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft

erictang000 wants to merge 9 commits into NovaSky-AI:main from erictang000:megatron_lora_bridge

skyrl-train/pyproject.toml

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -79,7 +79,7 @@ override-dependencies = [
  
        "nvidia-resiliency-ext; sys_platform == 'never'",

        "mamba-ssm; sys_platform == 'never'",

        "causal-conv1d; sys_platform == 'never'",

        "transformer-engine[pytorch]==2.9.0",

        "transformer-engine[pytorch]==2.10.0",

        "megatron-core==0.15.0"

    ]

    [tool.uv.extra-build-dependencies]

    @@ -100,7 +100,8 @@ flashinfer-jit-cache = { index = "flashinfer-cu128", marker = "extra == 'vllm' o
  
    flashinfer-python = [

        { url = "https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.6.post1%2Bcu128torch2.7-cp39-abi3-linux_x86_64.whl", marker = "extra == 'sglang' and extra != 'mcore' and extra != 'vllm'" }

    ]

    megatron-bridge = {git = "https://github.com/NVIDIA-NeMo/Megatron-Bridge", rev = "953aabf75c0500180dc14a6a76cf9e7e7c4baec7"}

    megatron-bridge = {git = "https://github.com/NVIDIA-NeMo/Megatron-Bridge", rev = "04e370eedf8cc44a812189a19f2171d90555c07a"}

    vllm = { index = "vllm-nightly-4829148" }

    [project.optional-dependencies]

    @@ -125,9 +126,9 @@ sandboxes = [
  
        "litellm[proxy]>=1.67.5",

    ]

    vllm = [

        "vllm==0.11.0",

        "vllm",  # Version resolved from vllm-nightly index (commit 4829148)

        "flash-attn==2.8.3",

        "torch==2.8.0",

        "torch==2.9.1",  # Required by vllm nightly

        "flashinfer-python",

        "flashinfer-jit-cache",

        "torchvision"

    @@ -140,15 +141,15 @@ sglang = [
  
        "torchvision",

    ]

    mcore = [

      "transformer-engine[pytorch]==2.9.0",

      "transformer-engine[pytorch]==2.10.0",

      "flash-attn==2.8.1",

      "vllm==0.11.0",

      "torch==2.8.0",

      "flashinfer-python==0.5.2",

      "vllm",  # Version resolved from vllm-nightly index (commit 4829148)

      "torch==2.9.1",  # Required by vllm nightly

      "flashinfer-python==0.5.3",

      "torchvision",

      "megatron-bridge @ git+https://github.com/NVIDIA-NeMo/Megatron-Bridge.git@v0.2.0",

      "megatron-core==0.15.0",

      "flashinfer-jit-cache==0.5.2",

      "flashinfer-jit-cache==0.5.3",

      "nvidia-modelopt",

    ]

    flashrl = [

    @@ -176,6 +177,12 @@ name = "flashinfer-cu128"
  
    url = "https://flashinfer.ai/whl/cu128"

    explicit = true

    [[tool.uv.index]]

    name = "vllm-nightly-4829148"

    # Replace cu130 with whatever variant you need (cu129, cu130, etc).

    url = "https://wheels.vllm.ai/482914849cf9ce61d3e0dffaa35096bb34de58f5"

    explicit = true

    [tool.setuptools]

    include-package-data = true

skyrl-train/skyrl_train/inference_engines/vllm/vllm_engine.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -364,27 +364,36 @@ def _create_engine(self, *args, **kwargs): @@
             model_name = model_path
             base_model_paths = [BaseModelPath(name=model_name, model_path=model_path)]
-            models = OpenAIServingModels(engine, model_config, base_model_paths)
+            # vllm >= 0.11.2 removed model_config from OpenAI serving APIs
+            is_new_api = version.parse(vllm.__version__) >= version.parse("0.11.2")
+            legacy_kwargs = {}
+            if is_new_api:
+                models = OpenAIServingModels(engine, base_model_paths)
+            else:
+                models = OpenAIServingModels(engine, model_config, base_model_paths)
+                legacy_kwargs["model_config"] = model_config
             # TODO(Charlie): revisit kwargs `enable_auto_tools` and `tool_parser` when we need to
             # support OAI-style tool calling; and `request_logger` for better debugging.
             self.openai_serving_chat = OpenAIServingChat(
                 engine_client=engine,
-                model_config=model_config,
                 models=models,
                 response_role="assistant",
                 request_logger=None,
                 chat_template=None,
                 chat_template_content_format="auto",
+                **legacy_kwargs,
                 **openai_kwargs,
             )
             # TODO(Charlie): revisit kwargs `return_tokens_as_token_ids`,
             # `enable_prompt_tokens_details`, `enable_force_include_usage`.
             self.openai_serving_completion = OpenAIServingCompletion(
                 engine_client=engine,
-                model_config=model_config,
                 models=models,
                 request_logger=None,
+                **legacy_kwargs,
             )
             return engine
@@ Expand Down @@

skyrl-train/skyrl_train/inference_engines/vllm/vllm_server.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -2,7 +2,8 @@ @@
     import signal
     import uvloop
     from vllm import AsyncLLMEngine
-    from vllm.utils import FlexibleArgumentParser, set_ulimit
+    from vllm.utils.argparse_utils import FlexibleArgumentParser
+    from vllm.utils.system_utils import set_ulimit
     from vllm.entrypoints.openai.cli_args import (
         make_arg_parser,
         validate_parsed_serve_args,
@@ Expand Down Expand Up / @@ -121,8 +122,7 @@ async def _destroy_weights_update_group(request: Request): @@
                 )
                 return {"status": "ok"}
-            vllm_config = await engine.get_vllm_config()
-            await init_app_state(engine, vllm_config, app.state, args)
+            await init_app_state(engine, app.state, args)
             shutdown_task = await serve_http(
                 app,
@@ Expand Down @@

skyrl-train/skyrl_train/workers/megatron/megatron_worker.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -137,6 +137,7 @@ def extract_weights(self, dtype: torch.dtype): @@
                     self.actor_module,
                     show_progress=False,
                     conversion_tasks=None,
+                    merge_weights=False,
                 )
                 for name, tensor in hf_params_generator:
@@ Expand All / @@ -156,7 +157,9 @@ def extract_weights(self, dtype: torch.dtype): @@
                         self.actor_module,
                         show_progress=False,
                         conversion_tasks=bucket,
+                        merge_weights=False,
                     )
+                    breakpoint(0)
                     # Collect all parameters in this bucket into one chunk
                     names = []
@@ Expand Down Expand Up @@
             torch.cuda.empty_cache()
             # Extract and send weights using the sender created at init time
+            if self._is_lora:
+                # extract weights
+                #
+                return
             await self._weight_transfer_sender.send_chunks(self.weight_extractor.extract_weights(generator_dtype))
             if cache_reset_task is not None:
@@ Expand Down @@

skyrl-train/tests/gpu/gpu_ci/test_megatron_worker.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -111,19 +111,19 @@ def get_test_training_batch(batch_size=4) -> TrainingInputBatch:
  
    @pytest.mark.parametrize(

        ("colocate_all", "inference_tp", "megatron_tp", "megatron_pp", "megatron_ep", "megatron_etp", "lora"),

        [(True, 4, 2, 2, 1, None, False), (False, 2, 2, 1, 1, None, False), (True, 4, 2, 2, 1, None, True)],

        ids=["colocate_all", "non_colocated", "colocate_all_lora"],

        ("colocate_all", "inference_tp", "megatron_tp", "megatron_pp", "megatron_ep", "megatron_etp", "lora", "model_name"),

        [(True, 4, 2, 2, 1, None, False, MODEL_NAME), (False, 2, 2, 1, 1, None, False, MODEL_NAME), (True, 4, 2, 2, 1, None, True, MODEL_NAME), (True, 4, 4, 1, 4, 1, True, MOE_MODEL_NAME)],

        ids=["colocate_all_qwen3_0.6b", "non_colocated_qwen3_0.6b", "colocate_all_lora_qwen3_0.6b", "colocate_all_moe_lora_qwen3_30b_a3b"],

    )

    @pytest.mark.megatron

    def test_megatron_policy_weight_sync(

        colocate_all, inference_tp, megatron_tp, megatron_pp, megatron_ep, megatron_etp, lora

        colocate_all, inference_tp, megatron_tp, megatron_pp, megatron_ep, megatron_etp, lora, model_name

    ):

        """

        Test that we can sync weights between policy and inference for megatron then run inference

        """

        try:

            cfg = get_test_actor_config(model_name=MODEL_NAME)

            cfg = get_test_actor_config(model_name=model_name)

            if lora:

                cfg.trainer.policy.model.lora.rank = 16

                cfg.trainer.policy.model.lora.alpha = 16

    @@ -141,7 +141,7 @@ def test_megatron_policy_weight_sync(
  
            # If colocate is True, this will load the engine, sleep, and wake up the engine

            client, pg = init_inference_engines(

                model=MODEL_NAME,

                model=model_name,

                cfg=cfg,

                use_local=True,

                async_engine=cfg.generator.async_engine,

    @@ -172,7 +172,7 @@ def test_megatron_policy_weight_sync(
  
            policy.offload_to_cpu()

            asyncio.run(client.wake_up(tags=["kv_cache"]))

            sampling_params = get_sampling_params_for_backend(cfg.generator.backend, cfg.generator.sampling_params)

            outputs = asyncio.run(run_inference(client, get_test_prompts(MODEL_NAME), sampling_params))

            outputs = asyncio.run(run_inference(client, get_test_prompts(model_name), sampling_params))

            print(f"Example output: {outputs['responses'][0]}, {outputs['stop_reasons'][0]}")

        finally:

skyrl-train/tests/gpu/utils.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -445,6 +445,9 @@ def get_free_port(): @@
                 # For standalone server, we use mp for now.
                 "--distributed-executor-backend",
                 "mp",
+                # vLLM 0.13+ V1 engine spawns worker processes that can't inherit CUDA context
+                # when CUDA_VISIBLE_DEVICES is set. Disable frontend multiprocessing to fix this.
+                "--disable-frontend-multiprocessing",
                 "--dtype",
                 "bfloat16",
                 "--host",
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[skyrl-train] Update Megatron + LoRA to use PEFT only bridge #885

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Uh oh!

[skyrl-train] Update Megatron + LoRA to use PEFT only bridge #885

Are you sure you want to change the base?

Uh oh!

[skyrl-train] Update Megatron + LoRA to use PEFT only bridge #885

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Uh oh!