neuralmagic · robertgshaw2-redhat · May 5, 2025 · May 5, 2025 · May 5, 2025 · May 5, 2025
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -209,7 +209,7 @@ steps:
     - pytest -v -s v1/worker
     - pytest -v -s v1/structured_output
     - pytest -v -s v1/spec_decode
-    - pytest -v -s v1/kv_transfer
+    - pytest -v -s v1/kv_connector/unit
     - pytest -v -s v1/test_serial_utils.py
     - pytest -v -s v1/test_stats.py
     - pytest -v -s v1/test_utils.py

diff --git a/tests/v1/kv_connector/run_accuracy_test.sh → ...tor/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/run_accuracy_test.sh → ...tor/nixl_integration/run_accuracy_test.sh
@@ -138,7 +138,7 @@ run_tests_for_model() {
   done
 
   # Build the command for the proxy server with all the hosts and ports
-  PROXY_CMD="python ${GIT_ROOT}/tests/v1/kv_connector/toy_proxy_server.py --port 8192"
+  PROXY_CMD="python ${GIT_ROOT}/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --port 8192"
 
   # Add all prefill hosts and ports
   PROXY_CMD+=" --prefiller-hosts ${PREFILL_HOSTS[@]}"
@@ -157,7 +157,7 @@ run_tests_for_model() {
 
   # Run lm eval for this model
   echo "Running tests for $model_name"
-  TEST_MODEL=$model_name python -m pytest -s -x ${GIT_ROOT}/tests/v1/kv_connector/test_accuracy.py
+  TEST_MODEL=$model_name python -m pytest -s -x ${GIT_ROOT}/tests/v1/kv_connector/nixl_integration/test_accuracy.py
 
   # Clean up before running next model
   cleanup_instances

diff --git a/tests/v1/kv_connector/test_accuracy.py → ...nnector/nixl_integration/test_accuracy.py b/tests/v1/kv_connector/test_accuracy.py → ...nnector/nixl_integration/test_accuracy.py
diff --git a/tests/v1/kv_connector/toy_proxy_server.py → ...ctor/nixl_integration/toy_proxy_server.py b/tests/v1/kv_connector/toy_proxy_server.py → ...ctor/nixl_integration/toy_proxy_server.py
diff --git a/tests/v1/kv_connector/__init__.py → tests/v1/kv_connector/unit/__init__.py b/tests/v1/kv_connector/__init__.py → tests/v1/kv_connector/unit/__init__.py
diff --git a/tests/v1/kv_connector/test_nixl_connector.py → .../kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/test_nixl_connector.py → .../kv_connector/unit/test_nixl_connector.py
diff --git a/...connector/test_remote_decode_lifecycle.py → ...ctor/unit/test_remote_decode_lifecycle.py b/...connector/test_remote_decode_lifecycle.py → ...ctor/unit/test_remote_decode_lifecycle.py
diff --git a/...onnector/test_remote_prefill_lifecycle.py → ...tor/unit/test_remote_prefill_lifecycle.py b/...onnector/test_remote_prefill_lifecycle.py → ...tor/unit/test_remote_prefill_lifecycle.py
@@ -272,3 +272,37 @@ def test_no_spurious_prefix_caching():
     for block in remote_blocks:
         assert block.ref_cnt == 1
         assert block._block_hash is None
+
+
+def test_short_prompt_lifecycle():
+    """Test lifecycle of a Remote Decode request with short prompt."""
+
+    vllm_config = create_vllm_config()
+    scheduler = create_scheduler(vllm_config)
+
+    # Not enough tokens for full block.
+    NUM_TOKENS = vllm_config.cache_config.block_size // 2
+    request = create_request(request_id=1,
+                             num_tokens=NUM_TOKENS,
+                             do_remote_decode=True)
+
+    scheduler.add_request(request)
+
+    # STEP (1): Prefill.
+    # (1a): schedule()
+    scheduler_output = scheduler.schedule()
+    assert len(scheduler.running) == 1
+    assert len(scheduler_output.scheduled_new_reqs) == 1
+
+    # (1b): execute_model()
+    model_runner_output = create_model_runner_output(reqs=[request])
+
+    # (1c): update_from_output()
+    # Since tokens < block_size, there will be no kv xfer.
+    # So this should be cleaned up immediately.
+    _ = scheduler.update_from_output(scheduler_output, model_runner_output)
+
+    # Confirm we do not have any memory leaks after req lifecycle.
+    # We need one more call to schedule() to clear data for persistent batch.
+    _ = scheduler.schedule()
+    assert_scheduler_empty(scheduler)
diff --git a/tests/v1/kv_connector/utils.py → tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/utils.py → tests/v1/kv_connector/unit/utils.py
@@ -6,7 +6,7 @@
 from vllm.config import (CacheConfig, DeviceConfig, KVTransferConfig,
                          ModelConfig, SchedulerConfig, VllmConfig)
 from vllm.sampling_params import KVTransferParams, SamplingParams
-from vllm.v1.core.sched.scheduler import Scheduler
+from vllm.v1.core.sched.scheduler_disagg import DisaggregatedScheduler
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheGroupSpec)
 from vllm.v1.outputs import ModelRunnerOutput
@@ -16,7 +16,7 @@
 EOS_TOKEN_ID = 50256
 
 
-def assert_scheduler_empty(scheduler: Scheduler):
+def assert_scheduler_empty(scheduler: DisaggregatedScheduler):
     """Confirm the scheduler is "empty" - i.e. no leaks."""
     # Scheduler Metadata.
     assert len(scheduler.requests) == 0
@@ -88,7 +88,7 @@ def create_vllm_config(
 def create_scheduler(
     vllm_config: VllmConfig,
     num_blocks: int = 10000,
-) -> Scheduler:
+) -> DisaggregatedScheduler:
     """Initialize Scheduler For Testing."""
     block_size = vllm_config.cache_config.block_size
     kv_cache_config = KVCacheConfig(
@@ -101,7 +101,7 @@ def create_scheduler(
         ],
     )
     vllm_config.cache_config.num_gpu_blocks = num_blocks
-    return Scheduler(
+    return DisaggregatedScheduler(
         vllm_config=vllm_config,
         kv_cache_config=kv_cache_config,
         log_stats=True,

diff --git a/tests/v1/kv_transfer/test_multi_connector.py b/tests/v1/kv_transfer/test_multi_connector.py
diff --git a/vllm/config.py b/vllm/config.py
@@ -3402,8 +3402,6 @@ class KVTransferConfig(BaseModel):
     kv_connector: Optional[str] = None
 
     # Engine ID for the KV transfers.
-    # Note(tms): sticking this here so the engine_id is consistent between
-    # scheduler-side and worker-side of the KVConnector
     engine_id: str = str(uuid.uuid4())
 
     # The device used by kv connector to buffer the KV cache.

diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py
@@ -110,8 +110,3 @@ def create_connector_v1(
     "NixlConnector",
     "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector",
     "NixlConnector")
-
-KVConnectorFactory.register_connector(
-    "MultiConnector",
-    "vllm.distributed.kv_transfer.kv_connector.v1.multi_connector",
-    "MultiConnector")
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -22,6 +22,7 @@
 
 import enum
 from abc import ABC, abstractmethod
+from dataclasses import dataclass
 from typing import TYPE_CHECKING
 
 import torch
@@ -46,6 +47,7 @@ class KVConnectorRole(enum.Enum):
     WORKER = 1
 
 
+@dataclass
 class KVConnectorMetadata:
     pass