support py_volcano platform

zhengchenyu · zhengchenyu · commit 2eb2ab095598 · 2025-05-29T14:58:22.000+08:00
diff --git a/dlrover/python/common/constants.py b/dlrover/python/common/constants.py
@@ -26,6 +26,7 @@ class PlatformType(object):
     RAY = "ray"
     PY_KUBERNETES = "pyk8s"
     LOCAL = "local"
+    PY_VOLCANO = "py_volcano"
 
 
 class CommunicationType(object):
@@ -293,6 +294,8 @@ class NodeEnv(object):
     NODE_ID = "NODE_ID"
     NODE_NUM = "NODE_NUM"
     NODE_RANK = "NODE_RANK"
+    MAX_NODE = "MAX_NODE"
+    MIN_NODE = "MIN_NODE"
 
     # Deprecated env vars.
     WORKER_TYPE = "WORKER_TYPE"
diff --git a/dlrover/python/master/scaler/factory.py b/dlrover/python/master/scaler/factory.py
@@ -29,3 +29,7 @@ def new_job_scaler(platform, job_name, namespace):
         return ActorScaler(job_name, namespace)
     elif platform == PlatformType.LOCAL:
         return None
+    elif platform == PlatformType.PY_VOLCANO:
+        from dlrover.python.master.scaler.volcano_scaler import VolcanoScaler
+
+        return VolcanoScaler(job_name, namespace)
diff --git a/dlrover/python/master/scaler/volcano_scaler.py b/dlrover/python/master/scaler/volcano_scaler.py
@@ -0,0 +1,27 @@
+# Copyright 2022 The DLRover Authors. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dlrover.python.master.scaler.base_scaler import ScalePlan, Scaler
+
+
+# Do nothing for VolcanoScaler, use volcano to scaling.
+class VolcanoScaler(Scaler):
+    def __init__(self, job_name, namespace):
+        super(VolcanoScaler, self).__init__(job_name)
+        self._namespace = namespace
+
+    def start(self):
+        pass
+
+    def scale(self, plan: ScalePlan):
+        pass
diff --git a/dlrover/python/master/watcher/factory.py b/dlrover/python/master/watcher/factory.py
@@ -17,7 +17,11 @@
 
 def new_node_watcher(platform, job_name, namespace):
     logger.info("New %s Node Watcher", platform)
-    if platform in (PlatformType.KUBERNETES, PlatformType.PY_KUBERNETES):
+    if platform in (
+        PlatformType.KUBERNETES,
+        PlatformType.PY_KUBERNETES,
+        PlatformType.PY_VOLCANO,
+    ):
         from dlrover.python.master.watcher.k8s_watcher import PodWatcher
 
         return PodWatcher(job_name, namespace)
@@ -43,5 +47,11 @@ def new_scale_plan_watcher(platform, job_name, namespace, job_uuid):
         )
 
         return RayScalePlanWatcher(job_name, namespace, job_uuid)
+    elif platform in (PlatformType.PY_VOLCANO):
+        from dlrover.python.master.watcher.volcano_watcher import (
+            VolcanoScalePlanWatcher,
+        )
+
+        return VolcanoScalePlanWatcher(job_name, namespace, job_uuid)
     else:
         raise ValueError("Not support engine %s", platform)
diff --git a/dlrover/python/master/watcher/volcano_watcher.py b/dlrover/python/master/watcher/volcano_watcher.py
@@ -0,0 +1,26 @@
+# Copyright 2022 The DLRover Authors. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+
+
+# Do nothing for VolcanoScalePlanWatcher, use volcano to scaling.
+class VolcanoScalePlanWatcher:
+    def __init__(self, job_name, namespace, job_uuid):
+        self.job_name = job_name
+        self.namespace = namespace
+        self.job_uuid = job_uuid
+
+    def watch(self):
+        while True:
+            time.sleep(1000)
+            yield None
diff --git a/dlrover/python/scheduler/factory.py b/dlrover/python/scheduler/factory.py
@@ -26,6 +26,10 @@ def new_elastic_job(platform, job_name, namespace):
         from dlrover.python.scheduler.ray import RayElasticJob
 
         return RayElasticJob(job_name, namespace)
+    elif platform in (PlatformType.PY_VOLCANO):
+        from dlrover.python.scheduler.volcano import VolcanoElasticJob
+
+        return VolcanoElasticJob(job_name, namespace)
     else:
         raise ValueError("Not support engine %s", platform)
 
@@ -42,5 +46,9 @@ def new_job_args(platform, job_name, namespace):
         return RayJobArgs(platform, namespace, job_name)
     elif platform == PlatformType.LOCAL:
         return LocalJobArgs(platform, namespace, job_name)
+    elif platform == PlatformType.PY_VOLCANO:
+        from dlrover.python.scheduler.volcano import VolcanoJobArgs
+
+        return VolcanoJobArgs(platform, namespace, job_name)
     else:
         raise ValueError("Not support platform %s", platform)
diff --git a/dlrover/python/scheduler/volcano.py b/dlrover/python/scheduler/volcano.py
@@ -0,0 +1,131 @@
+# Copyright 2022 The DLRover Authors. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import time
+
+from dlrover.python.common.constants import (
+    DistributionStrategy,
+    NodeEnv,
+    NodeType,
+    OptimizeMode,
+)
+from dlrover.python.common.log import default_logger as logger
+from dlrover.python.common.node import NodeGroupResource, NodeResource
+from dlrover.python.scheduler.job import ElasticJob, JobArgs, NodeArgs
+from dlrover.python.scheduler.kubernetes import (
+    _dlrover_context,
+    convert_cpu_to_decimal,
+    convert_memory_to_mb,
+    k8sClient,
+)
+
+CONFIGMAP_SUFFIX = "-dlrover-conf"
+
+
+class VolcanoElasticJob(ElasticJob):
+    def __init__(self, job_name, namespace):
+        self._namespace = namespace
+        self._job_name = job_name
+
+    def get_node_name(self, type, id):
+        return "pod-name"
+
+    def get_node_service_addr(self, type, id):
+        return ""
+
+
+class VolcanoJobArgs(JobArgs):
+    def __init__(self, platform, namespace, job_name):
+        super(VolcanoJobArgs, self).__init__(platform, namespace, job_name)
+
+    def initilize(self):
+        self.user = os.getenv("USER", "")
+        k8s_client = k8sClient.singleton_instance(self.namespace)
+
+        # Get parameters from configmap
+        configmap = self._retry_to_get_configmap(k8s_client)
+        self.job_uuid = os.getenv(NodeEnv.JOB_UID, "")
+        self.distribution_strategy = configmap.data.get(
+            "distributionStrategy", DistributionStrategy.ALLREDUCE
+        )
+        self.optimizeMode = configmap.data.get(
+            "optimizeMode", OptimizeMode.SINGLE_JOB
+        )
+
+        # Get parameters from volcano job
+        vcjob = self._retry_to_get_vcjob(k8s_client)
+        for task in vcjob["spec"]["tasks"]:
+            if task["name"] == NodeType.WORKER:
+                restart_policy = task["template"]["spec"].get(
+                    "restartPolicy", ""
+                )
+                self.relaunch_always = restart_policy == "Always"
+
+            num = int(task.get("replicas", 0))
+            assert len(task["template"]["spec"]["containers"]) == 1
+            container = task["template"]["spec"]["containers"][0]
+            resources = container.get("resources", {})
+            requests = resources.get("requests", {})
+            cpu = convert_cpu_to_decimal(requests.get("cpu", 0))
+            if "memory" in requests:
+                memory = convert_memory_to_mb(requests["memory"])
+            else:
+                memory = 0
+            gpu_type = None
+            gpu_num = 0
+            for k, v in requests.items():
+                if "nvidia.com" in k:
+                    gpu_type = k
+                    gpu_num = int(v)
+            group_resource = NodeGroupResource(
+                num,
+                NodeResource(
+                    cpu=cpu,
+                    memory=memory,
+                    gpu_type=gpu_type,
+                    gpu_num=gpu_num,
+                ),
+            )
+            self.node_args[task["name"]] = NodeArgs(
+                group_resource,
+                process_timeout=_dlrover_context.seconds_to_timeout_task_process,
+            )
+        logger.info("Job args = %s", self.__dict__)
+
+    def _retry_to_get_configmap(self, k8s_client: k8sClient):
+        for _ in range(3):
+            configmap = k8s_client.get_configmap(
+                name=self.job_name + CONFIGMAP_SUFFIX,
+            )
+            if configmap:
+                return configmap
+            else:
+                time.sleep(5)
+        raise ValueError("Cannot get the conifgmap %s" % self.job_name)
+
+    def _retry_to_get_vcjob(self, k8s_client: k8sClient):
+        for _ in range(3):
+            vcjob = k8s_client.get_custom_resource(
+                name=self.job_name,
+                group="batch.volcano.sh",
+                version="v1alpha1",
+                plural="jobs",
+            )
+            if vcjob:
+                return vcjob
+            else:
+                time.sleep(5)
+        raise ValueError(
+            "Cannot get the training volcano job %s" % self.job_name
+        )
diff --git a/dlrover/trainer/torch/elastic_run.py b/dlrover/trainer/torch/elastic_run.py
@@ -211,7 +211,20 @@ def parse_args(args):
         action=check_env,
         help="Whether to test the communication performance.",
     )
-    return parser.parse_args(args)
+    args = parser.parse_args(args)
+
+    # reconfigure nnodes
+    max_node = os.getenv(NodeEnv.MAX_NODE, None)
+    min_node = os.getenv(NodeEnv.MAX_NODE, None)
+    if max_node is not None and min_node is not None:
+        new_nnodes = "{}:{}".format(
+            int(os.getenv("MIN_NODE")), int(os.getenv("MAX_NODE"))
+        )
+        logger.info(
+            f"The nnodes will be reconfigured from {args.nnodes} to {new_nnodes}"
+        )
+        args.nnodes = new_nnodes
+    return args
 
 
 class ElasticLaunch: