Merge branch 'master' into loadams/pyproject-toml

loadams · web-flow · commit e0d9ba441139 · 2025-03-28T11:11:57.000-07:00
diff --git a/blogs/deepspeed-gds/README.md b/blogs/deepspeed-gds/README.md
@@ -47,7 +47,7 @@ We used three benchmarking tools for our evaluations. The first is fio, the popu
 
 ## High-Performance I/O with CPU Buffers via NVMe Scaling
 
-Our first set of microbenchmark evaluations used fio and ds\_io to measure the performance of transferring 1GB data between NVMe and CPU memory. We configure fio to use the libaio backend for these experiments1. The results are summarized in Figure 1, from which we make two observations. First, DeepNVMe demonstrates high performance as it roughly matches fio, despite being more representative of DL applications. Second, DeepNVMe scales I/O performance almost linearly with available NVMe bandwidth, achieving rates of 10GB/sec reads and 5GB/sec writes.
+Our first set of microbenchmark evaluations used fio and ds\_io to measure the performance of transferring 1GB data between NVMe and CPU memory. We configure fio to use the libaio backend for these experiments. The results are summarized in Figure 1, from which we make two observations. First, DeepNVMe demonstrates high performance as it roughly matches fio, despite being more representative of DL applications. Second, DeepNVMe scales I/O performance almost linearly with available NVMe bandwidth, achieving rates of 10GB/sec reads and 5GB/sec writes.
 
 <img src="./media/figure1.png" style="width:6.5in;height:3.42153in" />
 
@@ -85,4 +85,4 @@ In this blog post, we introduced DeepNVMe, an I/O optimization technology create
 
 
 # Acknowlegements
-This work is the result of a deep collaboration between Microsoft and NVIDIA. The contributors include Joe Mayer, Martin Cai, and Olatunji Ruwase from Microsoft; Kiran Modukuri, Vahid Noormofidi, Sourab Gupta, and Sandeep Joshi from Nivida.
+This work is the result of a deep collaboration between Microsoft and NVIDIA. The contributors include Joe Mayer, Martin Cai, and Olatunji Ruwase from Microsoft; Kiran Modukuri, Vahid Noormofidi, Sourab Gupta, and Sandeep Joshi from Nvidia.
diff --git a/deepspeed/launcher/runner.py b/deepspeed/launcher/runner.py
@@ -483,7 +483,7 @@ def main(args=None):
             result = subprocess.check_output(hostname_cmd)
         except subprocess.CalledProcessError as err:
             logger.error(
-                "Unable to detect suitable master address via `hostname -I`, please manually specify one via --master_addr"
+                "Unable to detect suitable master address via 'hostname -I', please manually specify one via --master_addr"
             )
             raise err
         args.master_addr = result.decode('utf-8').split()[0]
diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py
@@ -801,7 +801,6 @@ def __init__(self, config: Union[str, dict], mpu=None, mesh_device=None):
 
     def _initialize_params(self, param_dict):
         self.train_batch_size = get_train_batch_size(param_dict)
-        #print(f"beginning get_train_batch_size = {get_train_batch_size}")
         self.train_micro_batch_size_per_gpu = get_train_micro_batch_size_per_gpu(param_dict)
         self.gradient_accumulation_steps = get_gradient_accumulation_steps(param_dict)
         self.steps_per_print = get_steps_per_print(param_dict)
diff --git a/deepspeed/runtime/constants.py b/deepspeed/runtime/constants.py
@@ -249,7 +249,7 @@
 Optional comm data type for seq paralleism should be set as:
 "seq_parallel_communication_data_type": "fp32"
 '''
-SEQ_PARALLEL_COMMUNICATION_DATA_TYPE = "seq_parallel_comm_data_type"
+SEQ_PARALLEL_COMMUNICATION_DATA_TYPE = "seq_parallel_communication_data_type"
 SEQ_PARALLEL_COMMUNICATION_DATA_TYPE_DEFAULT = "fp32"
 
 #########################################
diff --git a/deepspeed/runtime/data_pipeline/config.py b/deepspeed/runtime/data_pipeline/config.py
@@ -20,7 +20,6 @@ def get_data_efficiency_config(param_dict):
     sub_param_dict = param_dict[DATA_EFFICIENCY]
     output[DATA_SAMPLING] = get_data_sampling(sub_param_dict)
     output[DATA_ROUTING] = get_data_routing(sub_param_dict)
-
     return output
 
 
@@ -39,15 +38,14 @@ def get_data_efficiency_seed(param_dict):
 
 
 def get_data_sampling(param_dict):
-    output = {}
+    sub_param_dict = param_dict.get(DATA_SAMPLING, {})
+    output = copy.copy(sub_param_dict)
     output[DATA_SAMPLING_ENABLED] = get_data_sampling_enabled(param_dict)
     output[DATA_SAMPLING_NUM_EPOCHS] = get_data_sampling_num_epochs(param_dict)
     output[DATA_SAMPLING_NUM_WORKERS] = get_data_sampling_num_workers(param_dict)
-    if DATA_SAMPLING not in param_dict.keys():
-        param_dict[DATA_SAMPLING] = {}
-    sub_param_dict = param_dict[DATA_SAMPLING]
+    output[DATA_SAMPLING_PIN_MEMORY] = get_data_sampling_pin_memory(param_dict)
     output[CURRICULUM_LEARNING] = get_curriculum_learning(sub_param_dict)
-
+    output[DYNAMIC_BATCHING] = get_dynamic_batching(sub_param_dict)
     return output
 
 
@@ -73,6 +71,13 @@ def get_data_sampling_num_workers(param_dict):
         return DATA_SAMPLING_NUM_WORKERS_DEFAULT
 
 
+def get_data_sampling_pin_memory(param_dict):
+    if DATA_SAMPLING in param_dict.keys():
+        return get_scalar_param(param_dict[DATA_SAMPLING], DATA_SAMPLING_PIN_MEMORY, DATA_SAMPLING_PIN_MEMORY_DEFAULT)
+    else:
+        return DATA_SAMPLING_PIN_MEMORY_DEFAULT
+
+
 def get_curriculum_learning(param_dict):
     output = {}
     output[CURRICULUM_LEARNING_ENABLED] = get_curriculum_learning_enabled(param_dict)
@@ -87,6 +92,26 @@ def get_curriculum_learning(param_dict):
     return output
 
 
+def get_dynamic_batching(param_dict):
+    output = copy.copy(param_dict.get(DYNAMIC_BATCHING, {}))
+    output[DYNAMIC_BATCHING_ENABLED] = bool(output.get(DYNAMIC_BATCHING_ENABLED, DYNAMIC_BATCHING_ENABLED_DEFAULT))
+    output[DYNAMIC_BATCHING_LR_SCALING_METHOD] = str(
+        output.get(DYNAMIC_BATCHING_LR_SCALING_METHOD, DYNAMIC_BATCHING_LR_SCALING_METHOD_DEFAULT))
+    output[DYNAMIC_BATCHING_MIN_BATCH_SIZE] = int(
+        output.get(DYNAMIC_BATCHING_MIN_BATCH_SIZE, DYNAMIC_BATCHING_MIN_BATCH_SIZE_DEFAULT))
+    output[DYNAMIC_BATCHING_MAX_BATCH_SIZE] = int(output[DYNAMIC_BATCHING_MAX_BATCH_SIZE]) \
+        if DYNAMIC_BATCHING_MAX_BATCH_SIZE in output.keys() \
+        else DYNAMIC_BATCHING_MAX_BATCH_SIZE_DEFAULT
+    output[DYNAMIC_BATCHING_SEQUENCE_PICKING_ORDER] = str(
+        output.get(DYNAMIC_BATCHING_SEQUENCE_PICKING_ORDER, DYNAMIC_BATCHING_SEQUENCE_PICKING_ORDER_DEFAULT))
+    if output[DYNAMIC_BATCHING_ENABLED]:
+        assert DYNAMIC_BATCHING_MAX_TOKENS in output.keys(
+        ), f"Dynamic batching is enabled, so {DYNAMIC_BATCHING_MAX_TOKENS} must be specified"
+        output[DYNAMIC_BATCHING_MAX_TOKENS] = int(output[DYNAMIC_BATCHING_MAX_TOKENS])
+    output[DYNAMIC_BATCHING_VERBOSE] = bool(output.get(DYNAMIC_BATCHING_VERBOSE, False))
+    return output
+
+
 def get_curriculum_learning_enabled(param_dict):
     if CURRICULUM_LEARNING in param_dict.keys():
         return get_scalar_param(param_dict[CURRICULUM_LEARNING], CURRICULUM_LEARNING_ENABLED,
diff --git a/deepspeed/runtime/data_pipeline/constants.py b/deepspeed/runtime/data_pipeline/constants.py
@@ -22,6 +22,8 @@
 DATA_SAMPLING_NUM_EPOCHS_DEFAULT = 1000
 DATA_SAMPLING_NUM_WORKERS = "num_workers"
 DATA_SAMPLING_NUM_WORKERS_DEFAULT = 0
+DATA_SAMPLING_PIN_MEMORY = "pin_memory"
+DATA_SAMPLING_PIN_MEMORY_DEFAULT = False
 
 #########################################
 # Data efficiency - Data Sampling - Curriculum Learning
@@ -62,6 +64,24 @@
 CURRICULUM_LEARNING_DATA_CLUSTER_CURRENT_POSITION = "data_cluster_current_position"
 CURRICULUM_LEARNING_NP_RNG_STATE = "np_rng_state"
 
+#########################################
+# Data efficiency - Dynamic batching and LR scaling
+#########################################
+DYNAMIC_BATCHING = "dynamic_batching"
+DYNAMIC_BATCHING_ENABLED = "enabled"
+DYNAMIC_BATCHING_ENABLED_DEFAULT = False
+DYNAMIC_BATCHING_METRICS_PATH = "metrics_path"
+DYNAMIC_BATCHING_LR_SCALING_METHOD = "lr_scaling_method"  # "linear" / "sqrt" / "none"
+DYNAMIC_BATCHING_LR_SCALING_METHOD_DEFAULT = "linear"
+DYNAMIC_BATCHING_MIN_BATCH_SIZE = "min_batch_size"
+DYNAMIC_BATCHING_MIN_BATCH_SIZE_DEFAULT = 1
+DYNAMIC_BATCHING_MAX_BATCH_SIZE = "max_batch_size"
+DYNAMIC_BATCHING_MAX_BATCH_SIZE_DEFAULT = None
+DYNAMIC_BATCHING_SEQUENCE_PICKING_ORDER = "sequence_picking_order"  # "random" / "seqlen" / "dataloader"
+DYNAMIC_BATCHING_SEQUENCE_PICKING_ORDER_DEFAULT = "dataloader"  # "random" / "seqlen" / "dataloader"
+DYNAMIC_BATCHING_MAX_TOKENS = "max_tokens"
+DYNAMIC_BATCHING_VERBOSE = "verbose"
+
 #########################################
 # Curriculum Learning legacy implementation
 #########################################
diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py
@@ -862,8 +862,13 @@ def test_compare_both_data_analyzers(dataset):
         for path in output_paths:
             with open(os.path.join(da.save_path, path), 'rb') as f1, \
                 open(os.path.join(dda.save_path, path), 'rb') as f2:
-                if f1.read() != f2.read():
+                # if files have suffix .bin, they should be identical
+                if path.endswith(".bin"):
+                    assert f1.read() == f2.read(), f"files {path} are not identical."
+                elif f1.read() != f2.read():
                     print(f"files {path} are not identical.")
+    dist.barrier()
+    dist.destroy_process_group()
 
 
 if __name__ == "__main__":
diff --git a/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py b/deepspeed/runtime/data_pipeline/data_sampling/variable_batch_size_and_lr.py
diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py
diff --git a/version.txt b/version.txt

Original file line number	Diff line number	Diff line change
`@@ -483,7 +483,7 @@ def main(args=None):`
`483`	`483`	`result = subprocess.check_output(hostname_cmd)`
`484`	`484`	`except subprocess.CalledProcessError as err:`
`485`	`485`	`logger.error(`
`486`		- "Unable to detect suitable master address via `hostname -I`, please manually specify one via --master_addr"
	`486`	`+ "Unable to detect suitable master address via 'hostname -I', please manually specify one via --master_addr"`
`487`	`487`	`)`
`488`	`488`	`raise err`
`489`	`489`	`args.master_addr = result.decode('utf-8').split()[0]`