Skip to content

Commit e0d9ba4

Browse files
authored
Merge branch 'master' into loadams/pyproject-toml
2 parents 3601c29 + 3054b93 commit e0d9ba4

File tree

10 files changed

+557
-15
lines changed

10 files changed

+557
-15
lines changed

blogs/deepspeed-gds/README.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ We used three benchmarking tools for our evaluations. The first is fio, the popu
4747

4848
## High-Performance I/O with CPU Buffers via NVMe Scaling
4949

50-
Our first set of microbenchmark evaluations used fio and ds\_io to measure the performance of transferring 1GB data between NVMe and CPU memory. We configure fio to use the libaio backend for these experiments1. The results are summarized in Figure 1, from which we make two observations. First, DeepNVMe demonstrates high performance as it roughly matches fio, despite being more representative of DL applications. Second, DeepNVMe scales I/O performance almost linearly with available NVMe bandwidth, achieving rates of 10GB/sec reads and 5GB/sec writes.
50+
Our first set of microbenchmark evaluations used fio and ds\_io to measure the performance of transferring 1GB data between NVMe and CPU memory. We configure fio to use the libaio backend for these experiments. The results are summarized in Figure 1, from which we make two observations. First, DeepNVMe demonstrates high performance as it roughly matches fio, despite being more representative of DL applications. Second, DeepNVMe scales I/O performance almost linearly with available NVMe bandwidth, achieving rates of 10GB/sec reads and 5GB/sec writes.
5151

5252
<img src="./media/figure1.png" style="width:6.5in;height:3.42153in" />
5353

@@ -85,4 +85,4 @@ In this blog post, we introduced DeepNVMe, an I/O optimization technology create
8585

8686

8787
# Acknowlegements
88-
This work is the result of a deep collaboration between Microsoft and NVIDIA. The contributors include Joe Mayer, Martin Cai, and Olatunji Ruwase from Microsoft; Kiran Modukuri, Vahid Noormofidi, Sourab Gupta, and Sandeep Joshi from Nivida.
88+
This work is the result of a deep collaboration between Microsoft and NVIDIA. The contributors include Joe Mayer, Martin Cai, and Olatunji Ruwase from Microsoft; Kiran Modukuri, Vahid Noormofidi, Sourab Gupta, and Sandeep Joshi from Nvidia.

deepspeed/launcher/runner.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -483,7 +483,7 @@ def main(args=None):
483483
result = subprocess.check_output(hostname_cmd)
484484
except subprocess.CalledProcessError as err:
485485
logger.error(
486-
"Unable to detect suitable master address via `hostname -I`, please manually specify one via --master_addr"
486+
"Unable to detect suitable master address via 'hostname -I', please manually specify one via --master_addr"
487487
)
488488
raise err
489489
args.master_addr = result.decode('utf-8').split()[0]

deepspeed/runtime/config.py

-1
Original file line numberDiff line numberDiff line change
@@ -801,7 +801,6 @@ def __init__(self, config: Union[str, dict], mpu=None, mesh_device=None):
801801

802802
def _initialize_params(self, param_dict):
803803
self.train_batch_size = get_train_batch_size(param_dict)
804-
#print(f"beginning get_train_batch_size = {get_train_batch_size}")
805804
self.train_micro_batch_size_per_gpu = get_train_micro_batch_size_per_gpu(param_dict)
806805
self.gradient_accumulation_steps = get_gradient_accumulation_steps(param_dict)
807806
self.steps_per_print = get_steps_per_print(param_dict)

deepspeed/runtime/constants.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -249,7 +249,7 @@
249249
Optional comm data type for seq paralleism should be set as:
250250
"seq_parallel_communication_data_type": "fp32"
251251
'''
252-
SEQ_PARALLEL_COMMUNICATION_DATA_TYPE = "seq_parallel_comm_data_type"
252+
SEQ_PARALLEL_COMMUNICATION_DATA_TYPE = "seq_parallel_communication_data_type"
253253
SEQ_PARALLEL_COMMUNICATION_DATA_TYPE_DEFAULT = "fp32"
254254

255255
#########################################

deepspeed/runtime/data_pipeline/config.py

+31-6
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ def get_data_efficiency_config(param_dict):
2020
sub_param_dict = param_dict[DATA_EFFICIENCY]
2121
output[DATA_SAMPLING] = get_data_sampling(sub_param_dict)
2222
output[DATA_ROUTING] = get_data_routing(sub_param_dict)
23-
2423
return output
2524

2625

@@ -39,15 +38,14 @@ def get_data_efficiency_seed(param_dict):
3938

4039

4140
def get_data_sampling(param_dict):
42-
output = {}
41+
sub_param_dict = param_dict.get(DATA_SAMPLING, {})
42+
output = copy.copy(sub_param_dict)
4343
output[DATA_SAMPLING_ENABLED] = get_data_sampling_enabled(param_dict)
4444
output[DATA_SAMPLING_NUM_EPOCHS] = get_data_sampling_num_epochs(param_dict)
4545
output[DATA_SAMPLING_NUM_WORKERS] = get_data_sampling_num_workers(param_dict)
46-
if DATA_SAMPLING not in param_dict.keys():
47-
param_dict[DATA_SAMPLING] = {}
48-
sub_param_dict = param_dict[DATA_SAMPLING]
46+
output[DATA_SAMPLING_PIN_MEMORY] = get_data_sampling_pin_memory(param_dict)
4947
output[CURRICULUM_LEARNING] = get_curriculum_learning(sub_param_dict)
50-
48+
output[DYNAMIC_BATCHING] = get_dynamic_batching(sub_param_dict)
5149
return output
5250

5351

@@ -73,6 +71,13 @@ def get_data_sampling_num_workers(param_dict):
7371
return DATA_SAMPLING_NUM_WORKERS_DEFAULT
7472

7573

74+
def get_data_sampling_pin_memory(param_dict):
75+
if DATA_SAMPLING in param_dict.keys():
76+
return get_scalar_param(param_dict[DATA_SAMPLING], DATA_SAMPLING_PIN_MEMORY, DATA_SAMPLING_PIN_MEMORY_DEFAULT)
77+
else:
78+
return DATA_SAMPLING_PIN_MEMORY_DEFAULT
79+
80+
7681
def get_curriculum_learning(param_dict):
7782
output = {}
7883
output[CURRICULUM_LEARNING_ENABLED] = get_curriculum_learning_enabled(param_dict)
@@ -87,6 +92,26 @@ def get_curriculum_learning(param_dict):
8792
return output
8893

8994

95+
def get_dynamic_batching(param_dict):
96+
output = copy.copy(param_dict.get(DYNAMIC_BATCHING, {}))
97+
output[DYNAMIC_BATCHING_ENABLED] = bool(output.get(DYNAMIC_BATCHING_ENABLED, DYNAMIC_BATCHING_ENABLED_DEFAULT))
98+
output[DYNAMIC_BATCHING_LR_SCALING_METHOD] = str(
99+
output.get(DYNAMIC_BATCHING_LR_SCALING_METHOD, DYNAMIC_BATCHING_LR_SCALING_METHOD_DEFAULT))
100+
output[DYNAMIC_BATCHING_MIN_BATCH_SIZE] = int(
101+
output.get(DYNAMIC_BATCHING_MIN_BATCH_SIZE, DYNAMIC_BATCHING_MIN_BATCH_SIZE_DEFAULT))
102+
output[DYNAMIC_BATCHING_MAX_BATCH_SIZE] = int(output[DYNAMIC_BATCHING_MAX_BATCH_SIZE]) \
103+
if DYNAMIC_BATCHING_MAX_BATCH_SIZE in output.keys() \
104+
else DYNAMIC_BATCHING_MAX_BATCH_SIZE_DEFAULT
105+
output[DYNAMIC_BATCHING_SEQUENCE_PICKING_ORDER] = str(
106+
output.get(DYNAMIC_BATCHING_SEQUENCE_PICKING_ORDER, DYNAMIC_BATCHING_SEQUENCE_PICKING_ORDER_DEFAULT))
107+
if output[DYNAMIC_BATCHING_ENABLED]:
108+
assert DYNAMIC_BATCHING_MAX_TOKENS in output.keys(
109+
), f"Dynamic batching is enabled, so {DYNAMIC_BATCHING_MAX_TOKENS} must be specified"
110+
output[DYNAMIC_BATCHING_MAX_TOKENS] = int(output[DYNAMIC_BATCHING_MAX_TOKENS])
111+
output[DYNAMIC_BATCHING_VERBOSE] = bool(output.get(DYNAMIC_BATCHING_VERBOSE, False))
112+
return output
113+
114+
90115
def get_curriculum_learning_enabled(param_dict):
91116
if CURRICULUM_LEARNING in param_dict.keys():
92117
return get_scalar_param(param_dict[CURRICULUM_LEARNING], CURRICULUM_LEARNING_ENABLED,

deepspeed/runtime/data_pipeline/constants.py

+20
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
DATA_SAMPLING_NUM_EPOCHS_DEFAULT = 1000
2323
DATA_SAMPLING_NUM_WORKERS = "num_workers"
2424
DATA_SAMPLING_NUM_WORKERS_DEFAULT = 0
25+
DATA_SAMPLING_PIN_MEMORY = "pin_memory"
26+
DATA_SAMPLING_PIN_MEMORY_DEFAULT = False
2527

2628
#########################################
2729
# Data efficiency - Data Sampling - Curriculum Learning
@@ -62,6 +64,24 @@
6264
CURRICULUM_LEARNING_DATA_CLUSTER_CURRENT_POSITION = "data_cluster_current_position"
6365
CURRICULUM_LEARNING_NP_RNG_STATE = "np_rng_state"
6466

67+
#########################################
68+
# Data efficiency - Dynamic batching and LR scaling
69+
#########################################
70+
DYNAMIC_BATCHING = "dynamic_batching"
71+
DYNAMIC_BATCHING_ENABLED = "enabled"
72+
DYNAMIC_BATCHING_ENABLED_DEFAULT = False
73+
DYNAMIC_BATCHING_METRICS_PATH = "metrics_path"
74+
DYNAMIC_BATCHING_LR_SCALING_METHOD = "lr_scaling_method" # "linear" / "sqrt" / "none"
75+
DYNAMIC_BATCHING_LR_SCALING_METHOD_DEFAULT = "linear"
76+
DYNAMIC_BATCHING_MIN_BATCH_SIZE = "min_batch_size"
77+
DYNAMIC_BATCHING_MIN_BATCH_SIZE_DEFAULT = 1
78+
DYNAMIC_BATCHING_MAX_BATCH_SIZE = "max_batch_size"
79+
DYNAMIC_BATCHING_MAX_BATCH_SIZE_DEFAULT = None
80+
DYNAMIC_BATCHING_SEQUENCE_PICKING_ORDER = "sequence_picking_order" # "random" / "seqlen" / "dataloader"
81+
DYNAMIC_BATCHING_SEQUENCE_PICKING_ORDER_DEFAULT = "dataloader" # "random" / "seqlen" / "dataloader"
82+
DYNAMIC_BATCHING_MAX_TOKENS = "max_tokens"
83+
DYNAMIC_BATCHING_VERBOSE = "verbose"
84+
6585
#########################################
6686
# Curriculum Learning legacy implementation
6787
#########################################

deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -862,8 +862,13 @@ def test_compare_both_data_analyzers(dataset):
862862
for path in output_paths:
863863
with open(os.path.join(da.save_path, path), 'rb') as f1, \
864864
open(os.path.join(dda.save_path, path), 'rb') as f2:
865-
if f1.read() != f2.read():
865+
# if files have suffix .bin, they should be identical
866+
if path.endswith(".bin"):
867+
assert f1.read() == f2.read(), f"files {path} are not identical."
868+
elif f1.read() != f2.read():
866869
print(f"files {path} are not identical.")
870+
dist.barrier()
871+
dist.destroy_process_group()
867872

868873

869874
if __name__ == "__main__":

0 commit comments

Comments
 (0)