diff --git a/.gitignore b/.gitignore index 5eeb4c7..caf152a 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ build/* dist/* env/* nvidia_bobber.egg-info/ +*.out diff --git a/bobber/bobber.py b/bobber/bobber.py index 38ef00a..9c365d3 100644 --- a/bobber/bobber.py +++ b/bobber/bobber.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: MIT import bobber.lib.docker import json +import sys from argparse import ArgumentParser, ArgumentTypeError, Namespace from copy import copy from bobber import __version__ @@ -88,11 +89,20 @@ def parse_args(version: str) -> Namespace: # More general options which apply to a majority of the running commands # Note that all arguments prepended with '--' are optional + commands_parent.add_argument('--slurm', help='Run a test on an existing ' + 'SLURM cluster with Pyxis/Enroot installed', + action='store_true') + commands_parent.add_argument('--storage-path', help='Path at which the ' + 'filesystem under test is mounted', + required='--slurm' in sys.argv) commands_parent.add_argument('log_path', metavar='log-path', help='Path ' 'used to store log files on the head node') - commands_parent.add_argument('hosts', help='Comma-separated list of ' + commands_parent.add_argument('hosts', help='Number of hosts to queue a ' + 'job for in a SLURM cluster.' if '--slurm' + in sys.argv else 'Comma-separated list of ' 'hostnames or IP addresses', - type=unique_hosts) + type=int if '--slurm' in sys.argv + else unique_hosts) commands_parent.add_argument('--config-path', help='Read a JSON config ' 'file with expected parameters and use those ' 'values for testing. Ignores all other ' @@ -384,6 +394,15 @@ def execute_command(args: Namespace, version: str) -> NoReturn: bobber.lib.docker.cast(args.storage_path, args.ignore_gpu, version) elif args.command == LOAD: bobber.lib.docker.load(args.filename) + elif args.slurm and args.command == RUN_NCCL: + args = load_settings(args) + bobber.lib.system.slurm.run_nccl(args, version) + elif args.slurm and args.command == RUN_DALI: + args = load_settings(args) + bobber.lib.system.slurm.run_dali(args, version) + elif args.slurm and args.command == RUN_STG_META: + args = load_settings(args) + bobber.lib.system.slurm.run_meta(args, version) else: # Update the version to be used in filenames version_underscore = version.replace('.', '_') diff --git a/bobber/lib/analysis/dali.py b/bobber/lib/analysis/dali.py index c5cd736..5ac2ed2 100644 --- a/bobber/lib/analysis/dali.py +++ b/bobber/lib/analysis/dali.py @@ -139,6 +139,42 @@ def _update_results(image_type_match: dict, results: list) -> dict: return image_type_match +def _slurm_test_sections(log_contents: str) -> list: + """ + Parse the SLURM log test sections. + + The SLURM log files for DALI tests have a different structure to the output + which needs to be special-handled. These sections are parsed by reading + from the beginning of one sub-section (ie. small JPGs) until the first time + the next sub-section is encountered (ie. large JPGs). + + Parameters + ---------- + log_contents : str + A ``string`` of the complete contents from the log file. + + Returns + ------- + list + Returns a ``list`` of strings where each element is the complete output + from a test subsection. + """ + small_jpg = re.findall('800x600/file_read_pipeline.*' + '?3840x2160/file_read_pipeline', + log_contents, re.DOTALL) + large_jpg = re.findall('3840x2160/file_read_pipeline.*' + '?800x600/tfrecord_pipeline', + log_contents, re.DOTALL) + small_tf = re.findall('800x600/tfrecord_pipeline.*' + '?3840x2160/tfrecord_pipeline', + log_contents, re.DOTALL) + large_tf = re.findall('3840x2160/tfrecord_pipeline.*' + 'OK', log_contents, re.DOTALL) + sections = [small_jpg, large_jpg, small_tf, large_tf] + sections = ['\n'.join(section) for section in sections] + return sections + + def _result_parsing(log_contents: str, systems: int, image_results: dict, log_file: str) -> dict: """ @@ -188,6 +224,10 @@ def _result_parsing(log_contents: str, systems: int, image_results: dict, ] test_sections = re.findall(r'RUN 1/1.*?OK', log_contents, re.DOTALL) + # The SLURM tests have a different layout and need to be grabbed + # appropriately + if '+ srun --nodes=' in log_contents: + test_sections = _slurm_test_sections(log_contents) if len(test_sections) != 4: print(f'Warning: Invalid number of results found in {log_file} log ' 'file. Skipping...') diff --git a/bobber/lib/docker/management.py b/bobber/lib/docker/management.py index b82e33a..d1ddeff 100644 --- a/bobber/lib/docker/management.py +++ b/bobber/lib/docker/management.py @@ -29,11 +29,23 @@ def __init__(self) -> NoReturn: try: self.client = docker.from_env() self.cli = docker.APIClient(timeout=600) + self.docker_running = True except docker.errors.DockerException as e: if 'error while fetching server api version' in str(e).lower(): - print('Error: Could not communicate with the Docker daemon.') - print('Ensure Docker is running with "systemctl start docker"') - sys.exit(DOCKER_COMMUNICATION_ERROR) + self.docker_running = False + + def _verify_docker_running(self, *args, **kwargs) -> None: + """ + Raise a DOCKER_COMMUNICATION_ERROR when Docker isn't running. + + If a command is attempted to be run that requires Docker and Docker is + either not installed or not running, an error needs to be raised + gracefully to the user. + """ + if not self.docker_running: + print('Error: Could not communicate with the Docker daemon.') + print('Ensure Docker is running with "systemctl start docker"') + sys.exit(DOCKER_COMMUNICATION_ERROR) def _build_if_not_built(self, tag: str, bobber_version: str) -> NoReturn: """ @@ -102,6 +114,7 @@ def cast(self, storage_path: str, ignore_gpu: bool, bobber_version : string A ``string`` of the local version of Bobber, such as '5.0.0'. """ + self._verify_docker_running() tag = self.get_tag(bobber_version) self._build_if_not_built(tag, bobber_version) runtime = None @@ -155,6 +168,7 @@ def export(self, bobber_version: str) -> NoReturn: bobber_version : string A ``string`` of the local version of Bobber, such as '5.0.0'. """ + self._verify_docker_running() tag = self.get_tag(bobber_version) self._build_if_not_built(tag, bobber_version) filename = tag.replace('/', '_').replace(':', '_') @@ -177,6 +191,7 @@ def build(self, bobber_version: str) -> NoReturn: bobber_version : string A ``string`` of the local version of Bobber, such as '5.0.0'. """ + self._verify_docker_running() tag = self.get_tag(bobber_version) print('Building a new image. This may take a while...') # Set the path to the repository's parent directory. @@ -208,6 +223,7 @@ def load(self, filename: str) -> NoReturn: A ``string`` of the filename for the local tarball to load, such as './nvidia_bobber_5.0.0.tar'. """ + self._verify_docker_running() print(f'Importing {filename}. This may take a while...') with open(filename, 'rb') as image_file: self.client.images.load(image_file) @@ -233,6 +249,7 @@ def execute(self, command: str, environment: Optional[dict] = None, log_file : string (Optional) A ``string`` of the path and filename to optionally save output to. """ + self._verify_docker_running() if not self.running: print('Bobber container not running. Launch a container with ' '"bobber cast" prior to running any tests.') @@ -281,6 +298,7 @@ def version_match(self, container: Container) -> bool: bool Returns `True` when the versions match and `False` when not. """ + self._verify_docker_running() if f'nvidia/bobber:{version}' not in container.image.tags: return False return True diff --git a/bobber/lib/exit_codes.py b/bobber/lib/exit_codes.py index dc65414..8a2ce49 100644 --- a/bobber/lib/exit_codes.py +++ b/bobber/lib/exit_codes.py @@ -8,3 +8,5 @@ CONTAINER_NOT_RUNNING = 32 # Bobber container not running NVIDIA_RUNTIME_ERROR = 33 # NVIDIA container runtime not found CONTAINER_VERSION_MISMATCH = 34 # Container different from application +SLURM_QUEUE_ERROR = 40 # Error queueing a SLURM job +SBATCH_CALL_ERROR = 41 # Error running sbatch diff --git a/bobber/lib/system/__init__.py b/bobber/lib/system/__init__.py index 548d2d4..827c2a2 100644 --- a/bobber/lib/system/__init__.py +++ b/bobber/lib/system/__init__.py @@ -1 +1,5 @@ # SPDX-License-Identifier: MIT +from bobber.lib.system import slurm + +run_dali = slurm.run_dali +run_nccl = slurm.run_nccl diff --git a/bobber/lib/system/slurm.py b/bobber/lib/system/slurm.py new file mode 100644 index 0000000..791c28f --- /dev/null +++ b/bobber/lib/system/slurm.py @@ -0,0 +1,228 @@ +# SPDX-License-Identifier: MIT +import os +import subprocess +import sys +from argparse import Namespace +from bobber.lib.exit_codes import SBATCH_CALL_ERROR, SLURM_QUEUE_ERROR +from typing import NoReturn + + +def _slurm_scripts_path() -> str: + """ + Find the absolute path to the slurm_scripts directory. + + The slurm_scripts directory contains several *.sub files which are required + to launch test commands via SLURM. Depending on how and where Bobber is + installed on a system, the absolute path to this directory may change, but + the relative path is easy to find compared to this module. By allowing + Python to determine the absolute path to this module, the absolute path to + slurm_scripts can be found by combining the absolute path of this module + and the relative path to the slurm_scripts directory. + + Returns + ------- + str + Returns a ``string`` of the absolute path to the slurm_scripts + directory. + """ + directory = os.path.dirname(os.path.realpath(__file__)) + directory = os.path.join(directory, '../../slurm_scripts') + return directory + + +def _sbatch_path() -> str: + """ + Find the full path to the sbatch script. + + While launching a Python process without "shell=True" as is done for the + test commands below, the "sbatch" command is not available as Python + launches a new process without a proper PATH variable. Running "which + sbatch" with a shell instance provides the full path to sbatch which can + later be used directly to invoke the script directly instead of using the + alias. If sbatch is not installed on the system, the application will exit. + + Returns + ------- + str + Returns a ``string`` of the full local path to the sbatch script. + """ + result = subprocess.run('which sbatch', capture_output=True, shell=True) + if not result.stderr and result.stdout: + return str(result.stdout.strip().decode('ascii')) + else: + print('sbatch command not found. Please ensure SLURM is installed and ' + 'functional.') + sys.exit(SBATCH_CALL_ERROR) + + +def run_nccl(args: Namespace, version: str) -> NoReturn: + """ + Launch a multi-node NCCL test via SLURM. + + Launch a NCCL test for N-nodes managed by a SLURM cluster. Multiple tests + are queued-up as sbatch commands which will only launch once the previous + test has completed. + + Parameters + ---------- + args : Namespace + A ``Namespace`` of all settings specified by the user for the test. + version : str + A ``string`` of the Bobber version. + """ + # Update the version to be used in filenames + version_underscore = version.replace('.', '_') + # If not sweeping, set the range of nodes from N-hosts to N-hosts for a + # single iteration of tests. + lower_bound = args.hosts + if args.sweep: + lower_bound = 1 + for hosts in range(lower_bound, args.hosts + 1): + for iteration in range(1, args.iterations + 1): + nccl_log = os.path.join(args.log_path, + f'nccl_iteration_{iteration}_' + f'gpus_{args.gpus}_' + f'nccl_max_{args.nccl_max}_' + f'gid_{args.compute_gid}_' + f'nccl_tc_{args.nccl_tc}_' + f'systems_{hosts}_' + f'version_{version_underscore}.log') + nccl_path = os.path.join(_slurm_scripts_path(), 'nccl.sub') + sbatch = _sbatch_path() + env = { + 'HOSTS': str(hosts), + 'FS_PATH': args.storage_path, + 'CONT_VERSION': f'nvcr.io/nvidian/bobber:{version}', + 'NCCL_MAX': str(args.nccl_max), + 'LOGDIR': args.log_path, + 'LOGPATH': nccl_log, + 'NCCL_IB_HCAS': args.nccl_ib_hcas, + 'COMPUTE_GID': str(args.compute_gid), + 'NCCL_TC': args.nccl_tc or '' + } + cmd = [f'{sbatch}', + '-N', + f'{hosts}', + f'--gpus-per-node={args.gpus}', + '--wait', + '--dependency=singleton', + f'{nccl_path}'] + try: + print('Running:', cmd) + subprocess.Popen(cmd, env=env) + except subprocess.CalledProcessError: + print('Error queueing SLURM job for NCCL tests. ' + 'See output for errors.') + sys.exit(SLURM_QUEUE_ERROR) + + +def run_dali(args: Namespace, version: str) -> NoReturn: + """ + Launch a multi-node DALI test via SLURM. + + Launch a DALI test for N-nodes managed by a SLURM cluster. Multiple tests + are queued-up as sbatch commands which will only launch once the previous + test has completed. + + Parameters + ---------- + args : Namespace + A ``Namespace`` of all settings specified by the user for the test. + version : str + A ``string`` of the Bobber version. + """ + # Update the version to be used in filenames + version_underscore = version.replace('.', '_') + # If not sweeping, set the range of nodes from N-hosts to N-hosts for a + # single iteration of tests. + lower_bound = args.hosts + if args.sweep: + lower_bound = 1 + for hosts in range(lower_bound, args.hosts + 1): + for iteration in range(1, args.iterations + 1): + dali_log = os.path.join(args.log_path, + f'dali_iteration_{iteration}_' + f'gpus_{args.gpus}_' + f'batch_size_lg_{args.batch_size_lg}_' + f'batch_size_sm_{args.batch_size_sm}_' + f'systems_{hosts}_' + f'version_{version_underscore}.log') + dali_path = os.path.join(_slurm_scripts_path(), 'dali.sub') + sbatch = _sbatch_path() + env = { + 'HOSTS': str(hosts), + 'FS_PATH': args.storage_path, + 'CONT_VERSION': f'nvcr.io/nvidian/bobber:{version}', + 'GPUS': str(args.gpus), + 'LOGDIR': args.log_path, + 'LOGPATH': dali_log, + 'BATCH_SIZE_SM': str(args.batch_size_sm), + 'BATCH_SIZE_LG': str(args.batch_size_lg) + } + cmd = [f'{sbatch}', + '-N', + f'{hosts}', + f'--gpus-per-node={args.gpus}', + '--wait', + '--dependency=singleton', + f'{dali_path}'] + try: + print('Running:', cmd) + subprocess.Popen(cmd, env=env) + except subprocess.CalledProcessError: + print('Error queueing SLURM job for DALI tests. ' + 'See output for errors.') + sys.exit(SLURM_QUEUE_ERROR) + + +def run_meta(args: Namespace, version: str) -> NoReturn: + """ + Launch a multi-node metadata test via SLURM. + + Launch a metadata test for N-nodes managed by a SLURM cluster. Multiple + tests are queued-up as sbatch commands which will only launch once the + previous test has completed. + + Parameters + ---------- + args : Namespace + A ``Namespace`` of all settings specified by the user for the test. + version : str + A ``string`` of the Bobber version. + """ + # Update the version to be used in filenames + version_underscore = version.replace('.', '_') + # If not sweeping, set the range of nodes from N-hosts to N-hosts for a + # single iteration of tests. + lower_bound = args.hosts + if args.sweep: + lower_bound = 1 + for hosts in range(lower_bound, args.hosts + 1): + for iteration in range(1, args.iterations + 1): + meta_log = os.path.join(args.log_path, + f'stg_meta_iteration_{iteration}_' + f'systems_{hosts}_' + f'version_{version_underscore}.log') + meta_path = os.path.join(_slurm_scripts_path(), 'mdtest.sub') + sbatch = _sbatch_path() + env = { + 'HOSTS': str(hosts), + 'FS_PATH': args.storage_path, + 'CONT_VERSION': f'nvcr.io/nvidian/bobber:{version}', + 'GPUS': str(args.gpus), + 'LOGDIR': args.log_path, + 'LOGPATH': meta_log + } + cmd = [f'{sbatch}', + '-N', + f'{hosts}', + '--wait', + '--dependency=singleton', + f'{meta_path}'] + try: + print('Running:', cmd) + subprocess.Popen(cmd, env=env) + except subprocess.CalledProcessError: + print('Error queueing SLURM job for metadata tests. ' + 'See output for errors.') + sys.exit(SLURM_QUEUE_ERROR) diff --git a/bobber/slurm_scripts/dali.sub b/bobber/slurm_scripts/dali.sub new file mode 100644 index 0000000..397e730 --- /dev/null +++ b/bobber/slurm_scripts/dali.sub @@ -0,0 +1,26 @@ +#!/bin/bash +#SBATCH --job-name bobber_dali +# SPDX-License-Identifier: MIT +set -euxo pipefail + +# Required vars +: "${HOSTS:=4}" +: "${FS_PATH:=/mnt/fs}" +: "${CONT_VERSION:=nvcr.io/nvidian/bobber:6.1.1}" +: "${LOGDIR:=test_logs/}" +: "${LOGPATH:=test_logs/dali.log}" +: "${BATCH_SIZE_LG:=150}" +: "${BATCH_SIZE_SM:=150}" + +mkdir -p ${LOGDIR} + +srun --nodes=1 --mpi=pmix --exclusive --gres=gpu:8 --container-image ${CONT_VERSION} --container-mounts=${FS_PATH}:/mnt/fs_under_test /tests/dali_setup.sh |& tee ${LOGPATH} +BATCH_SIZE=${BATCH_SIZE_SM} DATASET_PATH="/mnt/fs_under_test/imageinary_data/800x600/file_read_pipeline_images" srun --nodes=${HOSTS} --ntasks-per-node=1 --mpi=pmix --exclusive --gres=gpu:8 --container-image ${CONT_VERSION} --container-mounts=${FS_PATH}:/mnt/fs_under_test /tests/dali_slurm.sh |& tee ${LOGPATH} +srun --nodes=${HOSTS} --exclusive sudo /sbin/sysctl vm.drop_caches=3 +BATCH_SIZE=${BATCH_SIZE_LG} DATASET_PATH="/mnt/fs_under_test/imageinary_data/3840x2160/file_read_pipeline_images" srun --nodes=${HOSTS} --ntasks-per-node=1 --mpi=pmix --exclusive --gres=gpu:8 --container-image ${CONT_VERSION} --container-mounts=${FS_PATH}:/mnt/fs_under_test /tests/dali_slurm.sh |& tee ${LOGPATH} +srun --nodes=${HOSTS} --exclusive sudo /sbin/sysctl vm.drop_caches=3 +BATCH_SIZE=${BATCH_SIZE_SM} DATASET_PATH="/mnt/fs_under_test/imageinary_data/800x600/tfrecord_pipeline/tfrecord-*" srun --nodes=${HOSTS} --ntasks-per-node=1 --mpi=pmix --exclusive --gres=gpu:8 --container-image ${CONT_VERSION} --container-mounts=${FS_PATH}:/mnt/fs_under_test /tests/dali_slurm.sh |& tee ${LOGPATH} +srun --nodes=${HOSTS} --exclusive sudo /sbin/sysctl vm.drop_caches=3 +BATCH_SIZE=${BATCH_SIZE_LG} DATASET_PATH="/mnt/fs_under_test/imageinary_data/3840x2160/tfrecord_pipeline/tfrecord-*" srun --nodes=${HOSTS} --ntasks-per-node=1 --mpi=pmix --exclusive --gres=gpu:8 --container-image ${CONT_VERSION} --container-mounts=${FS_PATH}:/mnt/fs_under_test /tests/dali_slurm.sh |& tee ${LOGPATH} +srun --nodes=${HOSTS} --exclusive sudo /sbin/sysctl vm.drop_caches=3 +srun --nodes=1 --mpi=pmix --exclusive --gres=gpu:8 --container-image ${CONT_VERSION} --container-mounts=${FS_PATH}:/mnt/fs_under_test /tests/dali_cleanup.sh |& tee ${LOGPATH} diff --git a/bobber/slurm_scripts/mdtest.sub b/bobber/slurm_scripts/mdtest.sub new file mode 100644 index 0000000..8985cbc --- /dev/null +++ b/bobber/slurm_scripts/mdtest.sub @@ -0,0 +1,16 @@ +#!/bin/bash +#SBATCH --job-name bobber_nccl +# SPDX-License-Identifier: MIT +set -euxo pipefail + +# Required vars +: "${HOSTS:=4}" +: "${FS_PATH:=/mnt/fs}" +: "${CONT_VERSION:=nvcr.io/nvidian/bobber:6.1.1}" +: "${LOGDIR:=test_logs/}" +: "${LOGPATH:=test_logs/mdtest.log}" + +mkdir -p ${LOGDIR} + +# Default to 44 threads per node for known working config +srun --nodes=${HOSTS} --ntasks-per-node=44 --mpi=pmix --exclusive --container-image ${CONT_VERSION} --container-mounts=${FS_PATH}:/mnt/fs_under_test /tests/mdtest_slurm.sh |& tee ${LOGPATH} diff --git a/bobber/slurm_scripts/nccl.sub b/bobber/slurm_scripts/nccl.sub new file mode 100644 index 0000000..80a68dd --- /dev/null +++ b/bobber/slurm_scripts/nccl.sub @@ -0,0 +1,19 @@ +#!/bin/bash +#SBATCH --job-name bobber_nccl +# SPDX-License-Identifier: MIT +set -euxo pipefail + +# Required vars +: "${HOSTS:=4}" +: "${FS_PATH:=/mnt/fs}" +: "${CONT_VERSION:=nvcr.io/nvidian/bobber:6.1.1}" +: "${NCCL_MAX:=1}" +: "${LOGDIR:=test_logs/}" +: "${LOGPATH:=test_logs/nccl.log}" +: "${NCCL_IB_HCAS:=}" +: "${COMPUTE_GID:=0}" +: "${NCCL_TC:=}" + +mkdir -p ${LOGDIR} + +NCCL_MAX=${NCCL_MAX} NCCL_IB_HCAS=${NCCL_IB_HCAS} COMPUTE_GID=${COMPUTE_GID} NCCL_TC=${NCCL_TC} srun --nodes=${HOSTS} --ntasks-per-node=8 --mpi=pmix --exclusive --container-image ${CONT_VERSION} --container-mounts=${FS_PATH}:/mnt/fs_under_test /tests/nccl_slurm.sh |& tee ${LOGPATH} diff --git a/bobber/test_scripts/call_dali_slurm.sh b/bobber/test_scripts/call_dali_slurm.sh new file mode 100755 index 0000000..5355136 --- /dev/null +++ b/bobber/test_scripts/call_dali_slurm.sh @@ -0,0 +1,19 @@ +#!/bin/bash +# SPDX-License-Identifier: MIT +if [ "x$GPUS" = "x" ]; then + GPUS=8 +fi + +if [ "x$BATCH_SIZE_SM" = "x" ]; then + BATCH_SIZE_SM=150 +fi + +if [ "x$BATCH_SIZE_LG" = "x" ]; then + BATCH_SIZE_LG=150 +fi + +if [[ "$DATASET" == *tfrecord* ]]; then + python3 /dali/dali/test/python/test_RN50_data_pipeline.py -b $BATCH_SIZE --epochs=11 -g $GPUS --remove_default_pipeline_paths --tfrecord_pipeline_paths "$DATASET" +else + python3 /dali/dali/test/python/test_RN50_data_pipeline.py -b $BATCH_SIZE --epochs=11 -g $GPUS --remove_default_pipeline_paths --file_read_pipeline_paths "$DATASET" +fi diff --git a/bobber/test_scripts/dali_cleanup.sh b/bobber/test_scripts/dali_cleanup.sh new file mode 100755 index 0000000..dae1753 --- /dev/null +++ b/bobber/test_scripts/dali_cleanup.sh @@ -0,0 +1,3 @@ +#!/bin/bash +# SPDX-License-Identifier: MIT +rm -r /mnt/fs_under_test/imageinary_data diff --git a/bobber/test_scripts/dali_setup.sh b/bobber/test_scripts/dali_setup.sh new file mode 100755 index 0000000..132c661 --- /dev/null +++ b/bobber/test_scripts/dali_setup.sh @@ -0,0 +1,23 @@ +#!/bin/bash +# SPDX-License-Identifier: MIT +if [ "x$GPUS" = "x" ]; then + GPUS=8 +fi + +GPUS_ZERO_BASE=$(($GPUS-1)) + +mkdir -p /mnt/fs_under_test/imageinary_data/3840x2160/file_read_pipeline_images/images +mkdir -p /mnt/fs_under_test/imageinary_data/800x600/file_read_pipeline_images/images +mkdir -p /mnt/fs_under_test/imageinary_data/3840x2160/tfrecord_pipeline +mkdir -p /mnt/fs_under_test/imageinary_data/800x600/tfrecord_pipeline +mkdir -p /mnt/fs_under_test/imageinary_data/3840x2160/tfrecord_pipeline.idx +mkdir -p /mnt/fs_under_test/imageinary_data/800x600/tfrecord_pipeline.idx + +imagine create-images --path /mnt/fs_under_test/imageinary_data/3840x2160/file_read_pipeline_images/images --name 4k_image_ --width 3840 --height 2160 --count $(($GPUS*1000)) --image_format jpg --size +imagine create-images --path /mnt/fs_under_test/imageinary_data/800x600/file_read_pipeline_images/images --name small_image_ --width 800 --height 600 --count $(($GPUS*1000)) --image_format jpg --size + +imagine create-tfrecords --source_path /mnt/fs_under_test/imageinary_data/3840x2160/file_read_pipeline_images/images --dest_path /mnt/fs_under_test/imageinary_data/3840x2160/tfrecord_pipeline --name tfrecord- --img_per_file 1000 +imagine create-tfrecords --source_path /mnt/fs_under_test/imageinary_data/800x600/file_read_pipeline_images/images --dest_path /mnt/fs_under_test/imageinary_data/800x600/tfrecord_pipeline --name tfrecord- --img_per_file 1000 + +for i in $(seq 0 $GPUS_ZERO_BASE); do /dali/tools/tfrecord2idx /mnt/fs_under_test/imageinary_data/3840x2160/tfrecord_pipeline/tfrecord-$i /mnt/fs_under_test/imageinary_data/3840x2160/tfrecord_pipeline.idx/tfrecord-$i; done +for i in $(seq 0 $GPUS_ZERO_BASE); do /dali/tools/tfrecord2idx /mnt/fs_under_test/imageinary_data/800x600/tfrecord_pipeline/tfrecord-$i /mnt/fs_under_test/imageinary_data/800x600/tfrecord_pipeline.idx/tfrecord-$i; done diff --git a/bobber/test_scripts/dali_slurm.sh b/bobber/test_scripts/dali_slurm.sh new file mode 100755 index 0000000..e4c90b7 --- /dev/null +++ b/bobber/test_scripts/dali_slurm.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# SPDX-License-Identifier: MIT + +if [ "x$GPUS" = "x" ]; then + GPUS=8 +fi + +if [ "x$BATCH_SIZE" = "x" ]; then + BATCH_SIZE=150 +fi + +if [ "x$DATASET_PATH" = "x" ]; then + DATASET_PATH="/mnt/fs_under_test/imageinary_data/800x600/file_read_pipeline_images" +fi + +python3 /dali/dali/test/python/test_RN50_data_pipeline.py -b $BATCH_SIZE --epochs=11 -g $GPUS --remove_default_pipeline_paths --file_read_pipeline_paths /mnt/fs_under_test/imageinary_data/800x600/file_read_pipeline_images diff --git a/bobber/test_scripts/mdtest_slurm.sh b/bobber/test_scripts/mdtest_slurm.sh new file mode 100755 index 0000000..17fd66e --- /dev/null +++ b/bobber/test_scripts/mdtest_slurm.sh @@ -0,0 +1,12 @@ +#!/bin/bash +#SBATCH --job-name bobber_mdtest +# SPDX-License-Identifier: MIT +set -euxo pipefail + +FSDIR=/mnt/fs_under_test +mkdir -p $FSDIR/mdtest + +# N-hosts * 44 (default thread count) processes +/io-500-dev/bin/mdtest -i 3 -I 4 -z 3 -b 8 -u -d $FSDIR/mdtest + +rm -rf $FSDIR/mdtest diff --git a/bobber/test_scripts/nccl_slurm.sh b/bobber/test_scripts/nccl_slurm.sh new file mode 100755 index 0000000..4a29e86 --- /dev/null +++ b/bobber/test_scripts/nccl_slurm.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# SPDX-License-Identifier: MIT + +if [ "x$NCCL_IB_HCAS" = "x" ]; then + NCCL_IB_HCAS=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_6,mlx5_7,mlx5_8,mlx5_9 +fi + +if [ "x$NCCL_MAX" = "x" ]; then + NCCL_MAX=1 +fi + +if [ "x$COMPUTE_GID" = "x" ]; then + COMPUTE_GID=0 +fi + +if [ "x$NCCL_TC" = "x" ]; then + NCCL_TC='' +fi + +export NCCL_IB_HCA=$NCCL_IB_HCAS && \ + export NCCL_IB_TC=$NCCL_TC && \ + export NCCL_IB_GID_INDEX=$COMPUTE_GID && \ + export NCCL_IB_CUDA_SUPPORT=1 && \ + /nccl-tests/build/all_reduce_perf -b 8 -e ${NCCL_MAX}G -f 2 diff --git a/setup.py b/setup.py index ce61b05..d9832f4 100644 --- a/setup.py +++ b/setup.py @@ -18,12 +18,21 @@ 'bobber/lib/tests'], include_package_data=True, package_data={'': ['lib/docker/Dockerfile', + 'slurm_scripts/dali.sub', + 'slurm_scripts/mdtest.sub', + 'slurm_scripts/nccl.sub', 'test_scripts/call_dali_multi.sh', + 'test_scripts/call_dali_slurm.sh', + 'test_scripts/dali_cleanup.sh', 'test_scripts/dali_multi.sh', + 'test_scripts/dali_setup.sh', + 'test_scripts/dali_slurm.sh', 'test_scripts/fio_fill_single.sh', 'test_scripts/fio_multi.sh', 'test_scripts/mdtest_multi.sh', + 'test_scripts/mdtest_slurm.sh', 'test_scripts/nccl_multi.sh', + 'test_scripts/nccl_slurm.sh', 'test_scripts/setup_fio.sh']}, license='MIT', python_requires='>=3.6',