Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ build/*
dist/*
env/*
nvidia_bobber.egg-info/
*.out
23 changes: 21 additions & 2 deletions bobber/bobber.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: MIT
import bobber.lib.docker
import json
import sys
from argparse import ArgumentParser, ArgumentTypeError, Namespace
from copy import copy
from bobber import __version__
Expand Down Expand Up @@ -88,11 +89,20 @@ def parse_args(version: str) -> Namespace:

# More general options which apply to a majority of the running commands
# Note that all arguments prepended with '--' are optional
commands_parent.add_argument('--slurm', help='Run a test on an existing '
'SLURM cluster with Pyxis/Enroot installed',
action='store_true')
commands_parent.add_argument('--storage-path', help='Path at which the '
'filesystem under test is mounted',
required='--slurm' in sys.argv)
commands_parent.add_argument('log_path', metavar='log-path', help='Path '
'used to store log files on the head node')
commands_parent.add_argument('hosts', help='Comma-separated list of '
commands_parent.add_argument('hosts', help='Number of hosts to queue a '
'job for in a SLURM cluster.' if '--slurm'
in sys.argv else 'Comma-separated list of '
'hostnames or IP addresses',
type=unique_hosts)
type=int if '--slurm' in sys.argv
else unique_hosts)
commands_parent.add_argument('--config-path', help='Read a JSON config '
'file with expected parameters and use those '
'values for testing. Ignores all other '
Expand Down Expand Up @@ -384,6 +394,15 @@ def execute_command(args: Namespace, version: str) -> NoReturn:
bobber.lib.docker.cast(args.storage_path, args.ignore_gpu, version)
elif args.command == LOAD:
bobber.lib.docker.load(args.filename)
elif args.slurm and args.command == RUN_NCCL:
args = load_settings(args)
bobber.lib.system.slurm.run_nccl(args, version)
elif args.slurm and args.command == RUN_DALI:
args = load_settings(args)
bobber.lib.system.slurm.run_dali(args, version)
elif args.slurm and args.command == RUN_STG_META:
args = load_settings(args)
bobber.lib.system.slurm.run_meta(args, version)
else:
# Update the version to be used in filenames
version_underscore = version.replace('.', '_')
Expand Down
40 changes: 40 additions & 0 deletions bobber/lib/analysis/dali.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,42 @@ def _update_results(image_type_match: dict, results: list) -> dict:
return image_type_match


def _slurm_test_sections(log_contents: str) -> list:
"""
Parse the SLURM log test sections.

The SLURM log files for DALI tests have a different structure to the output
which needs to be special-handled. These sections are parsed by reading
from the beginning of one sub-section (ie. small JPGs) until the first time
the next sub-section is encountered (ie. large JPGs).

Parameters
----------
log_contents : str
A ``string`` of the complete contents from the log file.

Returns
-------
list
Returns a ``list`` of strings where each element is the complete output
from a test subsection.
"""
small_jpg = re.findall('800x600/file_read_pipeline.*'
'?3840x2160/file_read_pipeline',
log_contents, re.DOTALL)
large_jpg = re.findall('3840x2160/file_read_pipeline.*'
'?800x600/tfrecord_pipeline',
log_contents, re.DOTALL)
small_tf = re.findall('800x600/tfrecord_pipeline.*'
'?3840x2160/tfrecord_pipeline',
log_contents, re.DOTALL)
large_tf = re.findall('3840x2160/tfrecord_pipeline.*'
'OK', log_contents, re.DOTALL)
sections = [small_jpg, large_jpg, small_tf, large_tf]
sections = ['\n'.join(section) for section in sections]
return sections


def _result_parsing(log_contents: str, systems: int, image_results: dict,
log_file: str) -> dict:
"""
Expand Down Expand Up @@ -188,6 +224,10 @@ def _result_parsing(log_contents: str, systems: int, image_results: dict,
]

test_sections = re.findall(r'RUN 1/1.*?OK', log_contents, re.DOTALL)
# The SLURM tests have a different layout and need to be grabbed
# appropriately
if '+ srun --nodes=' in log_contents:
test_sections = _slurm_test_sections(log_contents)
if len(test_sections) != 4:
print(f'Warning: Invalid number of results found in {log_file} log '
'file. Skipping...')
Expand Down
24 changes: 21 additions & 3 deletions bobber/lib/docker/management.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,23 @@ def __init__(self) -> NoReturn:
try:
self.client = docker.from_env()
self.cli = docker.APIClient(timeout=600)
self.docker_running = True
except docker.errors.DockerException as e:
if 'error while fetching server api version' in str(e).lower():
print('Error: Could not communicate with the Docker daemon.')
print('Ensure Docker is running with "systemctl start docker"')
sys.exit(DOCKER_COMMUNICATION_ERROR)
self.docker_running = False

def _verify_docker_running(self, *args, **kwargs) -> None:
"""
Raise a DOCKER_COMMUNICATION_ERROR when Docker isn't running.

If a command is attempted to be run that requires Docker and Docker is
either not installed or not running, an error needs to be raised
gracefully to the user.
"""
if not self.docker_running:
print('Error: Could not communicate with the Docker daemon.')
print('Ensure Docker is running with "systemctl start docker"')
sys.exit(DOCKER_COMMUNICATION_ERROR)

def _build_if_not_built(self, tag: str, bobber_version: str) -> NoReturn:
"""
Expand Down Expand Up @@ -102,6 +114,7 @@ def cast(self, storage_path: str, ignore_gpu: bool,
bobber_version : string
A ``string`` of the local version of Bobber, such as '5.0.0'.
"""
self._verify_docker_running()
tag = self.get_tag(bobber_version)
self._build_if_not_built(tag, bobber_version)
runtime = None
Expand Down Expand Up @@ -155,6 +168,7 @@ def export(self, bobber_version: str) -> NoReturn:
bobber_version : string
A ``string`` of the local version of Bobber, such as '5.0.0'.
"""
self._verify_docker_running()
tag = self.get_tag(bobber_version)
self._build_if_not_built(tag, bobber_version)
filename = tag.replace('/', '_').replace(':', '_')
Expand All @@ -177,6 +191,7 @@ def build(self, bobber_version: str) -> NoReturn:
bobber_version : string
A ``string`` of the local version of Bobber, such as '5.0.0'.
"""
self._verify_docker_running()
tag = self.get_tag(bobber_version)
print('Building a new image. This may take a while...')
# Set the path to the repository's parent directory.
Expand Down Expand Up @@ -208,6 +223,7 @@ def load(self, filename: str) -> NoReturn:
A ``string`` of the filename for the local tarball to load, such as
'./nvidia_bobber_5.0.0.tar'.
"""
self._verify_docker_running()
print(f'Importing {filename}. This may take a while...')
with open(filename, 'rb') as image_file:
self.client.images.load(image_file)
Expand All @@ -233,6 +249,7 @@ def execute(self, command: str, environment: Optional[dict] = None,
log_file : string (Optional)
A ``string`` of the path and filename to optionally save output to.
"""
self._verify_docker_running()
if not self.running:
print('Bobber container not running. Launch a container with '
'"bobber cast" prior to running any tests.')
Expand Down Expand Up @@ -281,6 +298,7 @@ def version_match(self, container: Container) -> bool:
bool
Returns `True` when the versions match and `False` when not.
"""
self._verify_docker_running()
if f'nvidia/bobber:{version}' not in container.image.tags:
return False
return True
Expand Down
2 changes: 2 additions & 0 deletions bobber/lib/exit_codes.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,5 @@
CONTAINER_NOT_RUNNING = 32 # Bobber container not running
NVIDIA_RUNTIME_ERROR = 33 # NVIDIA container runtime not found
CONTAINER_VERSION_MISMATCH = 34 # Container different from application
SLURM_QUEUE_ERROR = 40 # Error queueing a SLURM job
SBATCH_CALL_ERROR = 41 # Error running sbatch
4 changes: 4 additions & 0 deletions bobber/lib/system/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,5 @@
# SPDX-License-Identifier: MIT
from bobber.lib.system import slurm

run_dali = slurm.run_dali
run_nccl = slurm.run_nccl
Loading