Skip to content

S3 over RDMA - initial impl #8817

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 4 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .eslintrc.js
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ module.exports = {
'one-var': ['error', 'never'],

'@stylistic/js/space-before-function-paren': ['error', {
'anonymous': 'never',
'anonymous': 'ignore',
'named': 'never',
'asyncArrow': 'always'
}],
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# BUILD
node_modules
/build
/target
/noobaa.rpm

# TEST
Expand Down
9 changes: 8 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,13 @@ endif
BUILD_S3SELECT?=1
BUILD_S3SELECT_PARQUET?=0

GYP_DEFINES?=
GYP_DEFINES+=$(if $(RDMA),"BUILD_RDMA_NAPI=1",)
GYP_DEFINES+=$(if $(CUDA),"BUILD_CUDA_NAPI=1",)
GYP_DEFINES+=$(if $(CUDA_PATH),"CUDA_PATH=$(CUDA_PATH)",)
GYP_DEFINES+=$(if $(CUOBJ_PATH),"CUOBJ_PATH=$(CUOBJ_PATH)",)


## RPM VARIABLES
DATE := $(shell date +'%Y%m%d')
NOOBAA_PKG_VERSION := $(shell jq -r '.version' < ./package.json)
Expand All @@ -106,7 +113,7 @@ default: build
# this target builds incrementally
build:
npm install
npm run build
GYP_DEFINES='$(GYP_DEFINES)' npm run build --verbose
.PHONY: build

clean_build:
Expand Down
15 changes: 12 additions & 3 deletions config.js
Original file line number Diff line number Diff line change
Expand Up @@ -958,6 +958,7 @@ config.NSFS_GLACIER_MIGRATE_LOG_THRESHOLD = 50 * 1024;
config.ANONYMOUS_ACCOUNT_NAME = 'anonymous';

config.NFSF_UPLOAD_STREAM_MEM_THRESHOLD = 8 * 1024 * 1024;
config.NFSF_DOWNLOAD_STREAM_MEM_THRESHOLD = 8 * 1024 * 1024;

// we want to change our handling related to EACCESS error
config.NSFS_LIST_IGNORE_ENTRY_ON_EACCES = true;
Expand Down Expand Up @@ -1091,6 +1092,14 @@ config.DEFAULT_REGION = 'us-east-1';

config.VACCUM_ANALYZER_INTERVAL = 86400000;


//////////////
/// RDMA ///
//////////////

config.RDMA_ENABLED = true; // TODO STILL EXPERIMENTAL - should be false by default


/////////////////////
// //
// OVERRIDES //
Expand Down Expand Up @@ -1173,7 +1182,7 @@ function _get_data_from_file(file_name) {
try {
data = fs.readFileSync(file_name).toString();
} catch (e) {
console.warn(`Error accrued while getting the data from ${file_name}: ${e}`);
// console.log(`Error accrued while getting the data from ${file_name}: ${e}`);
return;
}
return data;
Expand All @@ -1189,7 +1198,7 @@ function _get_config_root() {
const data = _get_data_from_file(redirect_path);
config_root = data.toString().trim();
} catch (err) {
console.warn('config.get_config_root - could not find custom config_root, will use the default config_root ', config_root);
// console.log('config.get_config_root - could not find custom config_root, will use the default config_root ', config_root);
}
return config_root;
}
Expand Down Expand Up @@ -1244,7 +1253,7 @@ function load_nsfs_nc_config() {
try {
if (!config.NSFS_NC_CONF_DIR) {
config.NSFS_NC_CONF_DIR = _get_config_root();
console.warn('load_nsfs_nc_config.setting config.NSFS_NC_CONF_DIR', config.NSFS_NC_CONF_DIR);
// console.warn('load_nsfs_nc_config.setting config.NSFS_NC_CONF_DIR', config.NSFS_NC_CONF_DIR);
}
const config_path = path.join(config.NSFS_NC_CONF_DIR, 'config.json');
const config_data = require(config_path);
Expand Down
113 changes: 113 additions & 0 deletions cudaguy.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
/*
Usage:
-----
CUDA_PATH="/usr/local/cuda"
CUOBJ_PATH="../cuObject-0.8.1-Linux_x86_64/src"
CUOBJ_LIBS="$CUOBJ_PATH/lib/libcuobjserver.so $CUOBJ_PATH/lib/libcuobjclient.so $CUOBJ_PATH/lib/libcufile.so.1.13.0 $CUOBJ_PATH/lib/libcufile_rdma.so.1.13.0"
g++ -o cudaguy cudaguy.cpp -I$CUDA_PATH/include/ -L$CUDA_PATH/lib64 -lcuda -I$CUOBJ_PATH/include/ $CUOBJ_LIBS
LD_PRELOAD=$CUOBJ_LIBS ./cudaguy
-----
*/

#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "cuobjclient.h"
#include "protocol.h"
#include <cuda.h>

#define CU_TRY(fn) \
do { \
CUresult r = fn; \
if (r != CUDA_SUCCESS) { \
const char* cuda_err = ""; \
cuGetErrorName(r, &cuda_err); \
fprintf(stderr, "CUDA error: %s %s\n", cuda_err, #fn); \
exit(1); \
} \
} while (0)

ssize_t
cuobj_get(
const void* handle,
char* ptr,
size_t size,
loff_t offset,
const cufileRDMAInfo_t* rdma_info)
{
fprintf(stderr, "cuobj_get: handle %p ptr %p size %zu offset %ld\n", handle, ptr, size, offset);
return size;
}

ssize_t
cuobj_put(
const void* handle,
const char* ptr,
size_t size,
loff_t offset,
const cufileRDMAInfo_t* rdma_info)
{
fprintf(stderr, "cuobj_put: handle %p ptr %p size %zu offset %ld\n", handle, ptr, size, offset);
return size;
}

int
main()
{
size_t size = 8 * 1024 * 1024;
CUdevice cuda_device = 0;
CUdevice cuda_device2 = 0;
CUcontext cuda_ctx = 0;
CUdeviceptr cuda_ptr = 0;
CUmemorytype mem_type = CU_MEMORYTYPE_HOST;
char* host_ptr = (char*)malloc(size);

CU_TRY(cuInit(0));
CU_TRY(cuDeviceGet(&cuda_device, 0););
// CU_TRY(cuCtxCreate(&cuda_ctx, 0, cuda_device));
CU_TRY(cuDevicePrimaryCtxRetain(&cuda_ctx, cuda_device));
CU_TRY(cuCtxSetCurrent(cuda_ctx));
fprintf(stderr, "CUDA initialized: device %d context %p\n", cuda_device, (void*)cuda_ctx);

CU_TRY(cuCtxGetDevice(&cuda_device2));
fprintf(stderr, "CUDA get device %d\n", cuda_device2);

CU_TRY(cuMemAlloc(&cuda_ptr, size));
CU_TRY(cuMemsetD8(cuda_ptr, 'A', size));
CU_TRY(cuCtxSynchronize());
fprintf(stderr, "CUDA allocated %p size %zu\n", (void*)cuda_ptr, size);

CU_TRY(cuPointerGetAttribute(&mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, cuda_ptr));
fprintf(stderr, "CUDA buffer mem type: %d\n", mem_type);

CUObjIOOps cuobj_ops = { .get = cuobj_get, .put = cuobj_put };
cuObjClient cuobj_client(cuobj_ops);
cuObjErr_t cuobj_err = cuobj_client.cuMemObjGetDescriptor((void*)cuda_ptr, size);
fprintf(stderr, "cuObjClient::cuMemObjGetDescriptor: %d\n", cuobj_err);

cuObjMemoryType_t cuobj_mem_type = cuObjClient::getMemoryType((void*)cuda_ptr);
fprintf(stderr, "cuObjClient::getMemoryType: %d\n", cuobj_mem_type);

ssize_t ret_size = cuobj_client.cuObjGet(NULL, (void*)cuda_ptr, size);
fprintf(stderr, "cuObjClient::cuObjGet: %zd\n", ret_size);

memset(host_ptr, 'B', size);
CU_TRY(cuMemcpyDtoH(host_ptr, cuda_ptr, size));

// skip repeating 'A' at the end, while keeping the first 10 chars,
// and terminate the string for printing
int i = size - 1;
while (i > 10 && host_ptr[i] == 'A') --i;
host_ptr[i] = '\0';
fprintf(stderr, "CUDA copied to host: %s\n", host_ptr);

free(host_ptr);
CU_TRY(cuMemFree(cuda_ptr));
CU_TRY(cuDevicePrimaryCtxRelease(cuda_device));
// CU_TRY(cuCtxDestroy(cuda_ctx));
fprintf(stderr, "CUDA freed\n");

return 0;
}
109 changes: 109 additions & 0 deletions docs/design/S3-over-RDMA.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
# S3 over RDMA (EXPERIMENTAL)

## Overview

S3 over RDMA is a new technology that enhances I/O performance directly to the applications memory, or directly to GPU memory! RDMA is extremely efficient, it bypasses the operating system, TCP stack, and much of the networking CPU overhead. Layering S3 on top of RDMA fits like a glove for modern applications. And the same endpoints can serve both RDMA and non-RDMA clients with a simple HTTP header.

This feature is still EXPERIMENTAL and is not yet available for production use. This document outlines the usage and design of this feature.

## What is needed to use S3 over RDMA?

Hardware:
- High performance RDMA network 100G/.../800G
- Infiniband or RoCE (must support DC transport)
- Compute Nodes with optional GPU devices and NVIDIA CUDA toolkit
- Storage Nodes with NVMe drives, can be same as compute nodes

Software:
- RHEL / UBUNTU
- High performance file system (e.g GPFS)
- NooBaa RPM / build from source with RDMA support.
- NVIDIA's cuObject (beta) and cuFile RDMA libraries.


## Which applications can benefit from S3 over RDMA?

- boto3 - S3 sdk for python applications
- s3-connector-for-pytorch - library for AI/ML applications (data loaders, checkpoints, etc.)
- rclone - a standalone CLI that can copy data between files/dirs and S3
- nodejs - using aws-sdk-js-v3 to store data collected from web services
- (share with us your use case and we will add to the list...)

## Lets dig right in

- Clone the noobaa-core repository
- Install the required dependencies (nodejs, nasm, etc. - see the noobaa-core README)
- Standard build - simple `make` should succeed.

Build the project with RDMA support:

```bash
$ make RDMA=1
```

or with RDMA and CUDA support:

```bash
$ make RDMA=1 CUDA=1
```

Define the following runtime variables:

```bash
CUDA_PATH="$(realpath /usr/local/cuda)"
CUOBJ_PATH="$(realpath ../cuObject-0.8.1-Linux_x86_64/src)"
CUFILE_ENV_PATH_JSON="$(realpath ../cuobj.json)"
CUOBJ_LIBS="$CUOBJ_PATH/lib/libcuobjserver.so $CUOBJ_PATH/lib/libcuobjclient.so $CUOBJ_PATH/lib/libcufile.so.1.13.0 $CUOBJ_PATH/lib/libcufile_rdma.so.1.13.0"
```

**NOTE**: If compilation fails to find cuda_runtime.h use: `touch $CUOBJ_PATH/include/cuda_runtime.h`

Create the configuration directory as described in [this doc](https://github.com/noobaa/noobaa-core/blob/master/docs/NooBaaNonContainerized/GettingStarted.md#configuration) (no need to build and install RPM because we build from source), and finally start the noobaa server with RDMA support:

```bash
$ LD_PRELOAD=$CUOBJ_LIBS node src/cmd/nsfs
```

## Getting Started

First we use the s3perf tool in the noobaa repo to test the RDMA performance. Here is a basic example that reads the same 8MB file 10 continuously and reports the speed:

```bash
$ LD_PRELOAD="$CUOBJ_LIBS" \
CUFILE_ENV_PATH_JSON="$CUFILE_ENV_PATH_JSON" \
UV_THREADPOOL_SIZE=16 \
DISABLE_INIT_RANDOM_SEED=true \
node src/tools/s3perf.js \
--endpoint http://172.16.0.61:6001 \
--access_key "AK" --secret_key "SK" \
--bucket bucket1 --get file8M --samekey \
--time 120 --size_units MB --size 8 --concur 8 --forks 6 --rdma
```

Will output something like:

```sh
Feb-20 5:50:05.386 [/3039076] [LOG] CONSOLE:: S3: 11240.0 MB/sec (average 9650.2) | OPS: 1405 min:20.7ms max:50.8ms avg:34.2ms
Feb-20 5:50:06.386 [/3039076] [LOG] CONSOLE:: S3: 11216.0 MB/sec (average 9685.5) | OPS: 1402 min:20.3ms max:54.2ms avg:34.3ms
Feb-20 5:50:07.386 [/3039076] [LOG] CONSOLE:: S3: 11040.0 MB/sec (average 9715.4) | OPS: 1380 min:17.1ms max:55.8ms avg:34.7ms
Feb-20 5:50:08.387 [/3039076] [LOG] CONSOLE:: S3: 11024.0 MB/sec (average 9743.7) | OPS: 1378 min:17.4ms max:58.3ms avg:34.9ms
```

Remove the --rdma flag to compare the performance with and without RDMA.

```bash
Feb-20 5:53:16.867 [/3040865] [LOG] CONSOLE:: S3: 3931.9 MB/sec (average 3785.4) | OPS: 495 min:53.1ms max:169.3ms avg:98.0ms
Feb-20 5:53:17.869 [/3040865] [LOG] CONSOLE:: S3: 3918.4 MB/sec (average 3788.3) | OPS: 490 min:58.0ms max:161.3ms avg:98.0ms
Feb-20 5:53:18.869 [/3040865] [LOG] CONSOLE:: S3: 3978.2 MB/sec (average 3792.3) | OPS: 497 min:50.9ms max:157.1ms avg:97.2ms
Feb-20 5:53:19.871 [/3040865] [LOG] CONSOLE:: S3: 3949.0 MB/sec (average 3795.5) | OPS: 489 min:52.5ms max:159.1ms avg:96.6ms
```

The --cuda flag tests the performance using the GPU memory. It can be used with or without the --rdma flag. Currently this is failing. Stay tuned.

```bash

## Next steps

- Integrate S3 over RDMA to python applications
- Support multiple Server IP's
- Optimization for GPFS
Loading
Loading