Skip to content

Commit 15630a3

Browse files
committed
s3 over rdma
Signed-off-by: Guy Margalit <[email protected]>
1 parent 5eeeb78 commit 15630a3

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

57 files changed

+5167
-1747
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# BUILD
22
node_modules
33
/build
4+
/target
45
/noobaa.rpm
56

67
# TEST

Makefile

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,13 @@ endif
8484
BUILD_S3SELECT?=1
8585
BUILD_S3SELECT_PARQUET?=0
8686

87+
GYP_DEFINES?=
88+
GYP_DEFINES+=$(if $(RDMA),"BUILD_RDMA_NAPI=1",)
89+
GYP_DEFINES+=$(if $(CUDA),"BUILD_CUDA_NAPI=1",)
90+
GYP_DEFINES+=$(if $(CUDA_PATH),"CUDA_PATH=$(CUDA_PATH)",)
91+
GYP_DEFINES+=$(if $(CUOBJ_PATH),"CUOBJ_PATH=$(CUOBJ_PATH)",)
92+
93+
8794
## RPM VARIABLES
8895
DATE := $(shell date +'%Y%m%d')
8996
NOOBAA_PKG_VERSION := $(shell jq -r '.version' < ./package.json)
@@ -106,7 +113,7 @@ default: build
106113
# this target builds incrementally
107114
build:
108115
npm install
109-
npm run build
116+
GYP_DEFINES='$(GYP_DEFINES)' npm run build --verbose
110117
.PHONY: build
111118

112119
clean_build:

config.js

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -958,6 +958,7 @@ config.NSFS_GLACIER_MIGRATE_LOG_THRESHOLD = 50 * 1024;
958958
config.ANONYMOUS_ACCOUNT_NAME = 'anonymous';
959959

960960
config.NFSF_UPLOAD_STREAM_MEM_THRESHOLD = 8 * 1024 * 1024;
961+
config.NFSF_DOWNLOAD_STREAM_MEM_THRESHOLD = 8 * 1024 * 1024;
961962

962963
// we want to change our handling related to EACCESS error
963964
config.NSFS_LIST_IGNORE_ENTRY_ON_EACCES = true;
@@ -1091,6 +1092,14 @@ config.DEFAULT_REGION = 'us-east-1';
10911092

10921093
config.VACCUM_ANALYZER_INTERVAL = 86400000;
10931094

1095+
1096+
//////////////
1097+
/// RDMA ///
1098+
//////////////
1099+
1100+
config.RDMA_ENABLED = true; // TODO STILL EXPERIMENTAL - should be false by default
1101+
1102+
10941103
/////////////////////
10951104
// //
10961105
// OVERRIDES //

cudaguy.c

Lines changed: 0 additions & 59 deletions
This file was deleted.

cudaguy.cpp

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
/*
2+
Usage:
3+
-----
4+
CUDA_PATH="/usr/local/cuda"
5+
CUOBJ_PATH="../cuObject-0.8.1-Linux_x86_64/src"
6+
CUOBJ_LIBS="$CUOBJ_PATH/lib/libcuobjserver.so $CUOBJ_PATH/lib/libcuobjclient.so $CUOBJ_PATH/lib/libcufile.so.1.13.0 $CUOBJ_PATH/lib/libcufile_rdma.so.1.13.0"
7+
g++ -o cudaguy cudaguy.cpp -I$CUDA_PATH/include/ -L$CUDA_PATH/lib64 -lcuda -I$CUOBJ_PATH/include/ $CUOBJ_LIBS
8+
LD_PRELOAD=$CUOBJ_LIBS ./cudaguy
9+
-----
10+
*/
11+
12+
#include <stdint.h>
13+
#include <stdio.h>
14+
#include <stdlib.h>
15+
#include <string.h>
16+
17+
#include "cuobjclient.h"
18+
#include "protocol.h"
19+
#include <cuda.h>
20+
21+
#define CU_TRY(fn) \
22+
do { \
23+
CUresult r = fn; \
24+
if (r != CUDA_SUCCESS) { \
25+
const char* cuda_err = ""; \
26+
cuGetErrorName(r, &cuda_err); \
27+
fprintf(stderr, "CUDA error: %s %s\n", cuda_err, #fn); \
28+
exit(1); \
29+
} \
30+
} while (0)
31+
32+
ssize_t
33+
cuobj_get(
34+
const void* handle,
35+
char* ptr,
36+
size_t size,
37+
loff_t offset,
38+
const cufileRDMAInfo_t* rdma_info)
39+
{
40+
fprintf(stderr, "cuobj_get: handle %p ptr %p size %zu offset %ld\n", handle, ptr, size, offset);
41+
return size;
42+
}
43+
44+
ssize_t
45+
cuobj_put(
46+
const void* handle,
47+
const char* ptr,
48+
size_t size,
49+
loff_t offset,
50+
const cufileRDMAInfo_t* rdma_info)
51+
{
52+
fprintf(stderr, "cuobj_put: handle %p ptr %p size %zu offset %ld\n", handle, ptr, size, offset);
53+
return size;
54+
}
55+
56+
int
57+
main()
58+
{
59+
size_t size = 8 * 1024 * 1024;
60+
CUdevice cuda_device = 0;
61+
CUdevice cuda_device2 = 0;
62+
CUcontext cuda_ctx = 0;
63+
CUdeviceptr cuda_ptr = 0;
64+
CUmemorytype mem_type = CU_MEMORYTYPE_HOST;
65+
char* host_ptr = (char*)malloc(size);
66+
67+
CU_TRY(cuInit(0));
68+
CU_TRY(cuDeviceGet(&cuda_device, 0););
69+
// CU_TRY(cuCtxCreate(&cuda_ctx, 0, cuda_device));
70+
CU_TRY(cuDevicePrimaryCtxRetain(&cuda_ctx, cuda_device));
71+
CU_TRY(cuCtxSetCurrent(cuda_ctx));
72+
fprintf(stderr, "CUDA initialized: device %d context %p\n", cuda_device, (void*)cuda_ctx);
73+
74+
CU_TRY(cuCtxGetDevice(&cuda_device2));
75+
fprintf(stderr, "CUDA get device %d\n", cuda_device2);
76+
77+
CU_TRY(cuMemAlloc(&cuda_ptr, size));
78+
CU_TRY(cuMemsetD8(cuda_ptr, 'A', size));
79+
CU_TRY(cuCtxSynchronize());
80+
fprintf(stderr, "CUDA allocated %p size %zu\n", (void*)cuda_ptr, size);
81+
82+
CU_TRY(cuPointerGetAttribute(&mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, cuda_ptr));
83+
fprintf(stderr, "CUDA buffer mem type: %d\n", mem_type);
84+
85+
CUObjIOOps cuobj_ops = { .get = cuobj_get, .put = cuobj_put };
86+
cuObjClient cuobj_client(cuobj_ops);
87+
cuObjErr_t cuobj_err = cuobj_client.cuMemObjGetDescriptor((void*)cuda_ptr, size);
88+
fprintf(stderr, "cuObjClient::cuMemObjGetDescriptor: %d\n", cuobj_err);
89+
90+
cuObjMemoryType_t cuobj_mem_type = cuObjClient::getMemoryType((void*)cuda_ptr);
91+
fprintf(stderr, "cuObjClient::getMemoryType: %d\n", cuobj_mem_type);
92+
93+
ssize_t ret_size = cuobj_client.cuObjGet(NULL, (void*)cuda_ptr, size);
94+
fprintf(stderr, "cuObjClient::cuObjGet: %zd\n", ret_size);
95+
96+
memset(host_ptr, 'B', size);
97+
CU_TRY(cuMemcpyDtoH(host_ptr, cuda_ptr, size));
98+
99+
// skip repeating 'A' at the end, while keeping the first 10 chars,
100+
// and terminate the string for printing
101+
int i = size - 1;
102+
while (i > 10 && host_ptr[i] == 'A') --i;
103+
host_ptr[i] = '\0';
104+
fprintf(stderr, "CUDA copied to host: %s\n", host_ptr);
105+
106+
free(host_ptr);
107+
CU_TRY(cuMemFree(cuda_ptr));
108+
CU_TRY(cuDevicePrimaryCtxRelease(cuda_device));
109+
// CU_TRY(cuCtxDestroy(cuda_ctx));
110+
fprintf(stderr, "CUDA freed\n");
111+
112+
return 0;
113+
}

docs/design/S3-over-RDMA.md

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
# S3 over RDMA (EXPERIMENTAL)
2+
3+
## Overview
4+
5+
S3 over RDMA is a new technology that enhances I/O performance directly to the applications memory, or directly to GPU memory! RDMA is extremely efficient, it bypasses the operating system, TCP stack, and much of the networking CPU overhead. Layering S3 on top of RDMA fits like a glove for modern applications. And the same endpoints can serve both RDMA and non-RDMA clients with a simple HTTP header.
6+
7+
This feature is still EXPERIMENTAL and is not yet available for production use. This document outlines the usage and design of this feature.
8+
9+
## What is needed to use S3 over RDMA?
10+
11+
Hardware:
12+
- High performance RDMA network 100G/.../800G
13+
- Infiniband or RoCE (must support DC transport)
14+
- Compute Nodes with optional GPU devices and NVIDIA CUDA toolkit
15+
- Storage Nodes with NVMe drives, can be same as compute nodes
16+
17+
Software:
18+
- RHEL / UBUNTU
19+
- High performance file system (e.g GPFS)
20+
- NooBaa RPM / build from source with RDMA support.
21+
- NVIDIA's cuObject (beta) and cuFile RDMA libraries.
22+
23+
24+
## Which applications can benefit from S3 over RDMA?
25+
26+
- boto3 - S3 sdk for python applications
27+
- s3-connector-for-pytorch - library for AI/ML applications (data loaders, checkpoints, etc.)
28+
- rclone - a standalone CLI that can copy data between files/dirs and S3
29+
- nodejs - using aws-sdk-js-v3 to store data collected from web services
30+
- (share with us your use case and we will add to the list...)
31+
32+
## Lets dig right in
33+
34+
- Clone the noobaa-core repository
35+
- Install the required dependencies (nodejs, nasm, etc. - see the noobaa-core README)
36+
- Standard build - simple `make` should succeed.
37+
38+
Build the project with RDMA support:
39+
40+
```bash
41+
$ make RDMA=1
42+
```
43+
44+
or with RDMA and CUDA support:
45+
46+
```bash
47+
$ make RDMA=1 CUDA=1
48+
```
49+
50+
Define the following runtime variables:
51+
52+
```bash
53+
CUDA_PATH="$(realpath /usr/local/cuda)"
54+
CUOBJ_PATH="$(realpath ../cuObject-0.8.1-Linux_x86_64/src)"
55+
CUFILE_ENV_PATH_JSON="$(realpath ../cuobj.json)"
56+
CUOBJ_LIBS="$CUOBJ_PATH/lib/libcuobjserver.so $CUOBJ_PATH/lib/libcuobjclient.so $CUOBJ_PATH/lib/libcufile.so.1.13.0 $CUOBJ_PATH/lib/libcufile_rdma.so.1.13.0"
57+
```
58+
59+
**NOTE**: If compilation fails to find cuda_runtime.h use: `touch $CUOBJ_PATH/include/cuda_runtime.h`
60+
61+
Create the configuration directory as described in [this doc](https://github.com/noobaa/noobaa-core/blob/master/docs/NooBaaNonContainerized/GettingStarted.md#configuration) (no need to build and install RPM because we build from source), and finally start the noobaa server with RDMA support:
62+
63+
```bash
64+
$ LD_PRELOAD=$CUOBJ_LIBS node src/cmd/nsfs
65+
```
66+
67+
## Getting Started
68+
69+
First we use the s3perf tool in the noobaa repo to test the RDMA performance. Here is a basic example that reads the same 8MB file 10 continuously and reports the speed:
70+
71+
```bash
72+
$ LD_PRELOAD="$CUOBJ_LIBS" \
73+
CUFILE_ENV_PATH_JSON="$CUFILE_ENV_PATH_JSON" \
74+
UV_THREADPOOL_SIZE=16 \
75+
DISABLE_INIT_RANDOM_SEED=true \
76+
node src/tools/s3perf.js \
77+
--endpoint http://172.16.0.61:6001 \
78+
--access_key "AK" --secret_key "SK" \
79+
--bucket bucket1 --get file8M --samekey \
80+
--time 120 --size_units MB --size 8 --concur 8 --forks 6 --rdma
81+
```
82+
83+
Will output something like:
84+
85+
```sh
86+
Feb-20 5:50:05.386 [/3039076] [LOG] CONSOLE:: S3: 11240.0 MB/sec (average 9650.2) | OPS: 1405 min:20.7ms max:50.8ms avg:34.2ms
87+
Feb-20 5:50:06.386 [/3039076] [LOG] CONSOLE:: S3: 11216.0 MB/sec (average 9685.5) | OPS: 1402 min:20.3ms max:54.2ms avg:34.3ms
88+
Feb-20 5:50:07.386 [/3039076] [LOG] CONSOLE:: S3: 11040.0 MB/sec (average 9715.4) | OPS: 1380 min:17.1ms max:55.8ms avg:34.7ms
89+
Feb-20 5:50:08.387 [/3039076] [LOG] CONSOLE:: S3: 11024.0 MB/sec (average 9743.7) | OPS: 1378 min:17.4ms max:58.3ms avg:34.9ms
90+
```
91+
92+
Remove the --rdma flag to compare the performance with and without RDMA.
93+
94+
```bash
95+
Feb-20 5:53:16.867 [/3040865] [LOG] CONSOLE:: S3: 3931.9 MB/sec (average 3785.4) | OPS: 495 min:53.1ms max:169.3ms avg:98.0ms
96+
Feb-20 5:53:17.869 [/3040865] [LOG] CONSOLE:: S3: 3918.4 MB/sec (average 3788.3) | OPS: 490 min:58.0ms max:161.3ms avg:98.0ms
97+
Feb-20 5:53:18.869 [/3040865] [LOG] CONSOLE:: S3: 3978.2 MB/sec (average 3792.3) | OPS: 497 min:50.9ms max:157.1ms avg:97.2ms
98+
Feb-20 5:53:19.871 [/3040865] [LOG] CONSOLE:: S3: 3949.0 MB/sec (average 3795.5) | OPS: 489 min:52.5ms max:159.1ms avg:96.6ms
99+
```
100+
101+
The --cuda flag tests the performance using the GPU memory. It can be used with or without the --rdma flag. Currently this is failing. Stay tuned.
102+
103+
```bash
104+
105+
## Next steps
106+
107+
- Integrate S3 over RDMA to python applications
108+
- Support multiple Server IP's
109+
- Optimization for GPFS

0 commit comments

Comments
 (0)