Skip to content

Commit 553a5c3

Browse files
authored
rpc : do not wait for response when sending RPC_CMD_SET_TENSOR (#12943)
RPC_CMD_SET_TENSOR always returns an empty response and we send this 4 times per token. We can improve TG speed if we don't wait for this empty response. The performance impact of this change depends on the network latency.
1 parent 13be08d commit 553a5c3

File tree

2 files changed

+13
-7
lines changed

2 files changed

+13
-7
lines changed

ggml/include/ggml-rpc.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
extern "C" {
88
#endif
99

10-
#define RPC_PROTO_MAJOR_VERSION 1
10+
#define RPC_PROTO_MAJOR_VERSION 2
1111
#define RPC_PROTO_MINOR_VERSION 0
1212
#define RPC_PROTO_PATCH_VERSION 0
1313
#define GGML_RPC_MAX_SERVERS 16

ggml/src/ggml-rpc/ggml-rpc.cpp

+12-6
Original file line numberDiff line numberDiff line change
@@ -378,8 +378,8 @@ static bool parse_endpoint(const std::string & endpoint, std::string & host, int
378378
}
379379

380380
// RPC request : | rpc_cmd (1 byte) | request_size (8 bytes) | request_data (request_size bytes) |
381-
// RPC response: | response_size (8 bytes) | response_data (response_size bytes) |
382-
static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cmd, const void * input, size_t input_size, void * output, size_t output_size) {
381+
// No response
382+
static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cmd, const void * input, size_t input_size) {
383383
uint8_t cmd_byte = cmd;
384384
if (!send_data(sock->fd, &cmd_byte, sizeof(cmd_byte))) {
385385
return false;
@@ -390,6 +390,15 @@ static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cm
390390
if (!send_data(sock->fd, input, input_size)) {
391391
return false;
392392
}
393+
return true;
394+
}
395+
396+
// RPC request : | rpc_cmd (1 byte) | request_size (8 bytes) | request_data (request_size bytes) |
397+
// RPC response: | response_size (8 bytes) | response_data (response_size bytes) |
398+
static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cmd, const void * input, size_t input_size, void * output, size_t output_size) {
399+
if (!send_rpc_cmd(sock, cmd, input, input_size)) {
400+
return false;
401+
}
393402
// TODO: currently the output_size is always known, do we need support for commands with variable output size?
394403
// even if we do, we can skip sending output_size from the server for commands with known output size
395404
uint64_t out_size;
@@ -555,7 +564,7 @@ static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggm
555564
memcpy(input.data(), &rpc_tensor, sizeof(rpc_tensor));
556565
memcpy(input.data() + sizeof(rpc_tensor), &offset, sizeof(offset));
557566
memcpy(input.data() + sizeof(rpc_tensor) + sizeof(offset), data, size);
558-
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR, input.data(), input.size(), nullptr, 0);
567+
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR, input.data(), input.size());
559568
GGML_ASSERT(status);
560569
}
561570

@@ -1428,9 +1437,6 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
14281437
if (!server.set_tensor(input)) {
14291438
return;
14301439
}
1431-
if (!send_msg(sockfd, nullptr, 0)) {
1432-
return;
1433-
}
14341440
break;
14351441
}
14361442
case RPC_CMD_SET_TENSOR_HASH: {

0 commit comments

Comments
 (0)