diff --git a/rel/overlay/etc/default.ini b/rel/overlay/etc/default.ini index ff148b2714..939bea21b6 100644 --- a/rel/overlay/etc/default.ini +++ b/rel/overlay/etc/default.ini @@ -724,6 +724,14 @@ partitioned||* = true ; *.example.com:443:[2001:db8::1]:443 ;connect_to = +; Compress outbound replication request bodies (_bulk_docs, _revs_diff) with gzip. +; Disabled by default. Only gzip is supported. Enable only when talking to CouchDB +; servers that support gzip Content-Encoding on inbound requests. +;compress_requests = false +;compress_min_size = 1024 +;compression_algorithm = gzip + + ; Some socket options that might boost performance in some scenarios: ; {nodelay, boolean()} ; {sndbuf, integer()} diff --git a/src/couch_replicator/COMPRESSION_CONFIG.md b/src/couch_replicator/COMPRESSION_CONFIG.md new file mode 100644 index 0000000000..a7e59c8c3d --- /dev/null +++ b/src/couch_replicator/COMPRESSION_CONFIG.md @@ -0,0 +1,16 @@ +# CouchDB Replicator Request Compression + +The replicator can optionally gzip-compress outbound request bodies for +`_bulk_docs` and `_revs_diff`. This reduces bandwidth during replication. +CouchDB already supports `Content-Encoding: gzip` on inbound requests, so no +server-side changes are needed. + +Compression is disabled by default. Relevant `[replicator]` config keys: + +```ini +[replicator] +compress_requests = false +compress_min_size = 1024 ; minimum body size in bytes before compressing +``` + +Metric: `couch_replicator.requests_compressed.gzip` — number of gzip-compressed requests sent. diff --git a/src/couch_replicator/priv/stats_descriptions.cfg b/src/couch_replicator/priv/stats_descriptions.cfg index 10821d8851..546b8af38e 100644 --- a/src/couch_replicator/priv/stats_descriptions.cfg +++ b/src/couch_replicator/priv/stats_descriptions.cfg @@ -146,3 +146,8 @@ {type, counter}, {desc, <<"number of times DNS overrides were applied to replication requests">>} ]}. + +{[couch_replicator, requests_compressed, gzip], [ + {type, counter}, + {desc, <<"number of HTTP requests compressed with gzip by the replicator">>} +]}. diff --git a/src/couch_replicator/src/couch_replicator_api_wrap.erl b/src/couch_replicator/src/couch_replicator_api_wrap.erl index 9364757d6c..b10288a36c 100644 --- a/src/couch_replicator/src/couch_replicator_api_wrap.erl +++ b/src/couch_replicator/src/couch_replicator_api_wrap.erl @@ -171,13 +171,14 @@ ensure_full_commit(#httpdb{} = Db) -> get_missing_revs(#httpdb{} = Db, IdRevs) -> JsonBody = {[{Id, couch_doc:revs_to_strs(Revs)} || {Id, Revs} <- IdRevs]}, + {Body, ExtraHeaders} = maybe_compress(?JSON_ENCODE(JsonBody)), send_req( Db, [ {method, post}, {path, "_revs_diff"}, - {body, ?JSON_ENCODE(JsonBody)}, - {headers, [{"Content-Type", "application/json"}]} + {body, Body}, + {headers, [{"Content-Type", "application/json"} | ExtraHeaders]} ], fun (200, _, {Props}) -> @@ -477,7 +478,7 @@ update_docs(#httpdb{} = HttpDb, DocList, Options, UpdateType) -> % Note: nginx and other servers don't like PUT/POST requests without % a Content-Length header, so we can't do a chunked transfer encoding % and JSON encode each doc only before sending it through the socket. - {Docs, Len} = lists:mapfoldl( + {Docs, _} = lists:mapfoldl( fun (#doc{} = Doc, Acc) -> Json = ?JSON_ENCODE(couch_doc:to_json_obj(Doc, [revs, attachments])), @@ -485,32 +486,27 @@ update_docs(#httpdb{} = HttpDb, DocList, Options, UpdateType) -> (Doc, Acc) -> {Doc, Acc + iolist_size(Doc)} end, - byte_size(Prefix) + byte_size(Suffix) + length(DocList) - 1, + 0, DocList ), - BodyFun = fun - (eof) -> - eof; - ([]) -> - {ok, Suffix, eof}; - ([prefix | Rest]) -> - {ok, Prefix, Rest}; - ([Doc]) -> - {ok, Doc, []}; - ([Doc | RestDocs]) -> - {ok, [Doc, ","], RestDocs} - end, + % Collect the full body into a binary so we can optionally gzip it. + % Content-Length is required (nginx etc. reject chunked PUT/POST). + RawBody = iolist_to_binary( + [Prefix, lists:join(",", Docs), Suffix] + ), + {Body, ExtraHeaders} = maybe_compress(RawBody), Headers = [ - {"Content-Length", Len}, + {"Content-Length", byte_size(Body)}, {"Content-Type", "application/json"}, {"X-Couch-Full-Commit", FullCommit} + | ExtraHeaders ], send_req( HttpDb, [ {method, post}, {path, "_bulk_docs"}, - {body, {BodyFun, [prefix | Docs]}}, + {body, Body}, {headers, Headers} ], fun @@ -1052,6 +1048,33 @@ header_value(Key, Headers, Default) -> _ -> Default end. + +%% Compress Body with gzip if enabled and body is large enough. +%% Returns {Body, ExtraHeaders} where ExtraHeaders may contain Content-Encoding. +maybe_compress(Body) when is_binary(Body) -> + case config:get_boolean("replicator", "compress_requests", false) of + true -> + Algorithm = config:get("replicator", "compression_algorithm", "gzip"), + MinSize = config:get_integer("replicator", "compress_min_size", 1024), + case byte_size(Body) >= MinSize of + true -> compress_with(Algorithm, Body); + false -> {Body, []} + end; + false -> + {Body, []} + end. + +compress_with("gzip", Body) -> + Compressed = zlib:gzip(Body), + couch_stats:increment_counter([couch_replicator, requests_compressed]), + couch_stats:increment_counter([couch_replicator, requests_compressed, gzip]), + {Compressed, [{"Content-Encoding", "gzip"}]}; +compress_with(Other, Body) -> + couch_log:warning( + "~p: unsupported compression_algorithm ~p, skipping compression", + [?MODULE, Other] + ), + {Body, []}. % Normalize an #httpdb{} or #db{} record such that it can be used for % comparisons. This means remove things like pids and also sort options / props. diff --git a/src/couch_replicator/src/couch_replicator_httpc.erl b/src/couch_replicator/src/couch_replicator_httpc.erl index 7f4f43afd5..4dd576b343 100644 --- a/src/couch_replicator/src/couch_replicator_httpc.erl +++ b/src/couch_replicator/src/couch_replicator_httpc.erl @@ -21,6 +21,7 @@ -export([stop_http_worker/0]). -export([full_url/2]). + -import(couch_util, [ get_value/2, get_value/3 diff --git a/src/couch_replicator/test/eunit/couch_replicator_compression_tests.erl b/src/couch_replicator/test/eunit/couch_replicator_compression_tests.erl new file mode 100644 index 0000000000..cee9a0c3ae --- /dev/null +++ b/src/couch_replicator/test/eunit/couch_replicator_compression_tests.erl @@ -0,0 +1,89 @@ +% Licensed under the Apache License, Version 2.0 (the "License"); you may not +% use this file except in compliance with the License. You may obtain a copy of +% the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +% WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +% License for the specific language governing permissions and limitations under +% the License. + +-module(couch_replicator_compression_tests). + +-include_lib("couch/include/couch_eunit.hrl"). +-include_lib("couch/include/couch_db.hrl"). + +-define(DOCS_COUNT, 10). +-define(TIMEOUT_EUNIT, 30). + +compression_test_() -> + { + "Replication compression tests", + { + foreach, + fun couch_replicator_test_helper:test_setup/0, + fun couch_replicator_test_helper:test_teardown/1, + [ + ?TDEF_FE(should_not_compress_by_default, ?TIMEOUT_EUNIT), + ?TDEF_FE(should_compress_when_enabled, ?TIMEOUT_EUNIT) + ] + } + }. + +should_not_compress_by_default({_Ctx, {Source, Target}}) -> + Before = couch_stats:sample([couch_replicator, requests_compressed, gzip]), + populate_db(Source, ?DOCS_COUNT), + replicate(Source, Target), + compare_dbs(Source, Target), + After = couch_stats:sample([couch_replicator, requests_compressed, gzip]), + ?assertEqual(Before, After). + +should_compress_when_enabled({_Ctx, {Source, Target}}) -> + config:set("replicator", "compress_requests", "true", false), + config:set("replicator", "compress_min_size", "10", false), + config:set("replicator", "compression_algorithm", "gzip", false), + Before = couch_stats:sample([couch_replicator, requests_compressed, gzip]), + populate_db(Source, ?DOCS_COUNT), + replicate(Source, Target), + compare_dbs(Source, Target), + After = couch_stats:sample([couch_replicator, requests_compressed, gzip]), + ?assert(After > Before), + config:delete("replicator", "compress_requests", false), + config:delete("replicator", "compress_min_size", false), + config:delete("replicator", "compression_algorithm", false). + +populate_db(DbName, Count) -> + Docs = lists:map( + fun(I) -> + Id = iolist_to_binary(io_lib:format("doc~p", [I])), + Data = list_to_binary(lists:duplicate(100, $x)), + {[ + {<<"_id">>, Id}, + {<<"value">>, I}, + {<<"data">>, Data} + ]} + end, + lists:seq(1, Count) + ), + {ok, _} = fabric:update_docs(DbName, Docs, [?ADMIN_CTX]), + ok. + +replicate(Source, Target) -> + SourceUrl = couch_replicator_test_helper:cluster_db_url(Source), + TargetUrl = couch_replicator_test_helper:cluster_db_url(Target), + RepObject = {[ + {<<"source">>, SourceUrl}, + {<<"target">>, TargetUrl}, + {<<"continuous">>, false} + ]}, + {ok, _} = couch_replicator_test_helper:replicate(RepObject). + +compare_dbs(Source, Target) -> + {ok, SourceInfo} = fabric:get_db_info(Source), + {ok, TargetInfo} = fabric:get_db_info(Target), + SourceDocCount = couch_util:get_value(doc_count, SourceInfo), + TargetDocCount = couch_util:get_value(doc_count, TargetInfo), + ?assertEqual(SourceDocCount, TargetDocCount), + ?assertEqual(?DOCS_COUNT, TargetDocCount).