Skip to content

Commit 77b9a8f

Browse files
authored
chore: introduce DEBUG COMPRESSION (#4620)
This function analyzes the compressability of the keys using a single huffman tree. For example, ``` >debug POPULATE 1000000 keyabcdef 10 OK > debug compression 1) max_symbol 2) (integer) 121 3) max_bits 4) (integer) 5 5) raw_size 6) (integer) 7861817 7) compressed_size 8) (integer) 4372270 9) ratio 10) "0.5561398847111297" ``` Signed-off-by: Roman Gershman <[email protected]>
1 parent afad75f commit 77b9a8f

12 files changed

+539
-13
lines changed

.pre-commit-config.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ default_stages: [commit]
22
exclude: |
33
(?x)(
44
src/redis/.* |
5+
src/huff/.* |
56
contrib/charts/dragonfly/ci/.* |
67
patches/.*
78
)

docs/build-from-source.md

+6-6
Original file line numberDiff line numberDiff line change
@@ -15,22 +15,22 @@ but you can also run Dragonfly on older kernels as well.
1515
On Debian/Ubuntu:
1616

1717
```bash
18-
sudo apt install ninja-build libunwind-dev libboost-fiber-dev libssl-dev \
19-
autoconf-archive libtool cmake g++ libzstd-dev bison libxml2-dev zlib1g-dev
18+
sudo apt install ninja-build libunwind-dev libboost-context-dev libssl-dev \
19+
autoconf-archive libtool cmake g++ bison zlib1g-dev
2020
```
2121

2222
On Fedora:
2323

2424
```bash
25-
sudo dnf install -y automake boost-devel g++ git cmake libtool ninja-build libzstd-devel \
26-
openssl-devel libunwind-devel autoconf-archive patch bison libxml2-devel libstdc++-static
25+
sudo dnf install -y automake boost-devel g++ git cmake libtool ninja-build \
26+
openssl-devel libunwind-devel autoconf-archive patch bison libstdc++-static
2727
```
2828

2929
On openSUSE:
3030

3131
```bash
32-
sudo zypper install automake boost-devel gcc-c++ git cmake libtool ninja libzstd-devel \
33-
openssl-devel libunwind-devel autoconf-archive patch bison libxml2-devel \
32+
sudo zypper install automake boost-devel gcc-c++ git cmake libtool ninja \
33+
openssl-devel libunwind-devel autoconf-archive patch bison \
3434
libboost_context-devel libboost_system-devel
3535
```
3636

src/core/CMakeLists.txt

+1-3
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,6 @@ cxx_link(dash_bench dfly_core redis_test_lib)
3434

3535

3636

37-
find_library(ZSTD_LIB NAMES libzstd.a libzstdstatic.a zstd NAMES_PER_DIR REQUIRED)
38-
3937
cxx_test(dfly_core_test dfly_core TRDP::fast_float ${PCRE2_LIB} ${RE2_LIB} LABELS DFLY)
4038
cxx_test(compact_object_test dfly_core LABELS DFLY)
4139
cxx_test(extent_tree_test dfly_core LABELS DFLY)
@@ -51,7 +49,7 @@ cxx_test(flatbuffers_test dfly_core TRDP::flatbuffers LABELS DFLY)
5149
cxx_test(bloom_test dfly_core LABELS DFLY)
5250
cxx_test(allocation_tracker_test dfly_core absl::random_random LABELS DFLY)
5351
cxx_test(qlist_test dfly_core DATA testdata/list.txt.zst LABELS DFLY)
54-
cxx_test(zstd_test dfly_core ${ZSTD_LIB} LABELS DFLY)
52+
cxx_test(zstd_test dfly_core TRDP::zstd LABELS DFLY)
5553
cxx_test(top_keys_test dfly_core LABELS DFLY)
5654

5755
if(LIB_PCRE2)

src/huff/LICENSE

+30
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
BSD License
2+
3+
For Zstandard software
4+
5+
Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved.
6+
7+
Redistribution and use in source and binary forms, with or without modification,
8+
are permitted provided that the following conditions are met:
9+
10+
* Redistributions of source code must retain the above copyright notice, this
11+
list of conditions and the following disclaimer.
12+
13+
* Redistributions in binary form must reproduce the above copyright notice,
14+
this list of conditions and the following disclaimer in the documentation
15+
and/or other materials provided with the distribution.
16+
17+
* Neither the name Facebook, nor Meta, nor the names of its contributors may
18+
be used to endorse or promote products derived from this software without
19+
specific prior written permission.
20+
21+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
22+
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23+
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
25+
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26+
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27+
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
28+
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

src/huff/README.md

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
The code in this folder exposes internal functions that are used by ZSTD.
2+
These functions are part of https://github.com/Cyan4973/FiniteStateEntropy project.
3+
4+
Since we already link to ZSTD, it is convenient that we get this functionality for free.

src/huff/hist.h

+82
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
/* ******************************************************************
2+
* hist : Histogram functions
3+
* part of Finite State Entropy project
4+
* Copyright (c) Meta Platforms, Inc. and affiliates.
5+
*
6+
* You can contact the author at :
7+
* - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
8+
* - Public forum : https://groups.google.com/forum/#!forum/lz4c
9+
*
10+
* This source code is licensed under both the BSD-style license (found in the
11+
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
12+
* in the COPYING file in the root directory of this source tree).
13+
* You may select, at your option, one of the above-listed licenses.
14+
****************************************************************** */
15+
16+
/* --- dependencies --- */
17+
#include <stddef.h> /* size_t */
18+
19+
20+
/* --- simple histogram functions --- */
21+
22+
/*! HIST_count():
23+
* Provides the precise count of each byte within a table 'count'.
24+
* 'count' is a table of unsigned int, of minimum size (*maxSymbolValuePtr+1).
25+
* Updates *maxSymbolValuePtr with actual largest symbol value detected.
26+
* @return : count of the most frequent symbol (which isn't identified).
27+
* or an error code, which can be tested using HIST_isError().
28+
* note : if return == srcSize, there is only one symbol.
29+
*/
30+
size_t HIST_count(unsigned* count, unsigned* maxSymbolValuePtr,
31+
const void* src, size_t srcSize);
32+
33+
unsigned HIST_isError(size_t code); /**< tells if a return value is an error code */
34+
35+
36+
/* --- advanced histogram functions --- */
37+
38+
#define HIST_WKSP_SIZE_U32 1024
39+
#define HIST_WKSP_SIZE (HIST_WKSP_SIZE_U32 * sizeof(unsigned))
40+
/** HIST_count_wksp() :
41+
* Same as HIST_count(), but using an externally provided scratch buffer.
42+
* Benefit is this function will use very little stack space.
43+
* `workSpace` is a writable buffer which must be 4-bytes aligned,
44+
* `workSpaceSize` must be >= HIST_WKSP_SIZE
45+
*/
46+
size_t HIST_count_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
47+
const void* src, size_t srcSize,
48+
void* workSpace, size_t workSpaceSize);
49+
50+
/** HIST_countFast() :
51+
* same as HIST_count(), but blindly trusts that all byte values within src are <= *maxSymbolValuePtr.
52+
* This function is unsafe, and will segfault if any value within `src` is `> *maxSymbolValuePtr`
53+
*/
54+
size_t HIST_countFast(unsigned* count, unsigned* maxSymbolValuePtr,
55+
const void* src, size_t srcSize);
56+
57+
/** HIST_countFast_wksp() :
58+
* Same as HIST_countFast(), but using an externally provided scratch buffer.
59+
* `workSpace` is a writable buffer which must be 4-bytes aligned,
60+
* `workSpaceSize` must be >= HIST_WKSP_SIZE
61+
*/
62+
size_t HIST_countFast_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
63+
const void* src, size_t srcSize,
64+
void* workSpace, size_t workSpaceSize);
65+
66+
/*! HIST_count_simple() :
67+
* Same as HIST_countFast(), this function is unsafe,
68+
* and will segfault if any value within `src` is `> *maxSymbolValuePtr`.
69+
* It is also a bit slower for large inputs.
70+
* However, it does not need any additional memory (not even on stack).
71+
* @return : count of the most frequent symbol.
72+
* Note this function doesn't produce any error (i.e. it must succeed).
73+
*/
74+
unsigned HIST_count_simple(unsigned* count, unsigned* maxSymbolValuePtr,
75+
const void* src, size_t srcSize);
76+
77+
/*! HIST_add() :
78+
* Lowest level: just add nb of occurrences of characters from @src into @count.
79+
* @count is not reset. @count array is presumed large enough (i.e. 1 KB).
80+
@ This function does not need any additional stack memory.
81+
*/
82+
void HIST_add(unsigned* count, const void* src, size_t srcSize);

0 commit comments

Comments
 (0)