From 7b557a5e5d9280d0bda58b946453713ba9bf498e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Theodor=20Wu=CC=88bker?= Date: Tue, 9 Sep 2025 13:32:58 +0200 Subject: [PATCH] DiskANN: Modernize module Currently, an old implementation of the diskann python bindings is used. Microsoft has an updated implementation called diskannpy available now. Replace the old diskann module with a new version that uses the StaticMemoryIndex from diskannpy. Add corresponding config and dockerfile. Also reduce the complexity of the previous code by removing unnecessary functions, leading to a minimal version comparable to the hnswlib module. --- ann_benchmarks/algorithms/diskann/Dockerfile | 28 +-- ann_benchmarks/algorithms/diskann/config.yml | 162 ++------------ ann_benchmarks/algorithms/diskann/module.py | 221 ++++--------------- 3 files changed, 69 insertions(+), 342 deletions(-) diff --git a/ann_benchmarks/algorithms/diskann/Dockerfile b/ann_benchmarks/algorithms/diskann/Dockerfile index 18c0329df..2fe313bdb 100644 --- a/ann_benchmarks/algorithms/diskann/Dockerfile +++ b/ann_benchmarks/algorithms/diskann/Dockerfile @@ -1,29 +1,9 @@ FROM ann-benchmarks RUN apt-get update -RUN apt-get install -y wget git cmake g++ libaio-dev libgoogle-perftools-dev clang-format libboost-dev python3 python3-setuptools python3-pip -RUN pip3 install pybind11 numpy +RUN apt-get install -y make cmake g++ libaio-dev libgoogle-perftools-dev clang-format libboost-all-dev python3.10-venv +RUN DEBIAN_FRONTEND=noninteractive apt-get install -y libmkl-full-dev -RUN cd /tmp && wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB -RUN cd /tmp && apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB -RUN cd /tmp && rm GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB -RUN cd /tmp && sh -c 'echo deb https://apt.repos.intel.com/mkl all main > /etc/apt/sources.list.d/intel-mkl.list' -RUN apt-get update -RUN apt-get install -y intel-mkl-64bit-2020.0-088 - -RUN update-alternatives --install /usr/lib/x86_64-linux-gnu/libblas.so libblas.so-x86_64-linux-gnu /opt/intel/mkl/lib/intel64/libmkl_rt.so 150 -RUN update-alternatives --install /usr/lib/x86_64-linux-gnu/libblas.so.3 libblas.so.3-x86_64-linux-gnu /opt/intel/mkl/lib/intel64/libmkl_rt.so 150 -RUN update-alternatives --install /usr/lib/x86_64-linux-gnu/liblapack.so liblapack.so-x86_64-linux-gnu /opt/intel/mkl/lib/intel64/libmkl_rt.so 150 -RUN update-alternatives --install /usr/lib/x86_64-linux-gnu/liblapack.so.3 liblapack.so.3-x86_64-linux-gnu /opt/intel/mkl/lib/intel64/libmkl_rt.so 150 - -RUN echo "/opt/intel/lib/intel64" > /etc/ld.so.conf.d/mkl.conf -RUN echo "/opt/intel/mkl/lib/intel64" >> /etc/ld.so.conf.d/mkl.conf -RUN ldconfig -RUN echo "MKL_THREADING_LAYER=GNU" >> /etc/environment +RUN pip install diskannpy==0.7.0 -RUN git clone --single-branch --branch python_bindings https://github.com/microsoft/diskann -RUN mkdir -p diskann/build -RUN cd diskann/build && cmake -DCMAKE_BUILD_TYPE=Release .. -RUN cd diskann/build && make -j -RUN cd diskann/python && pip install -e . -RUN python3 -c 'import vamanapy' +RUN python -c 'import diskannpy' diff --git a/ann_benchmarks/algorithms/diskann/config.yml b/ann_benchmarks/algorithms/diskann/config.yml index 0a8be58c9..49fbef50b 100644 --- a/ann_benchmarks/algorithms/diskann/config.yml +++ b/ann_benchmarks/algorithms/diskann/config.yml @@ -1,139 +1,25 @@ float: - angular: - - base_args: ['@metric'] - constructor: Vamana - disabled: false - docker_tag: ann-benchmarks-diskann - module: ann_benchmarks.algorithms.diskann - name: vamana(diskann) - run_groups: - vamana_125_32_1: - args: [{alpha: 1, l_build: 125, max_outdegree: 32}] - query_args: [[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120]] - vamana_125_32_1-1: - args: [{alpha: 1.1, l_build: 125, max_outdegree: 32}] - query_args: [[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120]] - vamana_125_32_1-2: - args: [{alpha: 1.2, l_build: 125, max_outdegree: 32}] - query_args: [[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120]] - vamana_125_64_1: - args: [{alpha: 1, l_build: 125, max_outdegree: 64}] - query_args: [[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120]] - vamana_125_64_1-1: - args: [{alpha: 1.1, l_build: 125, max_outdegree: 64}] - query_args: [[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120]] - vamana_125_64_1-2: - args: [{alpha: 1.2, l_build: 125, max_outdegree: 64}] - query_args: [[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120]] - - base_args: ['@metric'] - constructor: VamanaPQ - disabled: false - docker_tag: ann-benchmarks-diskann_pq - module: ann_benchmarks.algorithms.diskann - name: vamana-pq(diskann) - run_groups: - vamana_pq_125_32_1-2_14: - args: [{alpha: 1.2, chunks: 14, l_build: 125, max_outdegree: 32}] - query_args: [[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120]] - vamana_pq_125_32_1-2_28: - args: [{alpha: 1.2, chunks: 28, l_build: 125, max_outdegree: 32}] - query_args: [[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120]] - vamana_pq_125_32_1-2_42: - args: [{alpha: 1.2, chunks: 42, l_build: 125, max_outdegree: 32}] - query_args: [[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120]] - vamana_pq_125_32_1_14: - args: [{alpha: 1, chunks: 14, l_build: 125, max_outdegree: 32}] - query_args: [[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120]] - vamana_pq_125_32_1_28: - args: [{alpha: 1, chunks: 28, l_build: 125, max_outdegree: 32}] - query_args: [[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120]] - vamana_pq_125_32_1_42: - args: [{alpha: 1, chunks: 42, l_build: 125, max_outdegree: 32}] - query_args: [[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120]] - vamana_pq_125_64_1-2_14: - args: [{alpha: 1.2, chunks: 14, l_build: 125, max_outdegree: 64}] - query_args: [[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120]] - vamana_pq_125_64_1-2_28: - args: [{alpha: 1.2, chunks: 28, l_build: 125, max_outdegree: 64}] - query_args: [[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120]] - vamana_pq_125_64_1-2_42: - args: [{alpha: 1.2, chunks: 42, l_build: 125, max_outdegree: 64}] - query_args: [[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120]] - vamana_pq_125_64_1_14: - args: [{alpha: 1, chunks: 14, l_build: 125, max_outdegree: 64}] - query_args: [[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120]] - vamana_pq_125_64_1_28: - args: [{alpha: 1, chunks: 28, l_build: 125, max_outdegree: 64}] - query_args: [[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120]] - vamana_pq_125_64_1_42: - args: [{alpha: 1, chunks: 42, l_build: 125, max_outdegree: 64}] - query_args: [[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120]] - euclidean: - - base_args: ['@metric'] - constructor: Vamana - disabled: false - docker_tag: ann-benchmarks-diskann - module: ann_benchmarks.algorithms.diskann - name: vamana(diskann) - run_groups: - vamana_100_64_1: - args: [{alpha: 1, l_build: 100, max_outdegree: 64}] - query_args: [[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120]] - vamana_100_64_1-1: - args: [{alpha: 1.1, l_build: 100, max_outdegree: 64}] - query_args: [[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120]] - vamana_100_64_1-2: - args: [{alpha: 1.2, l_build: 100, max_outdegree: 64}] - query_args: [[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120]] - vamana_125_32_1: - args: [{alpha: 1, l_build: 125, max_outdegree: 32}] - query_args: [[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120]] - vamana_125_32_1-1: - args: [{alpha: 1.1, l_build: 125, max_outdegree: 32}] - query_args: [[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120]] - vamana_125_32_1-2: - args: [{alpha: 1.2, l_build: 125, max_outdegree: 32}] - query_args: [[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120]] - - base_args: ['@metric'] - constructor: VamanaPQ - disabled: false - docker_tag: ann-benchmarks-diskann_pq - module: ann_benchmarks.algorithms.diskann - name: vamana-pq(diskann) - run_groups: - vamana_pq_100_64_1-2_32: - args: [{alpha: 1.2, chunks: 32, l_build: 100, max_outdegree: 64}] - query_args: [[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120]] - vamana_pq_100_64_1_32: - args: [{alpha: 1, chunks: 32, l_build: 100, max_outdegree: 64}] - query_args: [[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120]] - vamana_pq_125_32_1-2_112: - args: [{alpha: 1.2, chunks: 112, l_build: 125, max_outdegree: 32}] - query_args: [[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120]] - vamana_pq_125_32_1-2_32: - args: [{alpha: 1.2, chunks: 32, l_build: 125, max_outdegree: 32}] - query_args: [[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120]] - vamana_pq_125_32_1-2_96: - args: [{alpha: 1.2, chunks: 96, l_build: 125, max_outdegree: 32}] - query_args: [[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120]] - vamana_pq_125_32_1_112: - args: [{alpha: 1, chunks: 112, l_build: 125, max_outdegree: 32}] - query_args: [[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120]] - vamana_pq_125_32_1_32: - args: [{alpha: 1, chunks: 32, l_build: 125, max_outdegree: 32}] - query_args: [[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120]] - vamana_pq_125_32_1_96: - args: [{alpha: 1, chunks: 96, l_build: 125, max_outdegree: 32}] - query_args: [[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120]] - vamana_pq_80_64_1-2_112: - args: [{alpha: 1.2, chunks: 112, l_build: 80, max_outdegree: 64}] - query_args: [[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120]] - vamana_pq_80_64_1-2_96: - args: [{alpha: 1.2, chunks: 96, l_build: 80, max_outdegree: 64}] - query_args: [[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120]] - vamana_pq_80_64_1_112: - args: [{alpha: 1, chunks: 112, l_build: 80, max_outdegree: 64}] - query_args: [[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120]] - vamana_pq_80_64_1_96: - args: [{alpha: 1, chunks: 96, l_build: 80, max_outdegree: 64}] - query_args: [[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120]] + any: + - base_args: ['@metric'] + constructor: DiskANN + disabled: false + docker_tag: ann-benchmarks-diskann + module: ann_benchmarks.algorithms.diskann + name: diskann + run_groups: + R-60: + arg_groups: [{max_degree: 60, complexity: 100, alpha: 1.2}] + args: {} + query_args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + R-80: + arg_groups: [{max_degree: 80, complexity: 150, alpha: 1.2}] + args: {} + query_args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + R-100: + arg_groups: [{max_degree: 100, complexity: 175, alpha: 1.2}] + args: {} + query_args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] + R-150: + arg_groups: [{max_degree: 150, complexity: 200, alpha: 1.2}] + args: {} + query_args: [[10, 20, 40, 80, 120, 200, 400, 600, 800]] diff --git a/ann_benchmarks/algorithms/diskann/module.py b/ann_benchmarks/algorithms/diskann/module.py index 0a54b65f5..38797902c 100644 --- a/ann_benchmarks/algorithms/diskann/module.py +++ b/ann_benchmarks/algorithms/diskann/module.py @@ -1,191 +1,52 @@ -import os -import struct -import time - -import numpy as np -import vamanapy as vp +import diskannpy +import tempfile from ..base.module import BaseANN -class Vamana(BaseANN): - def __init__(self, metric, param): +class DiskANN(BaseANN): + def __init__(self, metric, method_param): + self.name = None + self.search_complexity = None self.metric = {"angular": "cosine", "euclidean": "l2"}[metric] - self.l_build = int(param["l_build"]) - self.max_outdegree = int(param["max_outdegree"]) - self.alpha = float(param["alpha"]) - print("Vamana: L_Build = " + str(self.l_build)) - print("Vamana: R = " + str(self.max_outdegree)) - print("Vamana: Alpha = " + str(self.alpha)) - self.params = vp.Parameters() - self.params.set("L", self.l_build) - self.params.set("R", self.max_outdegree) - self.params.set("C", 750) - self.params.set("alpha", self.alpha) - self.params.set("saturate_graph", False) - self.params.set("num_threads", 1) + self.method_param = method_param + self.index = None + self.tempdir = tempfile.TemporaryDirectory() def fit(self, X): - def bin_to_float(binary): - return struct.unpack("!f", struct.pack("!I", int(binary, 2)))[0] - - print("Vamana: Starting Fit...") - index_dir = "indices" - - if not os.path.exists(index_dir): - os.makedirs(index_dir) - - data_path = os.path.join(index_dir, "base.bin") - self.name = "Vamana-{}-{}-{}".format(self.l_build, self.max_outdegree, self.alpha) - save_path = os.path.join(index_dir, self.name) - print("Vamana: Index Stored At: " + save_path) - shape = [ - np.float32(bin_to_float("{:032b}".format(X.shape[0]))), - np.float32(bin_to_float("{:032b}".format(X.shape[1]))), - ] - X = X.flatten() - X = np.insert(X, 0, shape) - X.tofile(data_path) - - if not os.path.exists(save_path): - print("Vamana: Creating Index") - s = time.time() - if self.metric == "l2": - index = vp.SinglePrecisionIndex(vp.Metric.FAST_L2, data_path) - elif self.metric == "cosine": - index = vp.SinglePrecisionIndex(vp.Metric.INNER_PRODUCT, data_path) - else: - print("Vamana: Unknown Metric Error!") - index.build(self.params, []) - t = time.time() - print("Vamana: Index Build Time (sec) = " + str(t - s)) - index.save(save_path) - if os.path.exists(save_path): - print("Vamana: Loading Index: " + str(save_path)) - s = time.time() - if self.metric == "l2": - self.index = vp.SinglePrecisionIndex(vp.Metric.FAST_L2, data_path) - elif self.metric == "cosine": - self.index = vp.SinglePrecisionIndex(vp.Metric.INNER_PRODUCT, data_path) - else: - print("Vamana: Unknown Metric Error!") - self.index.load(file_name=save_path) - print("Vamana: Index Loaded") - self.index.optimize_graph() - print("Vamana: Graph Optimization Completed") - t = time.time() - print("Vamana: Index Load Time (sec) = " + str(t - s)) - else: - print("Vamana: Unexpected Index Build Time Error") - - print("Vamana: End of Fit") - - def set_query_arguments(self, l_search): - print("Vamana: L_Search = " + str(l_search)) - self.l_search = l_search - - def query(self, v, n): - return self.index.single_numpy_query(v, n, self.l_search) - - def batch_query(self, X, n): - self.num_queries = X.shape[0] - self.result = self.index.batch_numpy_query(X, n, self.num_queries, self.l_search) - - def get_batch_results(self): - return self.result.reshape((self.num_queries, self.result.shape[0] // self.num_queries)) - - -class VamanaPQ(BaseANN): - def __init__(self, metric, param): - self.metric = {"angular": "cosine", "euclidean": "l2"}[metric] - self.l_build = int(param["l_build"]) - self.max_outdegree = int(param["max_outdegree"]) - self.alpha = float(param["alpha"]) - self.chunks = int(param["chunks"]) - print("Vamana PQ: L_Build = " + str(self.l_build)) - print("Vamana PQ: R = " + str(self.max_outdegree)) - print("Vamana PQ: Alpha = " + str(self.alpha)) - print("Vamana PQ: Chunks = " + str(self.chunks)) - self.params = vp.Parameters() - self.params.set("L", self.l_build) - self.params.set("R", self.max_outdegree) - self.params.set("C", 750) - self.params.set("alpha", self.alpha) - self.params.set("saturate_graph", False) - self.params.set("num_chunks", self.chunks) - self.params.set("num_threads", 1) - - def fit(self, X): - def bin_to_float(binary): - return struct.unpack("!f", struct.pack("!I", int(binary, 2)))[0] - - print("Vamana PQ: Starting Fit...") - index_dir = "indices" - - if self.chunks > X.shape[1]: - raise ValueError - - if not os.path.exists(index_dir): - os.makedirs(index_dir) - - data_path = os.path.join(index_dir, "base.bin") - pq_path = os.path.join(index_dir, "pq_memory_index") - self.name = "VamanaPQ-{}-{}-{}".format(self.l_build, self.max_outdegree, self.alpha) - save_path = os.path.join(index_dir, self.name) - print("Vamana PQ: Index Stored At: " + save_path) - shape = [ - np.float32(bin_to_float("{:032b}".format(X.shape[0]))), - np.float32(bin_to_float("{:032b}".format(X.shape[1]))), - ] - X = X.flatten() - X = np.insert(X, 0, shape) - X.tofile(data_path) - - if not os.path.exists(save_path): - print("Vamana PQ: Creating Index") - s = time.time() - if self.metric == "l2": - index = vp.SinglePrecisionIndex(vp.Metric.FAST_L2, data_path) - elif self.metric == "cosine": - index = vp.SinglePrecisionIndex(vp.Metric.INNER_PRODUCT, data_path) - else: - print("Vamana PQ: Unknown Metric Error!") - index.pq_build(data_path, pq_path, self.params) - t = time.time() - print("Vamana PQ: Index Build Time (sec) = " + str(t - s)) - index.save(save_path) - if os.path.exists(save_path): - print("Vamana PQ: Loading Index: " + str(save_path)) - s = time.time() - if self.metric == "l2": - self.index = vp.SinglePrecisionIndex(vp.Metric.FAST_L2, data_path) - elif self.metric == "cosine": - self.index = vp.SinglePrecisionIndex(vp.Metric.INNER_PRODUCT, data_path) - else: - print("Vamana PQ: Unknown Metric Error!") - self.index.load(file_name=save_path) - print("Vamana PQ: Index Loaded") - self.index.pq_load(pq_prefix_path=pq_path) - print("Vamana PQ: PQ Data Loaded") - self.index.optimize_graph() - print("Vamana PQ: Graph Optimization Completed") - t = time.time() - print("Vamana PQ: Index Load Time (sec) = " + str(t - s)) - else: - print("Vamana PQ: Unexpected Index Build Time Error") - - print("Vamana PQ: End of Fit") - - def set_query_arguments(self, l_search): - print("Vamana PQ: L_Search = " + str(l_search)) - self.l_search = l_search + max_degree = self.method_param.get("max_degree", 64) + complexity = self.method_param.get("complexity", 128) + alpha = self.method_param.get("alpha", 1.2) + + diskannpy.build_memory_index( + data=X, + distance_metric=self.metric, + index_directory=self.tempdir.name, + complexity=complexity, # candidate NN list when building. Typically 75-200. At least as large as graph degree. + graph_degree=max_degree, # max graph degree. Typically 60-150. Higher means better recall. + num_threads=0, # uses max available processors + alpha=alpha, # controls number of points added to the graph + use_pq_build=False, # uses quantization to save index which reduces recall and disk space + use_opq=False, + ) + + self.index = diskannpy.StaticMemoryIndex( + index_directory=self.tempdir.name, + num_threads=1, + initial_search_complexity=100, # most common complexity durig search. Working mem is initialized based off this * threads. + distance_metric=self.metric, + dimensions = None + ) + + def set_query_arguments(self, complexity): + self.search_complexity = complexity + self.name = "diskann (%s, 'complexity': %s)" % (self.method_param, complexity) def query(self, v, n): - return self.index.pq_single_numpy_query(v, n, self.l_search) + ids, distances = self.index.search(v, n, self.search_complexity) + return ids - def batch_query(self, X, n): - self.num_queries = X.shape[0] - self.result = self.index.pq_batch_numpy_query(X, n, self.num_queries, self.l_search) - def get_batch_results(self): - return self.result.reshape((self.num_queries, self.result.shape[0] // self.num_queries)) + def done(self): + self.tempdir.cleanup() + pass