diff --git a/CMakeLists.txt b/CMakeLists.txt index b1661249d..5f95914b3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -66,11 +66,11 @@ if(VECSIM_BUILD_TESTS) FetchContent_MakeAvailable(google_benchmark) add_subdirectory(tests/unit unit_tests) - add_subdirectory(tests/module module_tests) + # add_subdirectory(tests/module module_tests) - if(NOT(USE_ASAN OR USE_MSAN)) - add_subdirectory(tests/benchmark benchmark) - endif() + # if(NOT(USE_ASAN OR USE_MSAN)) + # add_subdirectory(tests/benchmark benchmark) + # endif() endif() add_subdirectory(src/VecSim) diff --git a/create_bm_graphs.py b/create_bm_graphs.py new file mode 100644 index 000000000..79e5de19b --- /dev/null +++ b/create_bm_graphs.py @@ -0,0 +1,130 @@ +import pandas as pd +import matplotlib.pyplot as plt +import re + +# ---- 1. Process CSV File (Memory Usage) ---- +def plot_memory_usage(csv_file, output_file): + df = pd.read_csv(csv_file) + # Convert memory from bytes to GB + df["memory_gb"] = df["memory_bytes"] / (1024**3) + + num_entries = 1000 + step = 5 + df = df.head(num_entries).iloc[::step] + + plt.figure(figsize=(30, 20)) + plt.scatter(df["time_stamp"], df["memory_gb"], label="Memory Usage (GB)", color='b', marker='o', s=50) + plt.xlabel("Time (Unix Timestamp)") + plt.ylabel("Memory (GB)") + plt.title("Memory Usage Over Time") + plt.legend() + plt.grid() + + plt.savefig(output_file) # Save as PNG file + plt.close() # Close the plot to free memory + +# ---- 2. Process Log File (Index Size) ---- +def parse_log_file(log_file): + pattern = re.compile(r"\[(\d+\.\d+)\] Building (\d+) vectors time: T(\d+\.\d+) seconds") + + timestamps = [] + index_sizes_m = [] + + with open(log_file, 'r') as file: + for line in file: + match = pattern.search(line) + if match: + timestamp, index_size, _ = match.groups() + timestamps.append(float(timestamp)) + index_sizes_m.append(int(index_size) / 1_000_000) # Convert to millions + + return timestamps, index_sizes_m + +def plot_index_size(log_file, output_file): + timestamps, index_sizes = parse_log_file(log_file) + + plt.figure(figsize=(10, 5)) + plt.plot(timestamps, index_sizes, label="Index Size (vectors)", color='r') + plt.xlabel("Time (Unix Timestamp)") + plt.ylabel("Index Size (vectors)") + plt.title("Index Size Over Time") + plt.legend() + plt.grid() + + plt.savefig(output_file) # Save as PNG file + plt.close() # Close the plot to free memory + +def parse_log_file(log_file): + pattern = re.compile(r"\[(\d+\.\d+)\] Building (\d+) vectors time: T(\d+\.\d+) seconds") + timestamps = [] + index_sizes_m = [] # Store index sizes in millions + + with open(log_file, 'r') as file: + for line in file: + match = pattern.search(line) + if match: + timestamp, index_size, _ = match.groups() + timestamps.append(float(timestamp)) + index_sizes_m.append(int(index_size) / 1_000_000) # Convert to millions + + return timestamps, index_sizes_m + +def plot_combined(csv_file, log_file, output_file, num_entries=None, step=5): + # Read the CSV file for memory usage + df = pd.read_csv(csv_file) + + if num_entries is None: + num_entries = len(df) + + # Convert memory from bytes to GB + df["memory_gb"] = df["memory_bytes"] / (1024**3) + + # Subtract the first memory value to adjust all memory values + initial_memory = df["memory_gb"].iloc[0] + df["memory_gb"] = df["memory_gb"] - initial_memory + + + # Select only the first `num_entries` rows and take every `step`-th row + df = df.head(num_entries).iloc[::step] + + # Parse the log file for index size data + timestamps, index_sizes_m = parse_log_file(log_file) + + # Create the plot with dual y-axes + fig, ax1 = plt.subplots(figsize=(20, 10)) + + # Plot memory usage on the first y-axis (left) + ax1.set_xlabel("Time (Unix Timestamp)") + ax1.set_ylabel("Memory Usage (GB)", color='b') + ax1.scatter(df["time_stamp"], df["memory_gb"], color='b', marker='o', s=10, label="Memory Usage") + ax1.tick_params(axis='y', labelcolor='b') + + # Create a second y-axis for the index size + ax2 = ax1.twinx() + ax2.set_ylabel("Index Size (M vectors)", color='r') + ax2.scatter(timestamps, index_sizes_m, color='r', marker='x', s=10, label="Index Size") + ax2.tick_params(axis='y', labelcolor='r') + + # Set the title and legend + plt.title("Memory Usage and Index Size Over Time") + ax1.legend(loc='upper left') + ax2.legend(loc='upper right') + + plt.grid(True) + + # Save the plot to a file + plt.savefig(output_file) + plt.close() # Close the plot to free memory + + print(f"Combined plot saved to {output_file}") + +# ---- Run the plots ---- +result_details = "results_M_60_efC_75_vec_10000000_q_10000_madvise_MADV_DONTNEED_bs_10240" +csv_file = f"results/mem_monitor/{result_details}_pid_1335459_rss_memory_monitor.csv" +log_file = f"results/logs/{result_details}_log.txt" + +# plot_memory_usage(csv_file, f"results/graphs/{result_details}_memory_usage.png") +# plot_index_size(log_file, f"results/graphs/{result_details}_index_size.png") +plot_combined(csv_file, log_file, f"results/graphs/{result_details}_combined.png", step = 20) + +print("Graphs saved: memory_usage.png, index_size.png") diff --git a/monitor_proc_rss.py b/monitor_proc_rss.py new file mode 100644 index 000000000..de70aac7f --- /dev/null +++ b/monitor_proc_rss.py @@ -0,0 +1,60 @@ +import psutil +import time +import csv +import argparse + +def collect_process_memory(pid, output_file): + """ Continuously log memory usage of a process to a CSV file until it dies. """ + with open(output_file, mode='w', newline='') as file: + writer = csv.writer(file) + writer.writerow(["time_stamp", "memory_bytes", "memory_current", "memory_high"]) + + i = 0 + print_interval = 600 # Print every 10 min + while True: + try: + proc = psutil.Process(pid) + memory = proc.memory_info().rss # Get memory in bytes + timestamp = time.time() # Unix timestamp (float) + + # Read memory.current and memory.event::high from cgroup + memory_current = get_memory_current(pid) + memory_high = get_memory_high(pid) + + writer.writerow([timestamp, memory, memory_current, memory_high]) + if i % print_interval == 0: + print(f"{timestamp}, rss: {memory} bytes, {(memory / 1024 / 1024 / 1024):.4f} GB, memory.current: {(memory_current / 1024 / 1024 / 1024):.4f} GB, memory.event::high: {memory_high}") + + time.sleep(1) # Adjust sampling interval if needed + i += 1 + except psutil.NoSuchProcess: + print(f"Process {pid} has ended.") + break + +def get_memory_current(pid): + """ Read memory.current from cgroup for the specified process. """ + with open(f"/sys/fs/cgroup/limited_process/memory.current", "r") as f: + memory_current = f.read().strip() + return int(memory_current) # Return value in bytes + +def get_memory_high(pid): + """ Read memory.event::high from cgroup for the specified process. """ + with open(f"/sys/fs/cgroup/limited_process/memory.events", "r") as f: + for line in f: + if "high" in line: + memory_high = line.strip().split()[1] + return int(memory_high) # Return value in bytes + +def generate_file_name(M, efC, num_vectors, num_queries, madvise, block_size, process_limit_high): + return f"results_M_{M}_efC_{efC}_vec_{num_vectors}_q_{num_queries}_madvise_{madvise}_bs_{block_size}_mem_limit_{process_limit_high}" + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Monitor RSS memory usage of a process.") + parser.add_argument("pid", type=int, help="PID of the process to monitor") + args = parser.parse_args() + + pid = args.pid # Get the PID from the command line argument + run_name = generate_file_name(M=60, efC=75, num_vectors=10_000_000, num_queries=10_000, madvise="None", block_size=10_240, process_limit_high="55G") + output_file = f"{run_name}_pid_{pid}_rss_memory_monitor.csv" + print("Start collecting memory usage for process", pid) + collect_process_memory(pid, output_file) diff --git a/parse_index_log.py b/parse_index_log.py new file mode 100644 index 000000000..27659577a --- /dev/null +++ b/parse_index_log.py @@ -0,0 +1,137 @@ +import re +import numpy as np +import matplotlib.pyplot as plt +import csv + +def format_index_size(size): + if size < 1_000_000: + return f"{size // 1_000}K" + else: + return f"{size / 1_000_000:.1f}M" + +def parse_index_log(file_path): + with open(file_path, "r") as f: + lines = f.readlines() + + index_data = [] + + time_pattern = re.compile(r'Building (\d+) vectors time: T([\d.]+) seconds') + memory_pattern = re.compile(r'Current index memory usage: .* ([\d.]+) GB') + + prev_time = None + + for i in range(len(lines)): + time_match = time_pattern.search(lines[i]) + if time_match: + index_size = int(time_match.group(1)) + time_elapsed = float(time_match.group(2)) + + memory_match = memory_pattern.search(lines[i+1]) # Memory info is on the next line + if memory_match: + memory_usage = float(memory_match.group(1)) + + # Compute time difference in hours + batch_time_hr = 0 if prev_time is None else (time_elapsed - prev_time) / 3600 + prev_time = time_elapsed + + # Format index size + index_size_formatted = format_index_size(index_size) + + index_data.append((index_size_formatted, round(batch_time_hr, 2), f"{memory_usage:.2f}")) + + # Print the result + print(f"{'Index Size':<12} {'Batch Time (hr)':<15} {'Memory Usage (GB)':<18}") + print("=" * 50) + for row in index_data: + print(f"{row[0]:<12} {row[1]:<15} {row[2]:<18}") + + # Export data to CSV + csv_filename = 'index_data.csv' + with open(csv_filename, mode='w', newline='') as file: + writer = csv.writer(file) + writer.writerow(["Index Size", "Batch Time (hr)", "Memory Usage (GB)"]) # Write header + writer.writerows(index_data) # Write the data + + print(f"Data saved to {csv_filename}") + + return index_data + +# Run the script with your log file +index_data = parse_index_log("results_M_60_efC_75_vec_10000000_q_10000_madvise_None_bs_10240_mem_limit_2G_pid_171927_log.txt") + + +# Extracting data for the graph +index_sizes = [row[0] for row in index_data] # Index sizes (formatted as K/M) +batch_times = [row[1] for row in index_data] # Batch elapsed times +memory_usages = [round(float(row[2]), 2) for row in index_data] # Memory usage + +# Convert index sizes to numerical values and scale to millions +index_sizes_numeric = [] +for size in index_sizes: + if size.endswith("K"): + index_sizes_numeric.append(float(size[:-1]) * 1e3 / 1e6) # Convert to millions + elif size.endswith("M"): + index_sizes_numeric.append(float(size[:-1])) # Already in millions + else: + index_sizes_numeric.append(float(size) / 1e6) # Convert to millions + +# Create the plot +fig, ax1 = plt.subplots(figsize=(12, 6)) + +# Plot Batch Time on the left y-axis +ax1.set_xlabel("Index Size (vectors / 1M)") +ax1.set_ylabel("Batch Time (hr)", color="tab:blue") +ax1.plot(index_sizes_numeric, batch_times, marker='o', linestyle='-', color="tab:blue", label="Batch Time (hr)") +ax1.tick_params(axis='y', labelcolor="tab:blue") + +# Set the range and ticks for the left y-axis (Batch Time) +ax1.set_ylim(0, 100) # Adjust this based on your data range +ax1.set_yticks(np.arange(0, 101, 10)) # Set tick marks every 10 units (you can adjust this) + +# Create a second y-axis to plot Memory Usage +ax2 = ax1.twinx() +ax2.set_ylabel("Memory Usage (GB)", color="tab:green") +ax2.plot(index_sizes_numeric, memory_usages, marker='s', linestyle='--', color="tab:green", label="Memory Usage (GB)") +ax2.tick_params(axis='y', labelcolor="tab:green") + + +# Set the range and ticks for the right y-axis (Memory Usage) +ax2.set_ylim(0, 3) # Adjust this based on your data range +ax2.set_yticks(np.arange(0, 3.1, 0.5)) # Set tick marks every 0.5 units (you can adjust this) + + +# Title and grid +plt.title("Build Index") +ax1.grid(True) + +x_value_5M = 5 # Since x-axis is in millions + +# Add a vertical dashed line at x = 5M +ax1.axvline(x=x_value_5M, color='red', linestyle='--', linewidth=1) + +# Add a note next to the line +y_pos = 0.9 +ax1.text( + x_value_5M + 1.5, # Slightly shift the text to the right + ax1.get_ylim()[1] * y_pos, # Position at 80% of the y-axis max value + "Process limit: 5GB", + fontsize=12, color="red", + ha="left", va="center", + bbox=dict(facecolor='white', alpha=0.7, edgecolor='red') +) + +ax1.text( + x_value_5M - 1.5, # Slightly shift the text to the right + ax1.get_ylim()[1] * y_pos, # Position at 80% of the y-axis max value + "Process limit: 2GB", + fontsize=12, color="red", + ha="right", va="center", + bbox=dict(facecolor='white', alpha=0.7, edgecolor='red') +) + + +# Save the figure to a file +plt.tight_layout() # Adjust layout to make sure everything fits +plt.savefig("index_growth.png", dpi=300, bbox_inches="tight") + +print("Graph saved as 'index_growth.png'") diff --git a/parse_query_log.py b/parse_query_log.py new file mode 100644 index 000000000..69ea217c2 --- /dev/null +++ b/parse_query_log.py @@ -0,0 +1,108 @@ +import re +import csv +import matplotlib.pyplot as plt +from collections import defaultdict + +def parse_benchmark(file_path): + with open(file_path, "r") as f: + lines = f.readlines() + + results = defaultdict(lambda: defaultdict(list)) # Group by k and efR + current_batch = False + query_count = 9000 + recall_sum = 0 + row_count = 0 + final_time = 0 # Store the time of the last query in the batch + efR = 0 + k = 0 + for line in lines: + batch_match = re.match(r"Running \d+ queries benchmark with params: efR: (\d+), k: (\d+)", line) + query_match = re.match(r"\[\d+\.\d+\] Query \d+: recall=([\d\.]+), time=([\d\.]+) seconds", line) + + if batch_match: + # If we already collected data, save the previous batch + if current_batch: + avg_recall = recall_sum / row_count + avg_latency = final_time / query_count # Using the last query's time for the batch + avg_qps = query_count / final_time # Queries per second (using the last query's time) + results[k][efR] = (avg_recall, avg_qps, avg_latency) # Save QPS and Latency + + efR = batch_match.group(1) + k = batch_match.group(2) + recall_sum = 0 + row_count = 0 + final_time = 0 # Reset for the new batch + current_batch = True + + elif query_match: + # batch_match = None + recall = float(query_match.group(1)) + time = float(query_match.group(2)) + recall_sum += recall + final_time = time # Keep only the time of the last query in the batch + row_count += 1 + + # Save the last batch + if current_batch: + avg_recall = float(recall_sum / row_count) + avg_latency = final_time / query_count # Using the last query's time for the batch + avg_qps = query_count / final_time # Queries per second (using the last query's time) + results[k][efR] = (avg_recall, avg_qps, avg_latency) # Save QPS and Latency + + return results + +def save_results_to_csv(results, output_file): + with open(output_file, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["efR", "k", "Average Recall", "QPS", "Avg Latency (s)"]) + for k, batch in results.items(): + for ef_r, batch_results in batch.items(): + writer.writerow([ef_r, k, round(batch_results[0], 2), round(batch_results[1], 2), round(batch_results[2], 2)]) + +def plot_results(results): + # Sort efR values numerically + k_qps_list = {} + k_latency_list = {} + efR_values = [] + graph_data = {} + for k, batch in results.items(): + graph_data[k] = {} + graph_data[k]["efR_values"] = batch.keys() + graph_data[k]["qps_list"] = [] + graph_data[k]["k_latency_list"] = [] + for ef_r, batch_values in batch.items(): + graph_data[k]["qps_list"].append(batch_values[1]) # QPS values + graph_data[k]["k_latency_list"].append(batch_values[2]) # Latency values + + # Plot QPS + plt.figure(figsize=(10, 6)) + for k in graph_data.keys(): + plt.plot(graph_data[k]["efR_values"], graph_data[k]["qps_list"], marker='o', label=f'k = {k}') + plt.xlabel("efR") + plt.ylabel("Queries Per Second (QPS)") + plt.xticks(rotation=45) + plt.title("Queries Per Second (QPS) vs efR for Different k Values") + plt.legend() + plt.grid() + plt.savefig("query_graphs/qps.png") # Save as PNG + plt.close() + + # Plot Latency + plt.figure(figsize=(10, 6)) + for k in graph_data.keys(): + plt.plot(graph_data[k]["efR_values"], graph_data[k]["k_latency_list"], marker='o', label=f'k = {k}') + plt.xlabel("efR") + plt.ylabel("Average Latency [s]") + plt.xticks(rotation=45) + plt.title("Average Latency [s] vs efR for Different k Values") + plt.legend() + plt.grid() + plt.savefig("query_graphs/avg_latency.png") # Save as PNG + plt.close() + +# Example usage +file_path = "results_M_60_efC_75_vec_10000000_q_10000_madvise_None_bs_10240_mem_limit_2G_pid_171927_log.txt" # Change this to your log file +output_csv = "benchmark_query_results.csv" +results = parse_benchmark(file_path) +save_results_to_csv(results, output_csv) +plot_results(results) diff --git a/pyproject.toml b/pyproject.toml index 9988c8b2c..c7761a00e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,14 +27,17 @@ classifiers = [ [tool.poetry.dependencies] python = "^3.10" - -[tool.poetry.dev-dependencies] +datasets = "^3.3.0" numpy = "*" hnswlib = "^0.6.2" pytest = "^8.3" scipy = "^1.14.1" h5py = "^3.7.0" ml_dtypes = "*" +psutil = "*" +matplotlib = "*" +pandas = "*" + [build-system] diff --git a/src/VecSim/algorithms/brute_force/brute_force.h b/src/VecSim/algorithms/brute_force/brute_force.h index fecf3fc42..977f83e46 100644 --- a/src/VecSim/algorithms/brute_force/brute_force.h +++ b/src/VecSim/algorithms/brute_force/brute_force.h @@ -50,8 +50,8 @@ class BruteForceIndex : public VecSimIndexAbstract { VecSimIndexInfo info() const override; VecSimInfoIterator *infoIterator() const override; VecSimIndexBasicInfo basicInfo() const override; - VecSimBatchIterator *newBatchIterator(const void *queryBlob, - VecSimQueryParams *queryParams) const override; + // VecSimBatchIterator *newBatchIterator(const void *queryBlob, + // VecSimQueryParams *queryParams) const override; bool preferAdHocSearch(size_t subsetSize, size_t k, bool initial_check) const override; labelType getVectorLabel(idType id) const { return idToLabelMapping.at(id); } @@ -130,8 +130,8 @@ class BruteForceIndex : public VecSimIndexAbstract { virtual void setVectorId(labelType label, idType id) = 0; virtual void resizeLabelLookup(size_t new_max_elements) = 0; - virtual VecSimBatchIterator * - newBatchIterator_Instance(void *queryBlob, VecSimQueryParams *queryParams) const = 0; + // virtual VecSimBatchIterator * + // newBatchIterator_Instance(void *queryBlob, VecSimQueryParams *queryParams) const = 0; #ifdef BUILD_TESTS #include "VecSim/algorithms/brute_force/brute_force_friend_tests.h" @@ -344,17 +344,17 @@ VecSimInfoIterator *BruteForceIndex::infoIterator() const { return infoIterator; } -template -VecSimBatchIterator * -BruteForceIndex::newBatchIterator(const void *queryBlob, - VecSimQueryParams *queryParams) const { - auto *queryBlobCopy = - this->allocator->allocate_aligned(this->dataSize, this->preprocessors->getAlignment()); - memcpy(queryBlobCopy, queryBlob, this->dim * sizeof(DataType)); - this->preprocessQueryInPlace(queryBlobCopy); - // Ownership of queryBlobCopy moves to BF_BatchIterator that will free it at the end. - return newBatchIterator_Instance(queryBlobCopy, queryParams); -} +// template +// VecSimBatchIterator * +// BruteForceIndex::newBatchIterator(const void *queryBlob, +// VecSimQueryParams *queryParams) const { +// auto *queryBlobCopy = +// this->allocator->allocate_aligned(this->dataSize, this->preprocessors->getAlignment()); +// memcpy(queryBlobCopy, queryBlob, this->dim * sizeof(DataType)); +// this->preprocessQueryInPlace(queryBlobCopy); +// // Ownership of queryBlobCopy moves to BF_BatchIterator that will free it at the end. +// return newBatchIterator_Instance(queryBlobCopy, queryParams); +// } template bool BruteForceIndex::preferAdHocSearch(size_t subsetSize, size_t k, diff --git a/src/VecSim/algorithms/brute_force/brute_force_multi.h b/src/VecSim/algorithms/brute_force/brute_force_multi.h index 999f5ac8b..0b1136c3b 100644 --- a/src/VecSim/algorithms/brute_force/brute_force_multi.h +++ b/src/VecSim/algorithms/brute_force/brute_force_multi.h @@ -7,7 +7,7 @@ #pragma once #include "brute_force.h" -#include "bfm_batch_iterator.h" +// #include "bfm_batch_iterator.h" #include "VecSim/utils/updatable_heap.h" #include "VecSim/utils/vec_utils.h" @@ -80,11 +80,11 @@ class BruteForceIndex_Multi : public BruteForceIndex { vecsim_stl::updatable_max_heap(this->allocator); } - inline BF_BatchIterator * - newBatchIterator_Instance(void *queryBlob, VecSimQueryParams *queryParams) const override { - return new (this->allocator) - BFM_BatchIterator(queryBlob, this, queryParams, this->allocator); - } + // inline BF_BatchIterator * + // newBatchIterator_Instance(void *queryBlob, VecSimQueryParams *queryParams) const override { + // return new (this->allocator) + // BFM_BatchIterator(queryBlob, this, queryParams, this->allocator); + // } #ifdef BUILD_TESTS #include "VecSim/algorithms/brute_force/brute_force_multi_tests_friends.h" diff --git a/src/VecSim/algorithms/brute_force/brute_force_single.h b/src/VecSim/algorithms/brute_force/brute_force_single.h index 0c27615e9..c9c42cd46 100644 --- a/src/VecSim/algorithms/brute_force/brute_force_single.h +++ b/src/VecSim/algorithms/brute_force/brute_force_single.h @@ -82,11 +82,11 @@ class BruteForceIndex_Single : public BruteForceIndex { vecsim_stl::max_priority_queue(this->allocator); } - BF_BatchIterator * - newBatchIterator_Instance(void *queryBlob, VecSimQueryParams *queryParams) const override { - return new (this->allocator) - BFS_BatchIterator(queryBlob, this, queryParams, this->allocator); - } + // BF_BatchIterator * + // newBatchIterator_Instance(void *queryBlob, VecSimQueryParams *queryParams) const override { + // return new (this->allocator) + // BFS_BatchIterator(queryBlob, this, queryParams, this->allocator); + // } #ifdef BUILD_TESTS #include "VecSim/algorithms/brute_force/brute_force_friend_tests.h" diff --git a/src/VecSim/algorithms/hnsw/graph_data.h b/src/VecSim/algorithms/hnsw/graph_data.h index 28df1167b..abf489570 100644 --- a/src/VecSim/algorithms/hnsw/graph_data.h +++ b/src/VecSim/algorithms/hnsw/graph_data.h @@ -4,70 +4,195 @@ #include #include #include + #include "VecSim/utils/vec_utils.h" +#include "VecSim/containers/mapped_mem.h" +#include "VecSim/vec_sim_interface.h" template using candidatesList = vecsim_stl::vector>; typedef uint16_t linkListSize; +namespace graphDataUtils { +static size_t levelIdx(size_t level) { + return level - 1; // no need to store id's offset for level 0, it's sequential. +} +} // namespace graphDataUtils +struct ElementInMemoryData { + ElementInMemoryData(vecsim_stl::vector offsets, + std::shared_ptr allocator) + : offsetIdxAtLevel(offsets, allocator), + incomingUnidirectionalEdges(offsetIdxAtLevel.size() + 1, allocator) + // ,offsetIdxAtLevel(new(allocator) vecsim_stl::vector(offsets.size(), allocator)) + { + for (auto &inc_edges_ptr : incomingUnidirectionalEdges) { + inc_edges_ptr = new (allocator) vecsim_stl::vector(allocator); + } + // (*offsetIdxAtLevel) = offsets; + } + ElementInMemoryData(ElementInMemoryData &&other) noexcept + : offsetIdxAtLevel(std::move(other.offsetIdxAtLevel)), + incomingUnidirectionalEdges(std::move(other.incomingUnidirectionalEdges)) {} + + mutable std::mutex neighborsGuard; + // offsetAtLevel[i] = relative offset of the element data in i = + // graphDataUtils::levelIdx(level). + vecsim_stl::vector offsetIdxAtLevel; // offsets of the element at each level > 0. + vecsim_stl::vector *> incomingUnidirectionalEdges; + + ~ElementInMemoryData() { + for (auto &inc_edges_ptr : incomingUnidirectionalEdges) { + delete inc_edges_ptr; + } + } + size_t getOffsetAtLevel(size_t level) const { + return offsetIdxAtLevel.at(graphDataUtils::levelIdx(level)); + } + + size_t getMaxlevel() const { return offsetIdxAtLevel.size(); } +}; + +/******* Disk structs *******/ +struct LevelsMappedMemContainer { // TODO: separate struct for level 0 + LevelsMappedMemContainer(size_t elementDataSize, std::shared_ptr allocator, + size_t cap = 0, bool is_level0 = false) + : mappedMems(cap, allocator), DataSize(elementDataSize) { + is_level0 ? offsetLevel = 0 : offsetLevel = 1; + } + + void destroy(size_t elementDataSize, size_t block_size_bytes) { + for (size_t i = 0; i < mappedMems.size(); i++) { + mappedMems[i].destroy(elementDataSize, block_size_bytes); + } + } + + // Return data of the element at offset_id in level + char *getOffsetIdDataByLevel(idType offset_id, size_t level) const { + return mappedMems[level - offsetLevel].mapped_addr + offset_id * DataSize; + } + + // Append element to the end of 0, 1, 2...elem_max_level mappedMems + // Returns the offset index of the new element + void appendElementUpToLevel(const void *element, size_t element_size_bytes, + size_t elem_max_level = 0) { + for (size_t level = offsetLevel; level <= elem_max_level; level++) { + mappedMems[level - offsetLevel].appendElement(element, element_size_bytes); + } + } + + size_t getElemCountByLevel(size_t level) const { + return mappedMems[level - offsetLevel].get_elem_count(); + } + + size_t getLevelsCount() const { return mappedMems.size(); } + + void UpdateMaxLevel(size_t maxLevel) { + if (getLevelsCount() < maxLevel) { + VecSimIndexInterface::log_external("debug", "Updateing max level to %zu", maxLevel); + mappedMems.resize(maxLevel); + } + } + + bool growByBlockUpTolevel(size_t elementDataSize, size_t block_size_bytes, size_t maxLevel) { + bool is_resized = false; + for (size_t level = offsetLevel; level <= maxLevel; level++) { + is_resized |= + mappedMems[level - offsetLevel].growByBlock(elementDataSize, block_size_bytes); + } + return is_resized; + } + + vecsim_stl::vector mappedMems; + size_t DataSize; + size_t offsetLevel; +}; + +struct DiskElementMetaData { + DiskElementMetaData(size_t toplevel) : toplevel(toplevel) {} + const size_t toplevel; +}; + +// Used to read data from disk struct ElementLevelData { // A list of ids that are pointing to the node where each edge is *unidirectional* vecsim_stl::vector *incomingUnidirectionalEdges; - linkListSize numLinks; - // Flexible array member - https://en.wikipedia.org/wiki/Flexible_array_member - // Using this trick, we can have the links list as part of the ElementLevelData struct, and - // avoid the need to dereference a pointer to get to the links list. We have to calculate the - // size of the struct manually, as `sizeof(ElementLevelData)` will not include this member. We - // do so in the constructor of the index, under the name `levelDataSize` (and - // `elementGraphDataSize`). Notice that this member must be the last member of the struct and - // all nesting structs. - idType links[]; - - explicit ElementLevelData(std::shared_ptr allocator) - : incomingUnidirectionalEdges(new(allocator) vecsim_stl::vector(allocator)), - numLinks(0) {} - - linkListSize getNumLinks() const { return this->numLinks; } + + // Cache the currlinks to avoid reading from disk + linkListSize currLinks; + + // Pointer to disk mapped memory + // {linkListSize numLinks, idType link0, idType link1, ...} + char *linksData; + + // explicit ElementLevelData(std::shared_ptr allocator) + // : incomingUnidirectionalEdges(new(allocator) vecsim_stl::vector(allocator)), + // numLinks(0) {} + + ElementLevelData() = default; + explicit ElementLevelData(vecsim_stl::vector *incEdgesPtr, char *linksMappedMem) + : incomingUnidirectionalEdges(incEdgesPtr), currLinks(*((linkListSize *)linksMappedMem)), + linksData(linksMappedMem) {} + + linkListSize getNumLinks() const { return this->currLinks; } + + idType *getLinksArray() const { + return (idType *)((linkListSize *)linksData + 1); // skip numLinks + } idType getLinkAtPos(size_t pos) const { - assert(pos < numLinks); - return this->links[pos]; + assert(pos < currLinks); + return getLinksArray()[pos]; } const vecsim_stl::vector &getIncomingEdges() const { return *incomingUnidirectionalEdges; } std::vector copyLinks() { std::vector links_copy; - links_copy.assign(links, links + numLinks); + idType *links = getLinksArray(); + links_copy.assign(links, links + currLinks); return links_copy; } // Sets the outgoing links of the current element. // Assumes that the object has the capacity to hold all the links. void setLinks(vecsim_stl::vector &links) { - numLinks = links.size(); - memcpy(this->links, links.data(), numLinks * sizeof(idType)); + currLinks = links.size(); + *(linkListSize *)linksData = currLinks; + memcpy(getLinksArray(), links.data(), currLinks * sizeof(idType)); } template - void setLinks(candidatesList &links) { - numLinks = 0; - for (auto &link : links) { - this->links[numLinks++] = link.second; + void setLinks(candidatesList &cand_links) { + currLinks = cand_links.size(); + *(linkListSize *)linksData = currLinks; + idType *links = getLinksArray(); + for (auto &link : cand_links) { + links = link.second; + links++; } } - void popLink() { this->numLinks--; } - void setNumLinks(linkListSize num) { this->numLinks = num; } - void setLinkAtPos(size_t pos, idType node_id) { this->links[pos] = node_id; } - void appendLink(idType node_id) { this->links[this->numLinks++] = node_id; } + void popLink() { + this->currLinks--; + *(linkListSize *)linksData = currLinks; + } + void setNumLinks(linkListSize num) { + this->currLinks = num; + *(linkListSize *)linksData = currLinks; + } + void setLinkAtPos(size_t pos, idType node_id) { this->getLinksArray()[pos] = node_id; } + void appendLink(idType node_id) { + this->getLinksArray()[this->currLinks++] = node_id; + *(linkListSize *)linksData = currLinks; + } void removeLink(idType node_id) { + idType *links = getLinksArray(); size_t i = 0; - for (; i < numLinks; i++) { + for (; i < currLinks; i++) { if (links[i] == node_id) { - links[i] = links[numLinks - 1]; + links[i] = links[currLinks - 1]; break; } } - assert(i < numLinks && "Corruption in HNSW index"); // node_id not found - error - numLinks--; + assert(i < currLinks && "Corruption in HNSW index"); // node_id not found - error + popLink(); } void newIncomingUnidirectionalEdge(idType node_id) { this->incomingUnidirectionalEdges->push_back(node_id); @@ -84,43 +209,198 @@ struct ElementLevelData { } }; -struct ElementGraphData { - size_t toplevel; - std::mutex neighborsGuard; - ElementLevelData *others; - ElementLevelData level0; - - ElementGraphData(size_t maxLevel, size_t high_level_size, - std::shared_ptr allocator) - : toplevel(maxLevel), others(nullptr), level0(allocator) { - if (toplevel > 0) { - others = (ElementLevelData *)allocator->callocate(high_level_size * toplevel); - if (others == nullptr) { - throw std::runtime_error("VecSim index low memory error"); - } - for (size_t i = 0; i < maxLevel; i++) { - new ((char *)others + i * high_level_size) ElementLevelData(allocator); - } - } +struct DiskElementGraphDataCopy { + size_t toplevel; // TODO: redundant ? + vecsim_stl::vector levelsData; + mutable std::mutex *neighborsGuard; + + DiskElementGraphDataCopy(size_t toplevel, + const vecsim_stl::vector &levelsData, + std::mutex &neighborsGuard) + : toplevel(toplevel), levelsData(levelsData), neighborsGuard(&neighborsGuard) {} + + const ElementLevelData &getElementLevelData(size_t level) const { + assert(level <= toplevel); + return levelsData[level]; + } + + ElementLevelData &getElementLevelData(size_t level) { + assert(level <= toplevel); + return levelsData[level]; } - ~ElementGraphData() = delete; // should be destroyed using `destroy' - void destroy(size_t levelDataSize, std::shared_ptr allocator) { - delete this->level0.incomingUnidirectionalEdges; - ElementLevelData *cur_ld = this->others; - for (size_t i = 0; i < this->toplevel; i++) { - delete cur_ld->incomingUnidirectionalEdges; - cur_ld = reinterpret_cast(reinterpret_cast(cur_ld) + - levelDataSize); + void lockNodeLinks() const { (neighborsGuard)->lock(); } + + void unlockNodeLinks() const { (neighborsGuard)->unlock(); } + + void destroy() { + for (size_t i = 0; i < levelsData.size(); i++) { + delete levelsData[i].incomingUnidirectionalEdges; } - allocator->free_allocation(this->others); } - ElementLevelData &getElementLevelData(size_t level, size_t levelDataSize) { - assert(level <= this->toplevel); +}; +struct GraphData : public VecsimBaseObject { + // LevelsMappedMemContainer(size_t elementDataSize, std::shared_ptr allocator, + // size_t cap = 0, bool is_level0 = false) + size_t level0DataSize; // size of each element in level0 + size_t levelDataSize; // size of each element in levels > 0 + LevelsMappedMemContainer MetaDatasAndLevel0; // A file for all elements' meta data + level0 + LevelsMappedMemContainer levelsData; // File for each level + vecsim_stl::vector InMemoryElementsData; // ElementInMemoryData elements + size_t level0DatablockSizeBytes; // page size of the system + size_t levelsDatablockSizeBytes; // page size of the system + + GraphData(size_t M0, size_t M, std::shared_ptr allocator) + : VecsimBaseObject(allocator), + // each element contains: DiskElementMetaData, numLinks, link0, link1, ... link(M0-1) + level0DataSize(sizeof(DiskElementMetaData) + sizeof(linkListSize) + M0 * sizeof(idType)), + // each element contains: numLinks, link0, link1, ... link(M-1) + levelDataSize(sizeof(linkListSize) + M * sizeof(idType)), + MetaDatasAndLevel0(level0DataSize, allocator, 1, true), + levelsData(levelDataSize, allocator), InMemoryElementsData(allocator) { + + size_t pageSize = static_cast(sysconf(_SC_PAGE_SIZE)); + + // let one data block be at least 1 page + level0DatablockSizeBytes = MAX(pageSize, level0DataSize * DEFAULT_BLOCK_SIZE); + levelsDatablockSizeBytes = MAX(pageSize, levelDataSize * DEFAULT_BLOCK_SIZE); + }; + + ~GraphData() { + MetaDatasAndLevel0.destroy(level0DataSize, level0DatablockSizeBytes); + levelsData.destroy(levelDataSize, levelsDatablockSizeBytes); + } + + ElementLevelData getElementLevelData(idType internal_id, size_t level) const { + const ElementInMemoryData &inMemoryData = InMemoryElementsData[internal_id]; + vecsim_stl::vector *inc_edges = inMemoryData.incomingUnidirectionalEdges[level]; + char *linksData = nullptr; if (level == 0) { - return this->level0; + linksData = MetaDatasAndLevel0.getOffsetIdDataByLevel(internal_id, level) + + sizeof(DiskElementMetaData); + } else { + size_t offsetAtlevel = inMemoryData.getOffsetAtLevel(level); + linksData = levelsData.getOffsetIdDataByLevel(offsetAtlevel, level); + } + return ElementLevelData(inc_edges, linksData); + } + + void UpdateMaxLevel(size_t newMaxLevel) { levelsData.UpdateMaxLevel(newMaxLevel); } + + void appendElement(size_t toplevel, labelType label, size_t id) { + // emplace space in levels data if needed for the new element + growByBlock(toplevel); + + // Add the in memory data + vecsim_stl::vector offsets(toplevel, this->allocator); + for (size_t i = 1; i <= toplevel; i++) { + idType elem_index_at_level = levelsData.getElemCountByLevel(i); + offsets[i - 1] = elem_index_at_level; + } + + InMemoryElementsData.emplace_back(offsets, this->allocator); + + // create ElementLevel0Data + char level0Data[this->level0DataSize] = {0}; + + DiskElementMetaData metadata(toplevel); + memcpy(level0Data, &metadata, sizeof(DiskElementMetaData)); + MetaDatasAndLevel0.appendElementUpToLevel(level0Data, this->level0DataSize, 0); + + // add to all level up to toplevel + char levelData[this->levelDataSize] = {0}; + levelsData.appendElementUpToLevel(levelData, this->levelDataSize, toplevel); + } + + size_t getElemMaxLevel(idType id) { return InMemoryElementsData[id].getMaxlevel(); } + + void removeElement(size_t id) { + // TODO: make sure we freed the element memory before overriding it + // TODO: make sure inMemoryData is handled properly in hnsw.h + + // override the element data with the last element data + // Do the same for the rest of the levels' + // size_t elem_max_level = getElemMaxLevel(id); + // for (size_t i = 0; i < elem_max_level; i++) { + // idType last_element_internal_id = levelsData[i]->last_elem_id; + // char *last_elem_level_file_ptr = getLevelDataByInternalId(i, + // last_element_internal_id); char *elem_level_file_ptr = + // getLevelDataByInternalId(element_internal_id); memcpy(elem_level_file_ptr, + // last_elem_level_file_ptr, this->elementlevelDataSize); + // } + + // // create ElementInMemoryData + // InMemoryElementsData.addElement(inMemoryData, id); + } + + void growByBlock(size_t maxLevel) { + MetaDatasAndLevel0.growByBlockUpTolevel(level0DataSize, level0DatablockSizeBytes, 0); + if (maxLevel > 0) { + levelsData.UpdateMaxLevel(maxLevel); + levelsData.growByBlockUpTolevel(levelDataSize, levelsDatablockSizeBytes, maxLevel); + } + if ((InMemoryElementsData.size() % DEFAULT_BLOCK_SIZE) == 0) { + InMemoryElementsData.reserve(InMemoryElementsData.size() + DEFAULT_BLOCK_SIZE); } - return *reinterpret_cast(reinterpret_cast(this->others) + - (level - 1) * levelDataSize); + } + + DiskElementGraphDataCopy getGraphDataByInternalId(idType internal_id) const { + const ElementInMemoryData &elemInMemData = InMemoryElementsData[internal_id]; + size_t toplevel = elemInMemData.getMaxlevel(); + vecsim_stl::vector levelsData(toplevel + 1, this->allocator); + for (size_t level = 0; level <= toplevel; level++) { + levelsData[level] = getElementLevelData(internal_id, level); + } + + return DiskElementGraphDataCopy(toplevel, levelsData, elemInMemData.neighborsGuard); + } + + void lockNodeLinks(idType internal_id) const { + InMemoryElementsData[internal_id].neighborsGuard.lock(); + } + + void unlockNodeLinks(idType internal_id) const { + InMemoryElementsData[internal_id].neighborsGuard.unlock(); } }; + +// struct ElementGraphData { +// size_t toplevel; +// std::mutex neighborsGuard; +// ElementLevelData *others; +// ElementLevelData level0; + +// ElementGraphData(size_t maxLevel, size_t high_level_size, +// std::shared_ptr allocator) +// : toplevel(maxLevel), others(nullptr), level0(allocator) { +// if (toplevel > 0) { +// others = (ElementLevelData *)allocator->callocate(high_level_size * toplevel); +// if (others == nullptr) { +// throw std::runtime_error("VecSim index low memory error"); +// } +// for (size_t i = 0; i < maxLevel; i++) { +// new ((char *)others + i * high_level_size) ElementLevelData(allocator); +// } +// } +// } +// ~ElementGraphData() = delete; // should be destroyed using `destroy' + +// void destroy(size_t levelDataSize, std::shared_ptr allocator) { +// delete this->level0.incomingUnidirectionalEdges; +// ElementLevelData *cur_ld = this->others; +// for (size_t i = 0; i < this->toplevel; i++) { +// delete cur_ld->incomingUnidirectionalEdges; +// cur_ld = reinterpret_cast(reinterpret_cast(cur_ld) + +// levelDataSize); +// } +// allocator->free_allocation(this->others); +// } +// ElementLevelData &getElementLevelData(size_t level, size_t levelDataSize) { +// assert(level <= this->toplevel); +// if (level == 0) { +// return this->level0; +// } +// return *reinterpret_cast(reinterpret_cast(this->others) + +// (level - 1) * levelDataSize); +// } +// }; diff --git a/src/VecSim/algorithms/hnsw/hnsw.h b/src/VecSim/algorithms/hnsw/hnsw.h index f4a9ef235..db91927aa 100644 --- a/src/VecSim/algorithms/hnsw/hnsw.h +++ b/src/VecSim/algorithms/hnsw/hnsw.h @@ -21,9 +21,12 @@ #include "VecSim/tombstone_interface.h" #ifdef BUILD_TESTS +#ifdef SERIALIZE + #include "hnsw_serialization_utils.h" #include "VecSim/utils/serializer.h" #endif +#endif #include #include @@ -82,9 +85,12 @@ template class HNSWIndex : public VecSimIndexAbstract, public VecSimIndexTombstone #ifdef BUILD_TESTS +#ifdef SERIALIZE + , public Serializer #endif +#endif { protected: // Index build parameters @@ -98,8 +104,8 @@ class HNSWIndex : public VecSimIndexAbstract, double epsilon; // Index meta-data (based on the data dimensionality and index parameters) - size_t elementGraphDataSize; - size_t levelDataSize; + // size_t elementGraphDataSize; + // size_t levelDataSize; double mult; // Index level generator of the top level for a new element @@ -112,8 +118,9 @@ class HNSWIndex : public VecSimIndexAbstract, size_t maxLevel; // this is the top level of the entry point's element // Index data - vecsim_stl::vector graphDataBlocks; + vecsim_stl::vector graphDataBlocks; // not in use vecsim_stl::vector idToMetaData; + GraphData graphData; // Used for marking the visited nodes in graph scans (the pool supports parallel graph scans). // This is mutable since the object changes upon search operations as well (which are const). @@ -123,8 +130,10 @@ class HNSWIndex : public VecSimIndexAbstract, #ifdef BUILD_TESTS #include "VecSim/algorithms/hnsw/hnsw_base_tests_friends.h" +#ifdef SERIALIZE #include "hnsw_serializer_declarations.h" #endif +#endif protected: HNSWIndex() = delete; // default constructor is disabled. @@ -177,12 +186,12 @@ class HNSWIndex : public VecSimIndexAbstract, void greedySearchLevel(const void *vector_data, size_t level, idType &curObj, DistType &curDist, void *timeoutCtx = nullptr, VecSimQueryReply_Code *rc = nullptr) const; void repairConnectionsForDeletion(idType element_internal_id, idType neighbour_id, - ElementLevelData &node_level, + const ElementLevelData &node_level, ElementLevelData &neighbor_level, size_t level, vecsim_stl::vector &neighbours_bitmap); void replaceEntryPoint(); - void SwapLastIdWithDeletedId(idType element_internal_id, ElementGraphData *last_element, + void SwapLastIdWithDeletedId(idType element_internal_id, DiskElementGraphDataCopy *last_element, const void *last_element_data); /** Add vector functions */ @@ -245,8 +254,8 @@ class HNSWIndex : public VecSimIndexAbstract, void unlockSharedIndexDataGuard() const; void lockNodeLinks(idType node_id) const; void unlockNodeLinks(idType node_id) const; - void lockNodeLinks(ElementGraphData *node_data) const; - void unlockNodeLinks(ElementGraphData *node_data) const; + void lockNodeLinks(const DiskElementGraphDataCopy &node_data) const; + void unlockNodeLinks(const DiskElementGraphDataCopy &node_data) const; VisitedNodesHandler *getVisitedList() const; void returnVisitedList(VisitedNodesHandler *visited_nodes_handler) const; VecSimIndexInfo info() const override; @@ -254,9 +263,15 @@ class HNSWIndex : public VecSimIndexAbstract, VecSimInfoIterator *infoIterator() const override; bool preferAdHocSearch(size_t subsetSize, size_t k, bool initial_check) const override; const char *getDataByInternalId(idType internal_id) const; - ElementGraphData *getGraphDataByInternalId(idType internal_id) const; - ElementLevelData &getElementLevelData(idType internal_id, size_t level) const; - ElementLevelData &getElementLevelData(ElementGraphData *element, size_t level) const; + DiskElementGraphDataCopy getGraphDataByInternalId(idType internal_id) const; + const ElementLevelData &getElementLevelData(idType internal_id, size_t level) const; + const ElementLevelData &getElementLevelData(const DiskElementGraphDataCopy &element, + size_t level) const; + ElementLevelData getElementLevelDataForWrite(const DiskElementGraphDataCopy &element, + size_t level); + ElementLevelData getElementLevelDataForWrite(idType internal_id, size_t level) { + return getElementLevelDataForWrite(getGraphDataByInternalId(internal_id), level); + } idType searchBottomLayerEP(const void *query_data, void *timeoutCtx, VecSimQueryReply_Code *rc) const; @@ -389,10 +404,9 @@ const char *HNSWIndex::getDataByInternalId(idType internal_i } template -ElementGraphData * +DiskElementGraphDataCopy HNSWIndex::getGraphDataByInternalId(idType internal_id) const { - return (ElementGraphData *)graphDataBlocks[internal_id / this->blockSize].getElement( - internal_id % this->blockSize); + return graphData.getGraphDataByInternalId(internal_id); } template @@ -403,15 +417,22 @@ size_t HNSWIndex::getRandomLevel(double reverse_size) { } template -ElementLevelData &HNSWIndex::getElementLevelData(idType internal_id, - size_t level) const { - return getGraphDataByInternalId(internal_id)->getElementLevelData(level, this->levelDataSize); +const ElementLevelData &HNSWIndex::getElementLevelData(idType internal_id, + size_t level) const { + return getGraphDataByInternalId(internal_id).getElementLevelData(level); } template -ElementLevelData &HNSWIndex::getElementLevelData(ElementGraphData *graph_data, - size_t level) const { - return graph_data->getElementLevelData(level, this->levelDataSize); +const ElementLevelData & +HNSWIndex::getElementLevelData(const DiskElementGraphDataCopy &elem_graph_data, + size_t level) const { + return elem_graph_data.getElementLevelData(level); +} + +template +ElementLevelData HNSWIndex::getElementLevelDataForWrite( + const DiskElementGraphDataCopy &elem_graph_data, size_t level) { + return elem_graph_data.getElementLevelData(level); } template @@ -427,6 +448,8 @@ void HNSWIndex::returnVisitedList( template void HNSWIndex::markDeletedInternal(idType internalId) { + + return; // not supported // Here we are holding the global index data guard (and the main index lock of the tiered index // for shared ownership). assert(internalId < this->curElementCount); @@ -479,26 +502,37 @@ void HNSWIndex::unlockSharedIndexDataGuard() const { indexDataGuard.unlock_shared(); } +// template +// void HNSWIndex::lockNodeLinks(const DiskElementGraphDataCopy &node_data) +// const { +// graphData.lock(); +// } + +// template +// void HNSWIndex::unlockNodeLinks(const DiskElementGraphDataCopy &node_data) +// const { +// node_data.neighborsGuard.unlock(); +// } + template -void HNSWIndex::lockNodeLinks(ElementGraphData *node_data) const { - node_data->neighborsGuard.lock(); +void HNSWIndex::lockNodeLinks(idType node_id) const { + graphData.lockNodeLinks(node_id); } template -void HNSWIndex::unlockNodeLinks(ElementGraphData *node_data) const { - node_data->neighborsGuard.unlock(); +void HNSWIndex::unlockNodeLinks(idType node_id) const { + graphData.unlockNodeLinks(node_id); } template -void HNSWIndex::lockNodeLinks(idType node_id) const { - lockNodeLinks(getGraphDataByInternalId(node_id)); +void HNSWIndex::lockNodeLinks(const DiskElementGraphDataCopy &data) const { + data.lockNodeLinks(); } template -void HNSWIndex::unlockNodeLinks(idType node_id) const { - unlockNodeLinks(getGraphDataByInternalId(node_id)); +void HNSWIndex::unlockNodeLinks(const DiskElementGraphDataCopy &data) const { + data.unlockNodeLinks(); } - /** * helper functions */ @@ -525,9 +559,9 @@ void HNSWIndex::processCandidate( tag_t visited_tag, vecsim_stl::abstract_priority_queue &top_candidates, candidatesMaxHeap &candidate_set, DistType &lowerBound) const { - ElementGraphData *cur_element = getGraphDataByInternalId(curNodeId); + auto cur_element = getGraphDataByInternalId(curNodeId); lockNodeLinks(cur_element); - ElementLevelData &node_level = getElementLevelData(cur_element, layer); + const ElementLevelData &node_level = getElementLevelData(cur_element, layer); linkListSize num_links = node_level.getNumLinks(); if (num_links > 0) { @@ -536,7 +570,7 @@ void HNSWIndex::processCandidate( __builtin_prefetch(elements_tags + node_level.getLinkAtPos(0)); // Pre-fetch first candidate data block address. next_data = getDataByInternalId(node_level.getLinkAtPos(0)); - __builtin_prefetch(next_data); + __builtin_prefetch(next_data); // TODO: check if this is needed for (linkListSize j = 0; j < num_links - 1; j++) { idType candidate_id = node_level.getLinkAtPos(j); @@ -609,9 +643,9 @@ void HNSWIndex::processCandidate_RangeSearch( tag_t visited_tag, std::unique_ptr &results, candidatesMaxHeap &candidate_set, DistType dyn_range, DistType radius) const { - auto *cur_element = getGraphDataByInternalId(curNodeId); + auto cur_element = getGraphDataByInternalId(curNodeId); lockNodeLinks(cur_element); - ElementLevelData &node_level = getElementLevelData(cur_element, layer); + const ElementLevelData &node_level = getElementLevelData(cur_element, layer); linkListSize num_links = node_level.getNumLinks(); if (num_links > 0) { @@ -893,14 +927,14 @@ idType HNSWIndex::mutuallyConnectNewElement( assert(top_candidates_list.size() <= M && "Should be not be more than M candidates returned by the heuristic"); - auto *new_node_level = getGraphDataByInternalId(new_node_id); - ElementLevelData &new_node_level_data = getElementLevelData(new_node_level, level); + auto new_node_level = getGraphDataByInternalId(new_node_id); + ElementLevelData new_node_level_data = getElementLevelDataForWrite(new_node_level, level); assert(new_node_level_data.getNumLinks() == 0 && "The newly inserted element should have blank link list"); for (auto &neighbor_data : top_candidates_list) { idType selected_neighbor = neighbor_data.second; // neighbor's id - auto *neighbor_graph_data = getGraphDataByInternalId(selected_neighbor); + auto neighbor_graph_data = getGraphDataByInternalId(selected_neighbor); if (new_node_id < selected_neighbor) { lockNodeLinks(new_node_level); lockNodeLinks(neighbor_graph_data); @@ -931,7 +965,8 @@ idType HNSWIndex::mutuallyConnectNewElement( continue; } - ElementLevelData &neighbor_level_data = getElementLevelData(neighbor_graph_data, level); + ElementLevelData neighbor_level_data = + getElementLevelDataForWrite(neighbor_graph_data, level); // if the neighbor's neighbors list has the capacity to add the new node, make the update // and finish. @@ -954,7 +989,7 @@ idType HNSWIndex::mutuallyConnectNewElement( template void HNSWIndex::repairConnectionsForDeletion( - idType element_internal_id, idType neighbour_id, ElementLevelData &node_level, + idType element_internal_id, idType neighbour_id, const ElementLevelData &node_level, ElementLevelData &neighbor_level, size_t level, vecsim_stl::vector &neighbours_bitmap) { if (isMarkedDeleted(neighbour_id)) { @@ -1014,7 +1049,7 @@ void HNSWIndex::repairConnectionsForDeletion( // we should remove it from the node's incoming edges. // otherwise, edge turned from bidirectional to one directional, // and it should be saved in the neighbor's incoming edges. - auto &node_level_data = getElementLevelData(node_id, level); + auto node_level_data = getElementLevelData(node_id, level); if (!node_level_data.removeIncomingUnidirectionalEdgeIfExists(neighbour_id)) { neighbor_level.newIncomingUnidirectionalEdge(node_id); } @@ -1029,7 +1064,7 @@ void HNSWIndex::repairConnectionsForDeletion( for (size_t i = 0; i < neighbor_level.getNumLinks(); i++) { idType node_id = neighbor_level.getLinkAtPos(i); if (!neighbour_orig_neighbours_set[node_id]) { - ElementLevelData &node_level = getElementLevelData(node_id, level); + ElementLevelData node_level = getElementLevelData(node_id, level); // If the node has an edge to the neighbour as well, remove it from the incoming nodes // of the neighbour. Otherwise, we need to update the edge as unidirectional incoming. bool bidirectional_edge = false; @@ -1053,8 +1088,10 @@ void HNSWIndex::repairConnectionsForDeletion( template void HNSWIndex::replaceEntryPoint() { + return; // not supported + idType old_entry_point_id = entrypointNode; - auto *old_entry_point = getGraphDataByInternalId(old_entry_point_id); + auto old_entry_point = getGraphDataByInternalId(old_entry_point_id); // Sets an (arbitrary) new entry point, after deleting the current entry point. while (old_entry_point_id == entrypointNode) { @@ -1067,14 +1104,14 @@ void HNSWIndex::replaceEntryPoint() { // Go over the entry point's neighbors at the top level. lockNodeLinks(old_entry_point); - ElementLevelData &old_ep_level = getElementLevelData(old_entry_point, maxLevel); + ElementLevelData old_ep_level = getElementLevelData(old_entry_point, maxLevel); // Tries to set the (arbitrary) first neighbor as the entry point which is not deleted, // if exists. for (size_t i = 0; i < old_ep_level.getNumLinks(); i++) { if (!isMarkedDeleted(old_ep_level.getLinkAtPos(i))) { if (!isInProcess(old_ep_level.getLinkAtPos(i))) { entrypointNode = old_ep_level.getLinkAtPos(i); - unlockNodeLinks(old_entry_point); + unlockNodeLinks(old_entry_point_id); return; } else { // Store this candidate which is currently being inserted into the graph in @@ -1091,19 +1128,20 @@ void HNSWIndex::replaceEntryPoint() { for (DataBlock &graph_data_block : graphDataBlocks) { size_t size = graph_data_block.getLength(); for (size_t i = 0; i < size; i++) { - auto cur_element = (ElementGraphData *)graph_data_block.getElement(i); - if (cur_element->toplevel == maxLevel && cur_id != old_entry_point_id && - !isMarkedDeleted(cur_id)) { - // Found a non element in the current max level. - if (!isInProcess(cur_id)) { - entrypointNode = cur_id; - return; - } else if (candidate_in_process == INVALID_ID) { - // This element is still in process, and there hasn't been another candidate - // in process that has found in this level. - candidate_in_process = cur_id; - } - } + // auto cur_element = (ElementGraphData *)graph_data_block.getElement(i); + // if (cur_element->toplevel == maxLevel && cur_id != old_entry_point_id && + // !isMarkedDeleted(cur_id)) { + // // Found a non element in the current max level. + // if (!isInProcess(cur_id)) { + // entrypointNode = cur_id; + // return; + // } else if (candidate_in_process == INVALID_ID) { + // // This element is still in process, and there hasn't been another + // candidate + // // in process that has found in this level. + // candidate_in_process = cur_id; + // } + // } cur_id++; } } @@ -1130,23 +1168,26 @@ void HNSWIndex::replaceEntryPoint() { template void HNSWIndex::SwapLastIdWithDeletedId(idType element_internal_id, - ElementGraphData *last_element, + DiskElementGraphDataCopy *last_element, const void *last_element_data) { + return; // not supported + // Swap label - this is relevant when the last element's label exists (it is not marked as // deleted). if (!isMarkedDeleted(curElementCount)) { replaceIdOfLabel(getExternalLabel(curElementCount), element_internal_id, curElementCount); } + // for each level of the element to remove, swap the data with the last element in this level. // Swap neighbours for (size_t level = 0; level <= last_element->toplevel; level++) { - auto &cur_level = getElementLevelData(last_element, level); + auto cur_level = getElementLevelData(last_element, level); // Go over the neighbours that also points back to the last element whose is going to // change, and update the id. for (size_t i = 0; i < cur_level.getNumLinks(); i++) { idType neighbour_id = cur_level.getLinkAtPos(i); - ElementLevelData &neighbor_level = getElementLevelData(neighbour_id, level); + ElementLevelData neighbor_level = getElementLevelData(neighbour_id, level); bool bidirectional_edge = false; for (size_t j = 0; j < neighbor_level.getNumLinks(); j++) { @@ -1168,7 +1209,7 @@ void HNSWIndex::SwapLastIdWithDeletedId(idType element_inter // Next, go over the rest of incoming edges (the ones that are not bidirectional) and make // updates. for (auto incoming_edge : cur_level.getIncomingEdges()) { - ElementLevelData &incoming_neighbor_level = getElementLevelData(incoming_edge, level); + ElementLevelData incoming_neighbor_level = getElementLevelData(incoming_edge, level); for (size_t j = 0; j < incoming_neighbor_level.getNumLinks(); j++) { if (incoming_neighbor_level.getLinkAtPos(j) == curElementCount) { incoming_neighbor_level.setLinkAtPos(j, element_internal_id); @@ -1178,18 +1219,21 @@ void HNSWIndex::SwapLastIdWithDeletedId(idType element_inter } } + // top level, mutex, element levels data, level0 data // Move the last element's data to the deleted element's place - auto element = getGraphDataByInternalId(element_internal_id); - memcpy((void *)element, last_element, this->elementGraphDataSize); + // auto element = getGraphDataByInternalId(element_internal_id); + // memcpy((void *)element, last_element, this->elementGraphDataSize); - auto data = getDataByInternalId(element_internal_id); - memcpy((void *)data, last_element_data, this->dataSize); + // // copy the vector itself + // auto data = getDataByInternalId(element_internal_id); + // memcpy((void *)data, last_element_data, this->dataSize); - this->idToMetaData[element_internal_id] = this->idToMetaData[curElementCount]; + // // no need to touch + // this->idToMetaData[element_internal_id] = this->idToMetaData[curElementCount]; - if (curElementCount == this->entrypointNode) { - this->entrypointNode = element_internal_id; - } + // if (curElementCount == this->entrypointNode) { + // this->entrypointNode = element_internal_id; + // } } // This function is greedily searching for the closest candidate to the given data point at the @@ -1217,9 +1261,9 @@ void HNSWIndex::greedySearchLevel(const void *vector_data, s } changed = false; - auto *element = getGraphDataByInternalId(bestCand); + auto element = getGraphDataByInternalId(bestCand); lockNodeLinks(element); - ElementLevelData &node_level_data = getElementLevelData(element, level); + const ElementLevelData &node_level_data = getElementLevelData(element, level); for (int i = 0; i < node_level_data.getNumLinks(); i++) { idType candidate = node_level_data.getLinkAtPos(i); @@ -1253,10 +1297,10 @@ HNSWIndex::safeCollectAllNodeIncomingNeighbors(idType node_i vecsim_stl::vector incoming_neighbors(this->allocator); auto element = getGraphDataByInternalId(node_id); - for (size_t level = 0; level <= element->toplevel; level++) { + for (size_t level = 0; level <= element.toplevel; level++) { // Save the node neighbor's in the current level while holding its neighbors lock. lockNodeLinks(element); - auto &node_level_data = getElementLevelData(element, level); + auto node_level_data = getElementLevelData(element, level); // Store the deleted element's neighbours. auto neighbors_copy = node_level_data.copyLinks(); unlockNodeLinks(element); @@ -1264,9 +1308,9 @@ HNSWIndex::safeCollectAllNodeIncomingNeighbors(idType node_i // Go over the neighbours and collect tho ones that also points back to the removed node. for (auto neighbour_id : neighbors_copy) { // Hold the neighbor's lock while we are going over its neighbors. - auto *neighbor = getGraphDataByInternalId(neighbour_id); + auto neighbor = getGraphDataByInternalId(neighbour_id); lockNodeLinks(neighbor); - ElementLevelData &neighbour_level_data = getElementLevelData(neighbor, level); + const ElementLevelData &neighbour_level_data = getElementLevelData(neighbor, level); for (size_t j = 0; j < neighbour_level_data.getNumLinks(); j++) { // A bidirectional edge was found - this connection should be repaired. @@ -1306,7 +1350,6 @@ void HNSWIndex::resizeIndexCommon(size_t new_max_elements) { template void HNSWIndex::growByBlock() { size_t new_max_elements = maxElements + this->blockSize; - graphDataBlocks.emplace_back(this->blockSize, this->elementGraphDataSize, this->allocator); resizeIndexCommon(new_max_elements); } @@ -1337,7 +1380,7 @@ void HNSWIndex::mutuallyUpdateForRepairedNode( lockNodeLinks(nodes_to_update[i]); } - ElementLevelData &node_level = getElementLevelData(node_id, level); + ElementLevelData node_level = getElementLevelData(node_id, level); // Perform mutual updates: go over the node's neighbors and overwrite the neighbors to remove // that are still exist. @@ -1391,7 +1434,7 @@ void HNSWIndex::mutuallyUpdateForRepairedNode( // remove it from the incoming edges set. Otherwise, the edge is created unidirectional, so // we add it to the unidirectional edges set. Note: we assume that all updates occur // mutually and atomically, then can rely on this assumption. - auto &chosen_node_level_data = getElementLevelData(chosen_id, level); + auto chosen_node_level_data = getElementLevelData(chosen_id, level); if (!node_level.removeIncomingUnidirectionalEdgeIfExists(chosen_id)) { chosen_node_level_data.newIncomingUnidirectionalEdge(node_id); } @@ -1419,9 +1462,9 @@ void HNSWIndex::repairNodeConnections(idType node_id, size_t // Go over the repaired node neighbors, collect the non-deleted ones to be neighbors candidates // after the repair as well. - auto *element = getGraphDataByInternalId(node_id); + auto element = getGraphDataByInternalId(node_id); lockNodeLinks(element); - ElementLevelData &node_level_data = getElementLevelData(element, level); + const ElementLevelData &node_level_data = getElementLevelData(element, level); for (size_t j = 0; j < node_level_data.getNumLinks(); j++) { node_orig_neighbours_set[node_level_data.getLinkAtPos(j)] = true; // Don't add the removed element to the candidates. @@ -1452,9 +1495,9 @@ void HNSWIndex::repairNodeConnections(idType node_id, size_t nodes_to_update.push_back(deleted_neighbor_id); neighbors_to_remove.push_back(deleted_neighbor_id); - auto *neighbor = getGraphDataByInternalId(deleted_neighbor_id); + auto neighbor = getGraphDataByInternalId(deleted_neighbor_id); lockNodeLinks(neighbor); - ElementLevelData &neighbor_level_data = getElementLevelData(neighbor, level); + const ElementLevelData &neighbor_level_data = getElementLevelData(neighbor, level); for (size_t j = 0; j < neighbor_level_data.getNumLinks(); j++) { // Don't add removed elements to the candidates, nor nodes that are already in the @@ -1516,7 +1559,7 @@ void HNSWIndex::mutuallyRemoveNeighborAtPos(ElementLevelData size_t pos) { // Now we know that we are looking at a neighbor that needs to be removed. auto removed_node = node_level.getLinkAtPos(pos); - ElementLevelData &removed_node_level = getElementLevelData(removed_node, level); + ElementLevelData removed_node_level = getElementLevelDataForWrite(removed_node, level); // Perform the mutual update: // if the removed node id (the node's neighbour to be removed) // wasn't pointing to the node (i.e., the edge was uni-directional), @@ -1559,7 +1602,7 @@ void HNSWIndex::insertElementToGraph(idType element_id, for (auto level = static_cast(max_common_level); level >= 0; level--) { candidatesMaxHeap top_candidates = searchLayer(curr_element, vector_data, level, efConstruction); - // If the entry point was marked deleted between iterations, we may recieve an empty + // If the entry point was marked deleted between iterations, we may receive an empty // candidates set. if (!top_candidates.empty()) { curr_element = mutuallyConnectNewElement(element_id, top_candidates, level); @@ -1587,11 +1630,10 @@ HNSWIndex::HNSWIndex(const HNSWParams *params, const IndexComponents &components, size_t random_seed) : VecSimIndexAbstract(abstractInitParams, components), - VecSimIndexTombstone(), maxElements(0), graphDataBlocks(this->allocator), - idToMetaData(this->allocator), visitedNodesHandlerPool(0, this->allocator) { + VecSimIndexTombstone(), maxElements(0), M(params->M ? params->M : HNSW_DEFAULT_M), + M0(this->M * 2), graphDataBlocks(this->allocator), idToMetaData(this->allocator), + graphData(this->M0, this->M, this->allocator), visitedNodesHandlerPool(0, this->allocator) { - M = params->M ? params->M : HNSW_DEFAULT_M; - M0 = M * 2; if (M0 > UINT16_MAX) throw std::runtime_error("HNSW index parameter M is too large: argument overflow"); @@ -1612,15 +1654,13 @@ HNSWIndex::HNSWIndex(const HNSWParams *params, mult = 1 / log(1.0 * M); levelGenerator.seed(random_seed); - elementGraphDataSize = sizeof(ElementGraphData) + sizeof(idType) * M0; - levelDataSize = sizeof(ElementLevelData) + sizeof(idType) * M; + // elementGraphDataSize = sizeof(ElementGraphData) + sizeof(idType) * M0; + // levelDataSize = sizeof(ElementLevelData) + sizeof(idType) * M; } template HNSWIndex::~HNSWIndex() { - for (idType id = 0; id < curElementCount; id++) { - getGraphDataByInternalId(id)->destroy(this->levelDataSize, this->allocator); - } + this->log(VecSimCommonStrings::LOG_VERBOSE_STRING, "HNSWIndex destructor"); } /** @@ -1629,16 +1669,20 @@ HNSWIndex::~HNSWIndex() { template void HNSWIndex::removeAndSwap(idType internalId) { + + return; // Not supported + // Sanity check - the id to remove cannot be the entry point, as it should have been replaced // upon marking it as deleted. assert(entrypointNode != internalId); auto element = getGraphDataByInternalId(internalId); // Remove the deleted id form the relevant incoming edges sets in which it appears. - for (size_t level = 0; level <= element->toplevel; level++) { - ElementLevelData &cur_level = getElementLevelData(element, level); + for (size_t level = 0; level <= element.toplevel; level++) { + const ElementLevelData &cur_level = getElementLevelData(element, level); for (size_t i = 0; i < cur_level.getNumLinks(); i++) { - ElementLevelData &neighbour = getElementLevelData(cur_level.getLinkAtPos(i), level); + ElementLevelData neighbour = + getElementLevelDataForWrite(cur_level.getLinkAtPos(i), level); // Note that in case of in-place delete, we might have not accounted for this edge in // in the unidirectional edges, since there is no point in keeping it there temporarily // (we know we will get here and remove this deleted id permanently). @@ -1653,32 +1697,34 @@ void HNSWIndex::removeAndSwap(idType internalId) { } // Free the element's resources - element->destroy(this->levelDataSize, this->allocator); + // element->destroy(this->levelDataSize, this->allocator); // We can say now that the element has removed completely from index. --curElementCount; // Get the last element's metadata and data. // If we are deleting the last element, we already destroyed it's metadata. - auto *last_element_data = getDataByInternalId(curElementCount); - DataBlock &last_gd_block = graphDataBlocks.back(); - auto last_element = (ElementGraphData *)last_gd_block.removeAndFetchLastElement(); - // Swap the last id with the deleted one, and invalidate the last id data. - if (curElementCount != internalId) { - SwapLastIdWithDeletedId(internalId, last_element, last_element_data); - } + // auto *last_element_data = getDataByInternalId(curElementCount); + // DataBlock &last_gd_block = graphDataBlocks.back(); + // auto last_element = (ElementGraphData *)last_gd_block.removeAndFetchLastElement(); - // If we need to free a complete block and there is at least one block between the - // capacity and the size. - this->vectors->removeElement(curElementCount); - if (curElementCount % this->blockSize == 0) { - shrinkByBlock(); - } + // // Swap the last id with the deleted one, and invalidate the last id data. + // if (curElementCount != internalId) { + // SwapLastIdWithDeletedId(internalId, last_element, last_element_data); + // } + + // // If we need to free a complete block and there is at least one block between the + // // capacity and the size. + // this->vectors->removeElement(curElementCount); + // if (curElementCount % this->blockSize == 0) { + // shrinkByBlock(); + // } } template void HNSWIndex::removeAndSwapMarkDeletedElement(idType internalId) { + return; // Not supported removeAndSwap(internalId); // element is permanently removed from the index, it is no longer counted as marked deleted. --numMarkedDeleted; @@ -1687,12 +1733,13 @@ void HNSWIndex::removeAndSwapMarkDeletedElement(idType inter template void HNSWIndex::removeVectorInPlace(const idType element_internal_id) { + return; // Not supported vecsim_stl::vector neighbours_bitmap(this->allocator); // Go over the element's nodes at every level and repair the effected connections. - auto element = getGraphDataByInternalId(element_internal_id); - for (size_t level = 0; level <= element->toplevel; level++) { - ElementLevelData &cur_level = getElementLevelData(element, level); + DiskElementGraphDataCopy element = getGraphDataByInternalId(element_internal_id); + for (size_t level = 0; level <= element.toplevel; level++) { + const ElementLevelData &cur_level = getElementLevelData(element, level); // Reset the neighbours' bitmap for the current level. neighbours_bitmap.assign(curElementCount, false); // Store the deleted element's neighbours set in a bitmap for fast access. @@ -1702,42 +1749,42 @@ void HNSWIndex::removeVectorInPlace(const idType element_int // Go over the neighbours that also points back to the removed point and make a local // repair. for (size_t i = 0; i < cur_level.getNumLinks(); i++) { - idType neighbour_id = cur_level.getLinkAtPos(i); - ElementLevelData &neighbor_level = getElementLevelData(neighbour_id, level); - - bool bidirectional_edge = false; - for (size_t j = 0; j < neighbor_level.getNumLinks(); j++) { - // If the edge is bidirectional, do repair for this neighbor. - if (neighbor_level.getLinkAtPos(j) == element_internal_id) { - bidirectional_edge = true; - repairConnectionsForDeletion(element_internal_id, neighbour_id, cur_level, - neighbor_level, level, neighbours_bitmap); - break; - } - } - - // If this edge is uni-directional, we should remove the element from the neighbor's - // incoming edges. - if (!bidirectional_edge) { - // This should always return true (remove should succeed). - bool res = - neighbor_level.removeIncomingUnidirectionalEdgeIfExists(element_internal_id); - (void)res; - assert(res && "The edge should be in the incoming unidirectional edges"); - } + // idType neighbour_id = cur_level.getLinkAtPos(i); + // ElementLevelData neighbor_level = getElementLevelDataForWrite(neighbour_id, level); + + // bool bidirectional_edge = false; + // for (size_t j = 0; j < neighbor_level.getNumLinks(); j++) { + // // If the edge is bidirectional, do repair for this neighbor. + // if (neighbor_level.getLinkAtPos(j) == element_internal_id) { + // bidirectional_edge = true; + // repairConnectionsForDeletion(element_internal_id, neighbour_id, cur_level, + // neighbor_level, level, neighbours_bitmap); + // break; + // } + // } + + // // If this edge is uni-directional, we should remove the element from the neighbor's + // // incoming edges. + // if (!bidirectional_edge) { + // // This should always return true (remove should succeed). + // bool res = + // neighbor_level.removeIncomingUnidirectionalEdgeIfExists(element_internal_id); + // (void)res; + // assert(res && "The edge should be in the incoming unidirectional edges"); + // } } // Next, go over the rest of incoming edges (the ones that are not bidirectional) and make // repairs. - for (auto incoming_edge : cur_level.getIncomingEdges()) { - repairConnectionsForDeletion(element_internal_id, incoming_edge, cur_level, - getElementLevelData(incoming_edge, level), level, - neighbours_bitmap); - } + // for (auto incoming_edge : cur_level.getIncomingEdges()) { + // repairConnectionsForDeletion(element_internal_id, incoming_edge, cur_level, + // getElementLevelData(incoming_edge, level), level, + // neighbours_bitmap); + // } } if (entrypointNode == element_internal_id) { // Replace entry point if needed. - assert(element->toplevel == maxLevel); + assert(element.toplevel == maxLevel); replaceEntryPoint(); } // Finally, remove the element from the index and make a swap with the last internal id to @@ -1759,35 +1806,15 @@ HNSWAddVectorState HNSWIndex::storeNewElement(labelType labe state.newElementId = curElementCount++; // Create the new element's graph metadata. - // We must assign manually enough memory on the stack and not just declare an `ElementGraphData` - // variable, since it has a flexible array member. - auto tmpData = this->allocator->allocate_unique(this->elementGraphDataSize); - memset(tmpData.get(), 0, this->elementGraphDataSize); - ElementGraphData *cur_egd = (ElementGraphData *)(tmpData.get()); - // Allocate memory (inside `ElementGraphData` constructor) for the links in higher levels and - // initialize this memory to zeros. The reason for doing it here is that we might mark this - // vector as deleted BEFORE we finish its indexing. In that case, we will collect the incoming - // edges to this element in every level, and try to access its link lists in higher levels. - // Therefore, we allocate it here and initialize it with zeros, (otherwise we might crash...) - try { - new (cur_egd) ElementGraphData(state.elementMaxLevel, levelDataSize, this->allocator); - } catch (std::runtime_error &e) { - this->log(VecSimCommonStrings::LOG_WARNING_STRING, - "Error - allocating memory for new element failed due to low memory"); - throw e; - } - if (indexSize() > indexCapacity()) { growByBlock(); - } else if (state.newElementId % this->blockSize == 0) { - // If we had an initial capacity, we might have to allocate new blocks for the graph data. - this->graphDataBlocks.emplace_back(this->blockSize, this->elementGraphDataSize, - this->allocator); } + // Graphdata size is checked upon every insertion // Insert the new element to the data block this->vectors->addElement(vector_data, state.newElementId); - this->graphDataBlocks.back().addElement(cur_egd); + // this->graphDataBlocks.back().addElement(cur_egd); + this->graphData.appendElement(state.elementMaxLevel, label, state.newElementId); // We mark id as in process *before* we set it in the label lookup, so that IN_PROCESS flag is // set when checking if label . this->idToMetaData[state.newElementId] = ElementMetaData(label); @@ -2319,8 +2346,8 @@ HNSWIndex::getHNSWElementNeighbors(size_t label, int ***neig idType id = ids[0]; auto graph_data = this->getGraphDataByInternalId(id); lockNodeLinks(graph_data); - *neighborsData = new int *[graph_data->toplevel + 2]; - for (size_t level = 0; level <= graph_data->toplevel; level++) { + *neighborsData = new int *[graph_data.toplevel + 2]; + for (size_t level = 0; level <= graph_data.toplevel; level++) { auto &level_data = this->getElementLevelData(graph_data, level); assert(level_data.getNumLinks() <= (level > 0 ? this->getM() : 2 * this->getM())); (*neighborsData)[level] = new int[level_data.getNumLinks() + 1]; @@ -2329,11 +2356,14 @@ HNSWIndex::getHNSWElementNeighbors(size_t label, int ***neig (*neighborsData)[level][i + 1] = (int)idToMetaData.at(level_data.getLinkAtPos(i)).label; } } - (*neighborsData)[graph_data->toplevel + 1] = nullptr; + (*neighborsData)[graph_data.toplevel + 1] = nullptr; unlockNodeLinks(graph_data); return VecSimDebugCommandCode_OK; } #ifdef BUILD_TESTS -#include "hnsw_serializer.h" +#ifdef SERIALIZE + +// #include "hnsw_serializer.h" +#endif #endif diff --git a/src/VecSim/algorithms/hnsw/hnsw_multi.h b/src/VecSim/algorithms/hnsw/hnsw_multi.h index 972c981e4..ec3044f77 100644 --- a/src/VecSim/algorithms/hnsw/hnsw_multi.h +++ b/src/VecSim/algorithms/hnsw/hnsw_multi.h @@ -7,7 +7,7 @@ #pragma once #include "hnsw.h" -#include "hnsw_multi_batch_iterator.h" +// #include "hnsw_multi_batch_iterator.h" #include "VecSim/utils/updatable_heap.h" template @@ -57,6 +57,8 @@ class HNSWIndex_Multi : public HNSWIndex { : HNSWIndex(params, abstractInitParams, components, random_seed), labelLookup(this->allocator) {} #ifdef BUILD_TESTS +#ifdef SERIALIZE + // Ctor to be used before loading a serialized index. Can be used from v2 and up. HNSWIndex_Multi(std::ifstream &input, const HNSWParams *params, const AbstractIndexInitParams &abstractInitParams, @@ -65,6 +67,7 @@ class HNSWIndex_Multi : public HNSWIndex { : HNSWIndex(input, params, abstractInitParams, components, version), labelLookup(this->maxElements, this->allocator) {} +#endif void getDataByLabel(labelType label, std::vector> &vectors_output) const override { @@ -90,8 +93,8 @@ class HNSWIndex_Multi : public HNSWIndex { } inline size_t indexLabelCount() const override; - VecSimBatchIterator *newBatchIterator(const void *queryBlob, - VecSimQueryParams *queryParams) const override; + // VecSimBatchIterator *newBatchIterator(const void *queryBlob, + // VecSimQueryParams *queryParams) const override; int deleteVector(labelType label) override; int addVector(const void *vector_data, labelType label) override; @@ -195,18 +198,18 @@ int HNSWIndex_Multi::addVector(const void *vector_data, cons return 1; // We always add the vector, no overrides in multi. } -template -VecSimBatchIterator * -HNSWIndex_Multi::newBatchIterator(const void *queryBlob, - VecSimQueryParams *queryParams) const { - auto queryBlobCopy = - this->allocator->allocate_aligned(this->dataSize, this->preprocessors->getAlignment()); - memcpy(queryBlobCopy, queryBlob, this->dim * sizeof(DataType)); - this->preprocessQueryInPlace(queryBlobCopy); - // Ownership of queryBlobCopy moves to HNSW_BatchIterator that will free it at the end. - return new (this->allocator) HNSWMulti_BatchIterator( - queryBlobCopy, this, queryParams, this->allocator); -} +// template +// VecSimBatchIterator * +// HNSWIndex_Multi::newBatchIterator(const void *queryBlob, +// VecSimQueryParams *queryParams) const { +// auto queryBlobCopy = +// this->allocator->allocate_aligned(this->dataSize, this->preprocessors->getAlignment()); +// memcpy(queryBlobCopy, queryBlob, this->dim * sizeof(DataType)); +// this->preprocessQueryInPlace(queryBlobCopy); +// // Ownership of queryBlobCopy moves to HNSW_BatchIterator that will free it at the end. +// return new (this->allocator) HNSWMulti_BatchIterator( +// queryBlobCopy, this, queryParams, this->allocator); +// } /** * Marks an element with the given label deleted, does NOT really change the current graph. diff --git a/src/VecSim/algorithms/hnsw/hnsw_single.h b/src/VecSim/algorithms/hnsw/hnsw_single.h index c9ef19ead..d5924ba00 100644 --- a/src/VecSim/algorithms/hnsw/hnsw_single.h +++ b/src/VecSim/algorithms/hnsw/hnsw_single.h @@ -7,7 +7,7 @@ #pragma once #include "hnsw.h" -#include "hnsw_single_batch_iterator.h" +// #include "hnsw_single_batch_iterator.h" template class HNSWIndex_Single : public HNSWIndex { @@ -34,6 +34,8 @@ class HNSWIndex_Single : public HNSWIndex { : HNSWIndex(params, abstractInitParams, components, random_seed), labelLookup(this->allocator) {} #ifdef BUILD_TESTS +#ifdef SERIALIZE + // Ctor to be used before loading a serialized index. Can be used from v2 and up. HNSWIndex_Single(std::ifstream &input, const HNSWParams *params, const AbstractIndexInitParams &abstractInitParams, @@ -42,6 +44,7 @@ class HNSWIndex_Single : public HNSWIndex { : HNSWIndex(input, params, abstractInitParams, components, version), labelLookup(this->maxElements, this->allocator) {} +#endif void getDataByLabel(labelType label, std::vector> &vectors_output) const override { @@ -64,8 +67,8 @@ class HNSWIndex_Single : public HNSWIndex { new (this->allocator) vecsim_stl::default_results_container(cap, this->allocator)); } size_t indexLabelCount() const override; - VecSimBatchIterator *newBatchIterator(const void *queryBlob, - VecSimQueryParams *queryParams) const override; + // VecSimBatchIterator *newBatchIterator(const void *queryBlob, + // VecSimQueryParams *queryParams) const override; int deleteVector(labelType label) override; int addVector(const void *vector_data, labelType label) override; @@ -155,18 +158,18 @@ int HNSWIndex_Single::addVector(const void *vector_data, return label_exists ? 0 : 1; } -template -VecSimBatchIterator * -HNSWIndex_Single::newBatchIterator(const void *queryBlob, - VecSimQueryParams *queryParams) const { - auto queryBlobCopy = - this->allocator->allocate_aligned(this->dataSize, this->preprocessors->getAlignment()); - memcpy(queryBlobCopy, queryBlob, this->dim * sizeof(DataType)); - this->preprocessQueryInPlace(queryBlobCopy); - // Ownership of queryBlobCopy moves to HNSW_BatchIterator that will free it at the end. - return new (this->allocator) HNSWSingle_BatchIterator( - queryBlobCopy, this, queryParams, this->allocator); -} +// template +// VecSimBatchIterator * +// HNSWIndex_Single::newBatchIterator(const void *queryBlob, +// VecSimQueryParams *queryParams) const { +// auto queryBlobCopy = +// this->allocator->allocate_aligned(this->dataSize, this->preprocessors->getAlignment()); +// memcpy(queryBlobCopy, queryBlob, this->dim * sizeof(DataType)); +// this->preprocessQueryInPlace(queryBlobCopy); +// // Ownership of queryBlobCopy moves to HNSW_BatchIterator that will free it at the end. +// return new (this->allocator) HNSWSingle_BatchIterator( +// queryBlobCopy, this, queryParams, this->allocator); +// } /** * Marks an element with the given label deleted, does NOT really change the current graph. diff --git a/src/VecSim/algorithms/hnsw/hnsw_tiered.h b/src/VecSim/algorithms/hnsw/hnsw_tiered.h index ee63e97ad..158980dc3 100644 --- a/src/VecSim/algorithms/hnsw/hnsw_tiered.h +++ b/src/VecSim/algorithms/hnsw/hnsw_tiered.h @@ -195,14 +195,14 @@ class TieredHNSWIndex : public VecSimTieredIndex { VecSimIndexInfo info() const override; VecSimIndexBasicInfo basicInfo() const override; VecSimInfoIterator *infoIterator() const override; - VecSimBatchIterator *newBatchIterator(const void *queryBlob, - VecSimQueryParams *queryParams) const override { - size_t blobSize = this->frontendIndex->getDim() * sizeof(DataType); - void *queryBlobCopy = this->allocator->allocate(blobSize); - memcpy(queryBlobCopy, queryBlob, blobSize); - return new (this->allocator) - TieredHNSW_BatchIterator(queryBlobCopy, this, queryParams, this->allocator); - } + // VecSimBatchIterator *newBatchIterator(const void *queryBlob, + // VecSimQueryParams *queryParams) const override { + // size_t blobSize = this->frontendIndex->getDim() * sizeof(DataType); + // void *queryBlobCopy = this->allocator->allocate(blobSize); + // memcpy(queryBlobCopy, queryBlob, blobSize); + // return new (this->allocator) + // TieredHNSW_BatchIterator(queryBlobCopy, this, queryParams, this->allocator); + // } inline void setLastSearchMode(VecSearchMode mode) override { return this->backendIndex->setLastSearchMode(mode); } diff --git a/src/VecSim/containers/data_blocks_container.cpp b/src/VecSim/containers/data_blocks_container.cpp index 2f4ccbc3e..424f72fe8 100644 --- a/src/VecSim/containers/data_blocks_container.cpp +++ b/src/VecSim/containers/data_blocks_container.cpp @@ -56,6 +56,8 @@ std::unique_ptr DataBlocksContainer::getIterator() c } #ifdef BUILD_TESTS +#ifdef SERIALIZE + void DataBlocksContainer::saveVectorsData(std::ostream &output) const { // Save data blocks for (size_t i = 0; i < this->numBlocks(); i++) { @@ -103,6 +105,7 @@ void DataBlocksContainer::restoreBlocks(std::istream &input, size_t num_vectors, } } +#endif void DataBlocksContainer::shrinkToFit() { this->blocks.shrink_to_fit(); } size_t DataBlocksContainer::numBlocks() const { return this->blocks.size(); } diff --git a/src/VecSim/containers/data_blocks_container.h b/src/VecSim/containers/data_blocks_container.h index 692f663fd..45d34df03 100644 --- a/src/VecSim/containers/data_blocks_container.h +++ b/src/VecSim/containers/data_blocks_container.h @@ -38,10 +38,13 @@ class DataBlocksContainer : public VecsimBaseObject, public RawDataContainer { std::unique_ptr getIterator() const override; #ifdef BUILD_TESTS +#ifdef SERIALIZE + void saveVectorsData(std::ostream &output) const override; // Use that in deserialization when file was created with old version (v3) that serialized // the blocks themselves and not just thw raw vector data. void restoreBlocks(std::istream &input, size_t num_vectors, Serializer::EncodingVersion); +#endif void shrinkToFit(); size_t numBlocks() const; #endif diff --git a/src/VecSim/containers/mapped_mem.h b/src/VecSim/containers/mapped_mem.h new file mode 100644 index 000000000..b9893a43d --- /dev/null +++ b/src/VecSim/containers/mapped_mem.h @@ -0,0 +1,227 @@ +#pragma once +#include +#include +#include +#include +#include "VecSim/vec_sim_interface.h" + +struct MappedMem { + MappedMem() : mapped_addr(nullptr), curr_size(0) { + // create a temporary file + fd = open(".", O_TMPFILE | O_EXCL | O_RDWR, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP); + if (fd == -1) { + VecSimIndexInterface::log_external("debug", "Failed to open file with error: %s", std::strerror(errno)); + + throw std::runtime_error("Failed to open file " + std::string("with error: ") + + std::strerror(errno)); + } + } + + // Transfer ownership of the file descriptor and the mapped memory and invalidate the current + // object + MappedMem(MappedMem &&other) noexcept + : mapped_addr(other.mapped_addr), curr_size(other.curr_size), fd(other.fd) { + other.mapped_addr = nullptr; + other.curr_size = 0; + other.fd = -1; + } + + // Move assignment operator + MappedMem &operator=(MappedMem &&other) { + if (this != &other) { + // Clean up current resources + if (fd != -1) { + throw std::runtime_error( + "overrding a mapped mem object with another mapped mem object is not allowed!"); + } + + // Transfer ownership + mapped_addr = other.mapped_addr; + curr_size = other.curr_size; + fd = other.fd; + + // Invalidate the moved-from object + other.mapped_addr = nullptr; + other.curr_size = 0; + other.fd = -1; + } + return *this; + } + + ~MappedMem() { + if (fd != -1) { + close(fd); + } + } + void destroy(size_t element_size_bytes, size_t block_size_bytes) { + if (!curr_size) + return; + // unmap memory + size_t total_bytes = curr_size * element_size_bytes; + size_t num_blocks = (total_bytes + block_size_bytes - 1) / block_size_bytes; + size_t fileSize = num_blocks * block_size_bytes; + munmap(mapped_addr, fileSize); + } + + + void appendElement(const void *element, size_t element_size_bytes) { + // write element to the end of the file + memcpy(mapped_addr + curr_size * element_size_bytes, element, element_size_bytes); + ++curr_size; + } + + const char *getElement(idType id, size_t element_size_bytes) const { + return mapped_addr + id * element_size_bytes; + } + + size_t get_elem_count() const { return curr_size; } + + bool is_full(size_t element_size_bytes, size_t block_size_bytes) const { + // if curr_size * element_size_bytes is a multiple of block_size_bytes, return true + return (curr_size * element_size_bytes) % block_size_bytes == 0; + } + + // return true if the memory is full and we had to resize it + bool growByBlock(size_t element_size_bytes, size_t block_size_bytes) { + // if curr_size * element_size_bytes is a multiple of block_size_bytes, return true + if (is_full(element_size_bytes, block_size_bytes)) { + // Resize the file to the required size + size_t curr_file_size_bytes = element_size_bytes * curr_size; + size_t new_file_size = curr_file_size_bytes + block_size_bytes; + int status = posix_fallocate(fd, 0, new_file_size); + if (status != 0) { + switch (status) { + case EBADF: + throw std::runtime_error("EBADF: Invalid file descriptor"); + case EFBIG: + throw std::runtime_error("EFBIG: offset+size exceeds the maximum file size"); + case EINVAL: + throw std::runtime_error( + "EINVAL: offset was less than 0, or size was less than or equal to \ + 0, or the underlying filesystem does not support the operation."); + case ENODEV: + throw std::runtime_error("ENODEV: fd does not refer to a regular file.."); + case ENOSPC: + throw std::runtime_error( + "ENOSPC: There is not enough space left on the device containing \ + the file referred to by fd"); + case EOPNOTSUPP: + throw std::runtime_error( + "EOPNOTSUPP: The filesystem containing the file referred to by fd does \ + not support this operation. This error code can be returned by C \ + libraries that don't perform the emulation shown in CAVEATS, \ + such as musl libc."); + case ESPIPE: + throw std::runtime_error("ESPIPE: fd refers to a pipe."); + default: + throw std::runtime_error("posix_fallocate failed"); + } + } + + if (curr_size) { + char *remmapd_addr = (char *)mremap(mapped_addr, curr_file_size_bytes, + new_file_size, MREMAP_MAYMOVE); + if (remmapd_addr == MAP_FAILED) { + throw std::runtime_error("Failed to remmap memory " + + std::string("with error: ") + std::strerror(errno)); + } + mapped_addr = remmapd_addr; + } else { // first initialization + int mmap_flags = MAP_SHARED; + + // map memory + mapped_addr = static_cast( + mmap(NULL, new_file_size, PROT_READ | PROT_WRITE, mmap_flags, fd, 0)); + if (mapped_addr == MAP_FAILED) { + throw std::runtime_error("Failed to map file " + std::string("with error: ") + + std::strerror(errno)); + } + } + // Give advise about sequential access to ensure the entire element is loaded into + // memory + // TODO: benchmark different madvise options + + // flush changes to disk + // if (curr_file_size_bytes != 0) { + // if (msync(mapped_addr, curr_file_size_bytes, MS_SYNC) == -1) { + // throw std::runtime_error("msync failed " + std::string("with error: ") + + // std::strerror(errno)); + // } + + // if (madvise(mapped_addr, curr_file_size_bytes, MADV_DONTNEED) == -1) { + // throw std::runtime_error("madvise failed " + std::string("with error: ") + + // std::strerror(errno)); + // } + // VecSimIndexInterface::log_external( + // "debug", "madvise was called with MADV_DONTNEED on %zu bytes mapped from fd: %d.", + // curr_file_size_bytes, fd); + // } + VecSimIndexInterface::log_external("debug", "madvise was not set"); + return true; + } + + return false; + } + + char *mapped_addr; + size_t curr_size; // current element count in mapped memory + int fd; +}; + +struct VectorsMappedMemContainer : public VecsimBaseObject, public MappedMem { + + VectorsMappedMemContainer(size_t block_size_bytes, size_t element_size_bytes, + std::shared_ptr allocator) + : VecsimBaseObject(allocator), MappedMem(), element_bytes_count(element_size_bytes), + block_size_bytes(block_size_bytes) {} + + const char *getElement(size_t id) const { + return MappedMem::getElement(id, element_bytes_count); + } + + void addElement(const void *elem, size_t id) { + assert(id == curr_size); + // grow if needed + growByBlock(element_bytes_count, block_size_bytes); + this->appendElement(elem, element_bytes_count); + } + + size_t element_bytes_count; + size_t block_size_bytes; + + /************************ No op functions to enable compilation *******************/ + void removeElement(size_t id) {} + void updateElement(size_t id, const void *element) {} + + struct Iterator { + /** + * This is an abstract interface, constructor/destructor should be implemented by the + * derived classes + */ + explicit Iterator(const VectorsMappedMemContainer &container_) + : container(container_), cur_id(0){}; + virtual ~Iterator() = default; + + /** + * The basic iterator operations API + */ + virtual bool hasNext() const { return this->cur_id != this->container.curr_size; }; + virtual const char *next() { + if (this->hasNext()) { + return this->container.getElement(this->cur_id++); + } + return nullptr; + } + virtual void reset() { cur_id = 0; }; + + const VectorsMappedMemContainer &container; + size_t cur_id; + }; + + /** + * Create a new iterator. Should be freed by the iterator's destroctor. + */ + std::unique_ptr getIterator() const { + return std::make_unique(*this); + } +}; diff --git a/src/VecSim/containers/raw_data_container_interface.h b/src/VecSim/containers/raw_data_container_interface.h index 2be992bde..4cecf3b1e 100644 --- a/src/VecSim/containers/raw_data_container_interface.h +++ b/src/VecSim/containers/raw_data_container_interface.h @@ -58,9 +58,12 @@ struct RawDataContainer { virtual std::unique_ptr getIterator() const = 0; #ifdef BUILD_TESTS +#ifdef SERIALIZE + /** * Save the raw data of all elements in the container to the output stream. */ virtual void saveVectorsData(std::ostream &output) const = 0; #endif +#endif }; diff --git a/src/VecSim/index_factories/hnsw_factory.cpp b/src/VecSim/index_factories/hnsw_factory.cpp index 8bd7f50dc..8d1a7a82f 100644 --- a/src/VecSim/index_factories/hnsw_factory.cpp +++ b/src/VecSim/index_factories/hnsw_factory.cpp @@ -136,10 +136,11 @@ size_t EstimateInitialSize(const HNSWParams *params, bool is_normalized) { size_t EstimateElementSize(const HNSWParams *params) { size_t M = (params->M) ? params->M : HNSW_DEFAULT_M; - size_t elementGraphDataSize = sizeof(ElementGraphData) + sizeof(idType) * M * 2; + (void)M; + // size_t elementGraphDataSize = sizeof(ElementGraphData) + sizeof(idType) * M * 2; - size_t size_total_data_per_element = - elementGraphDataSize + params->dim * VecSimType_sizeof(params->type); + // size_t size_total_data_per_element = + // elementGraphDataSize + params->dim * VecSimType_sizeof(params->type); // when reserving space for new labels in the lookup hash table, each entry is a pointer to a // label node (bucket). @@ -156,10 +157,12 @@ size_t EstimateElementSize(const HNSWParams *params) { * 2. The incoming edges that aren't bidirectional are stored in a dynamic array * (vecsim_stl::vector) Those edges' memory *is omitted completely* from this estimation. */ - return size_meta_data + size_total_data_per_element; + // return size_meta_data + size_total_data_per_element; + return size_meta_data; } #ifdef BUILD_TESTS +#ifdef SERIALIZE template inline VecSimIndex *NewIndex_ChooseMultiOrSingle(std::ifstream &input, const HNSWParams *params, @@ -259,5 +262,6 @@ VecSimIndex *NewIndex(const std::string &location, bool is_normalized) { } } #endif +#endif }; // namespace HNSWFactory diff --git a/src/VecSim/index_factories/hnsw_factory.h b/src/VecSim/index_factories/hnsw_factory.h index 3f99a560c..5acb43273 100644 --- a/src/VecSim/index_factories/hnsw_factory.h +++ b/src/VecSim/index_factories/hnsw_factory.h @@ -26,12 +26,15 @@ size_t EstimateInitialSize(const HNSWParams *params, bool is_normalized = false) size_t EstimateElementSize(const HNSWParams *params); #ifdef BUILD_TESTS +#ifdef SERIALIZE + // Factory function to be used before loading a serialized index. // @params is only used for backward compatibility with V1. It won't be used if V2 and up is loaded. // Required fields: type, dim, metric and multi // Permission fields that *** must be initalized to zero ***: blockSize, epsilon * VecSimIndex *NewIndex(const std::string &location, bool is_normalized = false); +#endif #endif }; // namespace HNSWFactory diff --git a/src/VecSim/utils/vecsim_stl.h b/src/VecSim/utils/vecsim_stl.h index 0b24c2258..892fe303c 100644 --- a/src/VecSim/utils/vecsim_stl.h +++ b/src/VecSim/utils/vecsim_stl.h @@ -29,6 +29,12 @@ class vector : public VecsimBaseObject, public std::vector>(cap, alloc) {} explicit vector(size_t cap, T val, const std::shared_ptr &alloc) : VecsimBaseObject(alloc), std::vector>(cap, val, alloc) {} + explicit vector(const vector &other, const std::shared_ptr &alloc) + : VecsimBaseObject(alloc), std::vector>(alloc) { + this->assign(other.begin(), other.end()); + } + explicit vector(const vector &&other, const std::shared_ptr &alloc) + : VecsimBaseObject(alloc), std::vector>(std::move(other)) {} bool remove(T element) { auto it = std::find(this->begin(), this->end(), element); diff --git a/src/VecSim/vec_sim.cpp b/src/VecSim/vec_sim.cpp index 64378c7b0..7830c14df 100644 --- a/src/VecSim/vec_sim.cpp +++ b/src/VecSim/vec_sim.cpp @@ -23,6 +23,8 @@ extern "C" void VecSim_SetLogCallbackFunction(logCallbackFunction callback) { VecSimIndex::setLogCallbackFunction(callback); } +extern "C" void VecSim_ResetLogCallbackFunction() { VecSimIndex::resetLogCallbackFunction(); } + extern "C" void VecSim_SetWriteMode(VecSimWriteMode mode) { VecSimIndex::setWriteMode(mode); } static VecSimResolveCode _ResolveParams_EFRuntime(VecSimAlgo index_type, VecSimRawParam rparam, @@ -245,7 +247,8 @@ extern "C" VecSimIndexBasicInfo VecSimIndex_BasicInfo(VecSimIndex *index) { extern "C" VecSimBatchIterator *VecSimBatchIterator_New(VecSimIndex *index, const void *queryBlob, VecSimQueryParams *queryParams) { - return index->newBatchIterator(queryBlob, queryParams); + return NULL; + // return index->newBatchIterator(queryBlob, queryParams); } extern "C" void VecSimTieredIndex_GC(VecSimIndex *index) { diff --git a/src/VecSim/vec_sim.h b/src/VecSim/vec_sim.h index 80d43d74c..96bb38096 100644 --- a/src/VecSim/vec_sim.h +++ b/src/VecSim/vec_sim.h @@ -231,6 +231,7 @@ void VecSim_SetTimeoutCallbackFunction(timeoutCallbackFunction callback); * @param callback logCallbackFunction function. should get void* and return void. */ void VecSim_SetLogCallbackFunction(logCallbackFunction callback); +void VecSim_ResetLogCallbackFunction(); /** * @brief Allow 3rd party to set the write mode for tiered index - async insert/delete using diff --git a/src/VecSim/vec_sim_common.h b/src/VecSim/vec_sim_common.h index de93dcddc..c42d53e08 100644 --- a/src/VecSim/vec_sim_common.h +++ b/src/VecSim/vec_sim_common.h @@ -24,7 +24,7 @@ extern "C" { #define HNSW_DEFAULT_EF_C 200 #define HNSW_DEFAULT_EF_RT 10 #define HNSW_DEFAULT_EPSILON 0.01 -#define DEFAULT_BLOCK_SIZE 1024 +#define DEFAULT_BLOCK_SIZE 1024 * 10 #define HNSW_INVALID_LEVEL SIZE_MAX #define INVALID_JOB_ID UINT_MAX diff --git a/src/VecSim/vec_sim_index.h b/src/VecSim/vec_sim_index.h index d12b23d11..e101684fe 100644 --- a/src/VecSim/vec_sim_index.h +++ b/src/VecSim/vec_sim_index.h @@ -17,6 +17,7 @@ #include "info_iterator_struct.h" #include "containers/data_blocks_container.h" #include "containers/raw_data_container_interface.h" +#include "containers/mapped_mem.h" #include #include @@ -78,7 +79,7 @@ struct VecSimIndexAbstract : public VecSimIndexInterface { bool isMulti; // Determines if the index should multi-index or not. void *logCallbackCtx; // Context for the log callback. - RawDataContainer *vectors; // The raw vectors data container. + VectorsMappedMemContainer *vectors; // The raw vectors data container. /** * @brief Get the common info object @@ -109,8 +110,8 @@ struct VecSimIndexAbstract : public VecSimIndexInterface { lastMode(EMPTY_MODE), isMulti(params.multi), logCallbackCtx(params.logCtx) { assert(VecSimType_sizeof(vecType)); assert(dataSize); - this->vectors = new (this->allocator) DataBlocksContainer( - this->blockSize, this->dataSize, this->allocator, this->getAlignment()); + this->vectors = new (this->allocator) VectorsMappedMemContainer( + this->blockSize * this->dataSize, this->dataSize, this->allocator); } /** diff --git a/src/VecSim/vec_sim_interface.cpp b/src/VecSim/vec_sim_interface.cpp index 0051c5030..77f906181 100644 --- a/src/VecSim/vec_sim_interface.cpp +++ b/src/VecSim/vec_sim_interface.cpp @@ -16,3 +16,7 @@ void Vecsim_Log(void *ctx, const char *level, const char *message) { timeoutCallbackFunction VecSimIndexInterface::timeoutCallback = [](void *ctx) { return 0; }; logCallbackFunction VecSimIndexInterface::logCallback = Vecsim_Log; VecSimWriteMode VecSimIndexInterface::asyncWriteMode = VecSim_WriteAsync; + +void VecSimIndexInterface::resetLogCallbackFunction() { + VecSimIndexInterface::logCallback = Vecsim_Log; +} diff --git a/src/VecSim/vec_sim_interface.h b/src/VecSim/vec_sim_interface.h index 920a9943c..0951557bf 100644 --- a/src/VecSim/vec_sim_interface.h +++ b/src/VecSim/vec_sim_interface.h @@ -20,6 +20,22 @@ struct VecSimIndexInterface : public VecsimBaseObject { public: + static void log_external(const char *level, const char *fmt, ...) { + if (VecSimIndexInterface::logCallback) { + // Format the message and call the callback + va_list args; + va_start(args, fmt); + int len = vsnprintf(NULL, 0, fmt, args); + va_end(args); + char *buf = new char[len + 1]; + va_start(args, fmt); + vsnprintf(buf, len + 1, fmt, args); + va_end(args); + logCallback(nullptr, level, buf); + delete[] buf; + } + } + /** * @brief Construct a new Vec Sim Index object * @@ -141,8 +157,8 @@ struct VecSimIndexInterface : public VecsimBaseObject { * @param queryBlob binary representation of the query vector. Blob size should match the index * data type and dimension. The index is responsible to process the query vector. */ - virtual VecSimBatchIterator *newBatchIterator(const void *queryBlob, - VecSimQueryParams *queryParams) const = 0; + // virtual VecSimBatchIterator *newBatchIterator(const void *queryBlob, + // VecSimQueryParams *queryParams) const = 0; /** * @brief Return True if heuristics says that it is better to use ad-hoc brute-force * search over the index instead of using batch iterator. @@ -194,6 +210,8 @@ struct VecSimIndexInterface : public VecsimBaseObject { VecSimIndexInterface::logCallback = callback; } + static void resetLogCallbackFunction(); + /** * @brief Allow 3rd party to set the write mode for tiered index - async insert/delete using * background jobs, or insert/delete inplace. diff --git a/src/python_bindings/bindings.cpp b/src/python_bindings/bindings.cpp index 836c113bc..3727ed72b 100644 --- a/src/python_bindings/bindings.cpp +++ b/src/python_bindings/bindings.cpp @@ -199,6 +199,8 @@ class PyVecSimIndex { size_t indexMemory() { return this->index->getAllocationSize(); } + size_t getBlockSize() const { return index->info().commonInfo.basicInfo.blockSize; } + virtual PyBatchIterator createBatchIterator(const py::object &input, VecSimQueryParams *query_params) { py::array query(input); @@ -226,7 +228,13 @@ class PyVecSimIndex { } } - virtual ~PyVecSimIndex() = default; // Delete function was given to the shared pointer object + void disableLogCallback() { VecSim_SetLogCallbackFunction(nullptr); } + + void resetLogCallback() { VecSim_ResetLogCallbackFunction(); } + + virtual ~PyVecSimIndex() { + resetLogCallback(); + }; // Delete function was given to the shared pointer object }; class PyHNSWLibIndex : public PyVecSimIndex { @@ -278,40 +286,65 @@ class PyHNSWLibIndex : public PyVecSimIndex { this->index = std::shared_ptr(VecSimIndex_New(¶ms), VecSimIndex_Free); } - // @params is required only in V1. - explicit PyHNSWLibIndex(const std::string &location) { - this->index = - std::shared_ptr(HNSWFactory::NewIndex(location), VecSimIndex_Free); - } - - void setDefaultEf(size_t ef) { - auto *hnsw = reinterpret_cast *>(index.get()); - hnsw->setEf(ef); - } - void saveIndex(const std::string &location) { + size_t getMaxLevel() const { auto type = VecSimIndex_Info(this->index.get()).commonInfo.basicInfo.type; if (type == VecSimType_FLOAT32) { auto *hnsw = dynamic_cast *>(index.get()); - hnsw->saveIndex(location); + return hnsw->getMaxLevel(); } else if (type == VecSimType_FLOAT64) { auto *hnsw = dynamic_cast *>(index.get()); - hnsw->saveIndex(location); + return hnsw->getMaxLevel(); } else if (type == VecSimType_BFLOAT16) { auto *hnsw = dynamic_cast *>(index.get()); - hnsw->saveIndex(location); + return hnsw->getMaxLevel(); } else if (type == VecSimType_FLOAT16) { auto *hnsw = dynamic_cast *>(index.get()); - hnsw->saveIndex(location); + return hnsw->getMaxLevel(); } else if (type == VecSimType_INT8) { auto *hnsw = dynamic_cast *>(index.get()); - hnsw->saveIndex(location); + return hnsw->getMaxLevel(); } else if (type == VecSimType_UINT8) { auto *hnsw = dynamic_cast *>(index.get()); - hnsw->saveIndex(location); + return hnsw->getMaxLevel(); } else { throw std::runtime_error("Invalid index data type"); } } + + // @params is required only in V1. + // explicit PyHNSWLibIndex(const std::string &location) { + // this->index = + // std::shared_ptr(HNSWFactory::NewIndex(location), VecSimIndex_Free); + // } + + void setDefaultEf(size_t ef) { + auto *hnsw = reinterpret_cast *>(index.get()); + hnsw->setEf(ef); + } + // void saveIndex(const std::string &location) { + // auto type = VecSimIndex_Info(this->index.get()).commonInfo.basicInfo.type; + // if (type == VecSimType_FLOAT32) { + // auto *hnsw = dynamic_cast *>(index.get()); + // hnsw->saveIndex(location); + // } else if (type == VecSimType_FLOAT64) { + // auto *hnsw = dynamic_cast *>(index.get()); + // hnsw->saveIndex(location); + // } else if (type == VecSimType_BFLOAT16) { + // auto *hnsw = dynamic_cast *>(index.get()); + // hnsw->saveIndex(location); + // } else if (type == VecSimType_FLOAT16) { + // auto *hnsw = dynamic_cast *>(index.get()); + // hnsw->saveIndex(location); + // } else if (type == VecSimType_INT8) { + // auto *hnsw = dynamic_cast *>(index.get()); + // hnsw->saveIndex(location); + // } else if (type == VecSimType_UINT8) { + // auto *hnsw = dynamic_cast *>(index.get()); + // hnsw->saveIndex(location); + // } else { + // throw std::runtime_error("Invalid index data type"); + // } + // } py::object searchKnnParallel(const py::object &input, size_t k, VecSimQueryParams *query_params, int n_threads) { @@ -422,36 +455,36 @@ class PyHNSWLibIndex : public PyVecSimIndex { } } - bool checkIntegrity() { - auto type = VecSimIndex_Info(this->index.get()).commonInfo.basicInfo.type; - if (type == VecSimType_FLOAT32) { - return dynamic_cast *>(this->index.get()) - ->checkIntegrity() - .valid_state; - } else if (type == VecSimType_FLOAT64) { - return dynamic_cast *>(this->index.get()) - ->checkIntegrity() - .valid_state; - } else if (type == VecSimType_BFLOAT16) { - return dynamic_cast *>(this->index.get()) - ->checkIntegrity() - .valid_state; - } else if (type == VecSimType_FLOAT16) { - return dynamic_cast *>(this->index.get()) - ->checkIntegrity() - .valid_state; - } else if (type == VecSimType_INT8) { - return dynamic_cast *>(this->index.get()) - ->checkIntegrity() - .valid_state; - } else if (type == VecSimType_UINT8) { - return dynamic_cast *>(this->index.get()) - ->checkIntegrity() - .valid_state; - } else { - throw std::runtime_error("Invalid index data type"); - } - } + // bool checkIntegrity() { + // auto type = VecSimIndex_Info(this->index.get()).commonInfo.basicInfo.type; + // if (type == VecSimType_FLOAT32) { + // return dynamic_cast *>(this->index.get()) + // ->checkIntegrity() + // .valid_state; + // } else if (type == VecSimType_FLOAT64) { + // return dynamic_cast *>(this->index.get()) + // ->checkIntegrity() + // .valid_state; + // } else if (type == VecSimType_BFLOAT16) { + // return dynamic_cast *>(this->index.get()) + // ->checkIntegrity() + // .valid_state; + // } else if (type == VecSimType_FLOAT16) { + // return dynamic_cast *>(this->index.get()) + // ->checkIntegrity() + // .valid_state; + // } else if (type == VecSimType_INT8) { + // return dynamic_cast *>(this->index.get()) + // ->checkIntegrity() + // .valid_state; + // } else if (type == VecSimType_UINT8) { + // return dynamic_cast *>(this->index.get()) + // ->checkIntegrity() + // .valid_state; + // } else { + // throw std::runtime_error("Invalid index data type"); + // } + // } PyBatchIterator createBatchIterator(const py::object &input, VecSimQueryParams *query_params) override { py::array query(input); @@ -624,26 +657,30 @@ PYBIND11_MODULE(VecSim, m) { .def("index_size", &PyVecSimIndex::indexSize) .def("index_type", &PyVecSimIndex::indexType) .def("index_memory", &PyVecSimIndex::indexMemory) + .def("index_block_size", &PyVecSimIndex::getBlockSize) .def("create_batch_iterator", &PyVecSimIndex::createBatchIterator, py::arg("query_blob"), py::arg("query_param") = nullptr) - .def("get_vector", &PyVecSimIndex::getVector); + .def("get_vector", &PyVecSimIndex::getVector) + .def("disable_logs", &PyVecSimIndex::disableLogCallback) + .def("reset_logs", &PyVecSimIndex::resetLogCallback); py::class_(m, "HNSWIndex") .def(py::init([](const HNSWParams ¶ms) { return new PyHNSWLibIndex(params); }), py::arg("params")) - .def(py::init([](const std::string &location) { return new PyHNSWLibIndex(location); }), - py::arg("location")) + // .def(py::init([](const std::string &location) { return new PyHNSWLibIndex(location); }), + // py::arg("location")) .def("set_ef", &PyHNSWLibIndex::setDefaultEf) - .def("save_index", &PyHNSWLibIndex::saveIndex) + // .def("save_index", &PyHNSWLibIndex::saveIndex) .def("knn_parallel", &PyHNSWLibIndex::searchKnnParallel, py::arg("queries"), py::arg("k"), py::arg("query_param") = nullptr, py::arg("num_threads") = -1) .def("add_vector_parallel", &PyHNSWLibIndex::addVectorsParallel, py::arg("vectors"), py::arg("labels"), py::arg("num_threads") = -1) - .def("check_integrity", &PyHNSWLibIndex::checkIntegrity) + // .def("check_integrity", &PyHNSWLibIndex::checkIntegrity) .def("range_parallel", &PyHNSWLibIndex::searchRangeParallel, py::arg("queries"), py::arg("radius"), py::arg("query_param") = nullptr, py::arg("num_threads") = -1) .def("create_batch_iterator", &PyHNSWLibIndex::createBatchIterator, py::arg("query_blob"), - py::arg("query_param") = nullptr); + py::arg("query_param") = nullptr) + .def("index_max_level", &PyHNSWLibIndex::getMaxLevel); py::class_(m, "TieredIndex") .def("wait_for_index", &PyTieredIndex::WaitForIndex, py::arg("waiting_duration") = 10) diff --git a/tests/flow/download_dataset.py b/tests/flow/download_dataset.py new file mode 100644 index 000000000..6edda2ed7 --- /dev/null +++ b/tests/flow/download_dataset.py @@ -0,0 +1,111 @@ +from datasets import load_dataset +import numpy as np +import pickle +import time +import os + +DOWNLOAD_DATASET = False +VERIFY_DATASET = False +env_var = os.environ.get('DOWNLOAD_MULTILANG_DATASET') +if env_var == 'false': + DOWNLOAD_DATASET = False +if env_var == 'true': + DOWNLOAD_DATASET = True + +env_var = os.environ.get('VERIFY_MULTILANG_DATASET') +if env_var == 'false': + VERIFY_DATASET = False +if env_var == 'true': + VERIFY_DATASET = True + +lang = "en" #Use the Simple English Wikipedia subset +# num_vectors_train = 2000 +# num_vectors_test = 1 +num_vectors_train = 10_000_000 +num_vectors_test = 10_000 +num_vectors = num_vectors_train + num_vectors_test + +dim = 1024 +docs = load_dataset("Cohere/wikipedia-2023-11-embed-multilingual-v3", lang, split="train", streaming=True) +vecs = np.zeros((num_vectors, dim), dtype=np.float32) # Use float32 for memory efficiency + +fields = ['emb', '_id', 'text', 'title'] +# files format: multilang_n_{num_vectors_train}_q_{num_vectors_test}_{field}.pik + +# dict of : +data = {} + +file_base_name = f"multilang_n_{num_vectors_train}_q_{num_vectors_test}" +def download_dataset(): + should_download = 'y' + for field in fields: + file = f"{file_base_name}_{field}.pik" + if os.path.exists(file): + should_download = input(f"{field} file exists. Should override? (y/n)") + else: + should_download = input(f"{field} file does not exist. Should we create it? (y/n)") + if should_download.lower() == 'y': + print(f"Downloading {field} data to {file}") + if field == 'emb': + data[field] = vecs + else: + data[field] = [] + + if data == {}: + print("Nothing to download") + return + + counter = 0 + start_time = time.time() # Start timing + for doc in docs: + if counter == num_vectors: + break + for key in data.keys(): + if key == 'emb': + vecs[counter] = doc[key] # add to pre-allocated numpy array + else: + data[key].append(doc[key]) # add to meta data list + + counter += 1 + end_time = time.time() # End timing + print('load time: ',f"T{end_time - start_time:.4f} seconds") + start_time = time.time() # Start timing + + for key in data.keys(): + with open(f"{file_base_name}_{key}.pik", 'wb') as f: + pickle.dump(data[key], f) + +def load_dataset_from_disk(): + file = f"{file_base_name}_emb.pik" + with open(file,'rb') as f: + unpickled_array = pickle.load(f) + # dim = unpickled_array.shape[1] + print('Array shape: ' + str(unpickled_array.shape)) + print('Data type: '+str(type(unpickled_array))) + +def verify_downloaded_dataset(): + for field in fields: + file = f"{file_base_name}_{field}.pik" + print(f"Verifying {file}") + if os.path.exists(file): + with open(file,'rb') as f: + unpickled_array = pickle.load(f) + if field == 'emb': + assert unpickled_array.shape == (num_vectors, dim) + print("Ensure no zero vectors in the array") + for i in range(num_vectors): + assert np.any(unpickled_array[i]), f"Array at index {i} is all zeros" + elif field == '_id': + assert len(unpickled_array) == num_vectors + elif field == 'text': + assert len(unpickled_array) == num_vectors + elif field == 'title': + assert len(unpickled_array) == num_vectors + print(f"{field} - ok") + +if DOWNLOAD_DATASET == True: + download_dataset() +if VERIFY_DATASET == True: + verify_downloaded_dataset() +# load_dataset_from_disk() +# diff --git a/tests/flow/test_mashoo.py b/tests/flow/test_mashoo.py new file mode 100644 index 000000000..05ce1f887 --- /dev/null +++ b/tests/flow/test_mashoo.py @@ -0,0 +1,142 @@ +import os +os.environ['DOWNLOAD_MULTILANG_DATASET'] = 'false' +os.environ['VERIFY_MULTILANG_DATASET'] = 'false' + +# from download_dataset import file_base_name, num_vectors_train, num_vectors_test # if download_dataset() is not commented it will run +num_vectors_train = 10_000_000 +num_vectors_test = 10_000 +file_base_name = f"multilang_n_{num_vectors_train}_q_{num_vectors_test}" +import pickle +import time +vectors_file_name = f"{file_base_name}_emb.pik" + +def split_pickle_file(vectors_file_name, splits): + with open(vectors_file_name,'rb') as f: + print(f"loading vectors files from {vectors_file_name}") + unpickled_array = pickle.load(f) + print('Array shape: ' + str(unpickled_array.shape)) + print('Data type: '+str(type(unpickled_array))) + + vectors_data = unpickled_array[:num_vectors_train] + queries_data = unpickled_array[num_vectors_train:num_vectors_train + num_vectors_test] + assert len(vectors_data) == num_vectors_train + assert len(queries_data) == num_vectors_test + + batch_size = num_vectors_train // splits + if num_vectors_train % splits != 0: + batch_size += 1 + curr_idx = 0 + + only_vecs_file_name = f"multilang_n_{num_vectors_train}_emb" + + for split in range(splits): + with open(f"{only_vecs_file_name}_split_{split}.pik", 'wb') as f: + pickle.dump(vectors_data[curr_idx:curr_idx + batch_size], f) + + # Save queries data + only_queries_file_name = f"multilang_q_{num_vectors_test}_emb" + with open(f"{only_queries_file_name}.pik", 'wb') as f: + pickle.dump(queries_data, f) + + +def get_vector_file_count(): + splits = 0 + while os.path.exists(f"{only_vecs_file_name}_split_{splits}.pik"): + splits += 1 + return splits + +def open_pickled_file(filename): + with open(filename,'rb') as f: + print(f"loading {filename}") + unpickled_array = pickle.load(f) + print('Array shape: ' + str(unpickled_array.shape)) + print('Data type: '+str(type(unpickled_array))) + + return unpickled_array +only_vecs_file_name = f"multilang_n_{num_vectors_train}_emb" + +import numpy as np +def check_queries(): + queries_data = open_pickled_file(f"multilang_q_{num_vectors_test}_emb.pik") + queries_from_all_vectors_data = open_pickled_file(vectors_file_name)[num_vectors_train:num_vectors_train + num_vectors_test] + assert len(queries_data) == len(queries_from_all_vectors_data) + assert len(queries_data) == num_vectors_test + print(f"queries_file_shape: {queries_data.shape}") + print(f"queries_from_all_vectors_file_shape: {queries_from_all_vectors_data.shape}") + assert np.array_equal(queries_data, queries_from_all_vectors_data) + +def timed_populate_index(index= None, num_vectors = num_vectors_train): + batches_count = get_vector_file_count() + total_time = 0 + total_vectors = 0 + sum_vec_sanity = 0 + for i in range(batches_count): + filename = f"{only_vecs_file_name}_split_{i}.pik" + with open(filename,'rb') as f: + print(f"loading {filename}") + vectors_data = pickle.load(f) + print('Array shape: ' + str(vectors_data.shape)) + print('Data type: '+str(type(vectors_data))) + len_vectors_data = len(vectors_data) + + # limit the number of vectors to num_vectors + if total_vectors + len_vectors_data > num_vectors: + len_vectors_data = num_vectors - total_vectors + start_time = time.time() # Start timing + for i, vector in enumerate(vectors_data[:len_vectors_data]): + sum_vec_sanity += 1 + end_time = time.time() # End timing + total_time += end_time - start_time + total_vectors += len_vectors_data + print(f"Batch {i}: vectors: {len_vectors_data} took: T{end_time - start_time:.4f} seconds") + print(f"expected {total_vectors} vectors, sanity check: {sum_vec_sanity}") + +file_name_prefix = "multilang" +def create_ground_truth_file_name(num_vectors, num_queries): + return f"{file_name_prefix}_n_{num_vectors}_q_{num_queries}_gt.npy" + +def check_gt(num_vectors=num_vectors_train, num_queries=num_vectors_test): + gt_file_name = create_ground_truth_file_name(num_vectors, num_queries) + print(f"loading ground truth file from {gt_file_name}") + my_gt = np.load(gt_file_name, allow_pickle=True).item() + queries_data = open_pickled_file(f"multilang_q_{num_queries}_emb.pik") + print(f"my queries data len: {len(queries_data)}") + + # print("my_gt[0]", my_gt[1]) + print("type(my_gt.items()[1])", type(my_gt)) + omer_gt_prefix = "/home/ubuntu/VectorSimilarity/ground_truth" + + for i in range(num_queries): + gt_labels, gt_distances = my_gt[i] + omer_gt_labels_file_name = f"{omer_gt_prefix}/ids{i}.npy" + + omer_gt = np.load(omer_gt_labels_file_name) + if not np.array_equal(gt_labels[0], omer_gt[0]): + print(f"gt_labels for query {i} are not equal") + for j, label in enumerate(gt_labels[0]): + if label != omer_gt[0][j]: + print(f"label {j} is diffrent. my_gt: {label}, omer_gt: {omer_gt[0][j]}") + print(f"distance is: {gt_distances[0][j]}") + print() + + # omer_gt_vector_file_name = f"{omer_gt_prefix}/vector{i}.npy" + # try: + # omer_query = np.load(omer_gt_vector_file_name) + # print(f"omer query: {omer_query}") + # except EOFError as e: + # print("EOFError at query: ", i) + # if not np.array_equal(queries_data[i], omer_query[0]): + # print(f"queries_data for query {i} are not equal") + # for j, val in enumerate(queries_data[i]): + # if val != omer_query[0][j]: + # print(f"val {j} is diffrent. my_gt: {val}, omer_gt: {omer_query[0][j]}") + # print() + # try: + # assert np.array_equal(queries_data[i], np.load(omer_gt_vector_file_name)), f"query {i} is not equal" + # except EOFError as e: + # print("EOFError at query: ", i) + +check_gt() +# split_pickle_file(vectors_file_name, 5) +# timed_populate_index() +# check_queries() diff --git a/tests/flow/test_mmap.py b/tests/flow/test_mmap.py new file mode 100644 index 000000000..1482c0a8d --- /dev/null +++ b/tests/flow/test_mmap.py @@ -0,0 +1,444 @@ + +# use to disable automatic dataset download in download_dataset.py +import os +os.environ['DOWNLOAD_MULTILANG_DATASET'] = 'false' +os.environ['VERIFY_MULTILANG_DATASET'] = 'false' + +# from download_dataset import file_base_name, num_vectors_train, num_vectors_test # if download_dataset() is not commented it will run +num_vectors_train = 10_000_000 +num_vectors_test = 10_000 +file_base_name = f"multilang_n_{num_vectors_train}_q_{num_vectors_test}" + +import pickle +from VecSim import * +from common import create_hnsw_index, create_flat_index +import hnswlib +from numpy.testing import assert_allclose +import numpy as np +import time +import json +import psutil + +# To check file size at runtime look for the smallest fd that is marked as deleted +# this is level 0 vectors file. +# run stat /proc/283956/fd/ --dereference +# Than divide it by level 0 elemnt data size to get the number of current indexedvectors +# for example, M = 60, (M0 = 120): +# level0 data size: 2 * M * sizeof(id_type) + sizeof(metadata_struct) + sizeof(num_liks_data_type) +# 2 * 60 * 4 + 8 + 2 = 484 +# divide it by 490 (bytes per vector) + +# this one works as well: +# sudo lsof -p `pgrep pytest` | awk '$4 ~ /^[0-9]+u$/' + +# Globals +RUN_SANITY = False +RUN_BM = True +RUN_GT = False + +MAX_K = 100 +MMAP_ADVISE = "None" +ENABLE_LOGS = False +LOAD_ALL_VECTORS = True + +PROCESS_LIMIT_HIGH = "2G" +PROCESS_LIMIT_MAX = "None" + +TIMESTAMP_PRINT = True + +file_name_prefix = "multilang" +all_vectors_file_name = f"{file_base_name}_emb.pik" +only_vecs_file_name = f"multilang_n_{num_vectors_train}_emb" +queries_file_name = f"multilang_q_{num_vectors_test}_emb.pik" + +default_print = print +def timestamped_print(*args, **kwargs): + """ Custom print function that prepends a timestamp to each message. """ + timestamp = time.time() # Unix timestamp + default_print(f"[{timestamp}]", *args, **kwargs) + +# Override the built-in print function globally +if TIMESTAMP_PRINT: print = timestamped_print + +def get_rss_memory_usage_bytes(): + process = psutil.Process() + return process.memory_info().rss # RSS in bytes + +def create_ground_truth_file_name(num_vectors, num_queries): + return f"{file_name_prefix}_n_{num_vectors}_q_{num_queries}_gt.npy" + +def open_pickled_file(filename, mode='rb'): + with open(filename, 'rb') as f: + print(f"loading {filename}") + data = pickle.load(f) + print('Array shape: ' + str(data.shape)) + print('Data type: '+str(type(data))) + + return data + +queries_data = open_pickled_file(queries_file_name) +dim = queries_data.shape[1] +if LOAD_ALL_VECTORS: + vectors_data = open_pickled_file(all_vectors_file_name) + input(f"PID: {os.getpid()} Done loading vectors, press enter to continue") + +def get_vector_file_count(): + splits = 0 + while os.path.exists(f"{only_vecs_file_name}_split_{splits}.pik"): + splits += 1 + return splits + +def timed_populate_index(index, num_vectors, check_memory_interval=0): + check_memory_interval = num_vectors // 10 if check_memory_interval == 0 else check_memory_interval + + batches_count = get_vector_file_count() + build_time = 0 + total_vectors = 0 + done = False + for i in range(batches_count): + filename = f"{only_vecs_file_name}_split_{i}.pik" + with open(filename,'rb') as f: + if done == True: + break + print(f"loading {filename}") + vectors_data = pickle.load(f) + print('Array shape: ' + str(vectors_data.shape)) + print('Data type: '+str(type(vectors_data))) + len_vectors_data = len(vectors_data) + + # limit the number of vectors to num_vectors + if total_vectors + len_vectors_data > num_vectors: + len_vectors_data = num_vectors - total_vectors + done = True + + start_time = time.time() # Start timing + for vector in vectors_data[:len_vectors_data]: + index.add_vector(vector, total_vectors) + total_vectors += 1 + + if total_vectors % check_memory_interval == 0: + end_time = time.time() + build_time += end_time - start_time + print(f"Building {total_vectors} vectors time: ",f"T{build_time:.4f} seconds") + curr_mem = index.index_memory() + print(f"Current memory usage: {curr_mem} bytes, {(curr_mem / 1024 / 1024):.4f} MB, {(index.index_memory() / 1024 / 1024 / 1024):.4f} GB") + end_time = time.time() # End timing + build_time += end_time - start_time + print(f"Batch {i}: vectors: {len_vectors_data} took: T{end_time - start_time:.4f} seconds") + + print(f"printing a vec: {vectors_data[0]}") + + return build_time + + +def build_grount_truth(num_vectors=num_vectors_train, num_queries=num_vectors_test, dim=1024): + index = create_flat_index(dim, VecSimMetric_L2, VecSimType_FLOAT32) + print("\nBuilding Flat index") + # build_time = timed_populate_index(index, num_vectors) + start_time = time.time() # Start timing + for i, vector in enumerate(vectors_data[:num_vectors]): + index.add_vector(vector, i) + + end_time = time.time() + build_time = end_time - start_time + print('Building time: ',f"T{build_time:.4f} seconds") + + print(f"Get {MAX_K} ground truth vectors for each query vector") + + knn_results = {} + for i, query_vector in enumerate(queries_data[:num_queries]): + labels, distances = index.knn_query(query_vector, k=MAX_K) + knn_results[i] = (labels, distances) + + ground_truth_file_name = create_ground_truth_file_name(num_vectors, num_queries) + np.save(ground_truth_file_name, knn_results, allow_pickle=True) + +if RUN_GT: + build_grount_truth(num_vectors=1_000_000) + +def load_gt(num_vectors, num_queries): + ground_truth_file_name = create_ground_truth_file_name(num_vectors, num_queries) + print(f"loading {ground_truth_file_name}") + return np.load(ground_truth_file_name, allow_pickle=True).item() + +def write_result_to_file(result, filename="results.json", override=False): + print(f"writing results to file {filename}") + mode = 'a' if not override else 'w' + try: + with open(filename, mode) as f: + json.dump(result, f, indent=4) + f.write('\n') # Write each result on a new line + except Exception as e: + print(f"Failed to write result to file: {e}") + +def bm_query(index, k, efR, gt_results, num_queries=num_vectors_test): + print(f"\nRunning {num_queries} queries benchmark with params: efR: {efR}, k: {k}") + index.set_ef(efR) + total_query_time = 0 + recall = 0 + total_recall = 0 + for i, query_data in enumerate(queries_data[:num_queries]): + start_time = time.time() + hnsw_labels, hnsw_distances = index.knn_query(query_data, k=k) + end_time = time.time() + total_query_time += end_time - start_time + + # compute recall + gt_labels, gt_distances = gt_results[i] + recall = set(hnsw_labels.flatten()).intersection(set(gt_labels.flatten())) + # print(f"hnsw_labels.flatten(): {hnsw_labels.flatten()}") + # print(f"gt_labels.flatten(): {gt_labels.flatten()}") + recall = float(len(recall)) / float(k) + total_recall += recall + if i % 1000 == 0: + print(f"Query {i}: recall={recall}, time={total_query_time:.4f} seconds") + + avg_query_time = total_query_time / num_queries + avg_recall = total_recall / num_queries + + result = { + "num_queries": num_queries, + "k": k, + "efR": efR, + "total_query_time": total_query_time, + "avg_query_time": avg_query_time, + "avg_recall": avg_recall + } + + return result + + +def bm_test_case(M, efC, Ks_efR, num_vectors=num_vectors_train, num_queries=num_vectors_test): + index = create_hnsw_index( + dim, + num_vectors, + VecSimMetric_L2, + VecSimType_FLOAT32, + m=M, + ef_construction=efC, + ) + + if ENABLE_LOGS == False: index.disable_logs() + + index_block_size = index.index_block_size() + print(f'''\nBuilding index of size {num_vectors:,} with params: ", f"M: {M}, efC: {efC}, index_block_size: {index_block_size}, madvise: {MMAP_ADVISE}, + process high limit: {PROCESS_LIMIT_HIGH}, process max limit: {PROCESS_LIMIT_MAX}\n + ''', flush=True) + + check_memory_interval = num_vectors // 50 + + if LOAD_ALL_VECTORS == False: open_pickled_file(all_vectors_file_name) + build_time = 0 + start_time = time.time() # Start timing + for i, vector in enumerate(vectors_data[:num_vectors]): + index.add_vector(vector, i) + + if i % check_memory_interval == 0: + end_time = time.time() + build_time += end_time - start_time + print(f"Building {i} vectors time: ",f"T{build_time:.4f} seconds", flush=True) + curr_mem = index.index_memory() + print(f"Current index memory usage: {curr_mem} bytes, {(curr_mem / 1024 / 1024):.4f} MB, {(index.index_memory() / 1024 / 1024 / 1024):.4f} GB") + + # proc_rss = get_rss_memory_usage_bytes() + # print(f"Current process RSS memory usage: {proc_rss} bytes, {(proc_rss / 1024 / 1024):.4f} MB, {(proc_rss / 1024 / 1024 / 1024):.4f} GB") + start_time = time.time() + + print('\nBuilding time: ',f"{build_time:.4f} seconds, {(build_time / 60):.4f} m, {(build_time / 60 / 60):.4f} h\n") + index_max_level = index.index_max_level() + print(f"index_max_level: {index_max_level}") + final_allocations_mem_gb = index.index_memory()/ 1024 / 1024 / 1024 + print(f"index allocation size: {final_allocations_mem_gb:.4f} GB") + + # Sanity checks + print(f"disk hnsw index contains {(index.index_size()):,} vectors") + + random_query_index = np.random.randint(0, num_vectors) + + # query with a vector from the dataset + index_query_data = index.get_vector(random_query_index)[0] + query_data = vectors_data[random_query_index] + assert np.array_equal(query_data, index_query_data) + print("query_data is equal to index_query_data") + + # expect to get the same vector back with distance of 0 + labels, distances = index.knn_query(query_data, k=1) + print(f"testing vector: {random_query_index}") + print("labels: ", labels) + print("distances: ", distances) + sanity_checks = { + "index_size": "Pass" if (index.index_size() == num_vectors) else "Fail", + "query_label_result": "Pass" if (labels[0][0] == random_query_index) else "Fail", + "distance_check": "Pass" if (distances[0][0] == float(0)) else "Fail", + } + failure_info = {} + + + if sanity_checks["index_size"] == "Fail": + failure_info["index_size"] = { + "expected": index.index_size(), + "actual": num_vectors + } + + if sanity_checks["query_label_result"] == "Fail": + failure_info["query_label_result"] = { + "expected": int(labels[0][0]), + "actual": random_query_index + } + + if sanity_checks["distance_check"] == "Fail": + failure_info["distance_check"] = { + "expected": distances[0][0], + "actual": float(0) + } + benchmark_settings = { + "mmap_advise": MMAP_ADVISE, + "index_block_size": index_block_size, + "process_limit_high": PROCESS_LIMIT_HIGH, + "process_limit_max": PROCESS_LIMIT_MAX, + } + + build_result = { + "M": M, + "efC": efC, + "num_vectors": num_vectors, + "build_time": build_time, + "max_level": index_max_level, + "allocations_mem_gb": final_allocations_mem_gb, + "sanity_checks": sanity_checks, + "failure_info": failure_info, + } + + gt_results = load_gt(num_vectors, num_queries) + knn_bm_results = [] + for k, efR in Ks_efR: + queries_reslts = bm_query(index, k=k, efR=efR, gt_results=gt_results) + knn_bm_results.append(queries_reslts) + + result = { + "benchmark_settings": benchmark_settings, + "build_bm": build_result, + "knn_bm_results": knn_bm_results, + } + + results_file_name = f"results_M_{M}_efC_{efC}_vec_{num_vectors}_q_{num_queries}_madvise_{MMAP_ADVISE}_bs_{index_block_size}_mem_limit_{PROCESS_LIMIT_HIGH}.json" + write_result_to_file(result, filename=results_file_name, override=True) + +def bm(): + input(f"PID: {os.getpid()} Press Enter to continue...") + # print(f"PID: {os.getpid()}", flush=True) + # time.sleep(20) + # Build params + Ms_efC = [(60, 75), (120, 150), (150, 150), (200, 120), (200, 150)] + + # query params + Ks = [1, 10, 100] + factors = [200, 100, 20, 10, 2] + factors = [2, 10, 20, 100, 200] + Ks_efR = [] + max_efR = 200 + for k in Ks: + for factor in factors: + if k * factor <= max_efR: + Ks_efR.append((k, k * factor)) + + for M, efC in Ms_efC[:1]: + bm_test_case(M=M, efC=efC, Ks_efR=Ks_efR) + +def sanity_vecsim_mmap(): + efRuntime = 10 + M = 16 + efConstruction = 100 + + num_vectors = min(1000, num_vectors_train - 1) + + index = create_hnsw_index( + dim, + num_vectors_train, + VecSimMetric_L2, + VecSimType_FLOAT32, + m=M, + ef_construction=efConstruction, + ef_runtime=efRuntime, + ) + print("\ndisk hnsw index created") + + print("Create hnswlib index for sanity testing") + p = hnswlib.Index(space='l2', dim=dim) + p.init_index(max_elements=num_vectors_train, ef_construction=efConstruction, M=M) + p.set_ef(efRuntime) + + vectors_data = open_pickled_file(all_vectors_file_name) + for i, vector in enumerate(vectors_data[:num_vectors]): + index.add_vector(vector, i) + p.add_items(vector, i) + + print(f"disk hnsw index containts {index.index_size()} vectors") + print(f"hnswlib index containts {p.get_current_count()} vectors") + + print("Testing knn") + query_data = queries_data[0] + hnswlib_labels, hnswlib_distances = p.knn_query(query_data, k=10) + redis_labels, redis_distances = index.knn_query(query_data, 10) + assert_allclose(hnswlib_labels, redis_labels, rtol=1e-5, atol=0) + # print(f"redis labels = {redis_labels}, hnswlib labels = {hnswlib_labels}") + assert_allclose(hnswlib_distances, redis_distances, rtol=1e-5, atol=0) + # print(f"redis distances = {redis_distances}, hnswlib distances = {hnswlib_distances}") + print("Testing knn ok") + +def gt_vecsim_mmap(): + num_vectors = min(1000, num_vectors_train - 1) + efRuntime = 10 + M = 16 + efConstruction = 100 + metric = VecSimMetric_L2 + data_type = VecSimType_FLOAT32 + index = create_hnsw_index( + dim=dim, + num_elements=num_vectors, + metric=metric, + data_type=data_type, + m=M, + ef_construction=efConstruction, + ef_runtime=efRuntime, + ) + print("\ndisk hnsw index created") + + bf_index = create_flat_index(dim = dim, metric=metric, data_type=data_type) + print("\ndisk bf_index index created") + + vectors_data = open_pickled_file(all_vectors_file_name) + for i, vector in enumerate(vectors_data[:num_vectors]): + index.add_vector(vector, i) + bf_index.add_vector(vector, i) + + print(f"disk hnsw index contains {index.index_size()} vectors") + print(f"disk bf index contains {bf_index.index_size()} vectors") + + print("Testing knn") + query_data = queries_data[0] + flat_labels, flat_distances = bf_index.knn_query(query_data, k=10) + hnsw_labels, hnsw_distances = index.knn_query(query_data, 10) + + fail = False + res = np.allclose(flat_labels, hnsw_labels, rtol=1e-5, atol=0) + if not res: + print("Testing knn labels not ok") + fail = True + print(f"hnsw labels = {hnsw_labels}, flat labels = {flat_labels}") + res = np.allclose(flat_distances, hnsw_distances, rtol=1e-5, atol=0) + if not res: + print("Testing knn dists not ok") + fail = True + print(f"hnsw distances = {hnsw_distances}, flat distances = {flat_distances}") + assert not fail + +def test_sanity(): + if (RUN_SANITY): + sanity_vecsim_mmap() + gt_vecsim_mmap() + gt_vecsim_mmap() + +def test_bm(): + if (RUN_BM): + bm() diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index 5d55b497e..96f97a3cd 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -30,43 +30,46 @@ endif() include(${root}/cmake/x86_64InstructionFlags.cmake) -add_executable(test_hnsw ../utils/mock_thread_pool.cpp test_hnsw.cpp test_hnsw_multi.cpp test_hnsw_tiered.cpp unit_test_utils.cpp) -add_executable(test_hnsw_parallel test_hnsw_parallel.cpp ../utils/mock_thread_pool.cpp unit_test_utils.cpp) -add_executable(test_bruteforce test_bruteforce.cpp test_bruteforce_multi.cpp ../utils/mock_thread_pool.cpp unit_test_utils.cpp) -add_executable(test_allocator test_allocator.cpp ../utils/mock_thread_pool.cpp unit_test_utils.cpp) -add_executable(test_spaces test_spaces.cpp) -add_executable(test_types test_types.cpp) -add_executable(test_common ../utils/mock_thread_pool.cpp test_common.cpp unit_test_utils.cpp) -add_executable(test_components test_components.cpp ../utils/mock_thread_pool.cpp unit_test_utils.cpp) -add_executable(test_bf16 ../utils/mock_thread_pool.cpp test_bf16.cpp unit_test_utils.cpp) -add_executable(test_fp16 ../utils/mock_thread_pool.cpp test_fp16.cpp unit_test_utils.cpp) -add_executable(test_int8 ../utils/mock_thread_pool.cpp test_int8.cpp unit_test_utils.cpp) -add_executable(test_uint8 ../utils/mock_thread_pool.cpp test_uint8.cpp unit_test_utils.cpp) +add_executable(test_hnsw_mmap ../utils/mock_thread_pool.cpp test_hnsw_mmap.cpp unit_test_utils.cpp) +# add_executable(test_hnsw ../utils/mock_thread_pool.cpp test_hnsw.cpp test_hnsw_multi.cpp test_hnsw_tiered.cpp unit_test_utils.cpp) +# add_executable(test_hnsw_parallel test_hnsw_parallel.cpp ../utils/mock_thread_pool.cpp unit_test_utils.cpp) +# add_executable(test_bruteforce test_bruteforce.cpp test_bruteforce_multi.cpp ../utils/mock_thread_pool.cpp unit_test_utils.cpp) +# add_executable(test_allocator test_allocator.cpp ../utils/mock_thread_pool.cpp unit_test_utils.cpp) +# add_executable(test_spaces test_spaces.cpp) +# add_executable(test_types test_types.cpp) +# add_executable(test_common ../utils/mock_thread_pool.cpp test_common.cpp unit_test_utils.cpp) +# add_executable(test_components test_components.cpp ../utils/mock_thread_pool.cpp unit_test_utils.cpp) +# add_executable(test_bf16 ../utils/mock_thread_pool.cpp test_bf16.cpp unit_test_utils.cpp) +# add_executable(test_fp16 ../utils/mock_thread_pool.cpp test_fp16.cpp unit_test_utils.cpp) +# add_executable(test_int8 ../utils/mock_thread_pool.cpp test_int8.cpp unit_test_utils.cpp) +# add_executable(test_uint8 ../utils/mock_thread_pool.cpp test_uint8.cpp unit_test_utils.cpp) -target_link_libraries(test_hnsw PUBLIC gtest_main VectorSimilarity) -target_link_libraries(test_hnsw_parallel PUBLIC gtest_main VectorSimilarity) -target_link_libraries(test_bruteforce PUBLIC gtest_main VectorSimilarity) -target_link_libraries(test_allocator PUBLIC gtest_main VectorSimilarity) -target_link_libraries(test_spaces PUBLIC gtest_main VectorSimilarity) -target_link_libraries(test_common PUBLIC gtest_main VectorSimilarity) -target_link_libraries(test_components PUBLIC gtest_main VectorSimilarity) -target_link_libraries(test_types PUBLIC gtest_main VectorSimilarity) -target_link_libraries(test_bf16 PUBLIC gtest_main VectorSimilarity) -target_link_libraries(test_fp16 PUBLIC gtest_main VectorSimilarity) -target_link_libraries(test_int8 PUBLIC gtest_main VectorSimilarity) -target_link_libraries(test_uint8 PUBLIC gtest_main VectorSimilarity) +target_link_libraries(test_hnsw_mmap PUBLIC gtest_main VectorSimilarity) +# target_link_libraries(test_hnsw PUBLIC gtest_main VectorSimilarity) +# target_link_libraries(test_hnsw_parallel PUBLIC gtest_main VectorSimilarity) +# target_link_libraries(test_bruteforce PUBLIC gtest_main VectorSimilarity) +# target_link_libraries(test_allocator PUBLIC gtest_main VectorSimilarity) +# target_link_libraries(test_spaces PUBLIC gtest_main VectorSimilarity) +# target_link_libraries(test_common PUBLIC gtest_main VectorSimilarity) +# target_link_libraries(test_components PUBLIC gtest_main VectorSimilarity) +# target_link_libraries(test_types PUBLIC gtest_main VectorSimilarity) +# target_link_libraries(test_bf16 PUBLIC gtest_main VectorSimilarity) +# target_link_libraries(test_fp16 PUBLIC gtest_main VectorSimilarity) +# target_link_libraries(test_int8 PUBLIC gtest_main VectorSimilarity) +# target_link_libraries(test_uint8 PUBLIC gtest_main VectorSimilarity) include(GoogleTest) -gtest_discover_tests(test_hnsw) -gtest_discover_tests(test_hnsw_parallel) -gtest_discover_tests(test_bruteforce) -gtest_discover_tests(test_allocator) -gtest_discover_tests(test_spaces) -gtest_discover_tests(test_common) -gtest_discover_tests(test_components) -gtest_discover_tests(test_types) -gtest_discover_tests(test_bf16 TEST_PREFIX BF16UNIT_) -gtest_discover_tests(test_fp16 TEST_PREFIX FP16UNIT_) -gtest_discover_tests(test_int8 TEST_PREFIX INT8UNIT_) -gtest_discover_tests(test_uint8 TEST_PREFIX UINT8UNIT_) +gtest_discover_tests(test_hnsw_mmap) +# gtest_discover_tests(test_hnsw) +# gtest_discover_tests(test_hnsw_parallel) +# gtest_discover_tests(test_bruteforce) +# gtest_discover_tests(test_allocator) +# gtest_discover_tests(test_spaces) +# gtest_discover_tests(test_common) +# gtest_discover_tests(test_components) +# gtest_discover_tests(test_types) +# gtest_discover_tests(test_bf16 TEST_PREFIX BF16UNIT_) +# gtest_discover_tests(test_fp16 TEST_PREFIX FP16UNIT_) +# gtest_discover_tests(test_int8 TEST_PREFIX INT8UNIT_) +# gtest_discover_tests(test_uint8 TEST_PREFIX UINT8UNIT_) diff --git a/tests/unit/test_hnsw_mmap.cpp b/tests/unit/test_hnsw_mmap.cpp new file mode 100644 index 000000000..b3c13edb8 --- /dev/null +++ b/tests/unit/test_hnsw_mmap.cpp @@ -0,0 +1,1979 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ + +#include "gtest/gtest.h" +#include "VecSim/vec_sim.h" +#include "VecSim/vec_sim_debug.h" +#include "VecSim/algorithms/hnsw/hnsw_single.h" +#include "VecSim/index_factories/hnsw_factory.h" +#include "unit_test_utils.h" +// #include "VecSim/utils/serializer.h" +// #include "VecSim/query_result_definitions.h" +// #include +// #include +// #include +// #include + +template +class HNSWTest : public ::testing::Test { +public: + using data_t = typename index_type_t::data_t; + using dist_t = typename index_type_t::dist_t; + +protected: + VecSimIndex *CreateNewIndex(HNSWParams ¶ms, bool is_multi = false) { + return test_utils::CreateNewIndex(params, index_type_t::get_index_type(), is_multi); + } + VecSimIndex *CreateNewIndex(BFParams ¶ms, bool is_multi = false) { + return test_utils::CreateNewIndex(params, index_type_t::get_index_type(), is_multi); + } + HNSWIndex *CastToHNSW(VecSimIndex *index) { + return reinterpret_cast *>(index); + } + + HNSWIndex_Single *CastToHNSW_Single(VecSimIndex *index) { + return reinterpret_cast *>(index); + } +}; + +// DataTypeSet, TEST_DATA_T and TEST_DIST_T are defined in unit_test_utils.h + +TYPED_TEST_SUITE(HNSWTest, DataTypeSet); + +TYPED_TEST(HNSWTest, brute_force_vector_add_test) { + size_t n = 100; + size_t k = 11; + size_t dim = 4; + + BFParams params = {.dim = dim, .metric = VecSimMetric_L2}; + + VecSimIndex *index = this->CreateNewIndex(params); + + for (size_t i = 0; i < n; i++) { + GenerateAndAddVector(index, dim, i, i); + } + ASSERT_EQ(VecSimIndex_IndexSize(index), n); + + TEST_DATA_T query[] = {50, 50, 50, 50}; + auto verify_res = [&](size_t id, double score, size_t index) { ASSERT_EQ(id, (index + 45)); }; + runTopKSearchTest(index, query, k, verify_res, nullptr, BY_ID); + + VecSimIndex_Free(index); +} + +TYPED_TEST(HNSWTest, hnsw_vector_add_search_test) { + size_t dim = 4; + size_t bs = DEFAULT_BLOCK_SIZE; + size_t num_vec = bs * 2 + (DEFAULT_BLOCK_SIZE / 2); + size_t k = 11; + + HNSWParams params = {.dim = dim, .metric = VecSimMetric_L2, .M = 16, .efConstruction = 200}; + + VecSimIndex *index = this->CreateNewIndex(params); + + ASSERT_EQ(VecSimIndex_IndexSize(index), 0); + for (size_t i = 0; i < num_vec; i++) { + GenerateAndAddVector(index, dim, i, i); + } + + TEST_DATA_T val = num_vec / 2; + TEST_DATA_T query[] = {val, val, val, val}; + auto verify_res = [&](size_t id, double score, size_t index) { + size_t diff_id = (id > val) ? (id - val) : (val - id); + ASSERT_EQ(diff_id, (index + 1) / 2); + ASSERT_EQ(score, (4 * ((index + 1) / 2) * ((index + 1) / 2))); + }; + runTopKSearchTest(index, query, k, verify_res); + runTopKSearchTest(index, query, 0, verify_res); // For sanity, search for nothing + k = num_vec; + runTopKSearchTest(index, query, k, verify_res); + + ASSERT_EQ(VecSimIndex_IndexSize(index), num_vec); + VecSimIndex_Free(index); +} + +TYPED_TEST(HNSWTest, hnsw_blob_sanity_test) { + size_t dim = 4; + size_t bs = 1; +#define ASSERT_HNSW_BLOB_EQ(id, blob) \ + do { \ + const void *v = hnsw_index->getDataByInternalId(id); \ + ASSERT_FALSE(memcmp(v, blob, sizeof(blob))); \ + } while (0) + + HNSWParams params = {.dim = dim, .metric = VecSimMetric_L2, .blockSize = bs}; + + VecSimIndex *index = this->CreateNewIndex(params); + + ASSERT_EQ(VecSimIndex_IndexSize(index), 0); + + TEST_DATA_T a[dim], b[dim], c[dim], d[dim]; + for (size_t i = 0; i < dim; i++) { + a[i] = (TEST_DATA_T)0; + b[i] = (TEST_DATA_T)1; + c[i] = (TEST_DATA_T)2; + d[i] = (TEST_DATA_T)3; + } + HNSWIndex *hnsw_index = this->CastToHNSW(index); + + VecSimIndex_AddVector(index, a, 42); + ASSERT_EQ(VecSimIndex_IndexSize(index), 1); + ASSERT_HNSW_BLOB_EQ(0, a); + ASSERT_EQ(hnsw_index->getExternalLabel(0), 42); + + VecSimIndex_AddVector(index, b, 46); + ASSERT_EQ(VecSimIndex_IndexSize(index), 2); + ASSERT_HNSW_BLOB_EQ(1, b); + ASSERT_EQ(hnsw_index->getExternalLabel(1), 46); + + VecSimIndex_Free(index); +} + +/**** resizing cases ****/ + +// Add up to capacity. +TYPED_TEST(HNSWTest, resizeIndex) { + size_t dim = 4; + size_t n = 10; + size_t bs = 3; + + HNSWParams params = {.dim = dim, .metric = VecSimMetric_L2, .blockSize = bs}; + + VecSimIndex *index = this->CreateNewIndex(params); + + // Add up to n. + for (size_t i = 0; i < n; i++) { + GenerateAndAddVector(index, dim, i, i); + } + + // Initial capacity is rounded up to the block size. + size_t extra_cap = n % bs == 0 ? 0 : bs - n % bs; + // The size (+extra) and the capacity should be equal. + ASSERT_EQ(index->indexCapacity(), VecSimIndex_IndexSize(index) + extra_cap); + // The capacity shouldn't be changed. + ASSERT_EQ(index->indexCapacity(), n + extra_cap); + + VecSimIndex_Free(index); +} + +// Test empty index edge cases. +TYPED_TEST(HNSWTest, emptyIndex) { + size_t dim = 4; + size_t bs = 6; + + HNSWParams params = {.dim = dim, .metric = VecSimMetric_L2, .blockSize = bs}; + + VecSimIndex *index = this->CreateNewIndex(params); + + ASSERT_EQ(VecSimIndex_IndexSize(index), 0); + size_t curr_capacity = index->indexCapacity(); + + VecSimIndex_Free(index); +} + +TYPED_TEST(HNSWTest, hnsw_vector_search_test) { + size_t n = 100; + size_t k = 11; + size_t dim = 4; + + HNSWParams params = {.dim = dim, .metric = VecSimMetric_L2, .M = 16, .efConstruction = 200}; + + VecSimIndex *index = this->CreateNewIndex(params); + + for (size_t i = 0; i < n; i++) { + GenerateAndAddVector(index, dim, i, i); + } + ASSERT_EQ(VecSimIndex_IndexSize(index), n); + + TEST_DATA_T query[] = {50, 50, 50, 50}; + auto verify_res = [&](size_t id, double score, size_t index) { + size_t diff_id = (id > 50) ? (id - 50) : (50 - id); + ASSERT_EQ(diff_id, (index + 1) / 2); + ASSERT_EQ(score, (4 * ((index + 1) / 2) * ((index + 1) / 2))); + }; + runTopKSearchTest(index, query, k, verify_res); + runTopKSearchTest(index, query, 0, verify_res); // For sanity, search for nothing + VecSimIndex_Free(index); +} + +TYPED_TEST(HNSWTest, hnsw_vector_search_by_id_test) { + size_t n = 100; + size_t dim = 4; + size_t k = 11; + + HNSWParams params = {.dim = dim, .metric = VecSimMetric_L2, .M = 16, .efConstruction = 200}; + + VecSimIndex *index = this->CreateNewIndex(params); + + for (size_t i = 0; i < n; i++) { + GenerateAndAddVector(index, dim, i, i); + } + ASSERT_EQ(VecSimIndex_IndexSize(index), n); + + TEST_DATA_T query[] = {50, 50, 50, 50}; + auto verify_res = [&](size_t id, double score, size_t index) { ASSERT_EQ(id, (index + 45)); }; + runTopKSearchTest(index, query, k, verify_res, nullptr, BY_ID); + + VecSimIndex_Free(index); +} + +// TYPED_TEST(HNSWTest, hnsw_indexing_same_vector) { +// size_t n = 100; +// size_t dim = 4; +// size_t k = 10; + +// HNSWParams params = {.dim = dim, .metric = VecSimMetric_L2, .M = 16, .efConstruction = 200}; + +// VecSimIndex *index = this->CreateNewIndex(params); + +// for (size_t i = 0; i < n; i++) { +// GenerateAndAddVector(index, dim, i, i / 10); +// } +// ASSERT_EQ(VecSimIndex_IndexSize(index), n); + +// // Run a query where all the results are supposed to be {5,5,5,5} (different ids). +// TEST_DATA_T query[] = {4.9, 4.95, 5.05, 5.1}; +// auto verify_res = [&](size_t id, double score, size_t index) { +// ASSERT_TRUE(id >= 50 && id < 60 && score <= 1); +// }; +// runTopKSearchTest(index, query, k, verify_res); + +// VecSimIndex_Free(index); +// } + +// TYPED_TEST(HNSWTest, hnsw_reindexing_same_vector) { +// size_t n = 100; +// size_t dim = 4; +// size_t k = 10; + +// HNSWParams params = {.dim = dim, .metric = VecSimMetric_L2, .M = 16, .efConstruction = 200}; + +// VecSimIndex *index = this->CreateNewIndex(params); + +// for (size_t i = 0; i < n; i++) { +// GenerateAndAddVector(index, dim, i, i / 10); +// } +// ASSERT_EQ(VecSimIndex_IndexSize(index), n); + +// // Run a query where all the results are supposed to be {5,5,5,5} (different ids). +// TEST_DATA_T query[] = {4.9, 4.95, 5.05, 5.1}; +// auto verify_res = [&](size_t id, double score, size_t index) { +// ASSERT_TRUE(id >= 50 && id < 60 && score <= 1); +// }; +// runTopKSearchTest(index, query, k, verify_res); + +// for (size_t i = 0; i < n; i++) { +// VecSimIndex_DeleteVector(index, i); +// } +// ASSERT_EQ(VecSimIndex_IndexSize(index), 0); + +// // Reinsert the same vectors under the same ids +// for (size_t i = 0; i < n; i++) { +// GenerateAndAddVector(index, dim, i, i / 10); +// } +// ASSERT_EQ(VecSimIndex_IndexSize(index), n); + +// // Run the same query again +// runTopKSearchTest(index, query, k, verify_res); + +// VecSimIndex_Free(index); +// } + +// TYPED_TEST(HNSWTest, hnsw_reindexing_same_vector_different_id) { +// size_t n = 100; +// size_t dim = 4; +// size_t k = 10; + +// HNSWParams params = {.dim = dim, .metric = VecSimMetric_L2, .M = 16, .efConstruction = 200}; + +// VecSimIndex *index = this->CreateNewIndex(params); + +// for (size_t i = 0; i < n; i++) { +// GenerateAndAddVector(index, dim, i, i / 10); +// } +// ASSERT_EQ(VecSimIndex_IndexSize(index), n); + +// // Run a query where all the results are supposed to be {5,5,5,5} (different ids). +// TEST_DATA_T query[] = {4.9, 4.95, 5.05, 5.1}; +// auto verify_res = [&](size_t id, double score, size_t index) { +// ASSERT_TRUE(id >= 50 && id < 60 && score <= 1); +// }; +// runTopKSearchTest(index, query, k, verify_res); + +// for (size_t i = 0; i < n; i++) { +// VecSimIndex_DeleteVector(index, i); +// } +// ASSERT_EQ(VecSimIndex_IndexSize(index), 0); + +// // Reinsert the same vectors under different ids than before +// for (size_t i = 0; i < n; i++) { +// GenerateAndAddVector(index, dim, i + 10, i / 10); +// } +// ASSERT_EQ(VecSimIndex_IndexSize(index), n); + +// // Run the same query again +// auto verify_res_different_id = [&](int id, double score, size_t index) { +// ASSERT_TRUE(id >= 60 && id < 70 && score <= 1); +// }; +// runTopKSearchTest(index, query, k, verify_res_different_id); + +// VecSimIndex_Free(index); +// } + +// TYPED_TEST(HNSWTest, sanity_reinsert_1280) { +// size_t n = 5; +// size_t d = 1280; +// size_t k = 5; + +// HNSWParams params = {.dim = d, .metric = VecSimMetric_L2, .M = 16, .efConstruction = 200}; + +// VecSimIndex *index = this->CreateNewIndex(params); + +// auto *vectors = new TEST_DATA_T[n * d]; + +// // Generate random vectors in every iteration and inert them under different ids. +// for (size_t iter = 1; iter <= 3; iter++) { +// for (size_t i = 0; i < n; i++) { +// for (size_t j = 0; j < d; j++) { +// (vectors + i * d)[j] = (TEST_DATA_T)rand() / (TEST_DATA_T)(RAND_MAX) / 100; +// } +// } +// auto expected_ids = std::set(); +// for (size_t i = 0; i < n; i++) { +// VecSimIndex_AddVector(index, (vectors + i * d), i * iter); +// expected_ids.insert(i * iter); +// } +// auto verify_res = [&](size_t id, double score, size_t index) { +// ASSERT_TRUE(expected_ids.find(id) != expected_ids.end()); +// expected_ids.erase(id); +// }; + +// // Send arbitrary vector (the first) and search for top k. This should return all the +// // vectors that were inserted in this iteration - verify their ids. +// runTopKSearchTest(index, vectors, k, verify_res); + +// // Remove vectors form current iteration. +// for (size_t i = 0; i < n; i++) { +// VecSimIndex_DeleteVector(index, i * iter); +// } +// } +// delete[] vectors; +// VecSimIndex_Free(index); +// } + +TYPED_TEST(HNSWTest, test_hnsw_info) { + size_t n = 100; + size_t d = 128; + + // Build with default args + HNSWParams params = {.dim = d, .metric = VecSimMetric_L2}; + + VecSimIndex *index = this->CreateNewIndex(params); + + VecSimIndexInfo info = VecSimIndex_Info(index); + ASSERT_EQ(info.commonInfo.basicInfo.algo, VecSimAlgo_HNSWLIB); + ASSERT_EQ(info.commonInfo.basicInfo.dim, d); + // Default args. + ASSERT_FALSE(info.commonInfo.basicInfo.isMulti); + ASSERT_EQ(info.commonInfo.basicInfo.blockSize, DEFAULT_BLOCK_SIZE); + ASSERT_EQ(info.hnswInfo.M, HNSW_DEFAULT_M); + ASSERT_EQ(info.hnswInfo.efConstruction, HNSW_DEFAULT_EF_C); + ASSERT_EQ(info.hnswInfo.efRuntime, HNSW_DEFAULT_EF_RT); + ASSERT_DOUBLE_EQ(info.hnswInfo.epsilon, HNSW_DEFAULT_EPSILON); + ASSERT_EQ(info.commonInfo.basicInfo.type, params.type); + VecSimIndex_Free(index); + + d = 1280; + size_t bs = 42; + params.dim = d; + params.blockSize = bs, params.M = 200, params.efConstruction = 1000, params.efRuntime = 500, + params.epsilon = 0.005; + + index = this->CreateNewIndex(params); + info = VecSimIndex_Info(index); + ASSERT_EQ(info.commonInfo.basicInfo.algo, VecSimAlgo_HNSWLIB); + ASSERT_EQ(info.commonInfo.basicInfo.dim, d); + // User args. + ASSERT_FALSE(info.commonInfo.basicInfo.isMulti); + ASSERT_EQ(info.commonInfo.basicInfo.blockSize, bs); + ASSERT_EQ(info.hnswInfo.efConstruction, 1000); + ASSERT_EQ(info.hnswInfo.M, 200); + ASSERT_EQ(info.hnswInfo.efRuntime, 500); + ASSERT_EQ(info.hnswInfo.epsilon, 0.005); + ASSERT_EQ(info.commonInfo.basicInfo.type, params.type); + ASSERT_FALSE(info.commonInfo.basicInfo.isTiered); + + // Validate that Static info returns the right restricted info as well. + VecSimIndexBasicInfo s_info = VecSimIndex_BasicInfo(index); + ASSERT_EQ(info.commonInfo.basicInfo.algo, s_info.algo); + ASSERT_EQ(info.commonInfo.basicInfo.dim, s_info.dim); + ASSERT_EQ(info.commonInfo.basicInfo.blockSize, s_info.blockSize); + ASSERT_EQ(info.commonInfo.basicInfo.type, s_info.type); + ASSERT_EQ(info.commonInfo.basicInfo.isMulti, s_info.isMulti); + ASSERT_EQ(info.commonInfo.basicInfo.type, s_info.type); + ASSERT_EQ(info.commonInfo.basicInfo.isTiered, s_info.isTiered); + + VecSimIndex_Free(index); +} + +TYPED_TEST(HNSWTest, test_basic_hnsw_info_iterator) { + size_t n = 100; + size_t d = 128; + + VecSimMetric metrics[3] = {VecSimMetric_Cosine, VecSimMetric_IP, VecSimMetric_L2}; + for (size_t i = 0; i < 3; i++) { + // Build with default args. + // Build with default args + HNSWParams params = {.dim = d, .metric = metrics[i]}; + + VecSimIndex *index = this->CreateNewIndex(params); + + VecSimIndexInfo info = VecSimIndex_Info(index); + VecSimInfoIterator *infoIter = VecSimIndex_InfoIterator(index); + compareHNSWIndexInfoToIterator(info, infoIter); + VecSimInfoIterator_Free(infoIter); + VecSimIndex_Free(index); + } +} + +TYPED_TEST(HNSWTest, test_dynamic_hnsw_info_iterator) { + size_t n = 100; + size_t d = 128; + + HNSWParams params = {.dim = d, + .metric = VecSimMetric_L2, + .M = 100, + .efConstruction = 250, + .efRuntime = 400, + .epsilon = 0.004}; + + VecSimIndex *index = this->CreateNewIndex(params); + + VecSimIndexInfo info = VecSimIndex_Info(index); + VecSimInfoIterator *infoIter = VecSimIndex_InfoIterator(index); + ASSERT_EQ(100, info.hnswInfo.M); + ASSERT_EQ(250, info.hnswInfo.efConstruction); + ASSERT_EQ(400, info.hnswInfo.efRuntime); + ASSERT_EQ(0.004, info.hnswInfo.epsilon); + ASSERT_EQ(0, info.commonInfo.indexSize); + ASSERT_EQ(-1, info.hnswInfo.max_level); + ASSERT_EQ(-1, info.hnswInfo.entrypoint); + ASSERT_EQ(params.type, info.commonInfo.basicInfo.type); + compareHNSWIndexInfoToIterator(info, infoIter); + VecSimInfoIterator_Free(infoIter); + + TEST_DATA_T v[d]; + for (size_t i = 0; i < d; i++) { + v[i] = (TEST_DATA_T)i; + } + // Add vector. + VecSimIndex_AddVector(index, v, 1); + info = VecSimIndex_Info(index); + infoIter = VecSimIndex_InfoIterator(index); + ASSERT_EQ(1, info.commonInfo.indexSize); + ASSERT_EQ(1, info.hnswInfo.entrypoint); + ASSERT_GE(1, info.hnswInfo.max_level); + compareHNSWIndexInfoToIterator(info, infoIter); + VecSimInfoIterator_Free(infoIter); + + // Perform (or simulate) Search in all modes. + VecSimIndex_AddVector(index, v, 0); + auto res = VecSimIndex_TopKQuery(index, v, 1, nullptr, BY_SCORE); + VecSimQueryReply_Free(res); + info = VecSimIndex_Info(index); + infoIter = VecSimIndex_InfoIterator(index); + ASSERT_EQ(STANDARD_KNN, info.commonInfo.lastMode); + compareHNSWIndexInfoToIterator(info, infoIter); + VecSimInfoIterator_Free(infoIter); + + ASSERT_TRUE(VecSimIndex_PreferAdHocSearch(index, 1, 1, true)); + info = VecSimIndex_Info(index); + infoIter = VecSimIndex_InfoIterator(index); + ASSERT_EQ(HYBRID_ADHOC_BF, info.commonInfo.lastMode); + compareHNSWIndexInfoToIterator(info, infoIter); + VecSimInfoIterator_Free(infoIter); + + // Set the index size artificially so that BATCHES mode will be selected by the heuristics. + auto actual_element_count = this->CastToHNSW(index)->curElementCount; + this->CastToHNSW(index)->curElementCount = 1e6; + auto &label_lookup = this->CastToHNSW_Single(index)->labelLookup; + for (size_t i = 0; i < 1e6; i++) { + label_lookup[i] = i; + } + ASSERT_FALSE(VecSimIndex_PreferAdHocSearch(index, 10, 1, true)); + info = VecSimIndex_Info(index); + infoIter = VecSimIndex_InfoIterator(index); + ASSERT_EQ(HYBRID_BATCHES, info.commonInfo.lastMode); + compareHNSWIndexInfoToIterator(info, infoIter); + VecSimInfoIterator_Free(infoIter); + + // Simulate the case where another call to the heuristics is done after realizing that + // the subset size is smaller, and change the policy as a result. + ASSERT_TRUE(VecSimIndex_PreferAdHocSearch(index, 1, 10, false)); + info = VecSimIndex_Info(index); + infoIter = VecSimIndex_InfoIterator(index); + ASSERT_EQ(HYBRID_BATCHES_TO_ADHOC_BF, info.commonInfo.lastMode); + compareHNSWIndexInfoToIterator(info, infoIter); + VecSimInfoIterator_Free(infoIter); + + this->CastToHNSW(index)->curElementCount = actual_element_count; + VecSimIndex_Free(index); +} +TYPED_TEST(HNSWTest, test_query_runtime_params_default_build_args) { + size_t n = 100; + size_t d = 4; + size_t k = 11; + + // Build with default args. + + HNSWParams params = {.dim = d, .metric = VecSimMetric_L2}; + + VecSimIndex *index = this->CreateNewIndex(params); + + for (size_t i = 0; i < n; i++) { + GenerateAndAddVector(index, d, i, i); + } + ASSERT_EQ(VecSimIndex_IndexSize(index), n); + + auto verify_res = [&](size_t id, double score, size_t index) { + size_t diff_id = (id > 50) ? (id - 50) : (50 - id); + ASSERT_EQ(diff_id, (index + 1) / 2); + ASSERT_EQ(score, (4 * ((index + 1) / 2) * ((index + 1) / 2))); + }; + TEST_DATA_T query[] = {50, 50, 50, 50}; + runTopKSearchTest(index, query, k, verify_res); + + VecSimIndexInfo info = VecSimIndex_Info(index); + // Check that default args did not change. + ASSERT_EQ(info.hnswInfo.M, HNSW_DEFAULT_M); + ASSERT_EQ(info.hnswInfo.efConstruction, HNSW_DEFAULT_EF_C); + ASSERT_EQ(info.hnswInfo.efRuntime, HNSW_DEFAULT_EF_RT); + + // Run same query again, set efRuntime to 300. + HNSWRuntimeParams hnswRuntimeParams = {.efRuntime = 300}; + VecSimQueryParams queryParams = CreateQueryParams(hnswRuntimeParams); + runTopKSearchTest(index, query, k, verify_res, &queryParams); + + info = VecSimIndex_Info(index); + // Check that default args did not change. + ASSERT_EQ(info.hnswInfo.M, HNSW_DEFAULT_M); + ASSERT_EQ(info.hnswInfo.efConstruction, HNSW_DEFAULT_EF_C); + ASSERT_EQ(info.hnswInfo.efRuntime, HNSW_DEFAULT_EF_RT); + + VecSimIndex_Free(index); +} + +TYPED_TEST(HNSWTest, test_query_runtime_params_user_build_args) { + size_t n = 100; + size_t d = 4; + size_t M = 100; + size_t efConstruction = 300; + size_t efRuntime = 500; + + // Build with user args. + + HNSWParams params = {.dim = d, + .metric = VecSimMetric_L2, + .M = M, + .efConstruction = efConstruction, + .efRuntime = efRuntime}; + + VecSimIndex *index = this->CreateNewIndex(params); + + for (size_t i = 0; i < n; i++) { + GenerateAndAddVector(index, d, i, i); + } + ASSERT_EQ(VecSimIndex_IndexSize(index), n); + + auto verify_res = [&](size_t id, double score, size_t index) { + size_t diff_id = (id > 50) ? (id - 50) : (50 - id); + ASSERT_EQ(diff_id, (index + 1) / 2); + ASSERT_EQ(score, (4 * ((index + 1) / 2) * ((index + 1) / 2))); + }; + TEST_DATA_T query[] = {50, 50, 50, 50}; + + size_t k = 11; + runTopKSearchTest(index, query, k, verify_res); + + VecSimIndexInfo info = VecSimIndex_Info(index); + // Check that user args did not change. + ASSERT_EQ(info.hnswInfo.M, M); + ASSERT_EQ(info.hnswInfo.efConstruction, efConstruction); + ASSERT_EQ(info.hnswInfo.efRuntime, efRuntime); + + // Run same query again, set efRuntime to 300. + HNSWRuntimeParams hnswRuntimeParams = {.efRuntime = 300}; + VecSimQueryParams queryParams = CreateQueryParams(hnswRuntimeParams); + runTopKSearchTest(index, query, k, verify_res, &queryParams); + + info = VecSimIndex_Info(index); + // Check that user args did not change. + ASSERT_EQ(info.hnswInfo.M, M); + ASSERT_EQ(info.hnswInfo.efConstruction, efConstruction); + ASSERT_EQ(info.hnswInfo.efRuntime, efRuntime); + + VecSimIndex_Free(index); +} + +TYPED_TEST(HNSWTest, hnsw_search_empty_index) { + size_t n = 100; + size_t k = 11; + size_t d = 4; + + HNSWParams params = {.dim = d, .metric = VecSimMetric_L2}; + + VecSimIndex *index = this->CreateNewIndex(params); + + ASSERT_EQ(VecSimIndex_IndexSize(index), 0); + + TEST_DATA_T query[] = {50, 50, 50, 50}; + + // We do not expect any results. + VecSimQueryReply *res = VecSimIndex_TopKQuery(index, query, k, NULL, BY_SCORE); + ASSERT_EQ(VecSimQueryReply_Len(res), 0); + VecSimQueryReply_Iterator *it = VecSimQueryReply_GetIterator(res); + ASSERT_EQ(VecSimQueryReply_IteratorNext(it), nullptr); + VecSimQueryReply_IteratorFree(it); + VecSimQueryReply_Free(res); + + VecSimIndex_Free(index); +} + +TYPED_TEST(HNSWTest, hnsw_test_inf_score) { + size_t n = 4; + size_t k = 4; + size_t dim = 2; + + HNSWParams params = {.dim = dim, .metric = VecSimMetric_L2}; + + VecSimIndex *index = this->CreateNewIndex(params); + + TEST_DATA_T inf_val = GetInfVal(params.type); + ASSERT_FALSE(std::isinf(inf_val)); + + TEST_DATA_T query[] = {M_PI, M_PI}; + TEST_DATA_T v1[] = {M_PI, M_PI}; + TEST_DATA_T v2[] = {inf_val, inf_val}; + TEST_DATA_T v3[] = {M_E, M_E}; + TEST_DATA_T v4[] = {-inf_val, -inf_val}; + + VecSimIndex_AddVector(index, v1, 1); + VecSimIndex_AddVector(index, v2, 2); + VecSimIndex_AddVector(index, v3, 3); + VecSimIndex_AddVector(index, v4, 4); + ASSERT_EQ(VecSimIndex_IndexSize(index), 4); + + auto verify_res = [&](size_t id, double score, size_t index) { + if (index == 0) { + ASSERT_EQ(1, id); + } else if (index == 1) { + ASSERT_EQ(3, id); + } else { + ASSERT_TRUE(id == 2 || id == 4); + ASSERT_TRUE(std::isinf(score)); + } + }; + runTopKSearchTest(index, query, k, verify_res); + VecSimIndex_Free(index); +} + +// Tests VecSimIndex_New failure on bad M parameter. Should return null. +TYPED_TEST(HNSWTest, hnsw_bad_params) { + size_t n = 10000000; + size_t dim = 10000000; + size_t bad_M[] = { + 1, // Will fail because 1/log(M). + 100000000, // Will fail on M * 2 overflow. + UINT16_MAX, // Will fail on M * 2 overflow. + }; + size_t len = sizeof(bad_M) / sizeof(size_t); + + for (size_t i = 0; i < len; i++) { + + HNSWParams params = {.dim = dim, + .metric = VecSimMetric_L2, + .M = bad_M[i], + .efConstruction = 250, + .efRuntime = 400, + .epsilon = 0.004}; + + VecSimIndex *index = this->CreateNewIndex(params); + + ASSERT_TRUE(index == NULL) << "Failed on M=" << bad_M[i]; + } +} + +// TYPED_TEST(HNSWTest, hnsw_delete_entry_point) { +// size_t n = 10000; +// size_t dim = 4; +// size_t M = 2; + +// HNSWParams params = { +// .dim = dim, .metric = VecSimMetric_L2, .M = M, .efConstruction = 0, .efRuntime = 0}; + +// VecSimIndex *index = this->CreateNewIndex(params); + +// ASSERT_TRUE(index != NULL); + +// int64_t vec[dim]; +// for (size_t i = 0; i < dim; i++) +// vec[i] = i; +// for (size_t j = 0; j < n; j++) +// VecSimIndex_AddVector(index, vec, j); + +// VecSimIndexInfo info = VecSimIndex_Info(index); + +// while (info.commonInfo.indexSize > 0) { +// ASSERT_NO_THROW(VecSimIndex_DeleteVector(index, info.hnswInfo.entrypoint)); +// info = VecSimIndex_Info(index); +// } +// VecSimIndex_Free(index); +// } + +// TYPED_TEST(HNSWTest, hnsw_override) { +// size_t n = 100; +// size_t dim = 4; +// size_t M = 8; +// size_t ef = 300; + +// HNSWParams params = { +// .dim = dim, .metric = VecSimMetric_L2, .M = M, .efConstruction = 20, .efRuntime = ef}; + +// VecSimIndex *index = this->CreateNewIndex(params); + +// ASSERT_TRUE(index != nullptr); + +// // Insert n == 100 vectors. +// for (size_t i = 0; i < n; i++) { +// GenerateAndAddVector(index, dim, i, i); +// } +// ASSERT_EQ(VecSimIndex_IndexSize(index), n); + +// // Insert again 300 vectors, the first 100 will be overwritten (deleted first). +// n = 300; +// for (size_t i = 0; i < n; i++) { +// GenerateAndAddVector(index, dim, i, i); +// } + +// TEST_DATA_T query[dim]; +// GenerateVector(query, dim, n); +// // This is testing a bug fix - before we had the seconder sorting by id in CompareByFirst, +// // the graph got disconnected due to the deletion of some node followed by a bad repairing of +// // one of its neighbours. Here, we ensure that we get all the nodes in the graph as results. +// auto verify_res = [&](size_t id, double score, size_t index) { +// ASSERT_TRUE(id == n - 1 - index); +// }; +// runTopKSearchTest(index, query, 300, verify_res); + +// VecSimIndex_Free(index); +// } + +// TYPED_TEST(HNSWTest, hnsw_batch_iterator_basic) { +// size_t dim = 4; +// size_t M = 8; +// size_t ef = 20; +// size_t n = 1000; + +// HNSWParams params = { +// .dim = dim, .metric = VecSimMetric_L2, .M = M, .efConstruction = ef, .efRuntime = ef}; + +// VecSimIndex *index = this->CreateNewIndex(params); + +// // For every i, add the vector (i,i,i,i) under the label i. +// for (size_t i = 0; i < n; i++) { +// GenerateAndAddVector(index, dim, i, i); +// } +// ASSERT_EQ(VecSimIndex_IndexSize(index), n); + +// // Query for (n,n,n,n) vector (recall that n-1 is the largest id in te index). +// TEST_DATA_T query[dim]; +// GenerateVector(query, dim, n); + +// VecSimBatchIterator *batchIterator = VecSimBatchIterator_New(index, query, nullptr); +// size_t iteration_num = 0; + +// // Get the 5 vectors whose ids are the maximal among those that hasn't been returned yet +// // in every iteration. The results order should be sorted by their score (distance from the +// // query vector), which means sorted from the largest id to the lowest. +// size_t n_res = 5; +// while (VecSimBatchIterator_HasNext(batchIterator)) { +// std::vector expected_ids(n_res); +// for (size_t i = 0; i < n_res; i++) { +// expected_ids[i] = (n - iteration_num * n_res - i - 1); +// } +// auto verify_res = [&](size_t id, double score, size_t index) { +// ASSERT_TRUE(expected_ids[index] == id); +// }; +// runBatchIteratorSearchTest(batchIterator, n_res, verify_res); +// iteration_num++; +// } +// ASSERT_EQ(iteration_num, n / n_res); +// VecSimBatchIterator_Free(batchIterator); + +// VecSimIndex_Free(index); +// } + +// TYPED_TEST(HNSWTest, hnsw_batch_iterator_reset) { +// size_t dim = 4; +// size_t n = 1000; +// size_t M = 8; +// size_t ef = 20; + +// HNSWParams params = { +// .dim = dim, .metric = VecSimMetric_L2, .M = M, .efConstruction = ef, .efRuntime = ef}; + +// VecSimIndex *index = this->CreateNewIndex(params); + +// for (size_t i = 0; i < n; i++) { +// GenerateAndAddVector(index, dim, i, i); +// } +// ASSERT_EQ(VecSimIndex_IndexSize(index), n); + +// // Query for (n,n,n,n) vector (recall that n-1 is the largest id in te index). +// TEST_DATA_T query[dim]; +// GenerateVector(query, dim, n); + +// VecSimBatchIterator *batchIterator = VecSimBatchIterator_New(index, query, nullptr); + +// // Get the 100 vectors whose ids are the maximal among those that hasn't been returned yet, +// in +// // every iteration. Run this flow for 3 times, and reset the iterator. +// size_t n_res = 100; +// size_t re_runs = 3; + +// for (size_t take = 0; take < re_runs; take++) { +// size_t iteration_num = 0; +// while (VecSimBatchIterator_HasNext(batchIterator)) { +// std::vector expected_ids(n_res); +// for (size_t i = 0; i < n_res; i++) { +// expected_ids[i] = (n - iteration_num * n_res - i - 1); +// } +// auto verify_res = [&](size_t id, double score, size_t index) { +// ASSERT_TRUE(expected_ids[index] == id); +// }; +// runBatchIteratorSearchTest(batchIterator, n_res, verify_res, BY_SCORE); +// iteration_num++; +// } +// ASSERT_EQ(iteration_num, n / n_res); +// VecSimBatchIterator_Reset(batchIterator); +// } +// VecSimBatchIterator_Free(batchIterator); +// VecSimIndex_Free(index); +// } + +// TYPED_TEST(HNSWTest, hnsw_batch_iterator_batch_size_1) { +// size_t dim = 4; +// size_t n = 1000; +// size_t M = 8; +// size_t ef = 2; + +// HNSWParams params = { +// .dim = dim, .metric = VecSimMetric_L2, .M = M, .efConstruction = ef, .efRuntime = ef}; + +// VecSimIndex *index = this->CreateNewIndex(params); + +// for (size_t i = 0; i < n; i++) { +// // Set labels to be different than the internal ids. +// GenerateAndAddVector(index, dim, n - i, i); +// } +// ASSERT_EQ(VecSimIndex_IndexSize(index), n); + +// TEST_DATA_T query[dim]; +// GenerateVector(query, dim, n); + +// VecSimBatchIterator *batchIterator = VecSimBatchIterator_New(index, query, nullptr); +// size_t iteration_num = 0; +// size_t n_res = 1, expected_n_res = 1; +// while (VecSimBatchIterator_HasNext(batchIterator)) { +// iteration_num++; +// // Expect to get results in the reverse order of labels - which is the order of the +// distance +// // from the query vector. Get one result in every iteration. +// auto verify_res = [&](size_t id, double score, size_t index) { +// ASSERT_TRUE(id == iteration_num); +// }; +// runBatchIteratorSearchTest(batchIterator, n_res, verify_res, BY_SCORE, expected_n_res); +// } + +// ASSERT_EQ(iteration_num, n); +// VecSimBatchIterator_Free(batchIterator); +// VecSimIndex_Free(index); +// } + +// TYPED_TEST(HNSWTest, hnsw_batch_iterator_advanced) { +// size_t dim = 4; +// size_t n = 500; +// size_t M = 8; +// size_t ef = n; + +// HNSWParams params = { +// .dim = dim, .metric = VecSimMetric_L2, .M = M, .efConstruction = ef, .efRuntime = ef}; + +// VecSimIndex *index = this->CreateNewIndex(params); + +// TEST_DATA_T query[dim]; +// GenerateVector(query, dim, n); +// VecSimBatchIterator *batchIterator = VecSimBatchIterator_New(index, query, nullptr); + +// // Try to get results even though there are no vectors in the index. +// VecSimQueryReply *res = VecSimBatchIterator_Next(batchIterator, 10, BY_SCORE); +// ASSERT_EQ(VecSimQueryReply_Len(res), 0); +// VecSimQueryReply_Free(res); +// ASSERT_FALSE(VecSimBatchIterator_HasNext(batchIterator)); +// VecSimBatchIterator_Free(batchIterator); + +// // Insert one vector and query again. The internal id will be 0. +// VecSimIndex_AddVector(index, query, n); +// batchIterator = VecSimBatchIterator_New(index, query, nullptr); +// res = VecSimBatchIterator_Next(batchIterator, 10, BY_SCORE); +// ASSERT_EQ(VecSimQueryReply_Len(res), 1); +// VecSimQueryReply_Free(res); +// ASSERT_FALSE(VecSimBatchIterator_HasNext(batchIterator)); +// VecSimBatchIterator_Free(batchIterator); + +// // Insert vectors to the index and re-create the batch iterator. +// for (size_t i = 1; i < n; i++) { +// GenerateAndAddVector(index, dim, i, i); +// } +// ASSERT_EQ(VecSimIndex_IndexSize(index), n); +// batchIterator = VecSimBatchIterator_New(index, query, nullptr); + +// // Try to get 0 results. +// res = VecSimBatchIterator_Next(batchIterator, 0, BY_SCORE); +// ASSERT_EQ(VecSimQueryReply_Len(res), 0); +// VecSimQueryReply_Free(res); + +// // n_res does not divide into ef or vice versa - expect leftovers between the graph scans. +// size_t n_res = 7; +// size_t iteration_num = 0; + +// while (VecSimBatchIterator_HasNext(batchIterator)) { +// iteration_num++; +// std::vector expected_ids; +// // We ask to get the results sorted by ID in a specific batch (in ascending order), but +// // in every iteration the ids should be lower than the previous one, according to the +// // distance from the query. +// for (size_t i = 1; i <= n_res; i++) { +// expected_ids.push_back(n - iteration_num * n_res + i); +// } +// auto verify_res = [&](size_t id, double score, size_t index) { +// ASSERT_TRUE(expected_ids[index] == id); +// }; +// if (iteration_num <= n / n_res) { +// runBatchIteratorSearchTest(batchIterator, n_res, verify_res, BY_ID); +// } else { +// // In the last iteration there are n%n_res results left to return. +// // remove the first ids that aren't going to be returned since we pass the index +// size. for (size_t i = 0; i < n_res - n % n_res; i++) { +// expected_ids.erase(expected_ids.begin()); +// } +// runBatchIteratorSearchTest(batchIterator, n_res, verify_res, BY_ID, n % n_res); +// } +// } +// ASSERT_EQ(iteration_num, n / n_res + 1); +// // Try to get more results even though there are no. +// res = VecSimBatchIterator_Next(batchIterator, 1, BY_SCORE); +// ASSERT_EQ(VecSimQueryReply_Len(res), 0); +// VecSimQueryReply_Free(res); + +// VecSimBatchIterator_Free(batchIterator); +// VecSimIndex_Free(index); +// } + +TYPED_TEST(HNSWTest, hnsw_resolve_ef_runtime_params) { + size_t dim = 4; + size_t M = 8; + size_t ef = 2; + + HNSWParams params = { + .dim = dim, .metric = VecSimMetric_L2, .M = M, .efConstruction = ef, .efRuntime = ef}; + + VecSimIndex *index = this->CreateNewIndex(params); + + VecSimQueryParams qparams, zero; + bzero(&zero, sizeof(VecSimQueryParams)); + + std::vector rparams; + + // Test with empty runtime params. + for (VecsimQueryType query_type : test_utils::query_types) { + ASSERT_EQ( + VecSimIndex_ResolveParams(index, rparams.data(), rparams.size(), &qparams, query_type), + VecSim_OK); + } + ASSERT_EQ(memcmp(&qparams, &zero, sizeof(VecSimQueryParams)), 0); + + rparams.push_back(VecSimRawParam{"ef_runtime", strlen("ef_runtime"), "100", strlen("100")}); + + ASSERT_EQ( + VecSimIndex_ResolveParams(index, rparams.data(), rparams.size(), &qparams, QUERY_TYPE_KNN), + VecSim_OK); + ASSERT_EQ(qparams.hnswRuntimeParams.efRuntime, 100); + + rparams[0] = (VecSimRawParam){.name = "wrong_name", .nameLen = 10, .value = "100", .valLen = 3}; + ASSERT_EQ( + VecSimIndex_ResolveParams(index, rparams.data(), rparams.size(), &qparams, QUERY_TYPE_NONE), + VecSimParamResolverErr_UnknownParam); + + // Testing for legal prefix but only partial parameter name. + rparams[0] = (VecSimRawParam){.name = "ef_run", .nameLen = 6, .value = "100", .valLen = 3}; + ASSERT_EQ( + VecSimIndex_ResolveParams(index, rparams.data(), rparams.size(), &qparams, QUERY_TYPE_NONE), + VecSimParamResolverErr_UnknownParam); + + rparams[0] = + (VecSimRawParam){.name = "ef_runtime", .nameLen = 10, .value = "wrong_val", .valLen = 9}; + ASSERT_EQ( + VecSimIndex_ResolveParams(index, rparams.data(), rparams.size(), &qparams, QUERY_TYPE_KNN), + VecSimParamResolverErr_BadValue); + + rparams[0] = (VecSimRawParam){.name = "ef_runtime", .nameLen = 10, .value = "100", .valLen = 3}; + ASSERT_EQ(VecSimIndex_ResolveParams(index, rparams.data(), rparams.size(), &qparams, + QUERY_TYPE_RANGE), + VecSimParamResolverErr_UnknownParam); + + rparams[0] = (VecSimRawParam){.name = "ef_runtime", .nameLen = 10, .value = "-30", .valLen = 3}; + ASSERT_EQ( + VecSimIndex_ResolveParams(index, rparams.data(), rparams.size(), &qparams, QUERY_TYPE_KNN), + VecSimParamResolverErr_BadValue); + + rparams[0] = + (VecSimRawParam){.name = "ef_runtime", .nameLen = 10, .value = "1.618", .valLen = 5}; + ASSERT_EQ( + VecSimIndex_ResolveParams(index, rparams.data(), rparams.size(), &qparams, QUERY_TYPE_KNN), + VecSimParamResolverErr_BadValue); + + rparams[0] = (VecSimRawParam){.name = "ef_runtime", .nameLen = 10, .value = "100", .valLen = 3}; + rparams.push_back( + (VecSimRawParam){.name = "ef_runtime", .nameLen = 10, .value = "100", .valLen = 3}); + ASSERT_EQ( + VecSimIndex_ResolveParams(index, rparams.data(), rparams.size(), &qparams, QUERY_TYPE_KNN), + VecSimParamResolverErr_AlreadySet); + + /** Testing with hybrid query params - cases which are only relevant for HNSW index. **/ + // Cannot set ef_runtime param with "hybrid_policy" which is "ADHOC_BF" + rparams[1] = (VecSimRawParam){.name = "HYBRID_POLICY", + .nameLen = strlen("HYBRID_POLICY"), + .value = "ADHOC_BF", + .valLen = strlen("ADHOC_BF")}; + ASSERT_EQ(VecSimIndex_ResolveParams(index, rparams.data(), rparams.size(), &qparams, + QUERY_TYPE_HYBRID), + VecSimParamResolverErr_InvalidPolicy_AdHoc_With_EfRuntime); + + rparams[1] = (VecSimRawParam){.name = "HYBRID_POLICY", + .nameLen = strlen("HYBRID_POLICY"), + .value = "BATCHES", + .valLen = strlen("BATCHES")}; + rparams.push_back((VecSimRawParam){.name = "batch_size", + .nameLen = strlen("batch_size"), + .value = "50", + .valLen = strlen("50")}); + ASSERT_EQ(VecSimIndex_ResolveParams(index, rparams.data(), rparams.size(), &qparams, + QUERY_TYPE_HYBRID), + VecSim_OK); + ASSERT_EQ(qparams.searchMode, HYBRID_BATCHES); + ASSERT_EQ(qparams.batchSize, 50); + ASSERT_EQ(qparams.hnswRuntimeParams.efRuntime, 100); + + VecSimIndex_Free(index); +} + +TYPED_TEST(HNSWTest, hnsw_resolve_epsilon_runtime_params) { + size_t dim = 4; + size_t M = 8; + size_t ef = 2; + + HNSWParams params = { + .dim = dim, .metric = VecSimMetric_L2, .M = M, .efConstruction = ef, .efRuntime = ef}; + + VecSimIndex *index = this->CreateNewIndex(params); + + VecSimQueryParams qparams, zero; + bzero(&zero, sizeof(VecSimQueryParams)); + + std::vector rparams; + + rparams.push_back((VecSimRawParam){.name = "epsilon", + .nameLen = strlen("epsilon"), + .value = "0.001", + .valLen = strlen("0.001")}); + + for (VecsimQueryType query_type : {QUERY_TYPE_NONE, QUERY_TYPE_KNN, QUERY_TYPE_HYBRID}) { + ASSERT_EQ( + VecSimIndex_ResolveParams(index, rparams.data(), rparams.size(), &qparams, query_type), + VecSimParamResolverErr_InvalidPolicy_NRange); + } + + ASSERT_EQ(VecSimIndex_ResolveParams(index, rparams.data(), rparams.size(), &qparams, + QUERY_TYPE_RANGE), + VecSim_OK); + ASSERT_FLOAT_EQ(qparams.hnswRuntimeParams.epsilon, 0.001); + + rparams[0] = (VecSimRawParam){.name = "wrong_name", + .nameLen = strlen("wrong_name"), + .value = "0.001", + .valLen = strlen("0.001")}; + ASSERT_EQ(VecSimIndex_ResolveParams(index, rparams.data(), rparams.size(), &qparams, + QUERY_TYPE_RANGE), + VecSimParamResolverErr_UnknownParam); + + // Testing for legal prefix but only partial parameter name. + rparams[0] = (VecSimRawParam){ + .name = "epsi", .nameLen = strlen("epsi"), .value = "0.001", .valLen = strlen("0.001")}; + ASSERT_EQ( + VecSimIndex_ResolveParams(index, rparams.data(), rparams.size(), &qparams, QUERY_TYPE_NONE), + VecSimParamResolverErr_UnknownParam); + + rparams[0] = (VecSimRawParam){ + .name = "epsilon", .nameLen = strlen("epsilon"), .value = "wrong_val", .valLen = 9}; + ASSERT_EQ(VecSimIndex_ResolveParams(index, rparams.data(), rparams.size(), &qparams, + QUERY_TYPE_RANGE), + VecSimParamResolverErr_BadValue); + + rparams[0] = (VecSimRawParam){ + .name = "epsilon", .nameLen = strlen("epsilon"), .value = "-30", .valLen = 3}; + ASSERT_EQ(VecSimIndex_ResolveParams(index, rparams.data(), rparams.size(), &qparams, + QUERY_TYPE_RANGE), + VecSimParamResolverErr_BadValue); + + rparams[0] = (VecSimRawParam){.name = "epsilon", + .nameLen = strlen("epsilon"), + .value = "0.001", + .valLen = strlen("0.001")}; + rparams.push_back((VecSimRawParam){.name = "epsilon", + .nameLen = strlen("epsilon"), + .value = "0.001", + .valLen = strlen("0.001")}); + ASSERT_EQ(VecSimIndex_ResolveParams(index, rparams.data(), rparams.size(), &qparams, + QUERY_TYPE_RANGE), + VecSimParamResolverErr_AlreadySet); + + VecSimIndex_Free(index); +} + +TYPED_TEST(HNSWTest, hnsw_get_distance) { + size_t n = 4; + size_t dim = 2; + size_t numIndex = 3; + VecSimIndex *index[numIndex]; + std::vector distances; + + TEST_DATA_T v1[] = {M_PI, M_PI}; + TEST_DATA_T v2[] = {M_E, M_E}; + TEST_DATA_T v3[] = {M_PI, M_E}; + TEST_DATA_T v4[] = {M_SQRT2, -M_SQRT2}; + + HNSWParams params = {.dim = dim}; + + for (size_t i = 0; i < numIndex; i++) { + params.metric = (VecSimMetric)i; + index[i] = this->CreateNewIndex(params); + VecSimIndex_AddVector(index[i], v1, 1); + VecSimIndex_AddVector(index[i], v2, 2); + VecSimIndex_AddVector(index[i], v3, 3); + VecSimIndex_AddVector(index[i], v4, 4); + ASSERT_EQ(VecSimIndex_IndexSize(index[i]), 4); + } + + TEST_DATA_T *query = v1; + TEST_DATA_T *norm = v2; // {e, e} + VecSim_Normalize(norm, dim, params.type); // now {1/sqrt(2), 1/sqrt(2)} + ASSERT_FLOAT_EQ(norm[0], 1.0 / sqrt(2.0)); + ASSERT_FLOAT_EQ(norm[1], 1.0 / sqrt(2.0)); + double dist; + + // distances array values were calculated locally for fp32 vectors + // using VecSim library. + + // VecSimMetric_L2 + distances = {0, 0.3583844006061554, 0.1791922003030777, 23.739208221435547}; + for (size_t i = 0; i < n; i++) { + dist = VecSimIndex_GetDistanceFrom_Unsafe(index[VecSimMetric_L2], i + 1, query); + ASSERT_NEAR(dist, distances[i], 1e-5); + } + + // VecSimMetric_IP + distances = {-18.73921012878418, -16.0794677734375, -17.409339904785156, 1}; + for (size_t i = 0; i < n; i++) { + dist = VecSimIndex_GetDistanceFrom_Unsafe(index[VecSimMetric_IP], i + 1, query); + ASSERT_NEAR(dist, distances[i], 1e-5); + } + + // VecSimMetric_Cosine + distances = {5.9604644775390625e-08, 5.9604644775390625e-08, 0.0025991201400756836, 1}; + for (size_t i = 0; i < n; i++) { + dist = VecSimIndex_GetDistanceFrom_Unsafe(index[VecSimMetric_Cosine], i + 1, norm); + ASSERT_NEAR(dist, distances[i], 1e-5); + } + + // Bad values + dist = VecSimIndex_GetDistanceFrom_Unsafe(index[VecSimMetric_Cosine], 0, norm); + ASSERT_TRUE(std::isnan(dist)); + dist = VecSimIndex_GetDistanceFrom_Unsafe(index[VecSimMetric_L2], 46, query); + ASSERT_TRUE(std::isnan(dist)); + + // Clean-up. + for (size_t i = 0; i < numIndex; i++) { + VecSimIndex_Free(index[i]); + } +} + +TYPED_TEST(HNSWTest, preferAdHocOptimization) { + // Save the expected result for every combination that represent a different leaf in the tree. + // map: [k, index_size, dim, M, r] -> res + std::map, bool> combinations; + combinations[{5, 1000, 5, 5, 0.5}] = true; + combinations[{5, 6000, 5, 5, 0.1}] = true; + combinations[{5, 6000, 5, 5, 0.2}] = false; + combinations[{5, 6000, 60, 5, 0.5}] = false; + combinations[{5, 6000, 60, 15, 0.5}] = true; + combinations[{15, 6000, 50, 5, 0.5}] = true; + combinations[{5, 700000, 60, 5, 0.05}] = true; + combinations[{5, 800000, 60, 5, 0.05}] = false; + combinations[{10, 800000, 60, 5, 0.01}] = true; + combinations[{10, 800000, 60, 5, 0.05}] = false; + combinations[{10, 800000, 60, 5, 0.1}] = false; + combinations[{10, 60000, 100, 5, 0.1}] = true; + combinations[{10, 80000, 100, 5, 0.1}] = false; + combinations[{10, 60000, 100, 60, 0.1}] = true; + combinations[{10, 60000, 100, 5, 0.3}] = false; + combinations[{20, 60000, 100, 5, 0.1}] = true; + combinations[{20, 60000, 100, 5, 0.2}] = false; + combinations[{20, 60000, 100, 20, 0.1}] = true; + combinations[{20, 350000, 100, 20, 0.1}] = true; + combinations[{20, 350000, 100, 20, 0.2}] = false; + + for (auto &comb : combinations) { + auto k = (size_t)comb.first[0]; + auto index_size = (size_t)comb.first[1]; + auto dim = (size_t)comb.first[2]; + auto M = (size_t)comb.first[3]; + auto r = comb.first[4]; + + // Create index and check for the expected output of "prefer ad-hoc" heuristics. + HNSWParams params = { + .dim = dim, .metric = VecSimMetric_L2, .M = M, .efConstruction = 1, .efRuntime = 1}; + + VecSimIndex *index = this->CreateNewIndex(params); + + // Set the index size artificially to be the required one. + this->CastToHNSW(index)->curElementCount = index_size; + for (size_t i = 0; i < index_size; i++) { + this->CastToHNSW_Single(index)->labelLookup[i] = i; + } + ASSERT_EQ(VecSimIndex_IndexSize(index), index_size); + bool res = VecSimIndex_PreferAdHocSearch(index, (size_t)(r * (float)index_size), k, true); + ASSERT_EQ(res, comb.second); + // Clean-up. + this->CastToHNSW(index)->curElementCount = 0; + VecSimIndex_Free(index); + } + + // Corner cases - empty index. + + HNSWParams params = {.dim = 4, .metric = VecSimMetric_L2}; + + VecSimIndex *index = this->CreateNewIndex(params); + + ASSERT_TRUE(VecSimIndex_PreferAdHocSearch(index, 0, 50, true)); + + // Corner cases - subset size is greater than index size. + ASSERT_EQ(VecSimIndex_PreferAdHocSearch(index, 42, 50, true), + VecSimIndex_PreferAdHocSearch(index, 0, 50, true)); + + VecSimIndex_Free(index); +} + +TYPED_TEST(HNSWTest, testCosine) { + size_t dim = 4; + size_t n = 100; + + HNSWParams params = {.dim = dim, .metric = VecSimMetric_Cosine}; + + VecSimIndex *index = this->CreateNewIndex(params); + + for (size_t i = 1; i <= n; i++) { + TEST_DATA_T f[dim]; + f[0] = (TEST_DATA_T)i / n; + for (size_t j = 1; j < dim; j++) { + f[j] = 1.0; + } + VecSimIndex_AddVector(index, f, i); + } + ASSERT_EQ(VecSimIndex_IndexSize(index), n); + + TEST_DATA_T query[dim]; + TEST_DATA_T normalized_query[dim]; + GenerateVector(query, dim, 1.0); + GenerateVector(normalized_query, dim, 1.0); + VecSim_Normalize(normalized_query, dim, params.type); + + auto verify_res = [&](size_t id, double score, size_t result_rank) { + ASSERT_EQ(id, (n - result_rank)); + TEST_DATA_T expected_score = index->getDistanceFrom_Unsafe(id, normalized_query); + ASSERT_DOUBLE_EQ(score, expected_score); + }; + + runTopKSearchTest(index, query, 10, verify_res); + + VecSimIndex_Free(index); +} + +// TYPED_TEST(HNSWTest, testSizeEstimation) { +// size_t dim = 4; +// size_t n = 200; +// size_t bs = 256; +// size_t M = 64; + +// HNSWParams params = {.dim = dim, .metric = VecSimMetric_L2, .blockSize = bs, .M = M}; + +// VecSimIndex *index = this->CreateNewIndex(params); +// // EstimateInitialSize is called after CreateNewIndex because params struct is +// // changed in CreateNewIndex. +// size_t estimation = EstimateInitialSize(params); +// size_t actual = index->getAllocationSize(); + +// ASSERT_EQ(estimation, actual); + +// // Fill the initial capacity + fill the last block. +// for (size_t i = 0; i < n; i++) { +// GenerateAndAddVector(index, dim, i); +// } + +// idType cur = n; +// while (index->indexSize() % bs != 0) { +// GenerateAndAddVector(index, dim, cur++); +// } + +// // Estimate the memory delta of adding a single vector that requires a full new block. +// estimation = EstimateElementSize(params) * bs; +// size_t before = index->getAllocationSize(); +// GenerateAndAddVector(index, dim, bs, bs); +// actual = index->getAllocationSize() - before; + +// // We check that the actual size is within 1% of the estimation. +// ASSERT_GE(estimation, actual * 0.99); +// ASSERT_LE(estimation, actual * 1.01); + +// VecSimIndex_Free(index); +// } + +TYPED_TEST(HNSWTest, testTimeoutReturn) { + size_t dim = 4; + VecSimQueryReply *rep; + + HNSWParams params = {.dim = dim, .metric = VecSimMetric_L2, .blockSize = 5}; + + VecSimIndex *index = this->CreateNewIndex(params); + + GenerateAndAddVector(index, dim, 0, 1.0); + + VecSim_SetTimeoutCallbackFunction([](void *ctx) { return 1; }); // Always times out + + TEST_DATA_T query[dim]; + GenerateVector(query, dim, 1.0); + // Checks return code on timeout. + rep = VecSimIndex_TopKQuery(index, query, 1, NULL, BY_ID); + ASSERT_EQ(VecSimQueryReply_GetCode(rep), VecSim_QueryReply_TimedOut); + ASSERT_EQ(VecSimQueryReply_Len(rep), 0); + VecSimQueryReply_Free(rep); + + // Check timeout again - range query. + GenerateAndAddVector(index, dim, 1, 1.0); + ASSERT_EQ(VecSimIndex_Info(index).hnswInfo.max_level, 0); + + // Fail on searching bottom layer entry point. + // We need to have at least 1 vector in layer higher than 0 to fail there. + size_t next = 0; + while (VecSimIndex_Info(index).hnswInfo.max_level == 0) { + GenerateAndAddVector(index, dim, next, 1.0); + ++next; + } + VecSim_SetTimeoutCallbackFunction([](void *ctx) { return 1; }); // Always times out. + + rep = VecSimIndex_TopKQuery(index, query, 2, NULL, BY_ID); + ASSERT_EQ(VecSimQueryReply_GetCode(rep), VecSim_QueryReply_TimedOut); + ASSERT_EQ(VecSimQueryReply_Len(rep), 0); + VecSimQueryReply_Free(rep); + + VecSimIndex_Free(index); + VecSim_SetTimeoutCallbackFunction([](void *ctx) { return 0; }); // Cleanup. +} + +// TYPED_TEST(HNSWTest, testTimeoutReturn_batch_iterator) { +// size_t dim = 4; +// size_t n = 2; +// VecSimQueryReply *rep; + +// HNSWParams params = {.dim = dim, .metric = VecSimMetric_L2}; + +// VecSimIndex *index = this->CreateNewIndex(params); + +// for (size_t i = 0; i < n; i++) { +// GenerateAndAddVector(index, dim, 46 - i, 1.0); +// } + +// ASSERT_EQ(VecSimIndex_IndexSize(index), n); + +// // Fail on second batch (after some calculation already completed in the first one). +// TEST_DATA_T query[dim]; +// GenerateVector(query, dim, 1.0); +// VecSimBatchIterator *batchIterator = VecSimBatchIterator_New(index, query, nullptr); + +// rep = VecSimBatchIterator_Next(batchIterator, 1, BY_ID); +// ASSERT_EQ(VecSimQueryReply_GetCode(rep), VecSim_QueryReply_OK); +// ASSERT_NE(VecSimQueryReply_Len(rep), 0); +// VecSimQueryReply_Free(rep); + +// VecSim_SetTimeoutCallbackFunction([](void *ctx) { return 1; }); // Always times out. +// rep = VecSimBatchIterator_Next(batchIterator, 1, BY_ID); +// ASSERT_EQ(VecSimQueryReply_GetCode(rep), VecSim_QueryReply_TimedOut); +// ASSERT_EQ(VecSimQueryReply_Len(rep), 0); +// VecSimQueryReply_Free(rep); + +// VecSimBatchIterator_Free(batchIterator); + +// // Fail on first batch (while calculating). +// auto timeoutcb = [](void *ctx) { +// static size_t flag = 1; +// if (flag) { +// flag = 0; +// return 0; +// } else { +// return 1; +// } +// }; +// VecSim_SetTimeoutCallbackFunction(timeoutcb); // Fails on second call. +// batchIterator = VecSimBatchIterator_New(index, query, nullptr); + +// rep = VecSimBatchIterator_Next(batchIterator, 2, BY_ID); +// ASSERT_EQ(VecSimQueryReply_GetCode(rep), VecSim_QueryReply_TimedOut); +// ASSERT_EQ(VecSimQueryReply_Len(rep), 0); +// VecSimQueryReply_Free(rep); + +// VecSimBatchIterator_Free(batchIterator); + +// // Fail on searching bottom layer entry point. +// // We need to have at least 1 vector in layer higher than 0 to fail there. +// size_t next = 0; +// while (VecSimIndex_Info(index).hnswInfo.max_level == 0) { +// GenerateAndAddVector(index, dim, next++, 1.0); +// } +// VecSim_SetTimeoutCallbackFunction([](void *ctx) { return 1; }); // Always times out. +// batchIterator = VecSimBatchIterator_New(index, query, nullptr); + +// rep = VecSimBatchIterator_Next(batchIterator, 2, BY_ID); +// ASSERT_EQ(VecSimQueryReply_GetCode(rep), VecSim_QueryReply_TimedOut); +// ASSERT_EQ(VecSimQueryReply_Len(rep), 0); +// VecSimQueryReply_Free(rep); + +// VecSimBatchIterator_Free(batchIterator); + +// VecSimIndex_Free(index); +// VecSim_SetTimeoutCallbackFunction([](void *ctx) { return 0; }); // Cleanup. +// } + +// TYPED_TEST(HNSWTest, rangeQuery) { +// size_t n = 5000; +// size_t dim = 4; + +// HNSWParams params = {.dim = dim, .metric = VecSimMetric_L2}; + +// VecSimIndex *index = this->CreateNewIndex(params); + +// for (size_t i = 0; i < n; i++) { +// GenerateAndAddVector(index, dim, i, i); +// } +// ASSERT_EQ(VecSimIndex_IndexSize(index), n); + +// size_t pivot_id = n / 2; // the id to return vectors around it. +// TEST_DATA_T query[dim]; +// GenerateVector(query, dim, pivot_id); + +// auto verify_res_by_score = [&](size_t id, double score, size_t index) { +// ASSERT_EQ(std::abs(int(id - pivot_id)), (index + 1) / 2); +// ASSERT_EQ(score, dim * pow((index + 1) / 2, 2)); +// }; +// uint expected_num_results = 11; +// // To get 11 results in the range [pivot_id - 5, pivot_id + 5], set the radius as the L2 +// score +// // in the boundaries. +// double radius = dim * pow(expected_num_results / 2, 2); +// runRangeQueryTest(index, query, radius, verify_res_by_score, expected_num_results, BY_SCORE); + +// // Rerun with a given query params. This high epsilon value will cause the range search main +// // loop to break since we insert a candidate whose distance is within the dynamic range +// // boundaries at the beginning of the search, but when this candidate is popped out from the +// // queue, it's no longer within the dynamic range boundaries. +// HNSWRuntimeParams hnswRuntimeParams = {.epsilon = 1.0}; +// auto query_params = CreateQueryParams(hnswRuntimeParams); +// runRangeQueryTest(index, query, radius, verify_res_by_score, expected_num_results, BY_SCORE, +// &query_params); + +// // Get results by id. +// auto verify_res_by_id = [&](size_t id, double score, size_t index) { +// ASSERT_EQ(id, pivot_id - expected_num_results / 2 + index); +// ASSERT_EQ(score, dim * pow(std::abs(int(id - pivot_id)), 2)); +// }; +// runRangeQueryTest(index, query, radius, verify_res_by_id, expected_num_results); + +// VecSimIndex_Free(index); +// } + +// TYPED_TEST(HNSWTest, rangeQueryCosine) { +// size_t n = 800; +// size_t dim = 4; + +// HNSWParams params = {.dim = dim, .metric = VecSimMetric_Cosine}; + +// VecSimIndex *index = this->CreateNewIndex(params); + +// for (size_t i = 0; i < n; i++) { +// TEST_DATA_T f[dim]; +// f[0] = TEST_DATA_T(i + 1) / n; +// for (size_t j = 1; j < dim; j++) { +// f[j] = 1.0; +// } +// // Use as label := n - (internal id) +// VecSimIndex_AddVector(index, f, n - i); +// } + +// ASSERT_EQ(VecSimIndex_IndexSize(index), n); +// TEST_DATA_T query[dim]; +// TEST_DATA_T normalized_query[dim]; +// for (size_t i = 0; i < dim; i++) { +// query[i] = 1.0; +// normalized_query[i] = 1.0; +// } + +// VecSim_Normalize(normalized_query, dim, params.type); +// auto verify_res = [&](size_t id, double score, size_t result_rank) { +// ASSERT_EQ(id, result_rank + 1); +// double expected_score = index->getDistanceFrom_Unsafe(id, normalized_query); +// ASSERT_EQ(score, expected_score); +// }; +// uint expected_num_results = 31; +// // Calculate the score of the 31st distant vector from the query vector (whose id should be +// 30) +// // to get the radius. +// double radius = index->getDistanceFrom_Unsafe(31, normalized_query); +// runRangeQueryTest(index, query, radius, verify_res, expected_num_results, BY_SCORE); + +// // Return results BY_ID should give the same results. +// runRangeQueryTest(index, query, radius, verify_res, expected_num_results, BY_ID); + +// VecSimIndex_Free(index); +// } + +// TYPED_TEST(HNSWTest, HNSWSerializationCurrentVersion) { + +// size_t dim = 4; +// size_t n = 1001; +// size_t n_labels[] = {n, 100}; +// size_t M = 8; +// size_t ef = 10; +// double epsilon = 0.004; +// bool is_multi[] = {false, true}; +// std::string multiToString[] = {"single", "multi_100labels"}; + +// HNSWParams params{.type = TypeParam::get_index_type(), +// .dim = dim, +// .metric = VecSimMetric_L2, +// .M = M, +// .efConstruction = ef, +// .efRuntime = ef, +// .epsilon = epsilon}; + +// // Test for multi and single + +// for (size_t i = 0; i < 2; ++i) { +// // Set index type. +// params.multi = is_multi[i]; + +// // Generate and add vectors to an index. +// VecSimIndex *index = this->CreateNewIndex(params, is_multi[i]); +// HNSWIndex *hnsw_index = this->CastToHNSW(index); + +// std::vector data(n * dim); +// std::mt19937 rng; +// rng.seed(47); +// std::uniform_real_distribution<> distrib; +// for (size_t i = 0; i < n * dim; ++i) { +// data[i] = (TEST_DATA_T)distrib(rng); +// } +// for (size_t j = 0; j < n; ++j) { +// VecSimIndex_AddVector(index, data.data() + dim * j, j % n_labels[i]); +// } + +// auto file_name = std::string(getenv("ROOT")) + "/tests/unit/1k-d4-L2-M8-ef_c10_" + +// VecSimType_ToString(TypeParam::get_index_type()) + "_" + +// multiToString[i] + +// ".hnsw_current_version"; + +// // Save the index with the default version (V4). +// hnsw_index->saveIndex(file_name); + +// // Fetch info after saving, as memory size change during saving. +// VecSimIndexInfo info = VecSimIndex_Info(index); +// ASSERT_EQ(info.commonInfo.basicInfo.algo, VecSimAlgo_HNSWLIB); +// ASSERT_EQ(info.hnswInfo.M, M); +// ASSERT_EQ(info.hnswInfo.efConstruction, ef); +// ASSERT_EQ(info.hnswInfo.efRuntime, ef); +// ASSERT_EQ(info.commonInfo.indexSize, n); +// ASSERT_EQ(info.commonInfo.basicInfo.metric, VecSimMetric_L2); +// ASSERT_EQ(info.commonInfo.basicInfo.type, TypeParam::get_index_type()); +// ASSERT_EQ(info.commonInfo.basicInfo.dim, dim); +// ASSERT_EQ(info.commonInfo.indexLabelCount, n_labels[i]); + +// VecSimIndex_Free(index); + +// // Load the index from the file. +// VecSimIndex *serialized_index = HNSWFactory::NewIndex(file_name); +// auto *serialized_hnsw_index = this->CastToHNSW(serialized_index); + +// // Verify that the index was loaded as expected. +// ASSERT_TRUE(serialized_hnsw_index->checkIntegrity().valid_state); +// ASSERT_EQ(serialized_hnsw_index->getVersion(), Serializer::EncodingVersion_V4); + +// VecSimIndexInfo info2 = VecSimIndex_Info(serialized_index); +// ASSERT_EQ(info2.commonInfo.basicInfo.algo, VecSimAlgo_HNSWLIB); +// ASSERT_EQ(info2.hnswInfo.M, M); +// ASSERT_EQ(info2.commonInfo.basicInfo.isMulti, is_multi[i]); +// ASSERT_EQ(info2.commonInfo.basicInfo.blockSize, DEFAULT_BLOCK_SIZE); +// ASSERT_EQ(info2.hnswInfo.efConstruction, ef); +// ASSERT_EQ(info2.hnswInfo.efRuntime, ef); +// ASSERT_EQ(info2.commonInfo.indexSize, n); +// ASSERT_EQ(info2.commonInfo.basicInfo.metric, VecSimMetric_L2); +// ASSERT_EQ(info2.commonInfo.basicInfo.type, TypeParam::get_index_type()); +// ASSERT_EQ(info2.commonInfo.basicInfo.dim, dim); +// ASSERT_EQ(info2.commonInfo.indexLabelCount, n_labels[i]); +// ASSERT_EQ(info2.hnswInfo.epsilon, epsilon); + +// // Check the functionality of the loaded index. + +// // Add and delete vector +// GenerateAndAddVector(serialized_index, dim, n); + +// VecSimIndex_DeleteVector(serialized_index, 1); + +// size_t n_per_label = n / n_labels[i]; +// ASSERT_TRUE(serialized_hnsw_index->checkIntegrity().valid_state); +// ASSERT_EQ(VecSimIndex_IndexSize(serialized_index), n + 1 - n_per_label); + +// // Clean up. +// remove(file_name.c_str()); +// VecSimIndex_Free(serialized_index); +// } +// } + +// TYPED_TEST(HNSWTest, HNSWSerializationV3) { +// if (TypeParam::get_index_type() != VecSimType_FLOAT32) { +// GTEST_SKIP(); +// } +// size_t dim = 4; +// size_t n = 1001; +// size_t n_labels[] = {n, 100}; +// size_t M = 8; +// size_t ef = 10; +// double epsilon = 0.004; +// size_t blockSize = 2; +// bool is_multi[] = {false, true}; +// std::string multiToString[] = {"single", "multi_100labels"}; + +// HNSWParams params{.type = TypeParam::get_index_type(), +// .dim = dim, +// .metric = VecSimMetric_L2, +// .blockSize = blockSize, +// .M = M, +// .efConstruction = ef, +// .efRuntime = ef, +// .epsilon = epsilon}; + +// // Test for multi and single + +// for (size_t i = 0; i < 2; ++i) { +// // Set index type. +// params.multi = is_multi[i]; +// auto file_name = std::string(getenv("ROOT")) + "/tests/unit/data/1k-d4-L2-M8-ef_c10_" + +// VecSimType_ToString(TypeParam::get_index_type()) + "_" + +// multiToString[i] + +// ".v3"; + +// // Load the index from the file. +// VecSimIndex *serialized_index = HNSWFactory::NewIndex(file_name); +// auto *serialized_hnsw_index = this->CastToHNSW(serialized_index); + +// // Verify that the index was loaded as expected. +// ASSERT_EQ(serialized_hnsw_index->getVersion(), Serializer::EncodingVersion_V3); +// ASSERT_TRUE(serialized_hnsw_index->checkIntegrity().valid_state); + +// VecSimIndexInfo info = VecSimIndex_Info(serialized_index); +// ASSERT_EQ(info.commonInfo.basicInfo.algo, VecSimAlgo_HNSWLIB); +// ASSERT_EQ(info.hnswInfo.M, M); +// ASSERT_EQ(info.commonInfo.basicInfo.isMulti, is_multi[i]); +// ASSERT_EQ(info.commonInfo.basicInfo.blockSize, blockSize); +// ASSERT_EQ(info.hnswInfo.efConstruction, ef); +// ASSERT_EQ(info.hnswInfo.efRuntime, ef); +// ASSERT_EQ(info.commonInfo.indexSize, n); +// ASSERT_EQ(info.commonInfo.basicInfo.metric, VecSimMetric_L2); +// ASSERT_EQ(info.commonInfo.basicInfo.type, TypeParam::get_index_type()); +// ASSERT_EQ(info.commonInfo.basicInfo.dim, dim); +// ASSERT_EQ(info.commonInfo.indexLabelCount, n_labels[i]); +// ASSERT_EQ(info.hnswInfo.epsilon, epsilon); + +// // Check the functionality of the loaded index. + +// // Add and delete vector +// GenerateAndAddVector(serialized_index, dim, n); + +// VecSimIndex_DeleteVector(serialized_index, 1); + +// size_t n_per_label = n / n_labels[i]; +// ASSERT_TRUE(serialized_hnsw_index->checkIntegrity().valid_state); +// ASSERT_EQ(VecSimIndex_IndexSize(serialized_index), n + 1 - n_per_label); + +// // Clean up. +// VecSimIndex_Free(serialized_index); +// } +// } + +// TYPED_TEST(HNSWTest, markDelete) { +// size_t n = 100; +// size_t k = 11; +// size_t dim = 4; +// VecSimBatchIterator *batchIterator; + +// HNSWParams params = {.dim = dim, .metric = VecSimMetric_L2}; + +// VecSimIndex *index = this->CreateNewIndex(params); +// // Try marking and a non-existing label +// ASSERT_EQ(this->CastToHNSW(index)->markDelete(0), +// vecsim_stl::vector(index->getAllocator())); + +// for (size_t i = 0; i < n; i++) { +// GenerateAndAddVector(index, dim, i, i); +// } +// ASSERT_EQ(VecSimIndex_IndexSize(index), n); +// TEST_DATA_T query[dim]; +// GenerateVector(query, dim, n / 2); + +// // Search for k results around the middle. expect to find them. +// auto verify_res = [&](size_t id, double score, size_t index) { +// size_t diff_id = (id > 50) ? (id - 50) : (50 - id); +// ASSERT_EQ(diff_id, (index + 1) / 2); +// ASSERT_EQ(score, (4 * ((index + 1) / 2) * ((index + 1) / 2))); +// }; +// runTopKSearchTest(index, query, k, verify_res); +// runRangeQueryTest(index, query, dim * k * k / 4 - 1, verify_res, k, BY_SCORE); +// batchIterator = VecSimBatchIterator_New(index, query, nullptr); +// runBatchIteratorSearchTest(batchIterator, k, verify_res); +// VecSimBatchIterator_Free(batchIterator); + +// unsigned char ep_reminder = index->info().hnswInfo.entrypoint % 2; +// // Mark as deleted half of the vectors, including the entrypoint. +// for (labelType label = 0; label < n; label++) +// if (label % 2 == ep_reminder) +// ASSERT_EQ(this->CastToHNSW(index)->markDelete(label), +// vecsim_stl::vector(1, label, index->getAllocator())); + +// ASSERT_EQ(this->CastToHNSW(index)->getNumMarkedDeleted(), n / 2); +// ASSERT_EQ(VecSimIndex_IndexSize(index), n); + +// // Search for k results around the middle. expect to find only even results. +// auto verify_res_half = [&](size_t id, double score, size_t index) { +// ASSERT_NE(id % 2, ep_reminder); +// size_t diff_id = (id > 50) ? (id - 50) : (50 - id); +// size_t expected_id = index % 2 ? index + 1 : index; +// ASSERT_EQ(diff_id, expected_id); +// ASSERT_EQ(score, (dim * expected_id * expected_id)); +// }; +// runTopKSearchTest(index, query, k, verify_res_half); +// runRangeQueryTest(index, query, dim * k * k - 1, verify_res_half, k, BY_SCORE); +// batchIterator = VecSimBatchIterator_New(index, query, nullptr); +// runBatchIteratorSearchTest(batchIterator, k, verify_res_half); +// VecSimBatchIterator_Free(batchIterator); + +// // Add a new vector, make sure it has no link to a deleted vector +// GenerateAndAddVector(index, dim, n, n); +// for (size_t level = 0; level <= +// this->CastToHNSW(index)->getGraphDataByInternalId(n)->toplevel; +// level++) { +// ElementLevelData &cur = this->CastToHNSW(index)->getElementLevelData(n, level); +// for (size_t idx = 0; idx < cur.numLinks; idx++) { +// ASSERT_TRUE(cur.links[idx] % 2 != ep_reminder) +// << "Got a link to " << cur.links[idx] << " on level " << level; +// } +// } + +// // Re-add the previously marked vectors (under new internal ids). +// for (labelType label = 0; label < n; label++) { +// if (label % 2 == ep_reminder) { +// GenerateAndAddVector(index, dim, label, label); +// } +// } + +// ASSERT_EQ(VecSimIndex_IndexSize(index), n + n / 2 + 1); +// ASSERT_EQ(this->CastToHNSW(index)->getNumMarkedDeleted(), n / 2); + +// // Search for k results around the middle again. expect to find the same results we +// // found in the first search. +// runTopKSearchTest(index, query, k, verify_res); +// runRangeQueryTest(index, query, dim * k * k / 4 - 1, verify_res, k, BY_SCORE); +// batchIterator = VecSimBatchIterator_New(index, query, nullptr); +// runBatchIteratorSearchTest(batchIterator, k, verify_res); +// VecSimBatchIterator_Free(batchIterator); + +// VecSimIndex_Free(index); +// } + +// TYPED_TEST(HNSWTest, allMarkedDeletedLevel) { +// size_t dim = 4; +// size_t M = 2; + +// HNSWParams params = {.dim = dim, .metric = VecSimMetric_L2, .M = M}; + +// VecSimIndex *index = this->CreateNewIndex(params); + +// size_t num_multi_layered = 0; +// labelType max_id = 0; + +// // Add vectors to the index until we have 10 multi-layered vectors. +// do { +// GenerateAndAddVector(index, dim, max_id, max_id); +// if (this->CastToHNSW(index)->getGraphDataByInternalId(max_id)->toplevel > 0) { +// num_multi_layered++; +// } +// max_id++; +// } while (num_multi_layered < 10); + +// // Mark all vectors with multi-layers as deleted. +// for (labelType label = 0; label < max_id; label++) { +// if (this->CastToHNSW(index)->getGraphDataByInternalId(label)->toplevel > 0) { +// this->CastToHNSW(index)->markDelete(label); +// } +// } + +// size_t max_level = index->info().hnswInfo.max_level; + +// // Re-add a new vector until its level is equal to the max level of the index. +// do { +// GenerateAndAddVector(index, dim, max_id, max_id); +// } while (this->CastToHNSW(index)->getGraphDataByInternalId(max_id)->toplevel < max_level); + +// // If we passed the previous loop, it means that we successfully added a vector without +// invalid +// // memory access. + +// // For completeness, we also check index integrity. +// ASSERT_TRUE(this->CastToHNSW(index)->checkIntegrity().valid_state); + +// VecSimIndex_Free(index); +// } + +// TYPED_TEST(HNSWTest, repairNodeConnectionsBasic) { +// size_t dim = 8; +// size_t n = dim; +// size_t M = 8; + +// HNSWParams params = {.dim = dim, .metric = VecSimMetric_L2, .M = M}; +// VecSimIndex *index = this->CreateNewIndex(params); +// auto *hnsw_index = this->CastToHNSW(index); + +// // Add 8 vectors, expect to get a full graph in level 0 (all nodes pairs are connected) +// TEST_DATA_T vec[] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; +// for (size_t i = 0; i < n; i++) { +// vec[i] = 1.0; +// VecSimIndex_AddVector(index, vec, i); +// vec[i] = 0.0; +// } +// for (size_t i = 0; i < n; i++) { +// ElementLevelData &cur = hnsw_index->getElementLevelData(i, 0); +// ASSERT_EQ(cur.numLinks, n - 1); +// } + +// // Mark element 0 as deleted, and repair all of its neighbors. +// hnsw_index->markDelete(0); +// ASSERT_EQ(this->CastToHNSW(index)->checkIntegrity().connections_to_repair, n - 1); +// for (size_t i = 1; i < n; i++) { +// hnsw_index->repairNodeConnections(i, 0); +// // After the repair expect that to have all nodes except for element 0 as neighbors. +// ElementLevelData &cur = hnsw_index->getElementLevelData(i, 0); +// ASSERT_EQ(cur.numLinks, n - 2); +// } + +// // Mark elements 1 and 2 as deleted. +// hnsw_index->markDelete(1); +// hnsw_index->markDelete(2); +// for (size_t i = 3; i < n; i++) { +// hnsw_index->repairNodeConnections(i, 0); +// // After the repair expect that to have all nodes except for elements 0-2 as neighbors. +// ElementLevelData &cur = hnsw_index->getElementLevelData(i, 0); +// ASSERT_EQ(cur.numLinks, n - 4); +// } + +// // For completeness, we also check index integrity. +// ASSERT_TRUE(this->CastToHNSW(index)->checkIntegrity().valid_state); + +// VecSimIndex_Free(index); +// } + +TYPED_TEST(HNSWTest, getElementNeighbors) { + size_t dim = 4; + size_t n = 0; + size_t M = 20; + + HNSWParams params = {.dim = dim, .metric = VecSimMetric_L2, .M = M}; + VecSimIndex *index = this->CreateNewIndex(params); + auto *hnsw_index = this->CastToHNSW(index); + + // Add vectors until we have at least 2 vectors at level 1. + size_t vectors_in_higher_levels = 0; + while (vectors_in_higher_levels < 2) { + GenerateAndAddVector(index, dim, n, n); + if (hnsw_index->getGraphDataByInternalId(n).toplevel > 0) { + vectors_in_higher_levels++; + } + n++; + } + // Go over all vectors and validate that the getElementNeighbors debug command returns the + // neighbors properly. + for (size_t id = 0; id < n; id++) { + int **neighbors_output; + VecSimDebug_GetElementNeighborsInHNSWGraph(index, id, &neighbors_output); + auto graph_data = hnsw_index->getGraphDataByInternalId(id); + for (size_t l = 0; l <= graph_data.toplevel; l++) { + ElementLevelData level_data = hnsw_index->getElementLevelData(graph_data, l); + auto &neighbours = neighbors_output[l]; + ASSERT_EQ(neighbours[0], level_data.getNumLinks()); + for (size_t j = 1; j <= neighbours[0]; j++) { + ASSERT_EQ(neighbours[j], level_data.getLinkAtPos(j - 1)); + } + } + VecSimDebug_ReleaseElementNeighborsInHNSWGraph(neighbors_output); + } + VecSimIndex_Free(index); +} + +TYPED_TEST(HNSWTest, FitMemoryTest) { + size_t dim = 4; + HNSWParams params = {.dim = dim, .blockSize = DEFAULT_BLOCK_SIZE}; + VecSimIndex *index = this->CreateNewIndex(params); + + // Fit memory to initial capacity shouldn't have any affect since the ctor initializes label2id + // size to the initial capacity. + size_t initial_memory = index->getAllocationSize(); + index->fitMemory(); + ASSERT_EQ(index->getAllocationSize(), initial_memory); + + // Add vector + GenerateAndAddVector(index, dim, 0); + initial_memory = index->getAllocationSize(); + index->fitMemory(); + // Due to the initial capacity, the memory for the vector was already allocated + ASSERT_EQ(index->getAllocationSize(), initial_memory); + + VecSimIndex_Free(index); +}