Skip to content

Omer knn2 heaps #561

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 7 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 87 additions & 20 deletions src/VecSim/algorithms/brute_force/brute_force.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@
#include <ranges>
#include <sys/param.h>

#include <chrono>
#include <thread>

using spaces::dist_func_t;

template <typename DataType, typename DistType>
Expand Down Expand Up @@ -232,42 +235,106 @@ BruteForceIndex<DataType, DistType>::topKQuery(const void *queryBlob, size_t k,
if (0 == k) {
return rep;
}

auto processed_query_ptr = this->preprocessQuery(queryBlob);
const void *processed_query = processed_query_ptr.get();
DistType upperBound = std::numeric_limits<DistType>::lowest();
vecsim_stl::abstract_priority_queue<DistType, labelType> *TopCandidates =
getNewMaxPriorityQueue();

// For vector, compute its scores and update the Top candidates max heap
auto vectors_it = vectors->getIterator();
idType curr_id = 0;

// create H1 from notebook algorithm
// starting with container, reserving memory for speed
// this is the container Omer is familiar with so should? be changes later
// Q - I see below (line 262) assert curr_id == count, should I use count instead of size?
std::vector<std::tuple<DistType, labelType>> heap1;
heap1.reserve(vectors->size());
// Step 1 - make a container (c++ vector) of vector distance scores

while (auto *vector = vectors_it->next()) {
// Omer - ask what this does exactly
if (VECSIM_TIMEOUT(timeoutCtx)) {
rep->code = VecSim_QueryReply_TimedOut;
delete TopCandidates;
return rep;
}
auto score = this->calcDistance(vector, processed_query);
// If we have less than k or a better score, insert it.
if (score < upperBound || TopCandidates->size() < k) {
TopCandidates->emplace(score, getVectorLabel(curr_id));
if (TopCandidates->size() > k) {
// If we now have more than k results, pop the worst one.
TopCandidates->pop();
}
upperBound = TopCandidates->top().first;
}
heap1.emplace_back(score, getVectorLabel(curr_id));
++curr_id;
}
assert(curr_id == this->count);

rep->results.resize(TopCandidates->size());
for (auto &result : std::ranges::reverse_view(rep->results)) {
std::tie(result.score, result.id) = TopCandidates->top();
TopCandidates->pop();
if (this->count <= k) {
std::sort(heap1.begin(), heap1.end(),
[](const auto &a, const auto &b) { return std::get<0>(a) < std::get<0>(b); });
rep->results.resize(this->count);
auto result_iter = rep->results.begin();
for (const auto &vect : heap1) {
std::tie(result_iter->score, result_iter->id) = vect;
++result_iter;
}
return rep;
}
delete TopCandidates;
std::this_thread::sleep_for(std::chrono::seconds(1));

// Step 2 - min heapify H1
// The comparator should probably be written outsize
std::make_heap(heap1.begin(), heap1.end(),
[](const auto &a, const auto &b) { return std::get<0>(a) > std::get<0>(b); });

// Step 3 Create empty candidate heap - H2
// Its size is not going to be bigger then 2k so it can be reserved
// Can probably reserve k+1 but need to make sure
// We are going to save the index of the element in H1 hence size_t in the tuple
std::vector<std::tuple<DistType, size_t>> heap2;
heap2.reserve(k + 1);

// Step4 - insert root of H1 into H2
// The root of H1 is in the front of the vector
heap2.emplace_back(std::get<0>(heap1.front()), 0);

// Steps 5 and 6 loop

rep->results.resize(k);
auto result_iter = rep->results.begin();
size_t counter = 0;
while (counter < k) {
// Step 5 insert root of H2 into result
auto selected = heap2.front();
size_t selected_heap1_index = std::get<1>(selected);
std::tie(result_iter->score, result_iter->id) = heap1[selected_heap1_index];
counter++;
if (counter >= k)
// This check might be faulty loop logic or bad coding but works for now
// but it is important to check to avoid redundant pop and 2 inserts
{
break;
}
// Step 6.1 pop the root of H2
// To do so - std::pop_heap & v.pop_back()
std::pop_heap(heap2.begin(), heap2.end(),
[](const auto &a, const auto &b) { return std::get<0>(a) > std::get<0>(b); });
heap2.pop_back();
// Step 6.2 insert the childs of the root in respect to H1

size_t left_child = 2 * selected_heap1_index + 1;

if (left_child < heap1.size()) {
heap2.emplace_back(std::get<0>(heap1[left_child]), left_child);
std::push_heap(heap2.begin(), heap2.end(), [](const auto &a, const auto &b) {
return std::get<0>(a) > std::get<0>(b);
});
}
// Insert to vector acting as heap is emplace back & push_heap
size_t right_child = 2 * selected_heap1_index + 2;

if (left_child < heap1.size()) {
heap2.emplace_back(std::get<0>(heap1[right_child]), right_child);
std::push_heap(heap2.begin(), heap2.end(), [](const auto &a, const auto &b) {
return std::get<0>(a) > std::get<0>(b);
});
}

++result_iter;
}

return rep;
}

Expand Down
4 changes: 2 additions & 2 deletions tests/unit/test_bf16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -400,9 +400,9 @@ void BF16Test::search_by_score_test(params_t index_params) {
GenerateVector(query, 50); // {50, 50, 50, 50}
// Vectors values are equal to the id, so the 11 closest vectors are
// 45, 46...50 (closest), 51...55
static size_t expected_res_order[] = {50, 49, 51, 48, 52, 47, 53, 46, 54, 45, 55};
static size_t expected_diff_order[] = {0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5};
auto verify_res = [&](size_t id, double score, size_t index) {
ASSERT_EQ(id, expected_res_order[index]);
ASSERT_EQ(id > 50 ? id - 50 : 50 - id, expected_diff_order[index]);
ASSERT_EQ(score, 4 * (50 - id) * (50 - id)); // L2 distance
};

Expand Down
3 changes: 2 additions & 1 deletion tests/unit/test_bruteforce.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,7 @@ TYPED_TEST(BruteForceTest, test_delete_swap_block) {
ASSERT_EQ(id, index + 1);
}
};
runTopKSearchTest(index, query, k, verify_res);

VecSimIndex_Free(index);
}

Expand Down Expand Up @@ -662,6 +662,7 @@ TYPED_TEST(BruteForceTest, brute_force_vector_search_test_l2) {

auto verify_res = [&](size_t id, double score, size_t index) {
size_t diff_id = (id > 50) ? (id - 50) : (50 - id);
std::cout << "diff_id is:" << diff_id << "with score" << score << std::endl;
ASSERT_EQ(diff_id, (index + 1) / 2);
ASSERT_EQ(score, (4 * ((index + 1) / 2) * ((index + 1) / 2)));
};
Expand Down
5 changes: 3 additions & 2 deletions tests/unit/test_fp16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -404,9 +404,10 @@ void FP16Test::search_by_score_test(params_t index_params) {
GenerateVector(query, 50); // {50, 50, 50, 50}
// Vectors values are equal to the id, so the 11 closest vectors are
// 45, 46...50 (closest), 51...55
static size_t expected_res_order[] = {50, 49, 51, 48, 52, 47, 53, 46, 54, 45, 55};
// static size_t expected_index_diff_order[] = {50, 49, 51, 48, 52, 47, 53, 46, 54, 45, 55};
static size_t expected_index_diff_order[] = {0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5};
auto verify_res = [&](size_t id, double score, size_t index) {
ASSERT_EQ(id, expected_res_order[index]);
ASSERT_EQ(id > 50 ? id - 50 : 50 - id, expected_index_diff_order[index]);
ASSERT_EQ(score, 4 * (50 - id) * (50 - id)); // L2 distance
};

Expand Down
Loading