Skip to content

Commit 4e7dd21

Browse files
rchitale7navneet1v
andauthored
Increase the minimum doc count to 5 (#52)
Signed-off-by: Rohan Chitale <[email protected]> Co-authored-by: Navneet Verma <[email protected]>
1 parent b4781e7 commit 4e7dd21

19 files changed

+297
-320
lines changed

API.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ Request Parameters
1515
"doc_id_path" : "DocIdPath", // File path all doc Ids are written to
1616
"tenant_id": "UniqueClusterID", // Unique identifier for the cluster making the request
1717
"dimension": 768,
18-
"doc_count": 1000000,
18+
"doc_count": 1000000, // Must be greater than 4
1919
"data_type": "float",
2020
"engine": "faiss",
2121
"index_parameters": {
@@ -41,6 +41,7 @@ Request Response:
4141
* The top two parameters are derived by the KNN plugin from the customer's repo setting. `container_name` is used to specifically refer to the "bucket" (or non-S3 equivalent), rather than the name of the repository itself.
4242
* Tenant ID is included to be used for billing, authorization, etc
4343
* Dimension, doc count, and data type are specifically placed on the first JSON level so the build service can quickly use them first to calculate workload size
44+
* Doc count must be greater than 4, since doc counts of 1, 3, and 4 have been observed to consistently fail during index builds
4445
* Engine: If in the future we have different workers hosting different engines, like Lucene etc. this parameter will act as an extension point. This is not a required parameter.
4546
* Space type is inside `index_parameters` since it is only used in index creation
4647
* By including an `algorithm` field in index parameters, we leave the door open for IVF, future algorithms

remote_vector_index_builder/app/models/workflow.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
# compatible open source license.
77

88
from pydantic import BaseModel
9-
from core.common.models.index_build_parameters import IndexBuildParameters
9+
from core.common.models import IndexBuildParameters
1010

1111

1212
class BuildWorkflow(BaseModel):

remote_vector_index_builder/core/common/models/__init__.py

+2-22
Original file line numberDiff line numberDiff line change
@@ -6,28 +6,8 @@
66
# compatible open source license.
77

88
from .index_build_parameters import SpaceType
9-
10-
from .index_builder.cagra_graph_build_algo import CagraGraphBuildAlgo
11-
12-
139
from .index_build_parameters import IndexBuildParameters
1410
from .vectors_dataset import VectorsDataset
15-
from .index_builder.response.faiss_gpu_build_index_output import (
16-
FaissGpuBuildIndexOutput,
17-
)
18-
from .index_builder.response.faiss_cpu_build_index_output import (
19-
FaissCpuBuildIndexOutput,
20-
)
21-
from .index_builder.faiss_gpu_index_builder import FaissGPUIndexBuilder
22-
from .index_builder.faiss_cpu_index_builder import FaissCPUIndexBuilder
2311

24-
__all__ = [
25-
"SpaceType",
26-
"CagraGraphBuildAlgo",
27-
"IndexBuildParameters",
28-
"VectorsDataset",
29-
"FaissGpuBuildIndexOutput",
30-
"FaissCpuBuildIndexOutput",
31-
"FaissGPUIndexBuilder",
32-
"FaissCPUIndexBuilder",
33-
]
12+
13+
__all__ = ["SpaceType", "IndexBuildParameters", "VectorsDataset"]

remote_vector_index_builder/core/common/models/index_build_parameters.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ class IndexBuildParameters(BaseModel):
144144
doc_id_path: str
145145
tenant_id: str = ""
146146
dimension: int = Field(gt=0)
147-
doc_count: int = Field(gt=1)
147+
doc_count: int = Field(gt=4)
148148
data_type: DataType = DataType.FLOAT
149149
engine: Engine = Engine.FAISS
150150
index_parameters: IndexParameters = Field(default_factory=IndexParameters)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# Copyright OpenSearch Contributors
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# The OpenSearch Contributors require contributions made to
5+
# this file be licensed under the Apache-2.0 license or a
6+
# compatible open source license.
7+
8+
from .cagra_graph_build_algo import CagraGraphBuildAlgo
9+
from .response.faiss_gpu_build_index_output import (
10+
FaissGpuBuildIndexOutput,
11+
)
12+
from .response.faiss_cpu_build_index_output import (
13+
FaissCpuBuildIndexOutput,
14+
)
15+
from .faiss_gpu_index_builder import FaissGPUIndexBuilder
16+
from .faiss_cpu_index_builder import FaissCPUIndexBuilder
17+
18+
__all__ = [
19+
"CagraGraphBuildAlgo",
20+
"FaissGpuBuildIndexOutput",
21+
"FaissCpuBuildIndexOutput",
22+
"FaissGPUIndexBuilder",
23+
"FaissCPUIndexBuilder",
24+
]

remote_vector_index_builder/core/common/models/index_builder/faiss/faiss_gpu_index_cagra_builder.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,15 @@
1010
from dataclasses import field, dataclass
1111

1212
from core.common.models import (
13-
CagraGraphBuildAlgo,
1413
VectorsDataset,
1514
SpaceType,
15+
)
16+
from core.common.models.index_builder import (
17+
CagraGraphBuildAlgo,
1618
FaissGpuBuildIndexOutput,
1719
FaissGPUIndexBuilder,
1820
)
21+
1922
from core.index_builder.index_builder_utils import configure_metric
2023
from .ivf_pq_build_cagra_config import IVFPQBuildCagraConfig
2124
from .ivf_pq_search_cagra_config import IVFPQSearchCagraConfig

remote_vector_index_builder/core/common/models/index_builder/faiss/faiss_index_hnsw_cagra_builder.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import faiss
99
from dataclasses import dataclass
1010
from typing import Dict, Any
11-
from core.common.models import (
11+
from core.common.models.index_builder import (
1212
FaissCpuBuildIndexOutput,
1313
FaissGpuBuildIndexOutput,
1414
FaissCPUIndexBuilder,

remote_vector_index_builder/core/common/models/index_builder/faiss_cpu_index_builder.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
# compatible open source license.
77

88
from abc import ABC, abstractmethod
9-
from core.common.models import (
9+
from core.common.models.index_builder import (
1010
FaissCpuBuildIndexOutput,
1111
FaissGpuBuildIndexOutput,
1212
)

remote_vector_index_builder/core/common/models/index_builder/faiss_gpu_index_builder.py

+3
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010
from core.common.models import (
1111
SpaceType,
1212
VectorsDataset,
13+
)
14+
15+
from core.common.models.index_builder import (
1316
FaissGpuBuildIndexOutput,
1417
)
1518

remote_vector_index_builder/core/index_builder/index_builder_utils.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
import faiss
1111

1212

13-
from core.common.models.index_build_parameters import (
13+
from core.common.models import (
1414
SpaceType,
1515
)
1616

remote_vector_index_builder/core/object_store/object_store_factory.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from typing import Any, Dict
99

1010
from core.common.exceptions import UnsupportedObjectStoreTypeError
11-
from core.common.models.index_build_parameters import IndexBuildParameters
11+
from core.common.models import IndexBuildParameters
1212
from core.object_store.object_store import ObjectStore
1313
from core.object_store.s3.s3_object_store import S3ObjectStore
1414
from core.object_store.types import ObjectStoreType

remote_vector_index_builder/core/object_store/s3/s3_object_store.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
from botocore.config import Config
2020
from botocore.exceptions import ClientError
2121
from core.common.exceptions import BlobError
22-
from core.common.models.index_build_parameters import IndexBuildParameters
22+
from core.common.models import IndexBuildParameters
2323
from core.object_store.object_store import ObjectStore
2424

2525
logger = logging.getLogger(__name__)

remote_vector_index_builder/core/tasks.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@
3333
from io import BytesIO
3434
from typing import Any, Dict, Optional
3535

36-
from core.common.models.index_build_parameters import IndexBuildParameters
37-
from core.common.models.vectors_dataset import VectorsDataset
36+
from core.common.models import IndexBuildParameters
37+
from core.common.models import VectorsDataset
3838
from core.index_builder.faiss.faiss_index_build_service import FaissIndexBuildService
3939
from core.object_store.object_store_factory import ObjectStoreFactory
4040

test_remote_vector_index_builder/conftest.py

+22-180
Original file line numberDiff line numberDiff line change
@@ -5,187 +5,29 @@
55
# this file be licensed under the Apache-2.0 license or a
66
# compatible open source license.
77

8-
import sys
9-
from types import ModuleType
10-
from unittest.mock import Mock
11-
128
import pytest
13-
14-
15-
class DeletionTracker:
16-
"""Helper class to track object deletions"""
17-
18-
def __init__(self):
19-
self.deleted_objects = set()
20-
21-
def mark_deleted(self, obj_id):
22-
self.deleted_objects.add(obj_id)
23-
24-
def is_deleted(self, obj_id):
25-
return obj_id in self.deleted_objects
26-
27-
def reset(self):
28-
self.deleted_objects.clear()
29-
30-
31-
# Create global deletion tracker
32-
_deletion_tracker = DeletionTracker()
9+
from core.common.models.index_build_parameters import (
10+
AlgorithmParameters,
11+
IndexBuildParameters,
12+
IndexParameters,
13+
SpaceType,
14+
)
3315

3416

3517
@pytest.fixture
36-
def deletion_tracker():
37-
"""Fixture to provide access to deletion tracker"""
38-
_deletion_tracker.reset() # Reset before each test
39-
return _deletion_tracker
40-
41-
42-
@pytest.fixture(autouse=True)
43-
def reset_deletion_tracker():
44-
"""Reset deletion tracker before each test"""
45-
_deletion_tracker.reset()
46-
yield
47-
48-
49-
class MockGpuIndexCagra:
50-
"""Mock for faiss.GpuIndexCagra with deletion tracking"""
51-
52-
def __init__(self, *args, **kwargs):
53-
self.id = id(self)
54-
self.thisown = False
55-
self.args = args
56-
self.kwargs = kwargs
57-
58-
def __del__(self):
59-
print("deleting MockGpuIndexCagra:", self.id)
60-
_deletion_tracker.mark_deleted(self.id)
61-
62-
@property
63-
def is_deleted(self):
64-
return _deletion_tracker.is_deleted(self.id)
65-
66-
def copyTo(self, cpu_index):
67-
"""Mock implementation of copyTo method"""
68-
if not isinstance(cpu_index, MockIndexHNSWCagra):
69-
raise TypeError("Target must be IndexHNSWCagra")
70-
# Simulate copying data to CPU index
71-
return True
72-
73-
74-
class MockIndexIDMap:
75-
"""Mock for faiss.IndexIDMap with deletion tracking"""
76-
77-
def __init__(self, *args, **kwargs):
78-
self.id = id(self)
79-
self.own_fields = False
80-
self.index = None
81-
self.args = args
82-
self.kwargs = kwargs
83-
84-
def __del__(self):
85-
print("deleting MockIndexIDMap:", self.id)
86-
_deletion_tracker.mark_deleted(self.id)
87-
88-
@property
89-
def is_deleted(self):
90-
return _deletion_tracker.is_deleted(self.id)
91-
92-
def add_with_ids(self, vectors, ids):
93-
pass
94-
95-
96-
class MockIndexHNSWCagra(Mock):
97-
"""Mock for faiss.IndexHNSWCagra"""
98-
99-
def __init__(self, *args, **kwargs):
100-
super().__init__(*args, **kwargs)
101-
self.hnsw = Mock()
102-
self.base_level_only = True
103-
104-
def __del__(self):
105-
_deletion_tracker.mark_deleted(self.id)
106-
107-
@property
108-
def is_deleted(self):
109-
return _deletion_tracker.is_deleted(self.id)
110-
111-
112-
class MockIVFPQBuildCagraConfig:
113-
"""Mock class for faiss.IVFPQBuildCagraConfig"""
114-
115-
def __init__(self):
116-
self.n_lists = 1024
117-
self.kmeans_n_iters = 20
118-
self.kmeans_trainset_fraction = 0.5
119-
self.pq_bits = 8
120-
self.pq_dim = 0
121-
self.conservative_memory_allocation = True
122-
123-
124-
class MockIVFPQSearchCagraConfig:
125-
"""Mock class for faiss.IVFPQSearchCagraConfig"""
126-
127-
def __init__(self):
128-
self.n_probes = 20
129-
130-
131-
class MockGpuIndexCagraConfig:
132-
"""Mock class for faiss.GpuIndexCagraConfig"""
133-
134-
def __init__(self):
135-
self.intermediate_graph_degree = 64
136-
self.graph_degree = 32
137-
self.store_dataset = False
138-
self.device = 0
139-
self.refine_rate = 2.0
140-
self.build_algo = None
141-
self.ivf_pq_build_config = None
142-
self.ivf_pq_search_config = None
143-
144-
145-
class FaissMock(ModuleType):
146-
"""Complete mock for faiss module"""
147-
148-
def __init__(self):
149-
super().__init__("faiss")
150-
# Classes
151-
self.StandardGpuResources = Mock()
152-
self.GpuIndexCagra = MockGpuIndexCagra
153-
self.IndexIDMap = MockIndexIDMap
154-
self.IndexHNSWCagra = MockIndexHNSWCagra
155-
self.IVFPQBuildCagraConfig = MockIVFPQBuildCagraConfig
156-
self.IVFPQSearchCagraConfig = MockIVFPQSearchCagraConfig
157-
self.GpuIndexCagraConfig = MockGpuIndexCagraConfig
158-
159-
# Enums
160-
self.graph_build_algo_IVF_PQ = 1
161-
162-
self.METRIC_L2 = 0
163-
self.METRIC_INNER_PRODUCT = 1
164-
165-
self._num_threads = None
166-
self.omp_set_num_threads = self._omp_set_num_threads
167-
self.omp_get_num_threads = self._omp_get_num_threads
168-
169-
self.write_index = self._write_index
170-
171-
def _omp_set_num_threads(self, num_threads: int) -> None:
172-
self._num_threads = num_threads
173-
174-
def _omp_get_num_threads(self) -> int:
175-
return self._num_threads
176-
177-
def _write_index(self, index, filepath):
178-
if not isinstance(filepath, str):
179-
raise TypeError("Filepath must be a string")
180-
if not index:
181-
raise ValueError("Index cannot be None")
182-
try:
183-
with open(filepath, "wb") as f:
184-
f.write(b"MOCK_INDEX")
185-
except IOError as e:
186-
raise IOError(f"Failed to write to {filepath}: {str(e)}")
187-
188-
189-
# Create the mock and patch faiss
190-
faiss_mock = FaissMock()
191-
sys.modules["faiss"] = faiss_mock
18+
def index_build_parameters():
19+
"""Create sample IndexBuildParameters for testing"""
20+
return IndexBuildParameters(
21+
container_name="testbucket",
22+
vector_path="vec.knnvec",
23+
doc_id_path="doc.knndid",
24+
dimension=3,
25+
doc_count=5,
26+
index_parameters=IndexParameters(
27+
space_type=SpaceType.INNERPRODUCT,
28+
algorithm_parameters=AlgorithmParameters(
29+
ef_construction=200, ef_search=200
30+
),
31+
),
32+
repository_type="s3",
33+
)

0 commit comments

Comments
 (0)