Skip to content

Commit aeaa0e3

Browse files
Add vector storage and search
1 parent 5b42a1a commit aeaa0e3

14 files changed

Lines changed: 1404 additions & 17 deletions

File tree

README.md

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ The data model is simple:
1818

1919
- **Entity** — a named node with a `type` and optional JSON `properties`
2020
- **Link** — a directed edge from a *subject* entity to an *object* entity, labelled with a *predicate* string and optional JSON `properties`
21+
- **Embedding** — a vector attached to an entity, with a model label and optional JSON metadata
2122

2223
Example: `(Experiment "SAXS run 42") --[produced]--> (Dataset "raw_001.h5")`
2324

@@ -103,6 +104,48 @@ mutation {
103104
}
104105
```
105106

107+
### Create an embedding
108+
109+
```bash
110+
curl -X POST http://localhost:8080/splash_links/embeddings \
111+
-H 'Content-Type: application/json' \
112+
-d '{
113+
"entityId": "<entity-id>",
114+
"embeddingModel": "text-embedding-3-small",
115+
"vector": [0.12, -0.03, 0.88],
116+
"properties": {"chunk": 1}
117+
}'
118+
```
119+
120+
### Find nearest embeddings
121+
122+
Embedding CRUD uses REST. Nearest-neighbor search stays in GraphQL and uses cosine distance. For PostgreSQL, embeddings are stored in a native `pgvector` column; for SQLite, they are stored as compact binary blobs and searched in-process.
123+
124+
```graphql
125+
query {
126+
nearestEmbeddings(
127+
vector: [0.11, -0.02, 0.90]
128+
embeddingModel: "text-embedding-3-small"
129+
limit: 5
130+
) {
131+
distance
132+
embedding {
133+
id
134+
entityId
135+
entity { name }
136+
}
137+
}
138+
}
139+
```
140+
141+
### Fetch or delete embeddings
142+
143+
```bash
144+
curl http://localhost:8080/splash_links/embeddings/<embedding-id>
145+
curl 'http://localhost:8080/splash_links/embeddings?entityId=<entity-id>&embeddingModel=text-embedding-3-small'
146+
curl -X DELETE http://localhost:8080/splash_links/embeddings/<embedding-id>
147+
```
148+
106149
### Health check
107150

108151
```
@@ -132,6 +175,13 @@ pixi run links -- --subject <entity-id> # outgoing from a node
132175
pixi run links -- --object <entity-id> # incoming to a node
133176
```
134177

178+
### List embeddings
179+
180+
```bash
181+
pixi run embeddings -- --entity <entity-id>
182+
splash-links embeddings --model text-embedding-3-small --limit 10
183+
```
184+
135185
### Raw SQLite shell
136186

137187
```bash
@@ -221,6 +271,7 @@ Tests require ≥ 90% coverage and will fail the build if that threshold is not
221271
| `docs` | `pixi run docs` | Serve MkDocs site locally |
222272
| `entities` | `pixi run entities` | List entities in the database |
223273
| `links` | `pixi run links` | List links in the database |
274+
| `embeddings` | `pixi run embeddings` | List embeddings in the database |
224275
| `db` | `pixi run db` | Open raw SQLite interactive shell |
225276

226277
Pass extra flags after `--`, e.g. `pixi run entities -- --type Experiment --limit 5`.
@@ -258,6 +309,16 @@ SPLASH_LINKS_DB=links.sqlite pixi run serve
258309
SPLASH_LINKS_DB=/data/links.sqlite pixi run serve
259310
```
260311

312+
#### PostgreSQL with pgvector
313+
314+
PostgreSQL nearest-neighbor search uses the `pgvector` extension. The Alembic migration will create the extension automatically when permissions allow it.
315+
316+
```bash
317+
SPLASH_LINKS_DB=postgresql+psycopg2://user:pass@host/dbname pixi run serve
318+
```
319+
320+
Embeddings use dialect-specific storage. PostgreSQL stores them in a native `pgvector` column, while SQLite stores packed float32 data in a BLOB. Base64 is intentionally not used, since it would only increase storage size and parsing overhead.
321+
261322
#### PostgreSQL (recommended for production / multi-user deployments)
262323

263324
Use PostgreSQL when you need concurrent writes, role-based access control, or want to run the service behind a load balancer. A `docker-compose.yml` is provided that starts a Postgres instance alongside the application:

_tests/test_base_client.py

Lines changed: 101 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
from splash_links.client import base as base_module
66
from splash_links.client import tiled as tiled_module
7-
from splash_links.client.base import Entity, LinksClient, from_uri
7+
from splash_links.client.base import EmbeddingMatch, Entity, LinksClient, from_uri
88
from splash_links.client.tiled import TiledEntity, _node_name, _node_properties, _node_uri, from_entity
99
from splash_links.client.tiled import get_or_create_entity as tiled_get_or_create
1010

@@ -195,6 +195,106 @@ def fake_execute(query: str, variables: dict | None = None) -> dict:
195195
}
196196

197197

198+
def test_create_embedding_posts_expected_payload(monkeypatch):
199+
seen: dict[str, object] = {}
200+
201+
def fake_post(url: str, json: dict, timeout: float):
202+
seen["url"] = url
203+
seen["json"] = json
204+
seen["timeout"] = timeout
205+
return FakeResponse(
206+
{
207+
"id": "emb-1",
208+
"entityId": "ent-1",
209+
"embeddingModel": "model-a",
210+
"vector": [0.1, 0.2, 0.3],
211+
"dimensions": 3,
212+
"properties": {"chunk": 1},
213+
"createdAt": "2026-01-01T00:00:00Z",
214+
}
215+
)
216+
217+
monkeypatch.setattr(base_module.httpx, "post", fake_post)
218+
219+
client = from_uri("splash://api:8080")
220+
embedding = client.create_embedding(
221+
entity_id="ent-1",
222+
vector=[0.1, 0.2, 0.3],
223+
embedding_model="model-a",
224+
properties={"chunk": 1},
225+
)
226+
227+
assert embedding.id == "emb-1"
228+
assert seen["url"] == "http://api:8080/splash_links/embeddings"
229+
assert seen["timeout"] == 30.0
230+
assert seen["json"] == {
231+
"entityId": "ent-1",
232+
"vector": [0.1, 0.2, 0.3],
233+
"embeddingModel": "model-a",
234+
"properties": {"chunk": 1},
235+
}
236+
237+
238+
def test_find_nearest_embeddings_posts_expected_payload(monkeypatch):
239+
seen: dict[str, object] = {}
240+
241+
def fake_execute(query: str, variables: dict | None = None) -> dict:
242+
seen["query"] = query
243+
seen["variables"] = variables
244+
return {
245+
"nearestEmbeddings": [
246+
{
247+
"distance": 0.01,
248+
"embedding": {
249+
"id": "emb-1",
250+
"entityId": "ent-1",
251+
"embeddingModel": "model-a",
252+
"vector": [0.1, 0.2],
253+
"dimensions": 2,
254+
"properties": None,
255+
"createdAt": "2026-01-01T00:00:00Z",
256+
},
257+
}
258+
]
259+
}
260+
261+
client = LinksClient("http://example.com")
262+
monkeypatch.setattr(client, "_execute", fake_execute)
263+
264+
matches = client.find_nearest_embeddings(
265+
vector=[0.1, 0.2],
266+
embedding_model="model-a",
267+
entity_id="ent-1",
268+
limit=5,
269+
offset=1,
270+
)
271+
272+
assert matches == [
273+
EmbeddingMatch.model_validate(
274+
{
275+
"distance": 0.01,
276+
"embedding": {
277+
"id": "emb-1",
278+
"entityId": "ent-1",
279+
"embeddingModel": "model-a",
280+
"vector": [0.1, 0.2],
281+
"dimensions": 2,
282+
"properties": None,
283+
"createdAt": "2026-01-01T00:00:00Z",
284+
},
285+
}
286+
)
287+
]
288+
assert seen["query"] == base_module._NEAREST_EMBEDDINGS_QUERY
289+
assert seen["variables"] == {
290+
"vector": [0.1, 0.2],
291+
"embeddingModel": "model-a",
292+
"entityId": "ent-1",
293+
"limit": 5,
294+
"offset": 1,
295+
}
296+
297+
198298
# ---------------------------------------------------------------------------
199299
# Tiled integration helpers
200300
# ---------------------------------------------------------------------------

_tests/test_cli.py

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
import splash_links.cli as cli_module
1010
from splash_links.cli import app
11-
from splash_links.store import EntityRecord, LinkRecord
11+
from splash_links.store import EmbeddingRecord, EntityRecord, LinkRecord
1212

1313
runner = CliRunner()
1414

@@ -44,10 +44,25 @@ def _make_link(**kw) -> LinkRecord:
4444
return LinkRecord(**defaults)
4545

4646

47+
def _make_embedding(**kw) -> EmbeddingRecord:
48+
defaults = dict(
49+
id="emb-1",
50+
entity_id="ent-1",
51+
embedding_model="model-a",
52+
vector=[0.1, 0.2, 0.3],
53+
dimensions=3,
54+
properties={},
55+
created_at=datetime.now(timezone.utc),
56+
)
57+
defaults.update(kw)
58+
return EmbeddingRecord(**defaults)
59+
60+
4761
class FakeStore:
48-
def __init__(self, entities=None, links=None):
62+
def __init__(self, entities=None, links=None, embeddings=None):
4963
self._entities = entities or []
5064
self._links = links or []
65+
self._embeddings = embeddings or []
5166

5267
def list_entities(self, entity_type=None, limit=50, offset=0):
5368
if entity_type:
@@ -57,6 +72,14 @@ def list_entities(self, entity_type=None, limit=50, offset=0):
5772
def find_links(self, subject_id=None, predicate=None, object_id=None, limit=50, offset=0):
5873
return self._links
5974

75+
def list_embeddings(self, entity_id=None, embedding_model=None, limit=50, offset=0):
76+
rows = self._embeddings
77+
if entity_id:
78+
rows = [embedding for embedding in rows if embedding.entity_id == entity_id]
79+
if embedding_model:
80+
rows = [embedding for embedding in rows if embedding.embedding_model == embedding_model]
81+
return rows
82+
6083
def close(self):
6184
pass
6285

@@ -124,6 +147,26 @@ def test_links_with_properties(monkeypatch):
124147
assert "confidence" in result.output
125148

126149

150+
# ---------------------------------------------------------------------------
151+
# embeddings command
152+
# ---------------------------------------------------------------------------
153+
154+
155+
def test_embeddings_shows_rows(monkeypatch):
156+
fake = FakeStore(embeddings=[_make_embedding()])
157+
monkeypatch.setattr(cli_module, "_open_store", lambda: fake)
158+
result = runner.invoke(app, ["embeddings"])
159+
assert result.exit_code == 0
160+
assert "model-a" in result.output
161+
162+
163+
def test_embeddings_no_rows_prints_message(monkeypatch):
164+
monkeypatch.setattr(cli_module, "_open_store", lambda: FakeStore())
165+
result = runner.invoke(app, ["embeddings"])
166+
assert result.exit_code == 0
167+
assert "No embeddings found" in result.output
168+
169+
127170
# ---------------------------------------------------------------------------
128171
# _open_store — missing DB
129172
# ---------------------------------------------------------------------------

0 commit comments

Comments
 (0)