-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathessay.py
87 lines (65 loc) · 2.18 KB
/
essay.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from dataclasses import dataclass
from typing import Annotated
import httpx
from vechord.chunk import RegexChunker
from vechord.embedding import GeminiDenseEmbedding
from vechord.evaluate import GeminiEvaluator
from vechord.extract import SimpleExtractor
from vechord.registry import VechordRegistry
from vechord.spec import (
ForeignKey,
PrimaryKeyAutoIncrease,
Table,
Vector,
)
URL = "https://paulgraham.com/{}.html"
ARTICLE = "best"
TOP_K = 10
DenseVector = Vector[768]
emb = GeminiDenseEmbedding()
evaluator = GeminiEvaluator()
extractor = SimpleExtractor()
class Chunk(Table, kw_only=True):
uid: PrimaryKeyAutoIncrease | None = None
text: str
vector: DenseVector
class Query(Table, kw_only=True):
uid: PrimaryKeyAutoIncrease | None = None
cid: Annotated[int, ForeignKey[Chunk.uid]]
text: str
vector: DenseVector
@dataclass(frozen=True)
class Evaluation:
map: float
ndcg: float
recall: float
vr = VechordRegistry(ARTICLE, "postgresql://postgres:[email protected]:5432/")
vr.register([Chunk, Query])
with httpx.Client() as client:
resp = client.get(URL.format(ARTICLE))
doc = extractor.extract_html(resp.text)
@vr.inject(output=Chunk)
def segment_essay() -> list[Chunk]:
chunker = RegexChunker()
chunks = chunker.segment(doc)
return [
Chunk(text=chunk, vector=DenseVector(emb.vectorize_chunk(chunk)))
for chunk in chunks
]
@vr.inject(input=Chunk, output=Query)
def create_query(uid: int, text: str) -> Query:
query = evaluator.produce_query(doc, text)
return Query(cid=uid, text=query, vector=DenseVector(emb.vectorize_chunk(query)))
@vr.inject(input=Query)
def evaluate(cid: int, vector: DenseVector) -> Evaluation:
chunks: list[Chunk] = vr.search_by_vector(Chunk, vector, topk=TOP_K)
score = evaluator.evaluate_one(str(cid), [str(chunk.uid) for chunk in chunks])
return Evaluation(
map=score["map"], ndcg=score["ndcg"], recall=score[f"recall_{TOP_K}"]
)
if __name__ == "__main__":
segment_essay()
create_query()
res: list[Evaluation] = evaluate()
print("ndcg", sum(r.ndcg for r in res) / len(res))
print(f"recall@{TOP_K}", sum(r.recall for r in res) / len(res))