-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathaxiv-vector-query.py
50 lines (33 loc) · 1.97 KB
/
axiv-vector-query.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# This is from the original file found here:
# https://alex.macrocosm.so/download
from InstructorEmbedding import INSTRUCTOR
import time
from pgvector.psycopg import register_vector
import psycopg
DB_NAME = "arxiv_abstracts"
connect_string = f"host=localhost user=postgres password='letmein' dbname='{DB_NAME}'"
#model = INSTRUCTOR('hkunlp/instructor-xl').cuda()
model = INSTRUCTOR('hkunlp/instructor-xl')
# CHANGE "sentence" HERE FOR DIFFERENT RESULTS
# read this to see examples of Instructor for information retrieval
# https://pypi.org/project/InstructorEmbedding/#use-customized-embeddings-for-information-retrieval
sentence = "Thames river pollution england"
instruction = "Represent the Research Paper abstract for retrieval; Input:"
embeddings = model.encode([[instruction,sentence]])
query_embedding = embeddings[0]
print("-----------------------------------------------------------------\n")
print("Similar search for " + sentence + "\n")
print("-----------------------------------------------------------------\n")
conn = psycopg.connect(connect_string, autocommit=True)
register_vector(conn)
results = conn.execute('SELECT id, (embedding <=> %s) as distance, abstract FROM documents ORDER BY embedding <=> %s LIMIT 10', (query_embedding, query_embedding, )).fetchall()
for result in results:
print("id: " + str(result[0]) + " || distance: " + str(result[1]) + " || abstract: " + result[2][:50])
print("-----------------------------------------------------------------\n")
print("Dissimilar search for " + sentence + "\n")
print("-----------------------------------------------------------------\n")
dissimilar_embedding = -1 * query_embedding
results = conn.execute('SELECT id, (embedding <=> %s) as distance, abstract FROM documents ORDER BY embedding <=> %s LIMIT 10', (dissimilar_embedding, dissimilar_embedding, )).fetchall()
for result in results:
print("id: " + str(result[0]) + " || distance: " + str(result[1]) + " || abstract: " + result[2][:50])
print("finished")