From db14c9f4a6b859653f9f4a4d0b4af8c718856ce3 Mon Sep 17 00:00:00 2001 From: Gleb Otochkin Date: Thu, 13 Feb 2025 14:21:16 -0500 Subject: [PATCH] feat: new pinecone API (#285) --- infrastructure/movie-search-app/README.md | 58 ++++++++++++++++++- .../movie-search-app/movie_search.py | 10 ++-- .../movie-search-app/pinecone_model.py | 6 +- 3 files changed, 63 insertions(+), 11 deletions(-) diff --git a/infrastructure/movie-search-app/README.md b/infrastructure/movie-search-app/README.md index 98f28aa1..8a463344 100644 --- a/infrastructure/movie-search-app/README.md +++ b/infrastructure/movie-search-app/README.md @@ -96,7 +96,59 @@ Here is the [link to the documentation for AlloyDB](https://cloud.google.com/all Create a database with the name movies and the user movies_owner. You can choose your own names for the database and the user. The application takes it from environment variables. Optionally you can modify the application to use secret manager in Google Cloud as more secured approach. ### Migrate data from Pinecone to AlloyDB -- Move the data from Pinecone to AlloyDB +Move the data from Pinecone to AlloyDB +- Pinecone index structure consists primarily from 3 main parts: + ID - unique row ID + VALUES - vector embedding value (text-embedding-004 from Google) + METADATA - Supplemental information about the data in key/value format + +- The future AlloyDB/PostreSQL table as it is defined in the app will have the following structure: + ``` + Table "public.alloydb_table" + Column | Type | Collation | Nullable | Default + --------------------+-------------+-----------+----------+--------- + langchain_id | uuid | | not null | + content | text | | not null | + embedding | vector(768) | | not null | + langchain_metadata | json | | | + Indexes: + "alloydb_table_pkey" PRIMARY KEY, btree (langchain_id) + ``` + And here is the json keys for the langchain_metadata column (from the movie dataset): + ``` + jsonb_object_keys + --------------------- + tags + genre + image + title + actors + poster + writer + runtime + summary + director + imdblink + boxoffice + imdbscore + imdbvotes + languages + viewrating + netflixlink + releasedate + tmdbtrailer + trailersite + seriesormovie + awardsreceived + hiddengemscore + metacriticscore + productionhouse + awardsnominatedfor + netflixreleasedate + countryavailability + rottentomatoesscore + ``` +- All the metadata keys are taken from the Pinecone metadata keeping the same structure. ### Enable virtual environment for Python You can use either your laptop or a virtual machnie for deployment. Using a VM deployed in the same Google Cloud project simplifies deployeent and network configuration. On a Debian Linux you can enable it in the shell using the following command: @@ -126,9 +178,9 @@ pip install -r requirements.txt export PINECONE_INDEX_NAME=netflix-index-01 export PORT=8080 export DB_USER=movies_owner -export DB_PASS=DatabasePassword +export DB_PASS={DATABASEPASSSWORD} export DB_NAME=movies -export INSTANCE_HOST=ALLOYDB_IP +export INSTANCE_HOST={ALLOYDB_IP} export DB_PORT=5432 ``` - Here is the command used to start the application diff --git a/infrastructure/movie-search-app/movie_search.py b/infrastructure/movie-search-app/movie_search.py index ec05c181..b1f0cdc5 100644 --- a/infrastructure/movie-search-app/movie_search.py +++ b/infrastructure/movie-search-app/movie_search.py @@ -209,13 +209,13 @@ def get_movies(db: sqlalchemy.engine.base.Engine, embeddings: str) -> dict: stmt = sqlalchemy.text( """ SELECT - mj.metadata->'title' as title, - mj.metadata->'summary' as summary, - mj.metadata->'director' as director, - mj.metadata->'actors' as actors, + mj.langchain_metadata->'title' as title, + mj.langchain_metadata->'summary' as summary, + mj.langchain_metadata->'director' as director, + mj.langchain_metadata->'actors' as actors, (mj.embedding <=> (:embeddings)::vector) as distance FROM - movies_json mj + alloydb_table mj ORDER BY distance ASC LIMIT 5; diff --git a/infrastructure/movie-search-app/pinecone_model.py b/infrastructure/movie-search-app/pinecone_model.py index 13c52ce2..55cfa723 100644 --- a/infrastructure/movie-search-app/pinecone_model.py +++ b/infrastructure/movie-search-app/pinecone_model.py @@ -14,7 +14,7 @@ import google.generativeai as genai from typing import Iterable -from pinecone import Pinecone # as Pinecone +from pinecone.grpc import PineconeGRPC as Pinecone import logging import os from data_model import ChatMessage, State @@ -58,10 +58,10 @@ def get_movies(embedding: list[float]) -> dict: logging.warning("PINECONE_INDEX_NAME not set, using default: %s", PINECONE_INDEX_NAME) pc = Pinecone(api_key=state.pinecone_api_key) index = pc.Index(name=PINECONE_INDEX_NAME) - query_resp = index.query(vector=embedding, namespace="sandpaper", top_k=5) + query_resp = index.query(vector=embedding, namespace="sandpaper", top_k=5, include_metadata=True) movies_list = [] for match in query_resp.matches: - meta = index.fetch(ids=[match['id']], namespace="sandpaper")["vectors"][match['id']]["metadata"] + meta = match["metadata"] movies_list.append({"title":meta["title"],"summary":meta["summary"],"director":meta["director"],"genre": meta["genre"],"actors": meta["actors"]}) return movies_list