From db14c9f4a6b859653f9f4a4d0b4af8c718856ce3 Mon Sep 17 00:00:00 2001
From: Gleb Otochkin <otochkin@google.com>
Date: Thu, 13 Feb 2025 14:21:16 -0500
Subject: [PATCH] feat: new pinecone API (#285)

---
 infrastructure/movie-search-app/README.md     | 58 ++++++++++++++++++-
 .../movie-search-app/movie_search.py          | 10 ++--
 .../movie-search-app/pinecone_model.py        |  6 +-
 3 files changed, 63 insertions(+), 11 deletions(-)

diff --git a/infrastructure/movie-search-app/README.md b/infrastructure/movie-search-app/README.md
index 98f28aa..8a46334 100644
--- a/infrastructure/movie-search-app/README.md
+++ b/infrastructure/movie-search-app/README.md
@@ -96,7 +96,59 @@ Here is the [link to the documentation for AlloyDB](https://cloud.google.com/all
 Create a database with the name movies and the user movies_owner. You can choose your own names for the database and the user. The application takes it from environment variables. Optionally you can modify the application to use secret manager in Google Cloud as more secured approach.
 
 ### Migrate data from Pinecone to AlloyDB
-- Move the data from Pinecone to AlloyDB
+Move the data from Pinecone to AlloyDB
+- Pinecone index structure consists primarily from 3 main parts:
+  ID - unique row ID
+  VALUES	- vector embedding value (text-embedding-004 from Google)
+  METADATA	- Supplemental information about the data in key/value format
+
+- The future AlloyDB/PostreSQL table as it is defined in the app will have the following structure:
+   ```
+                     Table "public.alloydb_table"
+         Column       |    Type     | Collation | Nullable | Default
+   --------------------+-------------+-----------+----------+---------
+   langchain_id       | uuid        |           | not null |
+   content            | text        |           | not null |
+   embedding          | vector(768) |           | not null |
+   langchain_metadata | json        |           |          |
+   Indexes:
+      "alloydb_table_pkey" PRIMARY KEY, btree (langchain_id)
+   ```
+   And here is the json keys for the langchain_metadata column (from the movie dataset):
+   ```
+     jsonb_object_keys
+   ---------------------
+   tags
+   genre
+   image
+   title
+   actors
+   poster
+   writer
+   runtime
+   summary
+   director
+   imdblink
+   boxoffice
+   imdbscore
+   imdbvotes
+   languages
+   viewrating
+   netflixlink
+   releasedate
+   tmdbtrailer
+   trailersite
+   seriesormovie
+   awardsreceived
+   hiddengemscore
+   metacriticscore
+   productionhouse
+   awardsnominatedfor
+   netflixreleasedate
+   countryavailability
+   rottentomatoesscore
+   ```
+- All the metadata keys are taken from the Pinecone metadata keeping the same structure.
 
 ### Enable virtual environment for Python
 You can use either your laptop or a virtual machnie for deployment. Using a VM deployed in the same Google Cloud project simplifies deployeent and network configuration. On a Debian Linux you can enable it in the shell using the following command:
@@ -126,9 +178,9 @@ pip install -r requirements.txt
 export PINECONE_INDEX_NAME=netflix-index-01
 export PORT=8080
 export DB_USER=movies_owner
-export DB_PASS=DatabasePassword
+export DB_PASS={DATABASEPASSSWORD}
 export DB_NAME=movies
-export INSTANCE_HOST=ALLOYDB_IP
+export INSTANCE_HOST={ALLOYDB_IP}
 export DB_PORT=5432
 ```
 - Here is the command used to start the application
diff --git a/infrastructure/movie-search-app/movie_search.py b/infrastructure/movie-search-app/movie_search.py
index ec05c18..b1f0cdc 100644
--- a/infrastructure/movie-search-app/movie_search.py
+++ b/infrastructure/movie-search-app/movie_search.py
@@ -209,13 +209,13 @@ def get_movies(db: sqlalchemy.engine.base.Engine, embeddings: str) -> dict:
     stmt = sqlalchemy.text(
         """
         SELECT
-                mj.metadata->'title' as title,
-                mj.metadata->'summary' as summary,
-                mj.metadata->'director' as director,
-                mj.metadata->'actors' as actors,
+                mj.langchain_metadata->'title' as title,
+                mj.langchain_metadata->'summary' as summary,
+                mj.langchain_metadata->'director' as director,
+                mj.langchain_metadata->'actors' as actors,
                 (mj.embedding <=> (:embeddings)::vector) as distance
         FROM
-                movies_json mj
+                alloydb_table mj
         ORDER BY
                 distance ASC
         LIMIT 5;
diff --git a/infrastructure/movie-search-app/pinecone_model.py b/infrastructure/movie-search-app/pinecone_model.py
index 13c52ce..55cfa72 100644
--- a/infrastructure/movie-search-app/pinecone_model.py
+++ b/infrastructure/movie-search-app/pinecone_model.py
@@ -14,7 +14,7 @@
 
 import google.generativeai as genai
 from typing import Iterable
-from pinecone import Pinecone # as Pinecone
+from pinecone.grpc import PineconeGRPC as Pinecone
 import logging
 import os
 from data_model import ChatMessage, State
@@ -58,10 +58,10 @@ def get_movies(embedding: list[float]) -> dict:
         logging.warning("PINECONE_INDEX_NAME not set, using default: %s", PINECONE_INDEX_NAME)
     pc = Pinecone(api_key=state.pinecone_api_key)
     index = pc.Index(name=PINECONE_INDEX_NAME)
-    query_resp = index.query(vector=embedding, namespace="sandpaper", top_k=5)
+    query_resp = index.query(vector=embedding, namespace="sandpaper", top_k=5, include_metadata=True)
     movies_list = []
     for match in query_resp.matches:
-        meta = index.fetch(ids=[match['id']], namespace="sandpaper")["vectors"][match['id']]["metadata"]
+        meta = match["metadata"]
         movies_list.append({"title":meta["title"],"summary":meta["summary"],"director":meta["director"],"genre": meta["genre"],"actors": meta["actors"]})
     return movies_list