diff --git a/application/core/settings.py b/application/core/settings.py index a7811ec78..fa728474e 100644 --- a/application/core/settings.py +++ b/application/core/settings.py @@ -10,7 +10,7 @@ class Settings(BaseSettings): LLM_NAME: str = "docsgpt" MODEL_NAME: Optional[str] = None # if LLM_NAME is openai, MODEL_NAME can be gpt-4 or gpt-3.5-turbo - EMBEDDINGS_NAME: str = "huggingface_sentence-transformers/all-mpnet-base-v2" + EMBEDDINGS_NAME: str = "openai/clip-vit-base-patch16" #"huggingface_sentence-transformers/all-mpnet-base-v2" CELERY_BROKER_URL: str = "redis://localhost:6379/0" CELERY_RESULT_BACKEND: str = "redis://localhost:6379/1" MONGO_URI: str = "mongodb://localhost:27017/docsgpt" diff --git a/application/parser/file/bulk.py b/application/parser/file/bulk.py index 8201b3f22..317148aa7 100644 --- a/application/parser/file/bulk.py +++ b/application/parser/file/bulk.py @@ -1,5 +1,7 @@ """Simple reader that reads files of different formats from a directory.""" + import logging +import sys from pathlib import Path from typing import Callable, Dict, List, Optional, Union @@ -10,7 +12,7 @@ from application.parser.file.html_parser import HTMLParser from application.parser.file.markdown_parser import MarkdownParser from application.parser.file.rst_parser import RstParser -from application.parser.file.tabular_parser import PandasCSVParser,ExcelParser +from application.parser.file.tabular_parser import PandasCSVParser, ExcelParser from application.parser.file.json_parser import JSONParser from application.parser.file.pptx_parser import PPTXParser from application.parser.file.image_parser import ImageParser @@ -20,14 +22,14 @@ ".pdf": PDFParser(), ".docx": DocxParser(), ".csv": PandasCSVParser(), - ".xlsx":ExcelParser(), + ".xlsx": ExcelParser(), ".epub": EpubParser(), ".md": MarkdownParser(), ".rst": RstParser(), ".html": HTMLParser(), ".mdx": MarkdownParser(), - ".json":JSONParser(), - ".pptx":PPTXParser(), + ".json": JSONParser(), + ".pptx": PPTXParser(), ".png": ImageParser(), ".jpg": ImageParser(), ".jpeg": ImageParser(), @@ -61,16 +63,16 @@ class SimpleDirectoryReader(BaseReader): """ def __init__( - self, - input_dir: Optional[str] = None, - input_files: Optional[List] = None, - exclude_hidden: bool = True, - errors: str = "ignore", - recursive: bool = True, - required_exts: Optional[List[str]] = None, - file_extractor: Optional[Dict[str, BaseParser]] = None, - num_files_limit: Optional[int] = None, - file_metadata: Optional[Callable[[str], Dict]] = None, + self, + input_dir: Optional[str] = None, + input_files: Optional[List] = None, + exclude_hidden: bool = True, + errors: str = "ignore", + recursive: bool = True, + required_exts: Optional[List[str]] = None, + file_extractor: Optional[Dict[str, BaseParser]] = None, + num_files_limit: Optional[int] = None, + file_metadata: Optional[Callable[[str], Dict]] = None, ) -> None: """Initialize with parameters.""" super().__init__() @@ -110,8 +112,8 @@ def _add_files(self, input_dir: Path) -> List[Path]: elif self.exclude_hidden and input_file.name.startswith("."): continue elif ( - self.required_exts is not None - and input_file.suffix not in self.required_exts + self.required_exts is not None + and input_file.suffix not in self.required_exts ): continue else: @@ -122,7 +124,7 @@ def _add_files(self, input_dir: Path) -> List[Path]: new_input_files.extend(sub_input_files) if self.num_files_limit is not None and self.num_files_limit > 0: - new_input_files = new_input_files[0: self.num_files_limit] + new_input_files = new_input_files[0 : self.num_files_limit] # print total number of files added logging.debug( @@ -146,6 +148,7 @@ def load_data(self, concatenate: bool = False) -> List[Document]: data: Union[str, List[str]] = "" data_list: List[str] = [] metadata_list = [] + documents: List[Document] = [] for input_file in self.input_files: if input_file.suffix in self.file_extractor: parser = self.file_extractor[input_file.suffix] @@ -155,29 +158,24 @@ def load_data(self, concatenate: bool = False) -> List[Document]: else: # do standard read with open(input_file, "r", errors=self.errors) as f: - data = f.read() + data = {"text": f.read(), "tables": [], "images": []} # Prepare metadata for this file if self.file_metadata is not None: file_metadata = self.file_metadata(str(input_file)) else: # Provide a default empty metadata - file_metadata = {'title': '', 'store': ''} - # TODO: Find a case with no metadata and check if breaks anything - - if isinstance(data, List): - # Extend data_list with each item in the data list - data_list.extend([str(d) for d in data]) - # For each item in the data list, add the file's metadata to metadata_list - metadata_list.extend([file_metadata for _ in data]) - else: - # Add the single piece of data to data_list - data_list.append(str(data)) - # Add the file's metadata to metadata_list - metadata_list.append(file_metadata) + file_metadata = {"title": "", "store": ""} + # TODO: Find a case with no metadata and check if breaks anything if concatenate: return [Document("\n".join(data_list))] - elif self.file_metadata is not None: - return [Document(d, extra_info=m) for d, m in zip(data_list, metadata_list)] else: - return [Document(d) for d in data_list] + # Create separate documents for text, tables, and images + doc = Document( + text=data.get("text", None), + tables=data.get("tables", None), + images=data.get("images", None), + extra_info=file_metadata, + ) + documents.append(doc) + return documents diff --git a/application/parser/file/docs_parser.py b/application/parser/file/docs_parser.py index 55d45a648..142bf60d3 100644 --- a/application/parser/file/docs_parser.py +++ b/application/parser/file/docs_parser.py @@ -3,12 +3,23 @@ Contains parsers for docx, pdf files. """ -from pathlib import Path -from typing import Dict +from pathlib import Path +from typing import Dict, List, Any, Union, Optional +from base64 import b64encode from application.parser.file.base_parser import BaseParser from application.core.settings import settings +from docx.enum.shape import WD_INLINE_SHAPE_TYPE import requests +import logging +import sys +from docx import Document +import base64 +from PIL import Image +import io + +logger = logging.getLogger(__name__) + class PDFParser(BaseParser): """PDF parser.""" @@ -23,9 +34,9 @@ def parse_file(self, file: Path, errors: str = "ignore") -> str: doc2md_service = "https://llm.arc53.com/doc2md" # alternatively you can use local vision capable LLM with open(file, "rb") as file_loaded: - files = {'file': file_loaded} - response = requests.post(doc2md_service, files=files) - data = response.json()["markdown"] + files = {"file": file_loaded} + response = requests.post(doc2md_service, files=files) + data = response.json()["markdown"] return data try: @@ -51,19 +62,82 @@ def parse_file(self, file: Path, errors: str = "ignore") -> str: class DocxParser(BaseParser): - """Docx parser.""" + """Docx parser to extract text, tables, and images.""" def _init_parser(self) -> Dict: """Init parser.""" return {} - def parse_file(self, file: Path, errors: str = "ignore") -> str: - """Parse file.""" + def parse_file(self, file: Path, errors: str = "ignore") -> Dict[str, Any]: + """Parse file and extract text, tables, and images.""" + document = Document(file) + text_content = [] + tables = [] + images = [] + + # Extract text from paragraphs + for para in document.paragraphs: + if para.text.strip(): + text_content.append(para.text.strip()) + + # Extract tables + for table in document.tables: + table_data = [] + for row in table.rows: + row_data = [cell.text.strip() for cell in row.cells] + table_data.append(row_data) + + # Flatten table into a string + flat_table = "\n".join([" | ".join(row) for row in table_data]) + tables.append(flat_table) + + # Extract images + for shape in document.inline_shapes: + if shape.type == WD_INLINE_SHAPE_TYPE.PICTURE: + image_data = self.extract_image_from_shape(shape, document=document) + if image_data: + images.append(image_data) + + return {"text": "\n".join(text_content), "tables": tables, "images": images} + + def extract_image_from_shape(self, shape, document) -> Optional[Dict[str, str]]: + """Extract image from an inline shape and encode it in Base64.""" try: - import docx2txt - except ImportError: - raise ValueError("docx2txt is required to read Microsoft Word files.") - - text = docx2txt.process(file) - - return text + rId = shape._inline.graphic.graphicData.pic.blipFill.blip.embed + image_part = document.part.related_parts[rId] + image_data = image_part.blob + image_filename = image_part.filename or f"image_{rId}.png" + + # Validate the image + if not self.is_valid_image(image_data): + print(f"Invalid image: {image_filename}") + return None + resized_image = self.resize_image(image_data) + image_base64 = base64.b64encode(resized_image).decode("utf-8") + return {"filename": image_filename, "image_base64": image_base64} + except Exception as e: + print(f"Error extracting image: {e}") + return None + + def is_valid_image(self, image_data: bytes) -> bool: + """Validate image data.""" + try: + image = Image.open(io.BytesIO(image_data)) + image.verify() + return True + except Exception: + return False + + def resize_image(self, image_data: bytes, width=200) -> bytes: + """Resize image to a specified maximum width, maintaining aspect ratio.""" + image = Image.open(io.BytesIO(image_data)) + ratio = width / float(image.width) + height = int(image.height * ratio) + img = image.resize((width, height), Image.Resampling.LANCZOS) + + # Save the resized image back to bytes + output = io.BytesIO() + # Use original format if possible, else default to PNG + img_format = image.format if image.format else "PNG" + img.save(output, format=img_format) + return output.getvalue() diff --git a/application/parser/file/image_parser.py b/application/parser/file/image_parser.py index fd800d91c..6f30cd343 100644 --- a/application/parser/file/image_parser.py +++ b/application/parser/file/image_parser.py @@ -3,10 +3,12 @@ Contains parser for .png, .jpg, .jpeg files. """ + from pathlib import Path import requests +import base64 from typing import Dict, Union - +from urllib.parse import urlparse from application.parser.file.base_parser import BaseParser @@ -21,7 +23,7 @@ def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, list[str] doc2md_service = "https://llm.arc53.com/doc2md" # alternatively you can use local vision capable LLM with open(file, "rb") as file_loaded: - files = {'file': file_loaded} - response = requests.post(doc2md_service, files=files) - data = response.json()["markdown"] + files = {"file": file_loaded} + response = requests.post(doc2md_service, files=files) + data = response.json()["markdown"] return data diff --git a/application/parser/open_ai_func.py b/application/parser/open_ai_func.py index 3109f5839..2b4f1e1a5 100755 --- a/application/parser/open_ai_func.py +++ b/application/parser/open_ai_func.py @@ -1,6 +1,7 @@ -import os +import os, sys from retry import retry +import numpy as np from application.core.settings import settings @@ -20,6 +21,27 @@ def store_add_texts_with_retry(store, i, id): # store_pine.add_texts([i.page_content], metadatas=[i.metadata]) +@retry(tries=10, delay=60) +def store_add_images_with_retry(store, image_base64: str, metadata: dict, id: str): + try: + print("Starting to embed image", file=sys.stderr) + metadata["source_id"] = str(id) + + # Embed the image + image_vector = store.embeddings.embed_image(image_base64=image_base64) + print("Image embedded successfully", file=sys.stderr) + + # Call add_image to handle indexing and docstore insertion + doc_id = store.add_image(image_vector, metadata) + print( + f"Completed store_add_images_with_retry, doc_id={doc_id}", file=sys.stderr + ) + except Exception as e: + print(f"Error in store_add_images_with_retry: {e}", file=sys.stderr) + print(f"error line number: {sys.exc_info()[-1].tb_lineno}", file=sys.stderr) + raise e + + def call_openai_api(docs, folder_name, id, task_status): # Function to create a vector store from the documents and save it to disk @@ -28,48 +50,147 @@ def call_openai_api(docs, folder_name, id, task_status): from tqdm import tqdm - c1 = 0 - if settings.VECTOR_STORE == "faiss": - docs_init = [docs[0]] - docs.pop(0) + try: + # c1 = 0 + text_docs = [] + images_docs = [] + for d in docs: + """What this does is that it separates the text documents from the images documents""" + """This is done to ensure that the text documents are indexed and the images are not""" + """Because we will index image documents separately""" + + tables = d.metadata.get("tables", None) + if tables and isinstance(tables, list): + combined_text = (d.page_content or "") + "\n\n" + "\n".join(tables) + d.page_content = combined_text.strip() + del d.metadata["tables"] + + if d.page_content and d.page_content.strip(): + text_docs.append(d) + + images = d.metadata.get("images", None) + if images and isinstance(images, list): + for img in images: + images_docs.append((d, img)) + del d.metadata["images"] + + store = None + if settings.VECTOR_STORE == "faiss": + if text_docs: + docs_init = [text_docs[0]] + rest_docs = text_docs[1:] + print( + "Dimension validation completed successfully", + file=sys.stderr, + flush=True, + ) + print( + "Now proceeding with document indexing...", + file=sys.stderr, + flush=True, + ) + + print( + f"Number of text_docs: {len(text_docs)}", + file=sys.stderr, + flush=True, + ) + print( + f"Number of images_docs: {len(images_docs)}", + file=sys.stderr, + flush=True, + ) + + print("Creating vectorstore...", file=sys.stderr, flush=True) + store = VectorCreator.create_vectorstore( + settings.VECTOR_STORE, + docs_init=docs_init, + source_id=f"{folder_name}", + embeddings_key=os.getenv("EMBEDDINGS_KEY"), + ) + print("Vectorstore created", file=sys.stderr, flush=True) + s1 = len(rest_docs) + c1 = 0 + for i in tqdm( + rest_docs, + desc="Embedding 🦖", + unit="docs", + total=len(rest_docs), + bar_format="{l_bar}{bar}| Time Left: {remaining}", + ): + try: + task_status.update_state( + state="PROGRESS", meta={"current": int((c1 / s1) * 100)} + ) + store_add_texts_with_retry(store, i, id) + except Exception as e: + print(e) + print("Error on ", i) + print("Saving progress") + print(f"stopped at {c1} out of {len(docs)}") + store.save_local(f"{folder_name}") + break + c1 += 1 + # if settings.VECTOR_STORE == "faiss": no need we already checked + store.save_local(f"{folder_name}") + else: + store = VectorCreator.create_vectorstore( + settings.VECTOR_STORE, + source_id=str(id), + embeddings_key=os.getenv("EMBEDDINGS_KEY"), + ) + store.save_local(f"{folder_name}") + + """ Handling Image Seperately """ + print("Handling Images 🩻", file=sys.stderr) + if images_docs and settings.VECTOR_STORE == "faiss": + print( + "Embedding Images 🦖 🩻 - Using the same text vector store", + file=sys.stderr, + ) + s2 = len(images_docs) + c2 = 0 + print(f"Total images to embed: {s2}", file=sys.stderr) + for origin_doc, img in tqdm( + images_docs, + desc="Embedding Images 🦖 🩻", + unit="imgs", + total=s2, + bar_format="{l_bar}{bar}| Time Left: {remaining}", + ): + print(f"Processing image {c2+1}/{s2}", file=sys.stderr) + image_base64 = img.get("image_base64") + if not image_base64: + print("No base64 found for this image", file=sys.stderr) + continue + + try: + task_status.update_state( + state="PROGRESS", meta={"current": int((c2 / s2) * 100)} + ) + print("Calling store_add_images_with_retry", file=sys.stderr) + store_add_images_with_retry( + store, image_base64, origin_doc.metadata, id + ) + print("Image stored successfully", file=sys.stderr) + except Exception as e: + print(e, file=sys.stderr) + print("Error on ", img.get("filename"), file=sys.stderr) + print("Saving progress", file=sys.stderr) + print(f"stopped at {c2} out of {len(images_docs)}", file=sys.stderr) + store.save_local(f"{folder_name}") + break + c2 += 1 + + print("Finished image embedding loop", file=sys.stderr) + store.save_local(f"{folder_name}") + print("Image store saved", file=sys.stderr) + except Exception as e: + print(f"Error in call_openai_api: {e}", file=sys.stderr) + print(f"error line number: {sys.exc_info()[-1].tb_lineno}", file=sys.stderr) + raise e - store = VectorCreator.create_vectorstore( - settings.VECTOR_STORE, - docs_init=docs_init, - source_id=f"{folder_name}", - embeddings_key=os.getenv("EMBEDDINGS_KEY"), - ) - else: - store = VectorCreator.create_vectorstore( - settings.VECTOR_STORE, - source_id=str(id), - embeddings_key=os.getenv("EMBEDDINGS_KEY"), - ) - store.delete_index() # Uncomment for MPNet embeddings # model_name = "sentence-transformers/all-mpnet-base-v2" # hf = HuggingFaceEmbeddings(model_name=model_name) # store = FAISS.from_documents(docs_test, hf) - s1 = len(docs) - for i in tqdm( - docs, - desc="Embedding 🦖", - unit="docs", - total=len(docs), - bar_format="{l_bar}{bar}| Time Left: {remaining}", - ): - try: - task_status.update_state( - state="PROGRESS", meta={"current": int((c1 / s1) * 100)} - ) - store_add_texts_with_retry(store, i, id) - except Exception as e: - print(e) - print("Error on ", i) - print("Saving progress") - print(f"stopped at {c1} out of {len(docs)}") - store.save_local(f"{folder_name}") - break - c1 += 1 - if settings.VECTOR_STORE == "faiss": - store.save_local(f"{folder_name}") diff --git a/application/parser/schema/base.py b/application/parser/schema/base.py index 61670f9a6..ddb8ebd54 100644 --- a/application/parser/schema/base.py +++ b/application/parser/schema/base.py @@ -1,4 +1,5 @@ """Base schema for readers.""" + from dataclasses import dataclass from langchain.docstore.document import Document as LCDocument @@ -26,9 +27,20 @@ def get_type(cls) -> str: def to_langchain_format(self) -> LCDocument: """Convert struct to LangChain document format.""" metadata = self.extra_info or {} + + if self.images is not None: + metadata["images"] = self.images + if self.tables is not None: + metadata["tables"] = self.tables + return LCDocument(page_content=self.text, metadata=metadata) @classmethod def from_langchain_format(cls, doc: LCDocument) -> "Document": """Convert struct from LangChain document format.""" - return cls(text=doc.page_content, extra_info=doc.metadata) + images = doc.metadata.pop("images", None) + tables = doc.metadata.pop("tables", None) + + return cls( + text=doc.page_content, extra_info=doc.metadata, images=images, tables=tables + ) diff --git a/application/parser/schema/schema.py b/application/parser/schema/schema.py index ec467e5a7..a2fb6c9ef 100644 --- a/application/parser/schema/schema.py +++ b/application/parser/schema/schema.py @@ -1,7 +1,7 @@ """Base schema for data structures.""" from abc import abstractmethod from dataclasses import dataclass -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Union from dataclasses_json import DataClassJsonMixin @@ -23,6 +23,10 @@ class BaseDocument(DataClassJsonMixin): # extra fields extra_info: Optional[Dict[str, Any]] = None + # Correct type annotations + images: Optional[Union[str, Dict[str, Any]]] = None # Accepts both str and dict for images + tables: Optional[Union[str, Dict[str, Any]]] = None + @classmethod @abstractmethod def get_type(cls) -> str: @@ -55,6 +59,13 @@ def get_embedding(self) -> List[float]: raise ValueError("embedding not set.") return self.embedding + def get_tables(self) -> List[Union[str, Dict[str, Any]]]: + return self.tables or [] + + def get_images(self) -> List[Union[str, Dict[str, Any]]]: + return self.images or [] + + @property def extra_info_str(self) -> Optional[str]: """Extra info string.""" diff --git a/application/parser/token_func.py b/application/parser/token_func.py index 7511cde09..499a35e64 100644 --- a/application/parser/token_func.py +++ b/application/parser/token_func.py @@ -1,6 +1,7 @@ import re from math import ceil from typing import List +import sys import tiktoken from application.parser.schema.base import Document @@ -9,71 +10,156 @@ def separate_header_and_body(text): header_pattern = r"^(.*?\n){3}" match = re.match(header_pattern, text) - header = match.group(0) - body = text[len(header):] + if match: + header = match.group(0) + body = text[len(header) :] + else: + header = "" + body = text + print( + f"Header: {header.strip()[:50]}... | Body length: {len(body)}", file=sys.stderr + ) return header, body +def is_text_document(doc: Document) -> bool: + return doc.text is not None and doc.text.strip() != "" -def group_documents(documents: List[Document], min_tokens: int, max_tokens: int) -> List[Document]: +def group_documents( + documents: List[Document], min_tokens: int, max_tokens: int +) -> List[Document]: docs = [] current_group = None - for doc in documents: - doc_len = len(tiktoken.get_encoding("cl100k_base").encode(doc.text)) + print( + f"Starting to group documents. Total documents: {len(documents)}", + file=sys.stderr, + ) + encoding = tiktoken.get_encoding("cl100k_base") + + for idx, doc in enumerate(documents): + print(f"Processing document {idx + 1}/{len(documents)}", file=sys.stderr) + if not is_text_document(doc): + print(f"Skipping document {idx + 1} as it has no text", file=sys.stderr) + docs.append(doc) + continue + + doc_tokens = encoding.encode(doc.text) + doc_len = len(doc_tokens) + print(f"Document length: {doc_len} tokens", file=sys.stderr) # Check if current group is empty or if the document can be added based on token count and matching metadata - if (current_group is None or - (len(tiktoken.get_encoding("cl100k_base").encode(current_group.text)) + doc_len < max_tokens and - doc_len < min_tokens and - current_group.extra_info == doc.extra_info)): + if current_group is None or ( + len(tiktoken.get_encoding("cl100k_base").encode(current_group.text)) + + doc_len + < max_tokens + and doc_len < min_tokens + and current_group.extra_info == doc.extra_info + ): if current_group is None: current_group = doc # Use the document directly to retain its metadata + print(f"Starting a new group with document {idx + 1}", file=sys.stderr) else: current_group.text += " " + doc.text # Append text to the current group + print(f"Added document {idx + 1} to current group", file=sys.stderr) else: + print( + f"Finalizing current group and starting a new group with document {idx + 1}", + file=sys.stderr, + ) docs.append(current_group) current_group = doc # Start a new group with the current document if current_group is not None: + print(f"Finalizing the last group", file=sys.stderr) docs.append(current_group) + print(f"Total groups created: {len(docs)}", file=sys.stderr) return docs def split_documents(documents: List[Document], max_tokens: int) -> List[Document]: docs = [] - for doc in documents: - token_length = len(tiktoken.get_encoding("cl100k_base").encode(doc.text)) + print( + f"Starting to split documents. Total documents: {len(documents)}", + file=sys.stderr, + ) + encoding = tiktoken.get_encoding("cl100k_base") + + for idx, doc in enumerate(documents): + print(f"Processing document {idx + 1}/{len(documents)}", file=sys.stderr) + if not is_text_document(doc): + print("Skipping splitting for non-text document", file=sys.stderr) + docs.append(doc) + continue + + token_length = len(encoding.encode(doc.text)) + print(f"Document length: {token_length} tokens", file=sys.stderr) if token_length <= max_tokens: + print( + f"Document {idx + 1} fits within max tokens, no splitting needed", + file=sys.stderr, + ) docs.append(doc) else: header, body = separate_header_and_body(doc.text) - if len(tiktoken.get_encoding("cl100k_base").encode(header)) > max_tokens: + if len(encoding.encode(header)) > max_tokens: + print( + f"Header exceeds max tokens. Treating entire document as body.", + file=sys.stderr, + ) body = doc.text header = "" + num_body_parts = ceil(token_length / max_tokens) part_length = ceil(len(body) / num_body_parts) - body_parts = [body[i:i + part_length] for i in range(0, len(body), part_length)] + print( + f"Splitting document {idx + 1} into {num_body_parts} parts", + file=sys.stderr, + ) + body_parts = [ + body[i : i + part_length] for i in range(0, len(body), part_length) + ] for i, body_part in enumerate(body_parts): - new_doc = Document(text=header + body_part.strip(), - doc_id=f"{doc.doc_id}-{i}", - embedding=doc.embedding, - extra_info=doc.extra_info) + new_doc = Document( + text=(header + body_part.strip()).strip(), + doc_id=f"{doc.doc_id}-{i}" if doc.doc_id else None, + embedding=doc.embedding, + extra_info=doc.extra_info, + tables=doc.tables, + images=doc.images, + ) + print( + f"Created new document part {i + 1} for document {idx + 1}", + file=sys.stderr, + ) docs.append(new_doc) + print(f"Total split documents created: {len(docs)}", file=sys.stderr) return docs -def group_split(documents: List[Document], max_tokens: int = 2000, min_tokens: int = 150, token_check: bool = True): +def group_split( + documents: List[Document], + max_tokens: int = 2000, + min_tokens: int = 150, + token_check: bool = True, +): if not token_check: + print("Token check is disabled. Returning original documents.", file=sys.stderr) return documents - print("Grouping small documents") + + print("Grouping small documents", file=sys.stderr) try: - documents = group_documents(documents=documents, min_tokens=min_tokens, max_tokens=max_tokens) - except Exception: - print("Grouping failed, try running without token_check") - print("Separating large documents") + documents = group_documents( + documents=documents, min_tokens=min_tokens, max_tokens=max_tokens + ) + except Exception as e: + print(f"Error during grouping: {e}", file=sys.stderr) + + print("Separating large documents", file=sys.stderr) try: documents = split_documents(documents=documents, max_tokens=max_tokens) - except Exception: - print("Grouping failed, try running without token_check") + except Exception as e: + print(f"Error during splitting: {e}", file=sys.stderr) + + print(f"Total documents after processing: {len(documents)}", file=sys.stderr) return documents diff --git a/application/requirements.txt b/application/requirements.txt index 2f28c2ea6..dd9ca1deb 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -67,6 +67,7 @@ pypdf2==3.0.1 python-dateutil==2.9.0.post0 python-dotenv==1.0.1 python-pptx==1.0.2 +python-docx==1.1.2 qdrant-client==1.11.0 redis==5.0.1 referencing==0.30.2 diff --git a/application/retriever/classic_rag.py b/application/retriever/classic_rag.py index 42e318d20..99c0a1be2 100644 --- a/application/retriever/classic_rag.py +++ b/application/retriever/classic_rag.py @@ -4,7 +4,7 @@ from application.llm.llm_creator import LLMCreator from application.utils import num_tokens_from_string - +import sys class ClassicRAG(BaseRetriever): @@ -41,10 +41,13 @@ def _get_data(self): if self.chunks == 0: docs = [] else: + print(f"Vector store: {self.vectorstore}", file=sys.stderr) docsearch = VectorCreator.create_vectorstore( settings.VECTOR_STORE, self.vectorstore, settings.EMBEDDINGS_KEY ) + print("Vector store created successfully.", file=sys.stderr) docs_temp = docsearch.search(self.question, k=self.chunks) + print(f"Search results: {docs_temp}", file=sys.stderr) docs = [ { "title": i.metadata.get( diff --git a/application/vectorstore/base.py b/application/vectorstore/base.py index 9c76b89f5..4c78bee68 100644 --- a/application/vectorstore/base.py +++ b/application/vectorstore/base.py @@ -1,38 +1,85 @@ from abc import ABC, abstractmethod -import os +import os, sys, base64, io +import torch from sentence_transformers import SentenceTransformer +from PIL import Image +from transformers import CLIPProcessor, CLIPModel from langchain_openai import OpenAIEmbeddings from application.core.settings import settings + class EmbeddingsWrapper: def __init__(self, model_name, *args, **kwargs): - self.model = SentenceTransformer(model_name, config_kwargs={'allow_dangerous_deserialization': True}, *args, **kwargs) - self.dimension = self.model.get_sentence_embedding_dimension() + print(f"Initializing EmbeddingsWrapper with model_name={model_name}", file=sys.stderr) + print("EmbeddingsWrapper initialized successfully", file=sys.stderr) + self.processor = CLIPProcessor.from_pretrained(model_name) + self.model = CLIPModel.from_pretrained(model_name) + if hasattr(self.model.config, "text_config"): + self.dimension = self.model.config.text_config.hidden_size + else: + raise AttributeError("'text_config.hidden_size' not found in model configuration") def embed_query(self, query: str): - return self.model.encode(query).tolist() - + if not self.model or not self.processor: + raise ValueError( + "Model or processor not initialized properly for query embedding." + ) + input = self.processor(text=[query], return_tensors="pt", padding=True) + with torch.no_grad(): + query_embedding = self.model.get_text_features(**input) + return query_embedding.squeeze().detach().cpu().numpy() + def embed_documents(self, documents: list): - return self.model.encode(documents).tolist() - + try: + if not self.model or not self.processor: + raise ValueError("Model or processor not initialized properly for document embedding") + inputs = self.processor(text=documents, return_tensors="pt", padding=True, truncation=True) + with torch.no_grad(): + document_embeddings = self.model.get_text_features(**inputs) + return document_embeddings.detach().cpu().numpy() + except Exception as e: + print(f"Error in embed_documents: {e}", file=sys.stderr) + print(f"error line number: {sys.exc_info()[-1].tb_lineno}", file=sys.stderr) + raise e + + def embed_image(self, image_path: str = None, image_base64: str = None): + print(f"Image path: {image_path}", file=sys.stderr) + print(f"Image base64: {image_base64[:50]}....", file=sys.stderr) + if not self.model or not self.processor: + raise ValueError("Model or processor not initialized properly for image embedding") + if image_base64: + img_data = base64.b64decode(image_base64) + image = Image.open(io.BytesIO(img_data)).convert("RGB") + elif image_path: + image = Image.open(image_path).convert("RGB") + else: + raise ValueError("Image path or base64 data must be provided") + + inputs = self.processor(images=image, return_tensors="pt", padding=True) + with torch.no_grad(): + image_embedding = self.model.get_image_features(**inputs) + return image_embedding.squeeze().cpu().numpy() + def __call__(self, text): if isinstance(text, str): - return self.embed_query(text) + if text.endswith((".jpg", ".jpeg", ".png")): + return self.embed_image(text) + else: + return self.embed_query(text) elif isinstance(text, list): return self.embed_documents(text) else: raise ValueError("Input must be a string or a list of strings") - class EmbeddingsSingleton: _instances = {} @staticmethod def get_instance(embeddings_name, *args, **kwargs): if embeddings_name not in EmbeddingsSingleton._instances: - EmbeddingsSingleton._instances[embeddings_name] = EmbeddingsSingleton._create_instance( - embeddings_name, *args, **kwargs + EmbeddingsSingleton._instances[embeddings_name] = ( + EmbeddingsSingleton._create_instance(embeddings_name, *args, **kwargs) ) return EmbeddingsSingleton._instances[embeddings_name] @@ -40,9 +87,18 @@ def get_instance(embeddings_name, *args, **kwargs): def _create_instance(embeddings_name, *args, **kwargs): embeddings_factory = { "openai_text-embedding-ada-002": OpenAIEmbeddings, - "huggingface_sentence-transformers/all-mpnet-base-v2": lambda: EmbeddingsWrapper("sentence-transformers/all-mpnet-base-v2"), - "huggingface_sentence-transformers-all-mpnet-base-v2": lambda: EmbeddingsWrapper("sentence-transformers/all-mpnet-base-v2"), - "huggingface_hkunlp/instructor-large": lambda: EmbeddingsWrapper("hkunlp/instructor-large"), + "huggingface_sentence-transformers/all-mpnet-base-v2": lambda: EmbeddingsWrapper( + "sentence-transformers/all-mpnet-base-v2" + ), + "huggingface_sentence-transformers-all-mpnet-base-v2": lambda: EmbeddingsWrapper( + "sentence-transformers/all-mpnet-base-v2" + ), + "huggingface_hkunlp/instructor-large": lambda: EmbeddingsWrapper( + "hkunlp/instructor-large" + ), + "openai/clip-vit-base-patch16": lambda: EmbeddingsWrapper( + "openai/clip-vit-base-patch16" + ), } if embeddings_name in embeddings_factory: @@ -50,6 +106,7 @@ def _create_instance(embeddings_name, *args, **kwargs): else: return EmbeddingsWrapper(embeddings_name, *args, **kwargs) + class BaseVectorStore(ABC): def __init__(self): pass @@ -59,20 +116,22 @@ def search(self, *args, **kwargs): pass def is_azure_configured(self): - return settings.OPENAI_API_BASE and settings.OPENAI_API_VERSION and settings.AZURE_DEPLOYMENT_NAME + return ( + settings.OPENAI_API_BASE + and settings.OPENAI_API_VERSION + and settings.AZURE_DEPLOYMENT_NAME + ) def _get_embeddings(self, embeddings_name, embeddings_key=None): if embeddings_name == "openai_text-embedding-ada-002": if self.is_azure_configured(): os.environ["OPENAI_API_TYPE"] = "azure" embedding_instance = EmbeddingsSingleton.get_instance( - embeddings_name, - model=settings.AZURE_EMBEDDINGS_DEPLOYMENT_NAME + embeddings_name, model=settings.AZURE_EMBEDDINGS_DEPLOYMENT_NAME ) else: embedding_instance = EmbeddingsSingleton.get_instance( - embeddings_name, - openai_api_key=embeddings_key + embeddings_name, openai_api_key=embeddings_key ) elif embeddings_name == "huggingface_sentence-transformers/all-mpnet-base-v2": if os.path.exists("./model/all-mpnet-base-v2"): @@ -83,7 +142,16 @@ def _get_embeddings(self, embeddings_name, embeddings_key=None): embedding_instance = EmbeddingsSingleton.get_instance( embeddings_name, ) + elif embeddings_name == "openai/clip-vit-base-patch16": + if os.path.exists("./model/clip-vit-base-patch16"): + embedding_instance = EmbeddingsSingleton.get_instance( + embeddings_name="./model/clip-vit-base-patch16", + ) + else: + embedding_instance = EmbeddingsSingleton.get_instance( + embeddings_name, + ) else: embedding_instance = EmbeddingsSingleton.get_instance(embeddings_name) - return embedding_instance \ No newline at end of file + return embedding_instance diff --git a/application/vectorstore/faiss.py b/application/vectorstore/faiss.py index afa55db95..bd6871cce 100644 --- a/application/vectorstore/faiss.py +++ b/application/vectorstore/faiss.py @@ -1,7 +1,12 @@ from langchain_community.vectorstores import FAISS from application.vectorstore.base import BaseVectorStore +import uuid +import numpy as np +from langchain.docstore.document import Document from application.core.settings import settings import os +import sys + def get_vectorstore(path: str) -> str: if path: @@ -10,24 +15,63 @@ def get_vectorstore(path: str) -> str: vectorstore = os.path.join("application") return vectorstore + class FaissStore(BaseVectorStore): def __init__(self, source_id: str, embeddings_key: str, docs_init=None): super().__init__() self.path = get_vectorstore(source_id) embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, embeddings_key) - + self.embeddings = embeddings + print(f"Embeddings: {embeddings}") + print(f"Dimension: {embeddings.dimension}") try: if docs_init: + print(f"Docs init on: {docs_init}", file=sys.stderr) + print(f"path: {self.path}", file=sys.stderr) self.docsearch = FAISS.from_documents(docs_init, embeddings) + print( + f"FAISS instance created with documents: {self.docsearch}", + file=sys.stderr, + ) + print( + f"FAISS Index Total Vectors: {self.docsearch.index.ntotal}", + file=sys.stderr, + ) + if self.docsearch.index.ntotal == 0: + print("FAISS index is empty. No data to search.", file=sys.stderr) else: - self.docsearch = FAISS.load_local(self.path, embeddings, allow_dangerous_deserialization=True) - except Exception: + print(f"Docs init off: {docs_init}", file=sys.stderr) + print( + f"Attempting to load FAISS index from: {self.path}", file=sys.stderr + ) + print("Directory contents:", os.listdir(self.path), file=sys.stderr) + self.docsearch = FAISS.load_local( + self.path, embeddings, allow_dangerous_deserialization=True + ) + print( + f"FAISS instance loaded from local path: {self.docsearch}", + file=sys.stderr, + ) + print( + f"FAISS Index Total Vectors: {self.docsearch.index.ntotal}", + file=sys.stderr, + ) + if self.docsearch.index.ntotal == 0: + print("FAISS index is empty. No data to search.", file=sys.stderr) + except Exception as e: + print(f"Error loading FAISS index: {e}", file=sys.stderr) raise self.assert_embedding_dimensions(embeddings) def search(self, *args, **kwargs): - return self.docsearch.similarity_search(*args, **kwargs) + print( + f"Performing FAISS search with args: {args}, kwargs: {kwargs}", + file=sys.stderr, + ) + results = self.docsearch.similarity_search(*args, **kwargs) + print(f"FAISS search results: {results}", file=sys.stderr) + return results def add_texts(self, *args, **kwargs): return self.docsearch.add_texts(*args, **kwargs) @@ -38,13 +82,54 @@ def save_local(self, *args, **kwargs): def delete_index(self, *args, **kwargs): return self.docsearch.delete(*args, **kwargs) + def add_image(self, image_vector: np.ndarray, metadata: dict) -> str: + # Generate a unique doc_id + try: + doc_id = str(uuid.uuid4()) + + # Create a Document + doc = Document(page_content="", metadata=metadata) + image_vector = np.atleast_2d(image_vector).astype(np.float32) + print("image_vector shape:", image_vector.shape, file=sys.stderr) + self.docsearch.index.add(image_vector) + + # Add doc to docstore + self.docsearch.docstore.add({doc_id: doc}) + + # Update index_to_docstore_id + starting_len = len(self.docsearch.index_to_docstore_id) + self.docsearch.index_to_docstore_id[starting_len] = doc_id + + print(f"Image added with doc_id={doc_id}", file=sys.stderr) + return doc_id + except Exception as e: + print(f"Error adding image: {e}", file=sys.stderr) + print(f"error line number: {sys.exc_info()[-1].tb_lineno}", file=sys.stderr) + raise + def assert_embedding_dimensions(self, embeddings): """Check that the word embedding dimension of the docsearch index matches the dimension of the word embeddings used.""" - if settings.EMBEDDINGS_NAME == "huggingface_sentence-transformers/all-mpnet-base-v2": - word_embedding_dimension = getattr(embeddings, 'dimension', None) + if settings.EMBEDDINGS_NAME == "openai/clip-vit-base-patch16": + word_embedding_dimension = getattr(embeddings, "dimension", None) if word_embedding_dimension is None: - raise AttributeError("'dimension' attribute not found in embeddings instance.") - + raise AttributeError( + "'dimension' attribute not found in embeddings instance." + ) + docsearch_index_dimension = self.docsearch.index.d + # Log dimensions for debugging + import sys + + print( + f"Validating embedding dimensions: " + f"Model dimension = {word_embedding_dimension}, Index dimension = {docsearch_index_dimension}", + file=sys.stderr, + ) if word_embedding_dimension != docsearch_index_dimension: - raise ValueError(f"Embedding dimension mismatch: embeddings.dimension ({word_embedding_dimension}) != docsearch index dimension ({docsearch_index_dimension})") + raise ValueError( + f"Embedding dimension mismatch: embeddings.dimension ({word_embedding_dimension}) != docsearch index dimension ({docsearch_index_dimension})" + ) + # Add this line to confirm we reached the end of the method + print( + "Dimension validation completed successfully", file=sys.stderr, flush=True + ) diff --git a/frontend/src/api/endpoints.ts b/frontend/src/api/endpoints.ts index 05c8f786f..4e7112d04 100644 --- a/frontend/src/api/endpoints.ts +++ b/frontend/src/api/endpoints.ts @@ -17,7 +17,7 @@ const endpoints = { TOKEN_ANALYTICS: '/api/get_token_analytics', FEEDBACK_ANALYTICS: '/api/get_feedback_analytics', LOGS: `/api/get_user_logs`, - MANAGE_SYNC: '/api/manage_sync' + MANAGE_SYNC: '/api/manage_sync', }, CONVERSATION: { ANSWER: '/api/answer', diff --git a/frontend/src/components/SourceDropdown.tsx b/frontend/src/components/SourceDropdown.tsx index f92173a08..2889eb834 100644 --- a/frontend/src/components/SourceDropdown.tsx +++ b/frontend/src/components/SourceDropdown.tsx @@ -27,8 +27,7 @@ function SourceDropdown({ const { t } = useTranslation(); const dropdownRef = React.useRef(null); const embeddingsName = - import.meta.env.VITE_EMBEDDINGS_NAME || - 'huggingface_sentence-transformers/all-mpnet-base-v2'; + import.meta.env.VITE_EMBEDDINGS_NAME || 'openai/clip-vit-base-patch16'; const handleEmptyDocumentSelect = () => { dispatch(setSelectedDocs(null)); diff --git a/frontend/src/conversation/Conversation.tsx b/frontend/src/conversation/Conversation.tsx index ed69064ac..12e34f51e 100644 --- a/frontend/src/conversation/Conversation.tsx +++ b/frontend/src/conversation/Conversation.tsx @@ -15,7 +15,6 @@ import { useDarkTheme, useMediaQuery } from '../hooks'; import { ShareConversationModal } from '../modals/ShareConversationModal'; import { selectConversationId } from '../preferences/preferenceSlice'; import { AppDispatch } from '../store'; -import conversationService from '../api/services/conversationService'; import ConversationBubble from './ConversationBubble'; import { handleSendFeedback } from './conversationHandlers'; import { FEEDBACK, Query } from './conversationModels'; diff --git a/frontend/src/conversation/conversationSlice.ts b/frontend/src/conversation/conversationSlice.ts index 5e1f9b27f..9298e7a52 100644 --- a/frontend/src/conversation/conversationSlice.ts +++ b/frontend/src/conversation/conversationSlice.ts @@ -157,8 +157,7 @@ export const fetchAnswer = createAsyncThunk< result: '', sources: [], }; -}, -); +}); export const conversationSlice = createSlice({ name: 'conversation',