Refactor using llm and replace langchain

malkoG · malkoG · commit cc84308c6f54 · 2025-05-03T17:36:25.000+09:00
diff --git a/.env.sample b/.env.sample
@@ -1 +1,2 @@
 OPENAI_API_KEY=sk-proj-xxxxx
+GEMINI_API_KEY=asdasd
diff --git a/pythonkr_backend/curation/models.py b/pythonkr_backend/curation/models.py
@@ -1,13 +1,7 @@
 from django.db import models
 from django.utils.text import slugify
-import requests
+from .utils import get_summary_from_url, translate_to_korean, categorize_summary 
 import readtime
-from langchain.chains.summarize import load_summarize_chain
-from langchain_openai import ChatOpenAI
-from langchain_community.document_loaders.web_base import WebBaseLoader
-from langchain_core.prompts import ChatPromptTemplate
-from langchain_core.output_parsers import StrOutputParser
-from langchain_core.documents import Document
 import os
 
 
@@ -51,7 +45,9 @@ def __str__(self):
         return self.title or self.url
         
     def calculate_reading_time(self, full_text: str):
-        """Calculates reading time based on the provided text."""
+        """
+        Calculates reading time based on the provided text.
+        """
         if full_text:
             try:
                 result = readtime.of_text(full_text)
@@ -70,34 +66,10 @@ def fetch_and_summarize(self) -> str:
         if not self.url:
             return "Error: No URL provided."
 
-        full_content_text = "" # Variable to hold the full text
-
         try:
-            # --- Step 1: Load Content ---
-            loader = WebBaseLoader(self.url)
-            docs = loader.load() # Load documents
-
-            if not docs or not docs[0].page_content:
-                return "Error: No content could be loaded from the URL."
-
-            full_content_text = docs[0].page_content # Store full text
+            summary_text = get_summary_from_url(self.url)
 
-            if not self.title and docs[0].metadata.get('title'):
-                self.title = docs[0].metadata.get('title')
-
-            # --- Step 2: Calculate Reading Time (on full content) ---
-            self.calculate_reading_time(full_content_text) # Call updated method
-
-            # --- Step 3: Generate Summary ---
-            api_key = os.getenv("OPENAI_API_KEY")
-            if not api_key:
-                self.save(update_fields=['title', 'reading_time_minutes', 'updated_at']) # Save what we have
-                return "Error: OpenAI API key not found. Title/Reading Time saved."
-
-            llm_summarize = ChatOpenAI(api_key=api_key, model_name="gpt-4o", temperature=0.2)
-            chain_summarize = load_summarize_chain(llm_summarize, chain_type="map_reduce")
-            summary_result = chain_summarize.invoke(docs)
-            summary_text = summary_result.get('output_text', '')
+            self.calculate_reading_time(summary_text) # Call updated method
 
             if not summary_text:
                 self.summary = ""
@@ -107,70 +79,46 @@ def fetch_and_summarize(self) -> str:
 
             self.summary = summary_text # Set summary
 
-            # === Integrate category assignment ===
             categorization_status = "Categorization skipped (no summary)."
             if self.summary: # Only categorize if summary was successful
                  categorization_status = self.assign_categories() # Call the revised method
                  print(f"Categorization status for article {self.id}: {categorization_status}")
-            # === End Integration ===
 
-            # --- Step 4: Translate Summary (immediately after generation) ---
             translation_status = self.translate_summary_to_korean() # Call translation
             print(f"Translation status for article {self.id}: {translation_status}")
             translation_failed = "Error" in translation_status
 
-            # === Adjust final save ===
-            # ManyToMany fields are saved via .add()/.clear() within assign_categories.
-            # Do NOT include 'categories' in update_fields.
             self.save(update_fields=[
                 'title',
                 'summary',
                 'summary_ko',
                 'reading_time_minutes',
                 'updated_at'
-                # 'categories' is NOT saved here
             ])
-            # === End Adjust save ===
 
-            # Update final message to include categorization status
             translation_failed = "Error" in translation_status # Re-evaluate this variable if needed
-            categorization_failed = "Error" in categorization_status or "Warning" in categorization_status
 
             final_message = "Fetch, Read Time, Summary completed."
             final_message += " Translation failed." if translation_failed else " Translation completed."
             final_message += f" {categorization_status}" # Include categorization status message
             return final_message
 
-        except requests.exceptions.RequestException as e:
-             return f"Error fetching URL: {str(e)}"
         except ImportError as e:
             return f"Error with required libraries: {str(e)}"
         except Exception as e:
              print(f"Unexpected error during fetch/summarize/translate for {self.id}: {e}")
-             # Optionally try saving minimal info on unexpected error:
-             # self.save(update_fields=['title', 'reading_time_minutes', 'summary', 'summary_ko', 'updated_at'])
              return f"Unexpected error processing article: {str(e)}"
             
     def translate_summary_to_korean(self):
-        """Translates the summary to Korean using the OpenAI API via Langchain."""
+        """
+        Translates the summary to Korean using the OpenAI API via Langchain.
+        """
         if not self.summary:
             self.summary_ko = ""
             return "No summary to translate."
 
-        api_key = os.getenv("OPENAI_API_KEY")
-        if not api_key:
-            return "Error: OpenAI API key not found."
-
         try:
-            llm = ChatOpenAI(api_key=api_key, model_name="gpt-4o", temperature=0.2)
-            prompt = ChatPromptTemplate.from_messages([
-                ("system", "You are a helpful assistant that translates English text to Korean."),
-                ("user", "Please translate the following English text accurately to Korean:\n\n{english_text}")
-            ])
-            parser = StrOutputParser()
-            chain = prompt | llm | parser
-
-            translated_text = chain.invoke({"english_text": self.summary})
+            translated_text = translate_to_korean(self.summary)
 
             self.summary_ko = translated_text.strip() if translated_text else ""
             self.save(update_fields=['summary_ko', 'updated_at'])
@@ -188,55 +136,31 @@ def assign_categories(self):
             self.categories.clear() # Clear existing categories if no summary
             return "Error: No summary available to categorize."
 
-        api_key = os.getenv("OPENAI_API_KEY")
-        if not api_key:
-            return "Error: OpenAI API key not found for categorization."
-
         try:
-            # Ensure defined categories exist in the DB (or create them)
             defined_category_names = [
                 'Web Development', 'MLOps', 'Large Language Models',
                 'Data Science', 'AI General', 'Software Engineering', 'Other'
             ]
-            # Use get_or_create to simplify existence check and creation
             category_objects = []
             created_names = []
             for name in defined_category_names:
                 cat, created = Category.objects.get_or_create(name=name)
                 category_objects.append(cat)
                 if created:
                   created_names.append(name)
-                  # Optionally save slug immediately if auto-generated
                   cat.save()
+
             if created_names:
                 print(f"Ensured categories exist. Created new: {created_names}")
 
-            # Prepare for LLM call
-            llm = ChatOpenAI(api_key=api_key, model_name="gpt-4o", temperature=0.1)
-            category_list_str = ", ".join([f"'{name}'" for name in defined_category_names])
-
-            prompt = ChatPromptTemplate.from_messages([
-                ("system", f"You are a helpful assistant that categorizes technical articles based on their summary. "
-                           f"Assign one or more relevant categories from the following list: {category_list_str}. "
-                           f"Respond with ONLY the category names, separated by commas (e.g., 'Web Development, Large Language Models'). "
-                           f"If none fit well, respond with 'Other'."),
-                ("user", "Please categorize the following article summary:\n\n{summary_text}")
-            ])
-            parser = StrOutputParser()
-            chain = prompt | llm | parser
-
-            response_text = chain.invoke({"summary_text": self.summary}).strip()
-
-            # Parse LLM response and get Category objects
+            response_text = categorize_summary(self.summary, defined_category_names).replace("'", "").replace('"', "")
             assigned_category_names = [name.strip() for name in response_text.split(',') if name.strip()]
 
-            # Validate against our defined list and get actual Category objects from the DB
             valid_categories = Category.objects.filter(name__in=assigned_category_names).filter(name__in=defined_category_names)
             valid_category_names = list(valid_categories.values_list('name', flat=True))
 
             print(f"LLM suggested: {assigned_category_names}, Validated & Found: {valid_category_names}")
 
-            # Update the article's categories relationship
             self.categories.clear() # Remove old associations first
             if valid_categories:
                 self.categories.add(*valid_categories) # Add the new set using the splat operator
diff --git a/pythonkr_backend/curation/utils.py b/pythonkr_backend/curation/utils.py
@@ -0,0 +1,77 @@
+import httpx
+import llm
+from pydantic import BaseModel
+
+import os
+
+class Result(BaseModel):
+    categories: list[str]
+
+GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
+
+def fetch_content_from_url(url: str) -> str:
+    """
+    Fetches the content from the given URL.
+
+    Args:
+        url (str): The URL to fetch content from.
+
+    Returns:
+        str: The content fetched from the URL.
+    """
+    llm_friendly_jina_ai_url = f'https://r.jina.ai/{url}'
+    response = httpx.get(llm_friendly_jina_ai_url)
+    return response.text
+
+
+def parse_contents(contents: str):
+    headers, markdown_body = contents.split("Markdown Content:",1)
+    header = {}
+    for header_line in headers.splitlines():
+        if header_line.strip() != "":
+            header_name, header_value =  header_line.split(":", 1)
+            header[header_name.strip()] = header_value.strip()
+    # most case 
+    # Title, URL Source
+    return header, markdown_body
+
+def get_summary_from_url(url: str):
+    contents = fetch_content_from_url(url)
+    model = llm.get_model("gemini-2.5-pro-exp-03-25")
+    model.key = GEMINI_API_KEY
+    response = model.prompt(
+        contents,
+        system="""make readable title and summary in korean as markdown format,
+                summary should be list of minimum 3, maximum 5 items"""
+    )
+    #header, markdown_body = parse_contents(contents)
+    return response.text()
+
+def translate_to_korean(content: str):
+    english_text = content
+    model = llm.get_model("gemini-2.5-pro-exp-03-25")
+    model.key = GEMINI_API_KEY
+    response = model.prompt(
+        f"Please translate the following English text accurately to Korean.\n\n{english_text}",
+        system="Your are a helpful assistant that translates English text to Korean.",
+    )
+
+    return response.text()
+
+
+def categorize_summary(summary: str, categories: list[str]):
+    category_list_str = [f"'{category}'" for category in categories]
+    model = llm.get_model("gemini-2.5-pro-exp-03-25")
+    model.key = GEMINI_API_KEY
+    response = model.prompt(
+        f"Please categories the following article summary:\n\n{summary}",
+        system=f"""
+- You are a helpful assistant that categorizes technical articles based on their summary. 
+- Assign one or more relevant categories from the following list: {category_list_str}. 
+- Respond with ONLY the category names, separated by commas (e.g., 'Web Development, Large Language Models'). 
+- If none fit well, respond with 'Other'.
+        """,
+    )
+
+    return response.text()
+

Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`	`1`	`OPENAI_API_KEY=sk-proj-xxxxx`
	`2`	`+GEMINI_API_KEY=asdasd`