Skip to content

Commit cc84308

Browse files
committed
Refactor using llm and replace langchain
1 parent 751f8dd commit cc84308

File tree

3 files changed

+90
-88
lines changed

3 files changed

+90
-88
lines changed

.env.sample

+1
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
OPENAI_API_KEY=sk-proj-xxxxx
2+
GEMINI_API_KEY=asdasd

pythonkr_backend/curation/models.py

+12-88
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,7 @@
11
from django.db import models
22
from django.utils.text import slugify
3-
import requests
3+
from .utils import get_summary_from_url, translate_to_korean, categorize_summary
44
import readtime
5-
from langchain.chains.summarize import load_summarize_chain
6-
from langchain_openai import ChatOpenAI
7-
from langchain_community.document_loaders.web_base import WebBaseLoader
8-
from langchain_core.prompts import ChatPromptTemplate
9-
from langchain_core.output_parsers import StrOutputParser
10-
from langchain_core.documents import Document
115
import os
126

137

@@ -51,7 +45,9 @@ def __str__(self):
5145
return self.title or self.url
5246

5347
def calculate_reading_time(self, full_text: str):
54-
"""Calculates reading time based on the provided text."""
48+
"""
49+
Calculates reading time based on the provided text.
50+
"""
5551
if full_text:
5652
try:
5753
result = readtime.of_text(full_text)
@@ -70,34 +66,10 @@ def fetch_and_summarize(self) -> str:
7066
if not self.url:
7167
return "Error: No URL provided."
7268

73-
full_content_text = "" # Variable to hold the full text
74-
7569
try:
76-
# --- Step 1: Load Content ---
77-
loader = WebBaseLoader(self.url)
78-
docs = loader.load() # Load documents
79-
80-
if not docs or not docs[0].page_content:
81-
return "Error: No content could be loaded from the URL."
82-
83-
full_content_text = docs[0].page_content # Store full text
70+
summary_text = get_summary_from_url(self.url)
8471

85-
if not self.title and docs[0].metadata.get('title'):
86-
self.title = docs[0].metadata.get('title')
87-
88-
# --- Step 2: Calculate Reading Time (on full content) ---
89-
self.calculate_reading_time(full_content_text) # Call updated method
90-
91-
# --- Step 3: Generate Summary ---
92-
api_key = os.getenv("OPENAI_API_KEY")
93-
if not api_key:
94-
self.save(update_fields=['title', 'reading_time_minutes', 'updated_at']) # Save what we have
95-
return "Error: OpenAI API key not found. Title/Reading Time saved."
96-
97-
llm_summarize = ChatOpenAI(api_key=api_key, model_name="gpt-4o", temperature=0.2)
98-
chain_summarize = load_summarize_chain(llm_summarize, chain_type="map_reduce")
99-
summary_result = chain_summarize.invoke(docs)
100-
summary_text = summary_result.get('output_text', '')
72+
self.calculate_reading_time(summary_text) # Call updated method
10173

10274
if not summary_text:
10375
self.summary = ""
@@ -107,70 +79,46 @@ def fetch_and_summarize(self) -> str:
10779

10880
self.summary = summary_text # Set summary
10981

110-
# === Integrate category assignment ===
11182
categorization_status = "Categorization skipped (no summary)."
11283
if self.summary: # Only categorize if summary was successful
11384
categorization_status = self.assign_categories() # Call the revised method
11485
print(f"Categorization status for article {self.id}: {categorization_status}")
115-
# === End Integration ===
11686

117-
# --- Step 4: Translate Summary (immediately after generation) ---
11887
translation_status = self.translate_summary_to_korean() # Call translation
11988
print(f"Translation status for article {self.id}: {translation_status}")
12089
translation_failed = "Error" in translation_status
12190

122-
# === Adjust final save ===
123-
# ManyToMany fields are saved via .add()/.clear() within assign_categories.
124-
# Do NOT include 'categories' in update_fields.
12591
self.save(update_fields=[
12692
'title',
12793
'summary',
12894
'summary_ko',
12995
'reading_time_minutes',
13096
'updated_at'
131-
# 'categories' is NOT saved here
13297
])
133-
# === End Adjust save ===
13498

135-
# Update final message to include categorization status
13699
translation_failed = "Error" in translation_status # Re-evaluate this variable if needed
137-
categorization_failed = "Error" in categorization_status or "Warning" in categorization_status
138100

139101
final_message = "Fetch, Read Time, Summary completed."
140102
final_message += " Translation failed." if translation_failed else " Translation completed."
141103
final_message += f" {categorization_status}" # Include categorization status message
142104
return final_message
143105

144-
except requests.exceptions.RequestException as e:
145-
return f"Error fetching URL: {str(e)}"
146106
except ImportError as e:
147107
return f"Error with required libraries: {str(e)}"
148108
except Exception as e:
149109
print(f"Unexpected error during fetch/summarize/translate for {self.id}: {e}")
150-
# Optionally try saving minimal info on unexpected error:
151-
# self.save(update_fields=['title', 'reading_time_minutes', 'summary', 'summary_ko', 'updated_at'])
152110
return f"Unexpected error processing article: {str(e)}"
153111

154112
def translate_summary_to_korean(self):
155-
"""Translates the summary to Korean using the OpenAI API via Langchain."""
113+
"""
114+
Translates the summary to Korean using the OpenAI API via Langchain.
115+
"""
156116
if not self.summary:
157117
self.summary_ko = ""
158118
return "No summary to translate."
159119

160-
api_key = os.getenv("OPENAI_API_KEY")
161-
if not api_key:
162-
return "Error: OpenAI API key not found."
163-
164120
try:
165-
llm = ChatOpenAI(api_key=api_key, model_name="gpt-4o", temperature=0.2)
166-
prompt = ChatPromptTemplate.from_messages([
167-
("system", "You are a helpful assistant that translates English text to Korean."),
168-
("user", "Please translate the following English text accurately to Korean:\n\n{english_text}")
169-
])
170-
parser = StrOutputParser()
171-
chain = prompt | llm | parser
172-
173-
translated_text = chain.invoke({"english_text": self.summary})
121+
translated_text = translate_to_korean(self.summary)
174122

175123
self.summary_ko = translated_text.strip() if translated_text else ""
176124
self.save(update_fields=['summary_ko', 'updated_at'])
@@ -188,55 +136,31 @@ def assign_categories(self):
188136
self.categories.clear() # Clear existing categories if no summary
189137
return "Error: No summary available to categorize."
190138

191-
api_key = os.getenv("OPENAI_API_KEY")
192-
if not api_key:
193-
return "Error: OpenAI API key not found for categorization."
194-
195139
try:
196-
# Ensure defined categories exist in the DB (or create them)
197140
defined_category_names = [
198141
'Web Development', 'MLOps', 'Large Language Models',
199142
'Data Science', 'AI General', 'Software Engineering', 'Other'
200143
]
201-
# Use get_or_create to simplify existence check and creation
202144
category_objects = []
203145
created_names = []
204146
for name in defined_category_names:
205147
cat, created = Category.objects.get_or_create(name=name)
206148
category_objects.append(cat)
207149
if created:
208150
created_names.append(name)
209-
# Optionally save slug immediately if auto-generated
210151
cat.save()
152+
211153
if created_names:
212154
print(f"Ensured categories exist. Created new: {created_names}")
213155

214-
# Prepare for LLM call
215-
llm = ChatOpenAI(api_key=api_key, model_name="gpt-4o", temperature=0.1)
216-
category_list_str = ", ".join([f"'{name}'" for name in defined_category_names])
217-
218-
prompt = ChatPromptTemplate.from_messages([
219-
("system", f"You are a helpful assistant that categorizes technical articles based on their summary. "
220-
f"Assign one or more relevant categories from the following list: {category_list_str}. "
221-
f"Respond with ONLY the category names, separated by commas (e.g., 'Web Development, Large Language Models'). "
222-
f"If none fit well, respond with 'Other'."),
223-
("user", "Please categorize the following article summary:\n\n{summary_text}")
224-
])
225-
parser = StrOutputParser()
226-
chain = prompt | llm | parser
227-
228-
response_text = chain.invoke({"summary_text": self.summary}).strip()
229-
230-
# Parse LLM response and get Category objects
156+
response_text = categorize_summary(self.summary, defined_category_names).replace("'", "").replace('"', "")
231157
assigned_category_names = [name.strip() for name in response_text.split(',') if name.strip()]
232158

233-
# Validate against our defined list and get actual Category objects from the DB
234159
valid_categories = Category.objects.filter(name__in=assigned_category_names).filter(name__in=defined_category_names)
235160
valid_category_names = list(valid_categories.values_list('name', flat=True))
236161

237162
print(f"LLM suggested: {assigned_category_names}, Validated & Found: {valid_category_names}")
238163

239-
# Update the article's categories relationship
240164
self.categories.clear() # Remove old associations first
241165
if valid_categories:
242166
self.categories.add(*valid_categories) # Add the new set using the splat operator

pythonkr_backend/curation/utils.py

+77
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
import httpx
2+
import llm
3+
from pydantic import BaseModel
4+
5+
import os
6+
7+
class Result(BaseModel):
8+
categories: list[str]
9+
10+
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
11+
12+
def fetch_content_from_url(url: str) -> str:
13+
"""
14+
Fetches the content from the given URL.
15+
16+
Args:
17+
url (str): The URL to fetch content from.
18+
19+
Returns:
20+
str: The content fetched from the URL.
21+
"""
22+
llm_friendly_jina_ai_url = f'https://r.jina.ai/{url}'
23+
response = httpx.get(llm_friendly_jina_ai_url)
24+
return response.text
25+
26+
27+
def parse_contents(contents: str):
28+
headers, markdown_body = contents.split("Markdown Content:",1)
29+
header = {}
30+
for header_line in headers.splitlines():
31+
if header_line.strip() != "":
32+
header_name, header_value = header_line.split(":", 1)
33+
header[header_name.strip()] = header_value.strip()
34+
# most case
35+
# Title, URL Source
36+
return header, markdown_body
37+
38+
def get_summary_from_url(url: str):
39+
contents = fetch_content_from_url(url)
40+
model = llm.get_model("gemini-2.5-pro-exp-03-25")
41+
model.key = GEMINI_API_KEY
42+
response = model.prompt(
43+
contents,
44+
system="""make readable title and summary in korean as markdown format,
45+
summary should be list of minimum 3, maximum 5 items"""
46+
)
47+
#header, markdown_body = parse_contents(contents)
48+
return response.text()
49+
50+
def translate_to_korean(content: str):
51+
english_text = content
52+
model = llm.get_model("gemini-2.5-pro-exp-03-25")
53+
model.key = GEMINI_API_KEY
54+
response = model.prompt(
55+
f"Please translate the following English text accurately to Korean.\n\n{english_text}",
56+
system="Your are a helpful assistant that translates English text to Korean.",
57+
)
58+
59+
return response.text()
60+
61+
62+
def categorize_summary(summary: str, categories: list[str]):
63+
category_list_str = [f"'{category}'" for category in categories]
64+
model = llm.get_model("gemini-2.5-pro-exp-03-25")
65+
model.key = GEMINI_API_KEY
66+
response = model.prompt(
67+
f"Please categories the following article summary:\n\n{summary}",
68+
system=f"""
69+
- You are a helpful assistant that categorizes technical articles based on their summary.
70+
- Assign one or more relevant categories from the following list: {category_list_str}.
71+
- Respond with ONLY the category names, separated by commas (e.g., 'Web Development, Large Language Models').
72+
- If none fit well, respond with 'Other'.
73+
""",
74+
)
75+
76+
return response.text()
77+

0 commit comments

Comments
 (0)