|
1 | | -import csv |
2 | | -import io |
3 | 1 | import logging |
4 | 2 |
|
5 | | -from fastapi import HTTPException |
6 | 3 | from sqlmodel import Session, select |
7 | 4 |
|
8 | 5 | from app.core.util import now |
9 | | -from app.models import EvaluationRun, UserProjectOrg |
10 | | -from app.models.evaluation import DatasetUploadResponse |
11 | | -from app.utils import get_langfuse_client |
| 6 | +from app.models import EvaluationRun |
12 | 7 |
|
13 | 8 | logger = logging.getLogger(__name__) |
14 | 9 |
|
15 | 10 |
|
16 | | -async def upload_dataset_to_langfuse( |
17 | | - csv_content: bytes, |
18 | | - dataset_name: str, |
19 | | - dataset_id: int, |
20 | | - duplication_factor: int, |
21 | | - _session: Session, |
22 | | - _current_user: UserProjectOrg, |
23 | | -) -> tuple[bool, DatasetUploadResponse | None, str | None]: |
24 | | - """ |
25 | | - Upload a CSV dataset to Langfuse with duplication for flakiness testing. |
26 | | -
|
27 | | - Args: |
28 | | - csv_content: Raw CSV file content as bytes |
29 | | - dataset_name: Name for the dataset in Langfuse |
30 | | - dataset_id: Database ID of the created dataset |
31 | | - duplication_factor: Number of times to duplicate each item (default 5) |
32 | | - _session: Database session |
33 | | - _current_user: Current user organization |
34 | | -
|
35 | | - Returns: |
36 | | - Tuple of (success, dataset_response, error_message) |
37 | | - """ |
38 | | - try: |
39 | | - # Get Langfuse client |
40 | | - try: |
41 | | - langfuse = get_langfuse_client( |
42 | | - session=_session, |
43 | | - org_id=_current_user.organization_id, |
44 | | - project_id=_current_user.project_id, |
45 | | - ) |
46 | | - except HTTPException as http_exc: |
47 | | - return False, None, http_exc.detail |
48 | | - |
49 | | - # Parse CSV content |
50 | | - csv_text = csv_content.decode("utf-8") |
51 | | - csv_reader = csv.DictReader(io.StringIO(csv_text)) |
52 | | - |
53 | | - # Validate CSV headers |
54 | | - if ( |
55 | | - "question" not in csv_reader.fieldnames |
56 | | - or "answer" not in csv_reader.fieldnames |
57 | | - ): |
58 | | - return ( |
59 | | - False, |
60 | | - None, |
61 | | - "CSV must contain 'question' and 'answer' columns. " |
62 | | - f"Found columns: {csv_reader.fieldnames}", |
63 | | - ) |
64 | | - |
65 | | - # Read all rows from CSV |
66 | | - original_items = [] |
67 | | - for row in csv_reader: |
68 | | - question = row.get("question", "").strip() |
69 | | - answer = row.get("answer", "").strip() |
70 | | - |
71 | | - if not question or not answer: |
72 | | - logger.warning(f"Skipping row with empty question or answer: {row}") |
73 | | - continue |
74 | | - |
75 | | - original_items.append({"question": question, "answer": answer}) |
76 | | - |
77 | | - if not original_items: |
78 | | - return False, None, "No valid items found in CSV file." |
79 | | - |
80 | | - logger.info( |
81 | | - f"Parsed {len(original_items)} items from CSV. " |
82 | | - f"Will duplicate {duplication_factor}x for a total of {len(original_items) * duplication_factor} items." |
83 | | - ) |
84 | | - |
85 | | - # Create or get dataset in Langfuse |
86 | | - dataset = langfuse.create_dataset(name=dataset_name) |
87 | | - |
88 | | - # Upload items with duplication |
89 | | - total_uploaded = 0 |
90 | | - for item in original_items: |
91 | | - # Duplicate each item N times |
92 | | - for duplicate_num in range(duplication_factor): |
93 | | - try: |
94 | | - langfuse.create_dataset_item( |
95 | | - dataset_name=dataset_name, |
96 | | - input={"question": item["question"]}, |
97 | | - expected_output={"answer": item["answer"]}, |
98 | | - metadata={ |
99 | | - "original_question": item["question"], |
100 | | - "duplicate_number": duplicate_num + 1, |
101 | | - "duplication_factor": duplication_factor, |
102 | | - }, |
103 | | - ) |
104 | | - total_uploaded += 1 |
105 | | - except Exception as e: |
106 | | - logger.error( |
107 | | - f"Failed to upload item (duplicate {duplicate_num + 1}): {item['question'][:50]}... Error: {e}" |
108 | | - ) |
109 | | - |
110 | | - # Flush to ensure all items are uploaded |
111 | | - langfuse.flush() |
112 | | - |
113 | | - logger.info( |
114 | | - f"Successfully uploaded {total_uploaded} items to dataset '{dataset_name}' " |
115 | | - f"({len(original_items)} original × {duplication_factor} duplicates)" |
116 | | - ) |
117 | | - |
118 | | - return ( |
119 | | - True, |
120 | | - DatasetUploadResponse( |
121 | | - dataset_id=dataset_id, |
122 | | - dataset_name=dataset_name, |
123 | | - total_items=total_uploaded, |
124 | | - original_items=len(original_items), |
125 | | - duplication_factor=duplication_factor, |
126 | | - langfuse_dataset_id=dataset.id if hasattr(dataset, "id") else None, |
127 | | - ), |
128 | | - None, |
129 | | - ) |
130 | | - |
131 | | - except Exception as e: |
132 | | - logger.error(f"Error uploading dataset: {str(e)}", exc_info=True) |
133 | | - return False, None, f"Failed to upload dataset: {str(e)}" |
134 | | - |
135 | | - |
136 | 11 | def create_evaluation_run( |
137 | 12 | session: Session, |
138 | 13 | run_name: str, |
@@ -170,8 +45,13 @@ def create_evaluation_run( |
170 | 45 | ) |
171 | 46 |
|
172 | 47 | session.add(eval_run) |
173 | | - session.commit() |
174 | | - session.refresh(eval_run) |
| 48 | + try: |
| 49 | + session.commit() |
| 50 | + session.refresh(eval_run) |
| 51 | + except Exception as e: |
| 52 | + session.rollback() |
| 53 | + logger.error(f"Failed to create EvaluationRun: {e}", exc_info=True) |
| 54 | + raise |
175 | 55 |
|
176 | 56 | logger.info(f"Created EvaluationRun record: id={eval_run.id}, run_name={run_name}") |
177 | 57 |
|
@@ -214,7 +94,7 @@ def list_evaluation_runs( |
214 | 94 | f"project_id={project_id}" |
215 | 95 | ) |
216 | 96 |
|
217 | | - return list(runs) |
| 97 | + return runs |
218 | 98 |
|
219 | 99 |
|
220 | 100 | def get_evaluation_run_by_id( |
@@ -302,7 +182,12 @@ def update_evaluation_run( |
302 | 182 |
|
303 | 183 | # Persist to database |
304 | 184 | session.add(eval_run) |
305 | | - session.commit() |
306 | | - session.refresh(eval_run) |
| 185 | + try: |
| 186 | + session.commit() |
| 187 | + session.refresh(eval_run) |
| 188 | + except Exception as e: |
| 189 | + session.rollback() |
| 190 | + logger.error(f"Failed to update EvaluationRun: {e}", exc_info=True) |
| 191 | + raise |
307 | 192 |
|
308 | 193 | return eval_run |
0 commit comments