-
Notifications
You must be signed in to change notification settings - Fork 1
Implement MVP podcast generation #34
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: dev
Are you sure you want to change the base?
Changes from all commits
72a94ee
b3b3373
d9a78a1
922f6fa
a46b4f2
c71e460
1b5498c
b7a6b65
62e60e6
0bbe907
6d03127
c7657d6
6809309
c204ea5
bd74fda
71a8c0c
5699040
cbc3797
d6061fe
7582c05
74102ba
53caf8a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,38 @@ | ||
| """add podcast table | ||
|
|
||
| Revision ID: 2042a1f0c0a1 | ||
| Revises: 10368f38610b | ||
| Create Date: 2025-10-05 06:00:00.000000 | ||
|
|
||
| """ | ||
| from alembic import op | ||
| import sqlalchemy as sa | ||
| import sqlmodel.sql.sqltypes | ||
|
|
||
|
|
||
| # revision identifiers, used by Alembic. | ||
| revision = '2042a1f0c0a1' | ||
| down_revision = '2cde6f094a4e' | ||
| branch_labels = None | ||
| depends_on = None | ||
|
|
||
|
|
||
| def upgrade(): | ||
| op.create_table( | ||
| 'podcast', | ||
| sa.Column('id', sa.Uuid(), nullable=False), | ||
| sa.Column('course_id', sa.Uuid(), nullable=False), | ||
| sa.Column('title', sqlmodel.sql.sqltypes.AutoString(length=255), nullable=False), | ||
| sa.Column('transcript', sa.Text(), nullable=False), | ||
| sa.Column('audio_path', sqlmodel.sql.sqltypes.AutoString(length=1024), nullable=False), | ||
| sa.Column('storage_backend', sqlmodel.sql.sqltypes.AutoString(length=50), nullable=False), | ||
| sa.Column('duration_seconds', sa.Float(), nullable=True), | ||
| sa.Column('created_at', sa.DateTime(timezone=True), server_default=sa.text('now()'), nullable=False), | ||
| sa.Column('updated_at', sa.DateTime(), nullable=False), | ||
| sa.ForeignKeyConstraint(['course_id'], ['course.id'], ondelete='CASCADE'), | ||
| sa.PrimaryKeyConstraint('id') | ||
| ) | ||
|
|
||
|
|
||
| def downgrade(): | ||
| op.drop_table('podcast') |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,27 @@ | ||
| """merge heads: podcast + dev | ||
|
|
||
| Revision ID: a9b7c6d5e4f3 | ||
| Revises: ('2042a1f0c0a1', '64343f21e9a8') | ||
| Create Date: 2025-10-06 00:00:00 | ||
|
|
||
| """ | ||
| from alembic import op | ||
| import sqlalchemy as sa | ||
|
|
||
|
|
||
| # revision identifiers, used by Alembic. | ||
| revision = 'a9b7c6d5e4f3' | ||
| down_revision = ('2042a1f0c0a1', '64343f21e9a8') | ||
| branch_labels = None | ||
| depends_on = None | ||
|
|
||
|
|
||
| def upgrade(): | ||
| # Merge point: no-op | ||
| pass | ||
|
|
||
|
|
||
| def downgrade(): | ||
| # Merge point: no-op | ||
| pass | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4,6 +4,7 @@ | |
| import tempfile | ||
| import uuid | ||
| from asyncio.log import logger | ||
| import logging | ||
| from datetime import datetime, timezone | ||
| from typing import Any | ||
|
|
||
|
|
@@ -21,7 +22,7 @@ | |
| from app.models.course import Course | ||
| from app.models.document import Document | ||
| from app.models.embeddings import Chunk | ||
| from app.schemas.public import DocumentStatus | ||
| from app.schemas.public import DocumentStatus, DocumentPublic | ||
| from app.tasks import generate_quizzes_task | ||
|
|
||
| router = APIRouter(prefix="/documents", tags=["documents"]) | ||
|
|
@@ -37,6 +38,7 @@ | |
| MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024 | ||
|
|
||
| pc = Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENV_NAME) | ||
| log = logging.getLogger(__name__) | ||
|
|
||
| task_status: dict[str, str] = {} | ||
|
|
||
|
|
@@ -48,6 +50,12 @@ def ensure_index_exists(): | |
| if pc.has_index(index_name): | ||
| existing = pc.describe_index(index_name) | ||
| if existing.dimension != EXPECTED_DIMENSION: | ||
| log.warning( | ||
| "[DOCS] Index dimension mismatch | name=%s | have=%s want=%s — recreating", | ||
| index_name, | ||
| existing.dimension, | ||
| EXPECTED_DIMENSION, | ||
| ) | ||
| pc.delete_index(index_name) | ||
| pc.create_index( | ||
| name=index_name, | ||
|
|
@@ -165,6 +173,7 @@ async def process_pdf_task(file_path: str, document_id: uuid.UUID, session: Sess | |
| "id": embedding_uuid, | ||
| "values": embedding, | ||
| "metadata": { | ||
| "course_id": str(document.course_id), | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for fixing this. I was curious why it wasn't working. |
||
| "document_id": str(document_id), | ||
| "chunk_id": str(record.id), | ||
| "text": record.text_content, | ||
|
|
@@ -290,8 +299,8 @@ async def process_multiple_documents( | |
| return {"message": "Processing started for multiple files", "documents": results} | ||
|
|
||
|
|
||
| @router.get("/{id}", response_model=Document) | ||
| def read_document(session: SessionDep, current_user: CurrentUser, id: uuid.UUID) -> Any: | ||
| @router.get("/{id}", response_model=DocumentPublic) | ||
| def read_document(session: SessionDep, current_user: CurrentUser, id: uuid.UUID) -> DocumentPublic: | ||
| """Get a document by its ID, ensuring the user has permissions.""" | ||
| statement = ( | ||
| select(Document) | ||
|
|
@@ -308,7 +317,7 @@ def read_document(session: SessionDep, current_user: CurrentUser, id: uuid.UUID) | |
| detail="Document not found or you do not have permission to access it.", | ||
| ) | ||
|
|
||
| return document | ||
| return DocumentPublic.model_validate(document) | ||
|
|
||
|
|
||
| def delete_embeddings_task(document_id: uuid.UUID): | ||
|
|
@@ -321,13 +330,13 @@ def delete_embeddings_task(document_id: uuid.UUID): | |
| logger.error(f"Failed to delete embeddings for document {document_id}: {e}") | ||
|
|
||
|
|
||
| @router.delete("/{id}") | ||
| @router.delete("/{id}", response_model=Message) | ||
| def delete_document( | ||
| session: SessionDep, | ||
| current_user: CurrentUser, | ||
| id: uuid.UUID, | ||
| background_tasks: BackgroundTasks, | ||
| ) -> Any: | ||
| ) -> Message: | ||
| """Delete a document by its ID, ensuring the user has permissions.""" | ||
|
|
||
| document = session.exec( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,140 @@ | ||
| import os | ||
| import uuid | ||
| from typing import Any | ||
|
|
||
| from fastapi import APIRouter, HTTPException | ||
| from fastapi.responses import JSONResponse, StreamingResponse | ||
| from sqlalchemy.orm import selectinload | ||
| from sqlmodel import select | ||
|
|
||
| from app.api.deps import CurrentUser, SessionDep | ||
| from app.core.config import settings | ||
| from app.models.podcast import Podcast | ||
| from app.schemas.internal import GeneratePodcastRequest | ||
| from app.schemas.public import PodcastPublic, PodcastsPublic | ||
| from app.services.podcast_service import generate_podcast_for_course | ||
|
|
||
| router = APIRouter(prefix="/podcasts", tags=["podcasts"]) | ||
|
|
||
|
|
||
| @router.get("/course/{course_id}", response_model=PodcastsPublic) | ||
| def list_podcasts(course_id: uuid.UUID, session: SessionDep, _current_user: CurrentUser, skip: int = 0, limit: int = 50) -> PodcastsPublic: | ||
| pods = session.exec(select(Podcast).where(Podcast.course_id == course_id).order_by(Podcast.created_at.desc()).offset(skip).limit(limit)).all() | ||
| return PodcastsPublic(data=[PodcastPublic.model_validate(p) for p in pods]) | ||
|
|
||
|
|
||
|
|
||
| @router.post("/course/{course_id}/generate", response_model=PodcastPublic) | ||
| async def generate_podcast( | ||
| course_id: uuid.UUID, | ||
| session: SessionDep, | ||
| _current_user: CurrentUser, | ||
| body: GeneratePodcastRequest, | ||
| ) -> PodcastPublic: | ||
| title = body.title.strip() | ||
| mode = body.mode | ||
| topics = body.topics | ||
| teacher_voice = body.teacher_voice or settings.PODCAST_TEACHER_VOICE | ||
| student_voice = body.student_voice or settings.PODCAST_STUDENT_VOICE | ||
| narrator_voice = body.narrator_voice or settings.PODCAST_TEACHER_VOICE | ||
| doc_ids = body.document_ids | ||
| podcast = await generate_podcast_for_course( | ||
| session, | ||
| course_id, | ||
| title, | ||
| teacher_voice, | ||
| student_voice, | ||
| narrator_voice, | ||
| mode, | ||
| topics, | ||
| doc_ids, | ||
| ) | ||
| return PodcastPublic.model_validate(podcast) | ||
|
|
||
|
|
||
| @router.get("/{podcast_id}", response_model=PodcastPublic) | ||
| def get_podcast(podcast_id: uuid.UUID, session: SessionDep, _current_user: CurrentUser) -> PodcastPublic: | ||
| pod = session.get(Podcast, podcast_id) | ||
| if not pod: | ||
| raise HTTPException(status_code=404, detail="Podcast not found") | ||
| return PodcastPublic.model_validate(pod) | ||
|
|
||
|
|
||
| @router.get("/{podcast_id}/audio") | ||
| def stream_audio(podcast_id: uuid.UUID, session: SessionDep, _current_user: CurrentUser): | ||
| pod = session.get(Podcast, podcast_id) | ||
| if not pod: | ||
| raise HTTPException(status_code=404, detail="Podcast not found") | ||
| if pod.storage_backend == "local": | ||
| file_path = pod.audio_path | ||
| if not os.path.exists(file_path): | ||
michaelgichia marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| raise HTTPException(status_code=404, detail="Audio file missing") | ||
| def iterfile(): | ||
| with open(file_path, "rb") as f: | ||
| while chunk := f.read(8192): | ||
| yield chunk | ||
| return StreamingResponse(iterfile(), media_type="audio/mpeg") | ||
| else: | ||
| # For S3, return a presigned URL to let client fetch directly | ||
| try: | ||
| import boto3 | ||
| s3 = boto3.client( | ||
| "s3", | ||
| aws_access_key_id=settings.AWS_ACCESS_KEY_ID, | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Share the S3 credentials with @deluakin so that he can add them to the Render Backend API service. |
||
| aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY, | ||
| region_name=settings.AWS_REGION, | ||
| ) | ||
| bucket = settings.S3_BUCKET_NAME | ||
| if not bucket: | ||
| raise ValueError("S3 bucket not configured") | ||
| key = pod.audio_path.replace(f"s3://{bucket}/", "") if pod.audio_path.startswith("s3://") else pod.audio_path | ||
| url = s3.generate_presigned_url( | ||
| ClientMethod='get_object', | ||
| Params={'Bucket': bucket, 'Key': key}, | ||
| ExpiresIn=3600, | ||
| ) | ||
| return JSONResponse({"url": url}) | ||
| except Exception as e: | ||
| raise HTTPException(status_code=500, detail=f"Failed to generate S3 URL: {e}") | ||
|
|
||
|
|
||
| @router.delete("/{podcast_id}") | ||
| def delete_podcast(podcast_id: uuid.UUID, session: SessionDep, current_user: CurrentUser) -> dict[str, str]: | ||
| pod = session.exec( | ||
| select(Podcast).where(Podcast.id == podcast_id).options(selectinload(Podcast.course)) # type: ignore | ||
| ).first() | ||
|
|
||
| if not pod: | ||
| raise HTTPException(status_code=404, detail="Podcast not found") | ||
|
|
||
| # Permission: owner or superuser | ||
| if not current_user.is_superuser and getattr(pod, "course", None) and pod.course.owner_id != current_user.id: # type: ignore | ||
| raise HTTPException(status_code=403, detail="Not enough permissions to delete this podcast") | ||
|
|
||
| # Best-effort delete of underlying media | ||
| try: | ||
| if pod.storage_backend == "local" and pod.audio_path and os.path.exists(pod.audio_path): | ||
| try: | ||
| os.remove(pod.audio_path) | ||
| except Exception: | ||
| pass | ||
| elif pod.storage_backend == "s3" and pod.audio_path: | ||
| try: | ||
| import boto3 | ||
| bucket = settings.S3_BUCKET_NAME | ||
| if bucket: | ||
| key = pod.audio_path.replace(f"s3://{bucket}/", "") if pod.audio_path.startswith("s3://") else pod.audio_path | ||
| s3 = boto3.client( | ||
| "s3", | ||
| aws_access_key_id=settings.AWS_ACCESS_KEY_ID, | ||
| aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY, | ||
| region_name=settings.AWS_REGION, | ||
| ) | ||
| s3.delete_object(Bucket=bucket, Key=key) | ||
| except Exception: | ||
| # ignore media delete failures | ||
| pass | ||
| finally: | ||
| session.delete(pod) | ||
| session.commit() | ||
| return {"message": "Podcast deleted successfully"} | ||
Uh oh!
There was an error while loading. Please reload this page.