Skip to content

Commit 6fb5cb2

Browse files
feat: search backend (#537)
* docs: transient docs * chore: cleanup * webvtt WIP * webvtt field * chore: webvtt tests comments * chore: remove useless tests * feat: search TASK.md * feat: full text search by title/webvtt * chore: search api task * feat: search api * feat: search API * chore: rm task md * chore: roll back unnecessary validators * chore: pr review WIP * chore: pr review WIP * chore: pr review * chore: top imports * feat: better lint + ci * feat: better lint + ci * feat: better lint + ci * feat: better lint + ci * chore: lint * chore: lint * fix: db datetime definitions * fix: flush() params * fix: update transcript mutability expectation / test * fix: update transcript mutability expectation / test * chore: auto review * chore: new controller extraction * chore: new controller extraction * chore: cleanup * chore: review WIP * chore: pr WIP * chore: remove ci lint * chore: openapi regeneration * chore: openapi regeneration * chore: postgres test doc * fix: .dockerignore for arm binaries * fix: .dockerignore for arm binaries * fix: cap test loops * fix: cap test loops * fix: cap test loops * fix: get_transcript_topics * chore: remove flow.md docs and claude guidance * chore: remove claude.md db doc * chore: remove claude.md db doc * chore: remove claude.md db doc * chore: remove claude.md db doc
1 parent a42ed12 commit 6fb5cb2

29 files changed

+3240
-1520
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,4 @@ data/
1414
www/REFACTOR.md
1515
www/reload-frontend
1616
server/test.sqlite
17+
CLAUDE.local.md

.pre-commit-config.yaml

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,10 @@
33
repos:
44
- repo: local
55
hooks:
6-
- id: yarn-format
7-
name: run yarn format
6+
- id: format
7+
name: run format
88
language: system
9-
entry: bash -c 'cd www && yarn format'
9+
entry: bash -c 'cd www && npx prettier --write .'
1010
pass_filenames: false
1111
files: ^www/
1212

@@ -23,8 +23,7 @@ repos:
2323
- id: ruff
2424
args:
2525
- --fix
26-
- --select
27-
- I,F401
26+
# Uses select rules from server/pyproject.toml
2827
files: ^server/
2928
- id: ruff-format
3029
files: ^server/

compose.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ services:
4444
working_dir: /app
4545
volumes:
4646
- ./www:/app/
47+
- /app/node_modules
4748
env_file:
4849
- ./www/.env.local
4950

server/migrations/README

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
1-
Generic single-database configuration.
1+
Generic single-database configuration.
2+
3+
Both data migrations and schema migrations must be in migrations.
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
"""add_webvtt_field_to_transcript
2+
3+
Revision ID: 0bc0f3ff0111
4+
Revises: b7df9609542c
5+
Create Date: 2025-08-05 19:36:41.740957
6+
7+
"""
8+
from typing import Sequence, Union
9+
10+
from alembic import op
11+
import sqlalchemy as sa
12+
13+
14+
revision: str = '0bc0f3ff0111'
15+
down_revision: Union[str, None] = 'b7df9609542c'
16+
branch_labels: Union[str, Sequence[str], None] = None
17+
depends_on: Union[str, Sequence[str], None] = None
18+
19+
20+
def upgrade() -> None:
21+
op.add_column('transcript',
22+
sa.Column('webvtt', sa.Text(), nullable=True)
23+
)
24+
25+
26+
def downgrade() -> None:
27+
op.drop_column('transcript', 'webvtt')
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
"""add_full_text_search
2+
3+
Revision ID: 116b2f287eab
4+
Revises: 0bc0f3ff0111
5+
Create Date: 2025-08-07 11:27:38.473517
6+
7+
"""
8+
from typing import Sequence, Union
9+
10+
from alembic import op
11+
import sqlalchemy as sa
12+
13+
14+
revision: str = '116b2f287eab'
15+
down_revision: Union[str, None] = '0bc0f3ff0111'
16+
branch_labels: Union[str, Sequence[str], None] = None
17+
depends_on: Union[str, Sequence[str], None] = None
18+
19+
20+
def upgrade() -> None:
21+
conn = op.get_bind()
22+
if conn.dialect.name != 'postgresql':
23+
return
24+
25+
op.execute("""
26+
ALTER TABLE transcript ADD COLUMN search_vector_en tsvector
27+
GENERATED ALWAYS AS (
28+
setweight(to_tsvector('english', coalesce(title, '')), 'A') ||
29+
setweight(to_tsvector('english', coalesce(webvtt, '')), 'B')
30+
) STORED
31+
""")
32+
33+
op.create_index(
34+
'idx_transcript_search_vector_en',
35+
'transcript',
36+
['search_vector_en'],
37+
postgresql_using='gin'
38+
)
39+
40+
41+
def downgrade() -> None:
42+
conn = op.get_bind()
43+
if conn.dialect.name != 'postgresql':
44+
return
45+
46+
op.drop_index('idx_transcript_search_vector_en', table_name='transcript')
47+
op.drop_column('transcript', 'search_vector_en')
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
"""populate_webvtt_from_topics
2+
3+
Revision ID: 8120ebc75366
4+
Revises: 116b2f287eab
5+
Create Date: 2025-08-11 19:11:01.316947
6+
7+
"""
8+
import json
9+
from typing import Sequence, Union
10+
11+
from alembic import op
12+
import sqlalchemy as sa
13+
from sqlalchemy import text
14+
15+
16+
# revision identifiers, used by Alembic.
17+
revision: str = '8120ebc75366'
18+
down_revision: Union[str, None] = '116b2f287eab'
19+
branch_labels: Union[str, Sequence[str], None] = None
20+
depends_on: Union[str, Sequence[str], None] = None
21+
22+
23+
def topics_to_webvtt(topics):
24+
"""Convert topics list to WebVTT format string."""
25+
if not topics:
26+
return None
27+
28+
lines = ["WEBVTT", ""]
29+
30+
for topic in topics:
31+
start_time = format_timestamp(topic.get("start"))
32+
end_time = format_timestamp(topic.get("end"))
33+
text = topic.get("text", "").strip()
34+
35+
if start_time and end_time and text:
36+
lines.append(f"{start_time} --> {end_time}")
37+
lines.append(text)
38+
lines.append("")
39+
40+
return "\n".join(lines).strip()
41+
42+
43+
def format_timestamp(seconds):
44+
"""Format seconds to WebVTT timestamp format (HH:MM:SS.mmm)."""
45+
if seconds is None:
46+
return None
47+
48+
hours = int(seconds // 3600)
49+
minutes = int((seconds % 3600) // 60)
50+
secs = seconds % 60
51+
52+
return f"{hours:02d}:{minutes:02d}:{secs:06.3f}"
53+
54+
55+
def upgrade() -> None:
56+
"""Populate WebVTT field for all transcripts with topics."""
57+
58+
# Get connection
59+
connection = op.get_bind()
60+
61+
# Query all transcripts with topics
62+
result = connection.execute(
63+
text("SELECT id, topics FROM transcript WHERE topics IS NOT NULL")
64+
)
65+
66+
rows = result.fetchall()
67+
print(f"Found {len(rows)} transcripts with topics")
68+
69+
updated_count = 0
70+
error_count = 0
71+
72+
for row in rows:
73+
transcript_id = row[0]
74+
topics_data = row[1]
75+
76+
if not topics_data:
77+
continue
78+
79+
try:
80+
# Parse JSON if it's a string
81+
if isinstance(topics_data, str):
82+
topics_data = json.loads(topics_data)
83+
84+
# Convert topics to WebVTT format
85+
webvtt_content = topics_to_webvtt(topics_data)
86+
87+
if webvtt_content:
88+
# Update the webvtt field
89+
connection.execute(
90+
text("UPDATE transcript SET webvtt = :webvtt WHERE id = :id"),
91+
{"webvtt": webvtt_content, "id": transcript_id}
92+
)
93+
updated_count += 1
94+
print(f"✓ Updated transcript {transcript_id}")
95+
96+
except Exception as e:
97+
error_count += 1
98+
print(f"✗ Error updating transcript {transcript_id}: {e}")
99+
100+
print(f"\nMigration complete!")
101+
print(f" Updated: {updated_count}")
102+
print(f" Errors: {error_count}")
103+
104+
105+
def downgrade() -> None:
106+
"""Clear WebVTT field for all transcripts."""
107+
op.execute(
108+
text("UPDATE transcript SET webvtt = NULL")
109+
)

server/pyproject.toml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ dependencies = [
4040
"llama-index>=0.12.52",
4141
"llama-index-llms-openai-like>=0.4.0",
4242
"pytest-env>=1.1.5",
43+
"webvtt-py>=0.5.0",
4344
]
4445

4546
[dependency-groups]
@@ -92,5 +93,12 @@ addopts = "-ra -q --disable-pytest-warnings --cov --cov-report html -v"
9293
testpaths = ["tests"]
9394
asyncio_mode = "auto"
9495

96+
[tool.ruff.lint]
97+
select = [
98+
"I", # isort - import sorting
99+
"F401", # unused imports
100+
"PLC0415", # import-outside-top-level - detect inline imports
101+
]
102+
95103
[tool.ruff.lint.per-file-ignores]
96104
"reflector/processors/summary/summary_builder.py" = ["E501"]

0 commit comments

Comments
 (0)