pythoncode-tutorials/machine-learning/speech-recognition/speech_to_text_2026.py at master · x4nth055/pythoncode-tutorials · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
from __future__ import annotations

import argparse
import os
import shutil
import subprocess
import tempfile
import wave
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable, Literal


@dataclass(slots=True)
class Segment:
    """One transcribed audio segment."""

    start: float
    end: float
    text: str


def seconds_to_srt_time(seconds: float) -> str:
    """Convert seconds to an SRT timestamp (HH:MM:SS,mmm)."""
    milliseconds = round(seconds * 1000)
    hours, remainder = divmod(milliseconds, 3_600_000)
    minutes, remainder = divmod(remainder, 60_000)
    secs, millis = divmod(remainder, 1000)
    return f"{hours:02}:{minutes:02}:{secs:02},{millis:03}"


def write_srt(segments: Iterable[Segment], output_path: str | Path) -> None:
    """Write transcript segments to an SRT subtitle file."""
    lines: list[str] = []
    subtitle_index = 1
    for segment in segments:
        text = segment.text.strip()
        if not text:
            continue
        lines.extend(
            [
                str(subtitle_index),
                f"{seconds_to_srt_time(segment.start)} --> {seconds_to_srt_time(segment.end)}",
                text,
                "",
            ]
        )
        subtitle_index += 1
    Path(output_path).write_text("\n".join(lines), encoding="utf-8")


def convert_to_wav(input_path: str | Path, output_path: str | Path, sample_rate: int = 16_000) -> None:
    """Convert any FFmpeg-readable audio/video file to mono 16 kHz WAV."""
    if not shutil.which("ffmpeg"):
        raise RuntimeError("FFmpeg is required. Install it from https://ffmpeg.org/.")

    subprocess.run(
        [
            "ffmpeg",
            "-y",
            "-i",
            str(input_path),
            "-ac",
            "1",
            "-ar",
            str(sample_rate),
            "-vn",
            str(output_path),
        ],
        check=True,
        stdout=subprocess.DEVNULL,
        stderr=subprocess.DEVNULL,
    )


def chunk_wav(input_wav: str | Path, chunk_seconds: int = 600) -> list[Path]:
    """Split a WAV file into fixed-size chunks without loading the whole file in memory."""
    input_wav = Path(input_wav)
    if chunk_seconds <= 0:
        raise ValueError("chunk_seconds must be positive")

    output_dir = input_wav.parent / f"{input_wav.stem}_chunks"
    output_dir.mkdir(parents=True, exist_ok=True)

    chunks: list[Path] = []
    with wave.open(str(input_wav), "rb") as reader:
        params = reader.getparams()
        frames_per_chunk = int(params.framerate * chunk_seconds)
        index = 1
        while True:
            frames = reader.readframes(frames_per_chunk)
            if not frames:
                break
            chunk_path = output_dir / f"chunk_{index:04d}.wav"
            with wave.open(str(chunk_path), "wb") as writer:
                writer.setparams(params)
                writer.writeframes(frames)
            chunks.append(chunk_path)
            index += 1
    return chunks


def transcribe_with_openai(
    audio_path: str | Path,
    *,
    model: str = "gpt-4o-transcribe",
    language: str | None = None,
    prompt: str | None = None,
) -> str:
    """Transcribe audio using OpenAI speech-to-text models."""
    try:
        from openai import OpenAI
    except ImportError as exc:
        raise RuntimeError("Install the OpenAI SDK first: pip install openai") from exc

    kwargs: dict[str, object] = {"model": model}
    if language:
        kwargs["language"] = language
    if prompt:
        kwargs["prompt"] = prompt

    client = OpenAI()
    with Path(audio_path).open("rb") as audio_file:
        transcript = client.audio.transcriptions.create(file=audio_file, **kwargs)
    return transcript.text


def transcribe_large_file_with_openai(
    input_path: str | Path,
    *,
    model: str = "gpt-4o-transcribe",
    language: str | None = None,
    prompt: str | None = None,
    chunk_seconds: int = 600,
) -> str:
    """Convert, chunk, and transcribe a long file with OpenAI's API."""
    with tempfile.TemporaryDirectory() as temp_dir:
        temp_dir_path = Path(temp_dir)
        wav_path = temp_dir_path / "audio.wav"
        convert_to_wav(input_path, wav_path)
        chunks = chunk_wav(wav_path, chunk_seconds=chunk_seconds)
        parts = [
            transcribe_with_openai(chunk, model=model, language=language, prompt=prompt)
            for chunk in chunks
        ]
    return "\n".join(part.strip() for part in parts if part.strip())


def transcribe_with_groq(
    audio_path: str | Path,
    *,
    model: str = "whisper-large-v3-turbo",
    language: str | None = None,
    prompt: str | None = None,
) -> str:
    """Transcribe audio with Groq's OpenAI-compatible Whisper endpoint."""
    try:
        from groq import Groq
    except ImportError as exc:
        raise RuntimeError("Install the Groq SDK first: pip install groq") from exc

    kwargs: dict[str, object] = {"model": model, "temperature": 0.0}
    if language:
        kwargs["language"] = language
    if prompt:
        kwargs["prompt"] = prompt

    client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
    with Path(audio_path).open("rb") as audio_file:
        transcript = client.audio.transcriptions.create(file=audio_file, **kwargs)
    return transcript.text


def transcribe_with_faster_whisper(
    audio_path: str | Path,
    *,
    model_size: str = "large-v3",
    device: Literal["auto", "cpu", "cuda"] = "auto",
    compute_type: str = "auto",
    language: str | None = None,
) -> tuple[str, list[Segment]]:
    """Transcribe audio locally with Faster-Whisper."""
    try:
        from faster_whisper import WhisperModel
    except ImportError as exc:
        raise RuntimeError("Install Faster-Whisper first: pip install faster-whisper") from exc

    if device == "auto":
        device = "cuda" if _cuda_is_available() else "cpu"
    if compute_type == "auto":
        compute_type = "float16" if device == "cuda" else "int8"

    model = WhisperModel(model_size, device=device, compute_type=compute_type)
    kwargs: dict[str, object] = {
        "beam_size": 5,
        "vad_filter": True,
        "vad_parameters": {"min_silence_duration_ms": 500},
    }
    if language:
        kwargs["language"] = language

    raw_segments, _info = model.transcribe(str(audio_path), **kwargs)
    segments = [Segment(start=s.start, end=s.end, text=s.text) for s in raw_segments]
    return "".join(s.text for s in segments).strip(), segments


def record_microphone(output_path: str | Path = "microphone.wav", seconds: int = 8, sample_rate: int = 16_000) -> Path:
    """Record microphone audio to a WAV file."""
    try:
        import sounddevice as sd
        from scipy.io.wavfile import write
    except ImportError as exc:
        raise RuntimeError("Install microphone dependencies: pip install sounddevice scipy") from exc

    output_path = Path(output_path)
    print(f"Recording for {seconds} seconds...")
    audio = sd.rec(int(seconds * sample_rate), samplerate=sample_rate, channels=1, dtype="int16")
    sd.wait()
    write(output_path, sample_rate, audio)
    print(f"Saved recording to {output_path}")
    return output_path


def _cuda_is_available() -> bool:
    """Return True when PyTorch sees a CUDA GPU, without requiring torch at install time."""
    try:
        import torch

        return bool(torch.cuda.is_available())
    except Exception:
        return False


def main(argv: list[str] | None = None) -> int:
    parser = argparse.ArgumentParser(description="Transcribe audio to text in Python.")
    parser.add_argument("audio", nargs="?", help="Path to an audio/video file")
    parser.add_argument("--engine", choices=("openai", "groq", "faster-whisper"), default="faster-whisper")
    parser.add_argument("--model", default=None, help="Model name. Defaults depend on the engine.")
    parser.add_argument("--language", default=None, help="Optional ISO-639-1 language hint, e.g. en, fr, es")
    parser.add_argument("--prompt", default=None, help="Optional context prompt for API transcription")
    parser.add_argument("--srt", default=None, help="Optional .srt output path (Faster-Whisper engine)")
    parser.add_argument("--long", action="store_true", help="Convert/chunk long files before OpenAI transcription")
    parser.add_argument("--chunk-seconds", type=int, default=600, help="Chunk size for --long, default: 600")
    parser.add_argument("--record", type=int, metavar="SECONDS", help="Record from microphone first")
    args = parser.parse_args(argv)

    audio_path: Path
    if args.record:
        audio_path = record_microphone(seconds=args.record)
    else:
        if not args.audio:
            parser.error("provide an audio file or use --record SECONDS")
        audio_path = Path(args.audio)
        if not audio_path.exists():
            parser.error(f"File not found: {audio_path}")

    if args.engine == "openai":
        if args.long:
            print(transcribe_large_file_with_openai(
                audio_path,
                model=args.model or "gpt-4o-transcribe",
                language=args.language,
                prompt=args.prompt,
                chunk_seconds=args.chunk_seconds,
            ))
        else:
            print(transcribe_with_openai(
                audio_path,
                model=args.model or "gpt-4o-transcribe",
                language=args.language,
                prompt=args.prompt,
            ))
        return 0

    if args.engine == "groq":
        print(transcribe_with_groq(
            audio_path,
            model=args.model or "whisper-large-v3-turbo",
            language=args.language,
            prompt=args.prompt,
        ))
        return 0

    text, segments = transcribe_with_faster_whisper(
        audio_path,
        model_size=args.model or "large-v3",
        language=args.language,
    )
    print(text)
    if args.srt:
        write_srt(segments, args.srt)
    return 0


if __name__ == "__main__":
    raise SystemExit(main())