transcribe-interview/transcriber/asr/whisper_engine.py

94 lines
2.6 KiB
Python

import logging
from dataclasses import dataclass, field
from faster_whisper import WhisperModel
logger = logging.getLogger(__name__)
@dataclass
class WordInfo:
"""Single word with timing and confidence."""
word: str
start: float
end: float
probability: float
@dataclass
class Segment:
"""ASR segment with text, timing, and optional word-level detail."""
start: float
end: float
text: str
words: list[WordInfo] = field(default_factory=list)
class WhisperEngine:
"""Speech recognition engine based on faster-whisper."""
def __init__(self, model_name: str, device: str, compute_type: str):
logger.info("Loading Whisper model: %s on %s (%s)", model_name, device, compute_type)
self._model = WhisperModel(model_name, device=device, compute_type=compute_type)
def transcribe(
self,
audio_path: str,
language: str | None = None,
beam_size: int = 5,
vad_filter: bool = True,
) -> list[Segment]:
"""Transcribe audio file and return list of segments.
Args:
audio_path: Path to WAV file.
language: Language code or None for auto-detection.
beam_size: Beam search width.
vad_filter: Whether to enable VAD filtering.
Returns:
List of transcription segments with word-level timestamps.
"""
logger.info("Transcribing: %s", audio_path)
segments_gen, info = self._model.transcribe(
audio_path,
language=language,
beam_size=beam_size,
word_timestamps=True,
vad_filter=vad_filter,
vad_parameters={"min_silence_duration_ms": 500},
temperature=0.0,
condition_on_previous_text=False,
no_speech_threshold=0.6,
log_prob_threshold=-1.0,
)
logger.info(
"Detected language: %s (%.2f), duration: %.1fs",
info.language, info.language_probability, info.duration,
)
results = []
for seg in segments_gen:
words = [
WordInfo(
word=w.word.strip(),
start=w.start,
end=w.end,
probability=w.probability,
)
for w in (seg.words or [])
]
results.append(Segment(
start=seg.start,
end=seg.end,
text=seg.text.strip(),
words=words,
))
logger.info("Transcription complete: %d segments", len(results))
return results