import logging from dataclasses import dataclass, field from faster_whisper import WhisperModel logger = logging.getLogger(__name__) @dataclass class WordInfo: """Single word with timing and confidence.""" word: str start: float end: float probability: float @dataclass class Segment: """ASR segment with text, timing, and optional word-level detail.""" start: float end: float text: str words: list[WordInfo] = field(default_factory=list) class WhisperEngine: """Speech recognition engine based on faster-whisper.""" def __init__(self, model_name: str, device: str, compute_type: str): logger.info("Loading Whisper model: %s on %s (%s)", model_name, device, compute_type) self._model = WhisperModel(model_name, device=device, compute_type=compute_type) def transcribe( self, audio_path: str, language: str | None = None, beam_size: int = 5, vad_filter: bool = True, ) -> list[Segment]: """Transcribe audio file and return list of segments. Args: audio_path: Path to WAV file. language: Language code or None for auto-detection. beam_size: Beam search width. vad_filter: Whether to enable VAD filtering. Returns: List of transcription segments with word-level timestamps. """ logger.info("Transcribing: %s", audio_path) segments_gen, info = self._model.transcribe( audio_path, language=language, beam_size=beam_size, word_timestamps=True, vad_filter=vad_filter, vad_parameters={"min_silence_duration_ms": 500}, temperature=0.0, condition_on_previous_text=False, no_speech_threshold=0.6, log_prob_threshold=-1.0, ) logger.info( "Detected language: %s (%.2f), duration: %.1fs", info.language, info.language_probability, info.duration, ) results = [] for seg in segments_gen: words = [ WordInfo( word=w.word.strip(), start=w.start, end=w.end, probability=w.probability, ) for w in (seg.words or []) ] results.append(Segment( start=seg.start, end=seg.end, text=seg.text.strip(), words=words, )) logger.info("Transcription complete: %d segments", len(results)) return results