94 lines
2.6 KiB
Python
94 lines
2.6 KiB
Python
import logging
|
|
from dataclasses import dataclass, field
|
|
|
|
from faster_whisper import WhisperModel
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class WordInfo:
|
|
"""Single word with timing and confidence."""
|
|
|
|
word: str
|
|
start: float
|
|
end: float
|
|
probability: float
|
|
|
|
|
|
@dataclass
|
|
class Segment:
|
|
"""ASR segment with text, timing, and optional word-level detail."""
|
|
|
|
start: float
|
|
end: float
|
|
text: str
|
|
words: list[WordInfo] = field(default_factory=list)
|
|
|
|
|
|
class WhisperEngine:
|
|
"""Speech recognition engine based on faster-whisper."""
|
|
|
|
def __init__(self, model_name: str, device: str, compute_type: str):
|
|
logger.info("Loading Whisper model: %s on %s (%s)", model_name, device, compute_type)
|
|
self._model = WhisperModel(model_name, device=device, compute_type=compute_type)
|
|
|
|
def transcribe(
|
|
self,
|
|
audio_path: str,
|
|
language: str | None = None,
|
|
beam_size: int = 5,
|
|
vad_filter: bool = True,
|
|
) -> list[Segment]:
|
|
"""Transcribe audio file and return list of segments.
|
|
|
|
Args:
|
|
audio_path: Path to WAV file.
|
|
language: Language code or None for auto-detection.
|
|
beam_size: Beam search width.
|
|
vad_filter: Whether to enable VAD filtering.
|
|
|
|
Returns:
|
|
List of transcription segments with word-level timestamps.
|
|
"""
|
|
logger.info("Transcribing: %s", audio_path)
|
|
|
|
segments_gen, info = self._model.transcribe(
|
|
audio_path,
|
|
language=language,
|
|
beam_size=beam_size,
|
|
word_timestamps=True,
|
|
vad_filter=vad_filter,
|
|
vad_parameters={"min_silence_duration_ms": 500},
|
|
temperature=0.0,
|
|
condition_on_previous_text=False,
|
|
no_speech_threshold=0.6,
|
|
log_prob_threshold=-1.0,
|
|
)
|
|
|
|
logger.info(
|
|
"Detected language: %s (%.2f), duration: %.1fs",
|
|
info.language, info.language_probability, info.duration,
|
|
)
|
|
|
|
results = []
|
|
for seg in segments_gen:
|
|
words = [
|
|
WordInfo(
|
|
word=w.word.strip(),
|
|
start=w.start,
|
|
end=w.end,
|
|
probability=w.probability,
|
|
)
|
|
for w in (seg.words or [])
|
|
]
|
|
results.append(Segment(
|
|
start=seg.start,
|
|
end=seg.end,
|
|
text=seg.text.strip(),
|
|
words=words,
|
|
))
|
|
|
|
logger.info("Transcription complete: %d segments", len(results))
|
|
return results
|