import os from dataclasses import dataclass, field from pathlib import Path from dotenv import load_dotenv @dataclass class TranscriberConfig: """Configuration for the transcription pipeline.""" input_path: str = "" output_dir: str = "./output" model: str = "large-v3" device: str = "cuda" compute_type: str = "float16" language: str = "ru" beam_size: int = 5 vad: bool = True max_speakers: int | None = None min_speakers: int | None = None formats: list[str] = field(default_factory=lambda: ["txt", "json"]) pause_threshold: float = 1.5 chunk_duration: int = 1800 hf_token: str = "" def __post_init__(self): load_dotenv() if not self.hf_token: self.hf_token = os.getenv("HF_TOKEN", "") if not self.hf_token: raise ValueError( "HF_TOKEN is required for pyannote diarization. " "Set it in .env or pass via --hf-token" ) if self.device == "cpu": self.compute_type = "int8" Path(self.output_dir).mkdir(parents=True, exist_ok=True)