import argparse import logging import sys from transcriber.config import TranscriberConfig from transcriber.pipeline import TranscriptionPipeline def parse_args() -> argparse.Namespace: """Parse command-line arguments.""" parser = argparse.ArgumentParser( description="Transcribe audio with speaker diarization", ) parser.add_argument("input", help="Path to audio file (.m4a, .mp3, .wav, .aac)") parser.add_argument("--output", default="./output", help="Output directory (default: ./output)") parser.add_argument("--model", default="large-v3", help="Whisper model name (default: large-v3)") parser.add_argument("--device", default="cuda", choices=["cuda", "cpu"], help="Device (default: cuda)") parser.add_argument("--compute-type", default="float16", help="Compute type (default: float16)") parser.add_argument("--language", default="ru", help="Language code (default: ru)") parser.add_argument("--beam-size", type=int, default=5, help="Beam search size (default: 5)") parser.add_argument("--vad", default="on", choices=["on", "off"], help="VAD filter (default: on)") parser.add_argument("--max-speakers", type=int, default=None, help="Maximum number of speakers") parser.add_argument("--min-speakers", type=int, default=None, help="Minimum number of speakers") parser.add_argument("--format", nargs="+", default=["txt", "json"], help="Output formats (default: txt json)") parser.add_argument("--pause-threshold", type=float, default=1.5, help="Max pause for merging (default: 1.5s)") parser.add_argument("--chunk-duration", type=int, default=1800, help="Max chunk duration in sec (default: 1800)") parser.add_argument("--hf-token", default="", help="HuggingFace token (default: from .env)") parser.add_argument("--verbose", "-v", action="store_true", help="Enable debug logging") return parser.parse_args() def main() -> None: """Entry point for the transcription CLI.""" args = parse_args() logging.basicConfig( level=logging.DEBUG if args.verbose else logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", datefmt="%H:%M:%S", ) config = TranscriberConfig( input_path=args.input, output_dir=args.output, model=args.model, device=args.device, compute_type=args.compute_type, language=args.language, beam_size=args.beam_size, vad=args.vad == "on", max_speakers=args.max_speakers, min_speakers=args.min_speakers, formats=args.format, pause_threshold=args.pause_threshold, chunk_duration=args.chunk_duration, hf_token=args.hf_token, ) pipeline = TranscriptionPipeline(config) try: exported = pipeline.run() for path in exported: print(f"Saved: {path}") except Exception: logging.exception("Pipeline failed") sys.exit(1) if __name__ == "__main__": main()