transcribe-interview/transcriber/main.py

71 lines
2.9 KiB
Python

import argparse
import logging
import sys
from transcriber.config import TranscriberConfig
from transcriber.pipeline import TranscriptionPipeline
def parse_args() -> argparse.Namespace:
"""Parse command-line arguments."""
parser = argparse.ArgumentParser(
description="Transcribe audio with speaker diarization",
)
parser.add_argument("input", help="Path to audio file (.m4a, .mp3, .wav, .aac)")
parser.add_argument("--output", default="./output", help="Output directory (default: ./output)")
parser.add_argument("--model", default="large-v3", help="Whisper model name (default: large-v3)")
parser.add_argument("--device", default="cuda", choices=["cuda", "cpu"], help="Device (default: cuda)")
parser.add_argument("--compute-type", default="float16", help="Compute type (default: float16)")
parser.add_argument("--language", default="ru", help="Language code (default: ru)")
parser.add_argument("--beam-size", type=int, default=5, help="Beam search size (default: 5)")
parser.add_argument("--vad", default="on", choices=["on", "off"], help="VAD filter (default: on)")
parser.add_argument("--max-speakers", type=int, default=None, help="Maximum number of speakers")
parser.add_argument("--min-speakers", type=int, default=None, help="Minimum number of speakers")
parser.add_argument("--format", nargs="+", default=["txt", "json"], help="Output formats (default: txt json)")
parser.add_argument("--pause-threshold", type=float, default=1.5, help="Max pause for merging (default: 1.5s)")
parser.add_argument("--chunk-duration", type=int, default=1800, help="Max chunk duration in sec (default: 1800)")
parser.add_argument("--hf-token", default="", help="HuggingFace token (default: from .env)")
parser.add_argument("--verbose", "-v", action="store_true", help="Enable debug logging")
return parser.parse_args()
def main() -> None:
"""Entry point for the transcription CLI."""
args = parse_args()
logging.basicConfig(
level=logging.DEBUG if args.verbose else logging.INFO,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
datefmt="%H:%M:%S",
)
config = TranscriberConfig(
input_path=args.input,
output_dir=args.output,
model=args.model,
device=args.device,
compute_type=args.compute_type,
language=args.language,
beam_size=args.beam_size,
vad=args.vad == "on",
max_speakers=args.max_speakers,
min_speakers=args.min_speakers,
formats=args.format,
pause_threshold=args.pause_threshold,
chunk_duration=args.chunk_duration,
hf_token=args.hf_token,
)
pipeline = TranscriptionPipeline(config)
try:
exported = pipeline.run()
for path in exported:
print(f"Saved: {path}")
except Exception:
logging.exception("Pipeline failed")
sys.exit(1)
if __name__ == "__main__":
main()