71 lines
2.9 KiB
Python
71 lines
2.9 KiB
Python
import argparse
|
|
import logging
|
|
import sys
|
|
|
|
from transcriber.config import TranscriberConfig
|
|
from transcriber.pipeline import TranscriptionPipeline
|
|
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
"""Parse command-line arguments."""
|
|
parser = argparse.ArgumentParser(
|
|
description="Transcribe audio with speaker diarization",
|
|
)
|
|
parser.add_argument("input", help="Path to audio file (.m4a, .mp3, .wav, .aac)")
|
|
parser.add_argument("--output", default="./output", help="Output directory (default: ./output)")
|
|
parser.add_argument("--model", default="large-v3", help="Whisper model name (default: large-v3)")
|
|
parser.add_argument("--device", default="cuda", choices=["cuda", "cpu"], help="Device (default: cuda)")
|
|
parser.add_argument("--compute-type", default="float16", help="Compute type (default: float16)")
|
|
parser.add_argument("--language", default="ru", help="Language code (default: ru)")
|
|
parser.add_argument("--beam-size", type=int, default=5, help="Beam search size (default: 5)")
|
|
parser.add_argument("--vad", default="on", choices=["on", "off"], help="VAD filter (default: on)")
|
|
parser.add_argument("--max-speakers", type=int, default=None, help="Maximum number of speakers")
|
|
parser.add_argument("--min-speakers", type=int, default=None, help="Minimum number of speakers")
|
|
parser.add_argument("--format", nargs="+", default=["txt", "json"], help="Output formats (default: txt json)")
|
|
parser.add_argument("--pause-threshold", type=float, default=1.5, help="Max pause for merging (default: 1.5s)")
|
|
parser.add_argument("--chunk-duration", type=int, default=1800, help="Max chunk duration in sec (default: 1800)")
|
|
parser.add_argument("--hf-token", default="", help="HuggingFace token (default: from .env)")
|
|
parser.add_argument("--verbose", "-v", action="store_true", help="Enable debug logging")
|
|
return parser.parse_args()
|
|
|
|
|
|
def main() -> None:
|
|
"""Entry point for the transcription CLI."""
|
|
args = parse_args()
|
|
|
|
logging.basicConfig(
|
|
level=logging.DEBUG if args.verbose else logging.INFO,
|
|
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
|
datefmt="%H:%M:%S",
|
|
)
|
|
|
|
config = TranscriberConfig(
|
|
input_path=args.input,
|
|
output_dir=args.output,
|
|
model=args.model,
|
|
device=args.device,
|
|
compute_type=args.compute_type,
|
|
language=args.language,
|
|
beam_size=args.beam_size,
|
|
vad=args.vad == "on",
|
|
max_speakers=args.max_speakers,
|
|
min_speakers=args.min_speakers,
|
|
formats=args.format,
|
|
pause_threshold=args.pause_threshold,
|
|
chunk_duration=args.chunk_duration,
|
|
hf_token=args.hf_token,
|
|
)
|
|
|
|
pipeline = TranscriptionPipeline(config)
|
|
try:
|
|
exported = pipeline.run()
|
|
for path in exported:
|
|
print(f"Saved: {path}")
|
|
except Exception:
|
|
logging.exception("Pipeline failed")
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|