transcribe-interview/transcriber/audio/preprocess.py

55 lines
1.6 KiB
Python

import logging
import subprocess
from pathlib import Path
logger = logging.getLogger(__name__)
SUPPORTED_FORMATS = {".m4a", ".mp3", ".wav", ".aac"}
def preprocess_audio(input_path: str, output_dir: str) -> str:
"""Convert audio to mono 16kHz PCM WAV with normalization and DC offset removal.
Args:
input_path: Path to the source audio file.
output_dir: Directory for the processed file.
Returns:
Path to the processed WAV file.
Raises:
FileNotFoundError: If input file does not exist.
ValueError: If file format is not supported.
RuntimeError: If ffmpeg processing fails.
"""
src = Path(input_path)
if not src.exists():
raise FileNotFoundError(f"Audio file not found: {input_path}")
if src.suffix.lower() not in SUPPORTED_FORMATS:
raise ValueError(
f"Unsupported format: {src.suffix}. Supported: {SUPPORTED_FORMATS}"
)
out = Path(output_dir) / f"{src.stem}_processed.wav"
out.parent.mkdir(parents=True, exist_ok=True)
cmd = [
"ffmpeg", "-y", "-i", str(src),
"-ac", "1",
"-ar", "16000",
"-sample_fmt", "s16",
"-af", "highpass=f=10,loudnorm=I=-16:TP=-1.5:LRA=11",
str(out),
]
logger.info("Preprocessing: %s -> %s", src.name, out.name)
result = subprocess.run(
cmd, capture_output=True, text=True, timeout=600
)
if result.returncode != 0:
raise RuntimeError(f"ffmpeg failed: {result.stderr[:500]}")
logger.info("Preprocessing complete: %s", out.name)
return str(out)