transcribe-interview/transcriber/audio/preprocess.py

import logging
import subprocess
from pathlib import Path

logger = logging.getLogger(__name__)

SUPPORTED_FORMATS = {".m4a", ".mp3", ".wav", ".aac"}


def preprocess_audio(input_path: str, output_dir: str) -> str:
    """Convert audio to mono 16kHz PCM WAV with normalization and DC offset removal.

    Args:
        input_path: Path to the source audio file.
        output_dir: Directory for the processed file.

    Returns:
        Path to the processed WAV file.

    Raises:
        FileNotFoundError: If input file does not exist.
        ValueError: If file format is not supported.
        RuntimeError: If ffmpeg processing fails.
    """
    src = Path(input_path)
    if not src.exists():
        raise FileNotFoundError(f"Audio file not found: {input_path}")
    if src.suffix.lower() not in SUPPORTED_FORMATS:
        raise ValueError(
            f"Unsupported format: {src.suffix}. Supported: {SUPPORTED_FORMATS}"
        )

    out = Path(output_dir) / f"{src.stem}_processed.wav"
    out.parent.mkdir(parents=True, exist_ok=True)

    cmd = [
        "ffmpeg", "-y", "-i", str(src),
        "-ac", "1",
        "-ar", "16000",
        "-sample_fmt", "s16",
        "-af", "highpass=f=10,loudnorm=I=-16:TP=-1.5:LRA=11",
        str(out),
    ]

    logger.info("Preprocessing: %s -> %s", src.name, out.name)

    result = subprocess.run(
        cmd, capture_output=True, text=True, timeout=600
    )
    if result.returncode != 0:
        raise RuntimeError(f"ffmpeg failed: {result.stderr[:500]}")

    logger.info("Preprocessing complete: %s", out.name)
    return str(out)