diff --git a/requirements.txt b/requirements.txt index 481f457..6947b7a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ faster-whisper pyannote.audio python-dotenv +soundfile tqdm diff --git a/transcriber/diarization/pyannote_engine.py b/transcriber/diarization/pyannote_engine.py index ec95676..a5bf992 100644 --- a/transcriber/diarization/pyannote_engine.py +++ b/transcriber/diarization/pyannote_engine.py @@ -1,6 +1,7 @@ import logging from dataclasses import dataclass +import soundfile as sf import torch from pyannote.audio import Pipeline from pyannote.audio.pipelines.utils.hook import ProgressHook @@ -47,6 +48,10 @@ class DiarizationEngine: """ logger.info("Diarizing: %s", audio_path) + data, sample_rate = sf.read(audio_path, dtype="float32") + waveform = torch.from_numpy(data).unsqueeze(0) + audio_input = {"waveform": waveform, "sample_rate": sample_rate} + kwargs = {} if min_speakers is not None: kwargs["min_speakers"] = min_speakers @@ -54,7 +59,7 @@ class DiarizationEngine: kwargs["max_speakers"] = max_speakers with ProgressHook() as hook: - diarization = self._pipeline(audio_path, hook=hook, **kwargs) + diarization = self._pipeline(audio_input, hook=hook, **kwargs) turns = [] for turn, speaker in diarization.exclusive_speaker_diarization: