SummitAI/audio_preprocess.py at master · chriswilder3/SummitAI · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from pydub import AudioSegment, effects, silence
import noisereduce as nr
import librosa
import soundfile as sf
import os

def preprocess_audio(input_wav: str,
                             output_wav: str,
                             silence_thresh: int = -40,
                             min_silence_len: int = 700):
    """
    Preprocess meeting audio for Whisper:
    1. Normalize volume
    2. Remove long silences
    3. Reduce background noise
    4. Save as 16kHz mono WAV
    """

    if not os.path.exists(input_wav):
        raise FileNotFoundError(f"Input file not found: {input_wav}")

    # --- Step 1: Normalize & remove silences with pydub ---
    sound = AudioSegment.from_file(input_wav, format="wav")

    # Normalize volume
    normalized_sound = effects.normalize(sound)

    # Split on silence
    chunks = silence.split_on_silence(
        normalized_sound,
        min_silence_len=min_silence_len,  # ms
        silence_thresh=silence_thresh     # dBFS
    )

    # Recombine chunks with small padding (keeps flow natural)
    processed = AudioSegment.silent(duration=200)
    for chunk in chunks:
        processed += chunk + AudioSegment.silent(duration=200)

    # Save cleaned file
    processed.export(output_wav, format="wav")

    # # --- Step 2: Noise reduction with noisereduce ---
    # y, sr = librosa.load(temp_file, sr=16000, mono=True)  # resample to 16kHz mono
    # reduced = nr.reduce_noise(y=y, sr=sr)

    # # Save final output
    # sf.write(output_wav, reduced, sr, subtype="PCM_16")

    # # Cleanup
    # os.remove(temp_file)

def main():
    input_audio_file  = "E:/downloads/meeting_16k_mono.wav"
    output_audio_file = "E:/downloads/meeting_preprocessed.wav"
    preprocess_audio(input_audio_file,output_audio_file )
    print("Saved cleaned audio → meeting_processed")

if __name__ == "__main__":
    main()