CUDA-Programming/audio_equaliser.cu at main · shivanibhat24/CUDA-Programming · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
#include <iostream>
#include <vector>
#include <cmath>
#include <fstream>
#include <numeric>
#include <algorithm> // For std::min and std::max

// CUDA error checking macro
// This macro simplifies error checking for CUDA API calls.
#define CUDA_CHECK(call)                                                                      \
    do {                                                                                      \
        cudaError_t err = call;                                                               \
        if (err != cudaSuccess) {                                                             \
            fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
            exit(EXIT_FAILURE);                                                               \
        }                                                                                     \
    } while (0)

// --- Constants ---
const int SAMPLE_RATE = 44100; // Audio sample rate (samples per second)
const int NUM_BANDS = 3;       // Number of equalizer bands (e.g., Bass, Mid, Treble)
const int BUFFER_SIZE = 1024;  // Size of audio chunks to process at a time
const float PI = 3.14159265358979323846f; // Value of Pi for calculations

// --- Structs for BiQuad Filter ---

// Structure to hold BiQuad filter coefficients.
// We use the Direct Form II Transposed implementation for numerical stability.
typedef struct {
    float b0, b1, b2; // Numerator coefficients
    float a1, a2;     // Denominator coefficients (a0 is implicitly 1, after normalization)
} BiQuadCoefficients;

// Structure to hold BiQuad filter state variables.
// These variables (w1, w2) store the filter's memory from previous samples.
typedef struct {
    float w1, w2; // State variables for Direct Form II Transposed
} BiQuadState;

// --- Host Functions ---

/**
 * @brief Calculates coefficients for a peaking equalizer filter.
 *
 * This function computes the b0, b1, b2, a1, a2 coefficients for a
 * second-order peaking filter, based on Robert Bristow-Johnson's Audio EQ Cookbook.
 *
 * @param coeffs Pointer to a BiQuadCoefficients struct to store the calculated coefficients.
 * @param center_freq The center frequency of the peaking filter in Hz.
 * @param Q The Q factor (quality factor) of the filter, controlling its bandwidth.
 * @param gain_db The gain applied by the filter in decibels (dB).
 */
void calculatePeakingCoeffs(
    BiQuadCoefficients* coeffs,
    float center_freq,
    float Q,
    float gain_db
) {
    float A = powf(10.0f, gain_db / 40.0f); // Convert dB gain to linear amplitude gain
    float omega0 = 2.0f * PI * center_freq / SAMPLE_RATE; // Angular frequency
    float sin_omega0 = sinf(omega0);
    float cos_omega0 = cosf(omega0);
    float alpha = sin_omega0 / (2.0f * Q); // Bandwidth parameter

    // Numerator coefficients
    coeffs->b0 = 1.0f + alpha * A;
    coeffs->b1 = -2.0f * cos_omega0;
    coeffs->b2 = 1.0f - alpha * A;

    // Denominator coefficients (a0 is used for normalization)
    float a0 = 1.0f + alpha / A;
    coeffs->a1 = -2.0f * cos_omega0;
    coeffs->a2 = 1.0f - alpha / A;

    // Normalize all coefficients by a0 to ensure a0 becomes 1
    coeffs->b0 /= a0;
    coeffs->b1 /= a0;
    coeffs->b2 /= a0;
    coeffs->a1 /= a0;
    coeffs->a2 /= a0;
}

/**
 * @brief Generates a simple sine wave for testing purposes.
 *
 * @param audio_data Reference to a vector that will store the generated audio samples.
 * @param num_samples The total number of samples to generate.
 * @param frequency The frequency of the sine wave in Hz.
 * @param amplitude The amplitude of the sine wave (0.0 to 1.0).
 */
void generateSineWave(std::vector<float>& audio_data, int num_samples, float frequency, float amplitude) {
    audio_data.resize(num_samples);
    for (int i = 0; i < num_samples; ++i) {
        audio_data[i] = amplitude * sinf(2.0f * PI * frequency * i / SAMPLE_RATE);
    }
}

/**
 * @brief Writes audio data (float array) to a mono, 16-bit PCM WAV file.
 *
 * @param filename The name of the WAV file to create.
 * @param audio_data The vector containing the audio samples (float, normalized -1.0 to 1.0).
 * @param sample_rate The sample rate of the audio.
 */
void writeWavFile(const std::string& filename, const std::vector<float>& audio_data, int sample_rate) {
    std::ofstream file(filename, std::ios::binary);
    if (!file.is_open()) {
        std::cerr << "Error: Could not open WAV file " << filename << std::endl;
        return;
    }

    // WAV header structure
    // This defines the necessary fields for a standard WAV file header.
    struct WavHeader {
        char riff_id[4];        // "RIFF" chunk ID
        uint32_t file_size;     // Total file size in bytes
        char wave_id[4];        // "WAVE" format ID
        char fmt_id[4];         // "fmt " subchunk ID
        uint32_t fmt_size;      // Size of the fmt subchunk (16 for PCM)
        uint16_t audio_format;  // Audio format (1 for PCM)
        uint16_t num_channels;  // Number of audio channels (1 for mono)
        uint32_t sample_rate;   // Sample rate (samples per second)
        uint32_t byte_rate;     // Byte rate (sample_rate * num_channels * bits_per_sample / 8)
        uint16_t block_align;   // Block align (num_channels * bits_per_sample / 8)
        uint16_t bits_per_sample; // Bits per sample (16 for 16-bit PCM)
        char data_id[4];        // "data" subchunk ID
        uint32_t data_size;     // Size of the data subchunk (actual audio data size)
    } header;

    // Fill the WAV header fields with appropriate values
    memcpy(header.riff_id, "RIFF", 4);
    memcpy(header.wave_id, "WAVE", 4);
    memcpy(header.fmt_id, "fmt ", 4);
    header.fmt_size = 16;
    header.audio_format = 1; // PCM (Pulse Code Modulation)
    header.num_channels = 1; // Mono audio
    header.sample_rate = sample_rate;
    header.bits_per_sample = 16; // 16 bits per sample
    header.byte_rate = header.sample_rate * header.num_channels * header.bits_per_sample / 8;
    header.block_align = header.num_channels * header.bits_per_sample / 8;
    memcpy(header.data_id, "data", 4);
    header.data_size = audio_data.size() * header.bits_per_sample / 8; // Total bytes in audio data
    header.file_size = 36 + header.data_size; // Total file size (header + data)

    // Write the WAV header to the file
    file.write(reinterpret_cast<const char*>(&header), sizeof(WavHeader));

    // Write audio data (convert float samples to 16-bit signed integers)
    // Samples are clamped to [-1.0, 1.0] and scaled to the 16-bit range.
    for (float sample : audio_data) {
        int16_t pcm_sample = static_cast<int16_t>(std::max(-1.0f, std::min(1.0f, sample)) * 32767.0f);
        file.write(reinterpret_cast<const char*>(&pcm_sample), sizeof(int16_t));
    }

    file.close();
    std::cout << "WAV file '" << filename << "' written successfully." << std::endl;
}

// --- CUDA Kernel ---

/**
 * @brief CUDA kernel to apply a single BiQuad filter to an entire audio buffer.
 *
 * This kernel is designed to be launched with 1 block and 1 thread.
 * The single thread processes all samples sequentially within the given buffer.
 * This sequential processing is necessary to correctly maintain the state of
 * an IIR (Infinite Impulse Response) filter, as each output sample depends
 * on previous output samples.
 *
 * The filter's state variables (w1, w2) are passed as a pointer to device memory,
 * allowing them to be updated by the kernel and persist across subsequent kernel
 * calls (e.g., for different audio chunks or filter bands).
 *
 * @param d_input Pointer to the input audio buffer on the device.
 * @param d_output Pointer to the output audio buffer on the device.
 * @param coeffs The BiQuadCoefficients struct for the current filter band.
 * @param d_state_ptr Pointer to the BiQuadState struct for this specific filter band on the device.
 * @param num_samples The number of samples in the current audio chunk to process.
 */
__global__ void biquadFilterKernel(
    float* d_input,
    float* d_output,
    BiQuadCoefficients coeffs,
    BiQuadState* d_state_ptr, // Pointer to the state for this specific filter band
    int num_samples
) {
    // Load the current state for this filter band from global memory.
    // These are temporary variables for the kernel's execution.
    float w1 = d_state_ptr->w1;
    float w2 = d_state_ptr->w2;

    // Iterate through all samples in the chunk sequentially
    for (int i = 0; i < num_samples; ++i) {
        float input_sample = d_input[i];

        // Apply Direct Form II Transposed BiQuad filter equation
        // y[n] = b0*x[n] + w1[n-1]
        // w1[n] = b1*x[n] + w2[n-1] - a1*y[n]
        // w2[n] = b2*x[n] - a2*y[n]
        float output_sample = coeffs.b0 * input_sample + w1;
        w1 = coeffs.b1 * input_sample + w2 - coeffs.a1 * output_sample;
        w2 = coeffs.b2 * input_sample - coeffs.a2 * output_sample;

        d_output[i] = output_sample; // Store the processed sample
    }

    // Store the updated state back to global memory.
    // This ensures the filter's memory persists for the next chunk or next filter band.
    d_state_ptr->w1 = w1;
    d_state_ptr->w2 = w2;
}


// --- Main Function ---

int main() {
    std::cout << "Starting GPU Audio Equalizer Demo..." << std::endl;

    // --- 1. Define EQ Band Parameters ---
    // These arrays define the center frequencies, Q factors, and gains for each band.
    // Example: 3 bands (Bass, Mid, Treble) with specific settings.
    float center_freqs[NUM_BANDS] = {100.0f, 1000.0f, 8000.0f}; // Center frequencies in Hz
    float Q_factors[NUM_BANDS] = {0.707f, 1.0f, 0.707f};       // Q factor (controls bandwidth)
    float gains_db[NUM_BANDS] = {6.0f, -3.0f, 9.0f};           // Gain in decibels (dB)

    // --- 2. Initialize BiQuad Coefficients and States on Host ---
    // h_coeffs stores the coefficients for all filter bands.
    // h_states stores the state variables (w1, w2) for all filter bands.
    // Each band has its own set of coefficients and state variables.
    std::vector<BiQuadCoefficients> h_coeffs(NUM_BANDS);
    std::vector<BiQuadState> h_states(NUM_BANDS); // One state struct per band

    for (int i = 0; i < NUM_BANDS; ++i) {
        // Calculate coefficients for each peaking filter band
        calculatePeakingCoeffs(&h_coeffs[i], center_freqs[i], Q_factors[i], gains_db[i]);
        h_states[i] = {0.0f, 0.0f}; // Initialize filter states to zero
    }

    // --- 3. Generate Synthetic Input Audio ---
    const int total_samples = SAMPLE_RATE * 5; // Generate 5 seconds of audio data
    std::vector<float> h_input_audio;
    // Generate a 440 Hz sine wave as a base
    generateSineWave(h_input_audio, total_samples, 440.0f, 0.5f);
    // Add a higher frequency component (5000 Hz) to better demonstrate filtering effects
    for (int i = 0; i < total_samples; ++i) {
        h_input_audio[i] += 0.3f * sinf(2.0f * PI * 5000.0f * i / SAMPLE_RATE);
    }
    std::cout << "Generated " << total_samples << " input samples." << std::endl;

    // --- 4. Allocate Device Memory ---
    // d_audio_buffer1 and d_audio_buffer2 are used for ping-ponging audio data
    // between successive filter stages on the GPU, avoiding host transfers.
    // d_coeffs_single_filter is a small buffer to transfer one filter's coefficients at a time.
    // d_states stores the state variables for all filter bands on the device.
    float *d_audio_buffer1, *d_audio_buffer2;
    BiQuadCoefficients *d_coeffs_single_filter;
    BiQuadState *d_states;

    CUDA_CHECK(cudaMalloc((void**)&d_audio_buffer1, BUFFER_SIZE * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void**)&d_audio_buffer2, BUFFER_SIZE * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void**)&d_coeffs_single_filter, sizeof(BiQuadCoefficients)));
    CUDA_CHECK(cudaMalloc((void**)&d_states, NUM_BANDS * sizeof(BiQuadState)));

    // --- 5. Main Processing Loop (Chunk by Chunk) ---
    // The total output audio will be stored here.
    std::vector<float> h_output_audio(total_samples);
    int current_sample_idx = 0; // Tracks progress through the total audio samples

    while (current_sample_idx < total_samples) {
        // Determine how many samples to process in the current chunk
        int samples_to_process = std::min(BUFFER_SIZE, total_samples - current_sample_idx);
        if (samples_to_process == 0) break; // No more samples to process

        // Copy the current input audio chunk from host to d_audio_buffer1 on device
        CUDA_CHECK(cudaMemcpy(
            d_audio_buffer1,
            h_input_audio.data() + current_sample_idx,
            samples_to_process * sizeof(float),
            cudaMemcpyHostToDevice
        ));

        // Copy the current filter states for all bands from host to device
        CUDA_CHECK(cudaMemcpy(
            d_states,
            h_states.data(),
            NUM_BANDS * sizeof(BiQuadState),
            cudaMemcpyHostToDevice
        ));

        // Pointers to manage the ping-pong buffers for current input and output on device
        float* d_current_input = d_audio_buffer1;
        float* d_current_output = d_audio_buffer2;

        // Apply each filter band sequentially to the current audio chunk
        for (int i = 0; i < NUM_BANDS; ++i) {
            // Copy the coefficients for the current filter band from host to device
            CUDA_CHECK(cudaMemcpy(
                d_coeffs_single_filter,
                &h_coeffs[i],
                sizeof(BiQuadCoefficients),
                cudaMemcpyHostToDevice
            ));

            // Launch the biquadFilterKernel.
            // It's launched with 1 block and 1 thread because IIR filters are sequential.
            // d_current_input is the input for this stage, d_current_output is where results go.
            // *d_coeffs_single_filter passes the coefficients by value.
            // &d_states[i] passes a pointer to the state for this specific filter band.
            biquadFilterKernel<<<1, 1>>>(
                d_current_input,
                d_current_output,
                *d_coeffs_single_filter,
                &d_states[i],
                samples_to_process
            );
            CUDA_CHECK(cudaGetLastError()); // Check for any errors during kernel launch
            CUDA_CHECK(cudaDeviceSynchronize()); // Wait for the kernel to finish execution

            // Swap the input and output pointers for the next filter stage.
            // The output of the current stage becomes the input for the next stage.
            std::swap(d_current_input, d_current_output);
        }

        // After all bands are applied, the final processed output for this chunk
        // will be in d_current_input (due to the final swap).
        // Copy this processed chunk back from device to host.
        CUDA_CHECK(cudaMemcpy(
            h_output_audio.data() + current_sample_idx,
            d_current_input, // This now holds the final processed data
            samples_to_process * sizeof(float),
            cudaMemcpyDeviceToHost
        ));

        // Copy the updated filter states for all bands back from device to host.
        // These updated states will be used as initial states for the next audio chunk.
        CUDA_CHECK(cudaMemcpy(
            h_states.data(),
            d_states,
            NUM_BANDS * sizeof(BiQuadState),
            cudaMemcpyDeviceToHost
        ));

        current_sample_idx += samples_to_process; // Advance the sample index
        // Print progress to console (using \r for in-place update)
        std::cout << "Processed " << current_sample_idx << "/" << total_samples << " samples...\r" << std::flush;
    }
    std::cout << std::endl << "Finished processing all samples." << std::endl;

    // --- 6. Save Processed Audio to WAV File ---
    // Write the complete processed audio data to a WAV file.
    writeWavFile("output_equalized_audio.wav", h_output_audio, SAMPLE_RATE);

    // --- 7. Clean up Device Memory ---
    // Free all allocated device memory to prevent memory leaks.
    CUDA_CHECK(cudaFree(d_audio_buffer1));
    CUDA_CHECK(cudaFree(d_audio_buffer2));
    CUDA_CHECK(cudaFree(d_coeffs_single_filter));
    CUDA_CHECK(cudaFree(d_states));

    std::cout << "GPU Audio Equalizer Demo Finished." << std::endl;

    return 0;
}