Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,10 @@ name = "interleave"
harness = false
required-features = ["complex-nums"]

[[bench]]
name = "heuristic_vs_tune"
harness = false

[profile.profiling]
inherits = "release"
debug = true
Expand Down
146 changes: 146 additions & 0 deletions benches/heuristic_vs_tune.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
use criterion::{criterion_group, criterion_main, BatchSize, BenchmarkId, Criterion, Throughput};
use num_traits::Float;
use phastft::options::Options;
use phastft::planner::{Direction, PlannerDit32, PlannerDit64, PlannerMode};
use phastft::{fft_32_dit_with_planner_and_opts, fft_64_dit_with_planner_and_opts};
use rand::distr::StandardUniform;
use rand::prelude::Distribution;
use rand::rngs::SmallRng;
use rand::{Rng, SeedableRng};

const LENGTHS: &[usize] = &[5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18];

fn generate_numbers<T: Float>(n: usize) -> (Vec<T>, Vec<T>)
where
StandardUniform: Distribution<T>,
{
let mut rng = SmallRng::from_os_rng();

let samples: Vec<T> = (&mut rng)
.sample_iter(StandardUniform)
.take(2 * n)
.collect();

let mut reals = vec![T::zero(); n];
let mut imags = vec![T::zero(); n];

for ((z_re, z_im), rand_chunk) in reals
.iter_mut()
.zip(imags.iter_mut())
.zip(samples.chunks_exact(2))
{
*z_re = rand_chunk[0];
*z_im = rand_chunk[1];
}

(reals, imags)
}

fn benchmark_heuristic_vs_tune_f64(c: &mut Criterion) {
let mut group = c.benchmark_group("Heuristic vs Tune f64");
group.plot_config(
criterion::PlotConfiguration::default().summary_scale(criterion::AxisScale::Logarithmic),
);
group.sample_size(20);

for n in LENGTHS.iter() {
let len = 1 << n;
group.throughput(Throughput::ElementsAndBytes {
elements: len as u64,
bytes: (len * size_of::<f64>()) as u64,
});

let options = Options::guess_options(len);

let planner_heuristic = PlannerDit64::new(len, Direction::Forward);
group.bench_function(BenchmarkId::new("Heuristic", len), |b| {
b.iter_batched(
|| generate_numbers::<f64>(len),
|(mut reals, mut imags)| {
fft_64_dit_with_planner_and_opts(
&mut reals,
&mut imags,
&planner_heuristic,
&options,
);
},
BatchSize::SmallInput,
);
});

let planner_tune = PlannerDit64::with_mode(len, Direction::Forward, PlannerMode::Tune);
group.bench_function(BenchmarkId::new("Tune", len), |b| {
b.iter_batched(
|| generate_numbers::<f64>(len),
|(mut reals, mut imags)| {
fft_64_dit_with_planner_and_opts(
&mut reals,
&mut imags,
&planner_tune,
&options,
);
},
BatchSize::SmallInput,
);
});
}
group.finish();
}

fn benchmark_heuristic_vs_tune_f32(c: &mut Criterion) {
let mut group = c.benchmark_group("Heuristic vs Tune f32");
group.plot_config(
criterion::PlotConfiguration::default().summary_scale(criterion::AxisScale::Logarithmic),
);
group.sample_size(20);

for n in LENGTHS.iter() {
let len = 1 << n;
group.throughput(Throughput::ElementsAndBytes {
elements: len as u64,
bytes: (len * size_of::<f32>()) as u64,
});

let options = Options::guess_options(len);

let planner_heuristic = PlannerDit32::new(len, Direction::Forward);
group.bench_function(BenchmarkId::new("Heuristic", len), |b| {
b.iter_batched(
|| generate_numbers::<f32>(len),
|(mut reals, mut imags)| {
fft_32_dit_with_planner_and_opts(
&mut reals,
&mut imags,
&planner_heuristic,
&options,
);
},
BatchSize::SmallInput,
);
});

let planner_tune = PlannerDit32::with_mode(len, Direction::Forward, PlannerMode::Tune);
group.bench_function(BenchmarkId::new("Tune", len), |b| {
b.iter_batched(
|| generate_numbers::<f32>(len),
|(mut reals, mut imags)| {
fft_32_dit_with_planner_and_opts(
&mut reals,
&mut imags,
&planner_tune,
&options,
);
},
BatchSize::SmallInput,
);
});
}
group.finish();
}

criterion_group!(
benches,
benchmark_heuristic_vs_tune_f64,
benchmark_heuristic_vs_tune_f32,
);
criterion_main!(benches);
4 changes: 1 addition & 3 deletions src/algorithms/bravo.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,9 @@
/// by Lokhmotov and Mycroft (SPAA'07).
///
/// The algorithm uses vector interleaving operations to perform bit-reversal permutation.
/// For N = 2^n elements with W-element vectors, the algorithm performs log₂(N) rounds
/// For `N = 2^n` elements with `W`-element vectors, the algorithm performs `log_2(N)` rounds
/// of in-place interleave operations on pairs of vectors.
///
/// The initial implementation was translated from mathematical notation in the paper
/// to Rust by Claude 4.5 Opus.
use fearless_simd::prelude::*;
use fearless_simd::{f32x4, f32x8, f64x4, f64x8, Simd};

Expand Down
23 changes: 21 additions & 2 deletions src/algorithms/dit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
use fearless_simd::{dispatch, Simd};

use crate::algorithms::bravo::{bit_rev_bravo_f32, bit_rev_bravo_f64};
use crate::kernels::codelets::{fft_dit_codelet_32_f32, fft_dit_codelet_32_f64};
use crate::kernels::dit::*;
use crate::options::Options;
use crate::parallel::run_maybe_in_parallel;
Expand All @@ -41,7 +42,16 @@ fn recursive_dit_fft_f64<S: Simd>(
let log_size = size.ilog2() as usize;

if size <= L1_BLOCK_SIZE {
for stage in 0..log_size {
// Use FFT-32 codelet to fuse stages 0-4 into a single pass per 32-element chunk
let start_stage = if planner.use_codelet_32 {
fft_dit_codelet_32_f64(simd, &mut reals[..size], &mut imags[..size]);
5
} else {
0
};

// Remaining stages use per-stage kernels
for stage in start_stage..log_size {
stage_twiddle_idx = execute_dit_stage_f64(
simd,
&mut reals[..size],
Expand Down Expand Up @@ -98,7 +108,16 @@ fn recursive_dit_fft_f32<S: Simd>(
let log_size = size.ilog2() as usize;

if size <= L1_BLOCK_SIZE {
for stage in 0..log_size {
// Use FFT-32 codelet to fuse stages 0-4 into a single pass per 32-element chunk
let start_stage = if planner.use_codelet_32 {
fft_dit_codelet_32_f32(simd, &mut reals[..size], &mut imags[..size]);
5
} else {
0
};

// Remaining stages use per-stage kernels
for stage in start_stage..log_size {
stage_twiddle_idx = execute_dit_stage_f32(
simd,
&mut reals[..size],
Expand Down
Loading
Loading