QuState · smu160 · Apr 5, 2026 · Mar 29, 2026 · Apr 4, 2026 · Apr 4, 2026
diff --git a/Cargo.toml b/Cargo.toml
@@ -44,6 +44,10 @@ name = "interleave"
 harness = false
 required-features = ["complex-nums"]
 
+[[bench]]
+name = "heuristic_vs_tune"
+harness = false
+
 [profile.profiling]
 inherits = "release"
 debug = true

diff --git a/benches/heuristic_vs_tune.rs b/benches/heuristic_vs_tune.rs
@@ -0,0 +1,146 @@
+use criterion::{criterion_group, criterion_main, BatchSize, BenchmarkId, Criterion, Throughput};
+use num_traits::Float;
+use phastft::options::Options;
+use phastft::planner::{Direction, PlannerDit32, PlannerDit64, PlannerMode};
+use phastft::{fft_32_dit_with_planner_and_opts, fft_64_dit_with_planner_and_opts};
+use rand::distr::StandardUniform;
+use rand::prelude::Distribution;
+use rand::rngs::SmallRng;
+use rand::{Rng, SeedableRng};
+
+const LENGTHS: &[usize] = &[5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18];
+
+fn generate_numbers<T: Float>(n: usize) -> (Vec<T>, Vec<T>)
+where
+    StandardUniform: Distribution<T>,
+{
+    let mut rng = SmallRng::from_os_rng();
+
+    let samples: Vec<T> = (&mut rng)
+        .sample_iter(StandardUniform)
+        .take(2 * n)
+        .collect();
+
+    let mut reals = vec![T::zero(); n];
+    let mut imags = vec![T::zero(); n];
+
+    for ((z_re, z_im), rand_chunk) in reals
+        .iter_mut()
+        .zip(imags.iter_mut())
+        .zip(samples.chunks_exact(2))
+    {
+        *z_re = rand_chunk[0];
+        *z_im = rand_chunk[1];
+    }
+
+    (reals, imags)
+}
+
+fn benchmark_heuristic_vs_tune_f64(c: &mut Criterion) {
+    let mut group = c.benchmark_group("Heuristic vs Tune f64");
+    group.plot_config(
+        criterion::PlotConfiguration::default().summary_scale(criterion::AxisScale::Logarithmic),
+    );
+    group.sample_size(20);
+
+    for n in LENGTHS.iter() {
+        let len = 1 << n;
+        group.throughput(Throughput::ElementsAndBytes {
+            elements: len as u64,
+            bytes: (len * size_of::<f64>()) as u64,
+        });
+
+        let options = Options::guess_options(len);
+
+        let planner_heuristic = PlannerDit64::new(len, Direction::Forward);
+        group.bench_function(BenchmarkId::new("Heuristic", len), |b| {
+            b.iter_batched(
+                || generate_numbers::<f64>(len),
+                |(mut reals, mut imags)| {
+                    fft_64_dit_with_planner_and_opts(
+                        &mut reals,
+                        &mut imags,
+                        &planner_heuristic,
+                        &options,
+                    );
+                },
+                BatchSize::SmallInput,
+            );
+        });
+
+        let planner_tune = PlannerDit64::with_mode(len, Direction::Forward, PlannerMode::Tune);
+        group.bench_function(BenchmarkId::new("Tune", len), |b| {
+            b.iter_batched(
+                || generate_numbers::<f64>(len),
+                |(mut reals, mut imags)| {
+                    fft_64_dit_with_planner_and_opts(
+                        &mut reals,
+                        &mut imags,
+                        &planner_tune,
+                        &options,
+                    );
+                },
+                BatchSize::SmallInput,
+            );
+        });
+    }
+    group.finish();
+}
+
+fn benchmark_heuristic_vs_tune_f32(c: &mut Criterion) {
+    let mut group = c.benchmark_group("Heuristic vs Tune f32");
+    group.plot_config(
+        criterion::PlotConfiguration::default().summary_scale(criterion::AxisScale::Logarithmic),
+    );
+    group.sample_size(20);
+
+    for n in LENGTHS.iter() {
+        let len = 1 << n;
+        group.throughput(Throughput::ElementsAndBytes {
+            elements: len as u64,
+            bytes: (len * size_of::<f32>()) as u64,
+        });
+
+        let options = Options::guess_options(len);
+
+        let planner_heuristic = PlannerDit32::new(len, Direction::Forward);
+        group.bench_function(BenchmarkId::new("Heuristic", len), |b| {
+            b.iter_batched(
+                || generate_numbers::<f32>(len),
+                |(mut reals, mut imags)| {
+                    fft_32_dit_with_planner_and_opts(
+                        &mut reals,
+                        &mut imags,
+                        &planner_heuristic,
+                        &options,
+                    );
+                },
+                BatchSize::SmallInput,
+            );
+        });
+
+        let planner_tune = PlannerDit32::with_mode(len, Direction::Forward, PlannerMode::Tune);
+        group.bench_function(BenchmarkId::new("Tune", len), |b| {
+            b.iter_batched(
+                || generate_numbers::<f32>(len),
+                |(mut reals, mut imags)| {
+                    fft_32_dit_with_planner_and_opts(
+                        &mut reals,
+                        &mut imags,
+                        &planner_tune,
+                        &options,
+                    );
+                },
+                BatchSize::SmallInput,
+            );
+        });
+    }
+    group.finish();
+}
+
+criterion_group!(
+    benches,
+    benchmark_heuristic_vs_tune_f64,
+    benchmark_heuristic_vs_tune_f32,
+);
+criterion_main!(benches);
diff --git a/src/algorithms/bravo.rs b/src/algorithms/bravo.rs
@@ -4,11 +4,9 @@
 /// by Lokhmotov and Mycroft (SPAA'07).
 ///
 /// The algorithm uses vector interleaving operations to perform bit-reversal permutation.
-/// For N = 2^n elements with W-element vectors, the algorithm performs log₂(N) rounds
+/// For `N = 2^n` elements with `W`-element vectors, the algorithm performs `log_2(N)` rounds
 /// of in-place interleave operations on pairs of vectors.
 ///
-/// The initial implementation was translated from mathematical notation in the paper
-/// to Rust by Claude 4.5 Opus.
 use fearless_simd::prelude::*;
 use fearless_simd::{f32x4, f32x8, f64x4, f64x8, Simd};
 

diff --git a/src/algorithms/dit.rs b/src/algorithms/dit.rs
@@ -17,6 +17,7 @@
 use fearless_simd::{dispatch, Simd};
 
 use crate::algorithms::bravo::{bit_rev_bravo_f32, bit_rev_bravo_f64};
+use crate::kernels::codelets::{fft_dit_codelet_32_f32, fft_dit_codelet_32_f64};
 use crate::kernels::dit::*;
 use crate::options::Options;
 use crate::parallel::run_maybe_in_parallel;
@@ -41,7 +42,16 @@ fn recursive_dit_fft_f64<S: Simd>(
     let log_size = size.ilog2() as usize;
 
     if size <= L1_BLOCK_SIZE {
-        for stage in 0..log_size {
+        // Use FFT-32 codelet to fuse stages 0-4 into a single pass per 32-element chunk
+        let start_stage = if planner.use_codelet_32 {
+            fft_dit_codelet_32_f64(simd, &mut reals[..size], &mut imags[..size]);
+            5
+        } else {
+            0
+        };
+
+        // Remaining stages use per-stage kernels
+        for stage in start_stage..log_size {
             stage_twiddle_idx = execute_dit_stage_f64(
                 simd,
                 &mut reals[..size],
@@ -98,7 +108,16 @@ fn recursive_dit_fft_f32<S: Simd>(
     let log_size = size.ilog2() as usize;
 
     if size <= L1_BLOCK_SIZE {
-        for stage in 0..log_size {
+        // Use FFT-32 codelet to fuse stages 0-4 into a single pass per 32-element chunk
+        let start_stage = if planner.use_codelet_32 {
+            fft_dit_codelet_32_f32(simd, &mut reals[..size], &mut imags[..size]);
+            5
+        } else {
+            0
+        };
+
+        // Remaining stages use per-stage kernels
+        for stage in start_stage..log_size {
             stage_twiddle_idx = execute_dit_stage_f32(
                 simd,
                 &mut reals[..size],