diff --git a/examples/04-simd-vectorization/bench/simd_bench.cpp b/examples/04-simd-vectorization/bench/simd_bench.cpp
index 863a7a9..9192c0b 100644
--- a/examples/04-simd-vectorization/bench/simd_bench.cpp
+++ b/examples/04-simd-vectorization/bench/simd_bench.cpp
@@ -14,8 +14,7 @@
 #include <random>
 #include <vector>
 
-#include "simd_utils.hpp"
-#include "simd_wrapper.hpp"
+#include <hpc/simd.hpp>
 
 namespace {
 
diff --git a/examples/04-simd-vectorization/include/simd_utils.hpp b/examples/04-simd-vectorization/include/simd_utils.hpp
index 11ef953..8025b4c 100644
--- a/examples/04-simd-vectorization/include/simd_utils.hpp
+++ b/examples/04-simd-vectorization/include/simd_utils.hpp
@@ -1,353 +1,2 @@
-/**
- * @file simd_utils.hpp
- * @brief SIMD utility functions and feature detection
- *
- * This header provides common utilities for SIMD programming including
- * feature detection, alignment helpers, and basic SIMD operations.
- *
- * All functionality is header-only for ease of integration.
- *
- * Validates:
- *   - Requirement 4.1: Automatic Vectorization Patterns
- *   - Requirement 4.2: SIMD Intrinsics Introduction
- *   - Requirement 4.3: SIMD Abstraction Wrappers
- *   - Requirement 4.4: CPU Capability Detection
- *   - Requirement 4.5: Scalar vs Vectorized Benchmark
- *   - Requirement 4.6: Vectorization Reports
- */
-
 #pragma once
-
-#include <cstddef>
-#include <cstdint>
-#include <cstdlib>
-#include <limits>
-#include <memory>
-#include <new>
-#include <vector>
-
-// Intel SIMD intrinsics - always include on x86 for target attribute dispatch
-// The target attribute controls which instructions are actually used
-#if (defined(__GNUC__) || defined(__clang__)) && (defined(__x86_64__) || defined(__i386__))
-#include <immintrin.h>
-#endif
-
-// Feature detection macros for compile-time checks
-#ifdef __SSE2__
-#define HPC_HAS_SSE2 1
-#endif
-
-#ifdef __AVX__
-#define HPC_HAS_AVX 1
-#endif
-
-#ifdef __AVX2__
-#define HPC_HAS_AVX2 1
-#endif
-
-#ifdef __AVX512F__
-#define HPC_HAS_AVX512 1
-#endif
-
-namespace hpc::simd {
-
-/**
- * @brief Check if a pointer is aligned to the specified boundary
- */
-inline bool is_aligned(const void* ptr, size_t alignment) {
-    return reinterpret_cast<uintptr_t>(ptr) % alignment == 0;
-}
-
-/**
- * @brief Align a size up to the next multiple of alignment
- */
-inline size_t align_up(size_t size, size_t alignment) {
-    return (size + alignment - 1) & ~(alignment - 1);
-}
-
-/**
- * @brief Get the optimal SIMD alignment for the current platform
- */
-inline size_t get_simd_alignment() {
-#ifdef HPC_HAS_AVX512
-    return 64;  // AVX-512 uses 64-byte alignment
-#elif defined(HPC_HAS_AVX) || defined(HPC_HAS_AVX2)
-    return 32;  // AVX/AVX2 uses 32-byte alignment
-#elif defined(HPC_HAS_SSE2)
-    return 16;  // SSE uses 16-byte alignment
-#else
-    return sizeof(void*);  // Fallback to pointer alignment
-#endif
-}
-
-/**
- * @brief SIMD-width aligned allocator for SIMD operations
- *
- * Uses runtime CPU feature detection to pick the optimal alignment
- * (16/32/64 bytes) for the current platform's SIMD width.
- *
- * See CONTEXT.md: SIMD-width allocator for the domain rationale.
- * For cache-line alignment, see hpc::memory::AlignedAllocator in memory_utils.hpp.
- */
-template <typename T>
-class AlignedAllocator {
-public:
-    using value_type = T;
-    using size_type = std::size_t;
-    using difference_type = std::ptrdiff_t;
-
-    template <typename U>
-    struct rebind {
-        using other = AlignedAllocator<U>;
-    };
-
-    AlignedAllocator() = default;
-
-    template <typename U>
-    AlignedAllocator(const AlignedAllocator<U>&) {}
-
-    T* allocate(size_type n) {
-        // Overflow protection
-        if (n > std::numeric_limits<size_type>::max() / sizeof(T)) {
-            throw std::bad_alloc();
-        }
-
-        if (n == 0) {
-            return nullptr;
-        }
-
-        const size_t alignment = get_simd_alignment();
-        const size_t size = n * sizeof(T);
-
-        void* ptr = nullptr;
-#if defined(_MSC_VER)
-        ptr = _aligned_malloc(size, alignment);
-#else
-        if (posix_memalign(&ptr, alignment, size) != 0) {
-            ptr = nullptr;
-        }
-#endif
-        if (!ptr) {
-            throw std::bad_alloc();
-        }
-        return static_cast<T*>(ptr);
-    }
-
-    void deallocate(T* p, size_type) {
-        if (p == nullptr) {
-            return;
-        }
-#if defined(_MSC_VER)
-        _aligned_free(p);
-#else
-        free(p);
-#endif
-    }
-
-    template <typename U>
-    bool operator==(const AlignedAllocator<U>&) const {
-        return true;
-    }
-
-    template <typename U>
-    bool operator!=(const AlignedAllocator<U>&) const {
-        return false;
-    }
-};
-
-/**
- * @brief Backward-compatible alias for AlignedAllocator
- * @deprecated Use AlignedAllocator<T> directly
- */
-template <typename T>
-using aligned_allocator [[deprecated("Use AlignedAllocator<T> directly")]] = AlignedAllocator<T>;
-
-/**
- * @brief Alias for AlignedAllocator with SIMD-specific naming
- */
-template <typename T>
-using simd_allocator = AlignedAllocator<T>;
-
-/**
- * @brief Aligned vector type for SIMD operations
- */
-template <typename T>
-using aligned_vector = std::vector<T, AlignedAllocator<T>>;
-
-/**
- * @brief Aligned buffer type alias for compatibility
- */
-template <typename T>
-using AlignedBuffer = aligned_vector<T>;
-
-/**
- * @brief Create an aligned vector with the specified size
- */
-template <typename T>
-aligned_vector<T> make_aligned_vector(size_t size) {
-    return aligned_vector<T>(size);
-}
-
-/**
- * @brief Create an aligned vector with the specified size and initial value
- */
-template <typename T>
-aligned_vector<T> make_aligned_vector(size_t size, const T& value) {
-    return aligned_vector<T>(size, value);
-}
-
-/**
- * @brief SIMD capability levels
- */
-enum class SIMDLevel { Scalar, SSE2, AVX, AVX2, AVX512 };
-
-/**
- * @brief Detect the highest available SIMD level
- */
-inline SIMDLevel detect_simd_level() {
-#ifdef HPC_HAS_AVX512
-    return SIMDLevel::AVX512;
-#elif defined(HPC_HAS_AVX2)
-    return SIMDLevel::AVX2;
-#elif defined(HPC_HAS_AVX)
-    return SIMDLevel::AVX;
-#elif defined(HPC_HAS_SSE2)
-    return SIMDLevel::SSE2;
-#else
-    return SIMDLevel::Scalar;
-#endif
-}
-
-/**
- * @brief Get the name of a SIMD level
- */
-inline const char* simd_level_name(SIMDLevel level) {
-    switch (level) {
-        case SIMDLevel::AVX512:
-            return "AVX-512";
-        case SIMDLevel::AVX2:
-            return "AVX2";
-        case SIMDLevel::AVX:
-            return "AVX";
-        case SIMDLevel::SSE2:
-            return "SSE2";
-        case SIMDLevel::Scalar:
-            return "Scalar";
-        default:
-            return "Unknown";
-    }
-}
-
-/**
- * @brief Get the vector width in bytes for a SIMD level
- */
-inline size_t simd_vector_width(SIMDLevel level) {
-    switch (level) {
-        case SIMDLevel::AVX512:
-            return 64;
-        case SIMDLevel::AVX2:
-            return 32;
-        case SIMDLevel::AVX:
-            return 32;
-        case SIMDLevel::SSE2:
-            return 16;
-        case SIMDLevel::Scalar:
-            return sizeof(float);
-        default:
-            return sizeof(float);
-    }
-}
-
-//------------------------------------------------------------------------------
-// Runtime SIMD Dispatch
-//------------------------------------------------------------------------------
-
-/**
- * @brief Generic CPU capability resolver for multi-version functions.
- *
- * Given scalar, SSE2, AVX2 and AVX-512 function pointers, returns the
- * best available one based on runtime CPU feature detection.
- *
- * @tparam Func Function pointer type (must be identical for all arguments)
- * @return Best available implementation pointer
- */
-template <typename Func>
-Func resolve_best(Func scalar, Func sse2, Func avx2, Func avx512) {
-#if (defined(__GNUC__) || defined(__clang__)) && (defined(__x86_64__) || defined(__i386__))
-    __builtin_cpu_init();
-    if (avx512 && __builtin_cpu_supports("avx512f"))
-        return avx512;
-    if (avx2 && __builtin_cpu_supports("avx2"))
-        return avx2;
-    if (sse2 && __builtin_cpu_supports("sse2"))
-        return sse2;
-#else
-    (void)sse2;
-    (void)avx2;
-    (void)avx512;
-#endif
-    return scalar;
-}
-
-namespace detail {
-
-using AddArraysFn = void (*)(const float* a, const float* b, float* c, size_t n);
-
-inline void add_arrays_scalar(const float* a, const float* b, float* c, size_t n) {
-    for (size_t i = 0; i < n; ++i) {
-        c[i] = a[i] + b[i];
-    }
-}
-
-#if (defined(__GNUC__) || defined(__clang__)) && (defined(__x86_64__) || defined(__i386__))
-
-__attribute__((target("sse2"))) inline void add_arrays_sse2(const float* a, const float* b,
-                                                            float* c, size_t n) {
-    size_t i = 0;
-    for (; i + 4 <= n; i += 4) {
-        const __m128 va = _mm_loadu_ps(&a[i]);
-        const __m128 vb = _mm_loadu_ps(&b[i]);
-        const __m128 vc = _mm_add_ps(va, vb);
-        _mm_storeu_ps(&c[i], vc);
-    }
-    for (; i < n; ++i) {
-        c[i] = a[i] + b[i];
-    }
-}
-
-__attribute__((target("avx2,avx"))) inline void add_arrays_avx2(const float* a, const float* b,
-                                                                float* c, size_t n) {
-    size_t i = 0;
-    for (; i + 8 <= n; i += 8) {
-        const __m256 va = _mm256_loadu_ps(&a[i]);
-        const __m256 vb = _mm256_loadu_ps(&b[i]);
-        const __m256 vc = _mm256_add_ps(va, vb);
-        _mm256_storeu_ps(&c[i], vc);
-    }
-    add_arrays_sse2(a + i, b + i, c + i, n - i);
-}
-
-#endif
-
-}  // namespace detail
-
-/**
- * @brief Add two arrays using the best available SIMD path at runtime.
- *
- * Automatically selects AVX2, SSE2, or scalar implementation based on
- * CPU capabilities. The resolved function pointer is cached in a
- * static local for thread-safe, single-shot initialization.
- */
-inline void dispatch_add_arrays(const float* a, const float* b, float* c, size_t n) {
-    using Fn = detail::AddArraysFn;
-    static const Fn dispatch = resolve_best<Fn>(&detail::add_arrays_scalar,
-#if (defined(__GNUC__) || defined(__clang__)) && (defined(__x86_64__) || defined(__i386__))
-                                                &detail::add_arrays_sse2, &detail::add_arrays_avx2,
-#else
-                                                nullptr, nullptr,
-#endif
-                                                nullptr);
-    dispatch(a, b, c, n);
-}
-
-}  // namespace hpc::simd
+#include <hpc/simd.hpp>
diff --git a/examples/04-simd-vectorization/include/simd_wrapper.hpp b/examples/04-simd-vectorization/include/simd_wrapper.hpp
index d540a68..c8bb5d7 100644
--- a/examples/04-simd-vectorization/include/simd_wrapper.hpp
+++ b/examples/04-simd-vectorization/include/simd_wrapper.hpp
@@ -1,491 +1,3 @@
 #pragma once
 
-/**
- * @file simd_wrapper.hpp
- * @brief C++ wrapper for SIMD intrinsics providing a clean, portable interface
- *
- * This wrapper provides:
- * 1. Type-safe SIMD vector types
- * 2. Operator overloading for natural syntax
- * 3. Automatic fallback to scalar operations
- * 4. Compile-time SIMD level selection
- */
-
-#include <algorithm>
-#include <cmath>
-
-#include "simd_utils.hpp"
-
-#ifdef HPC_HAS_SSE2
-#include <emmintrin.h>
-#endif
-
-#ifdef HPC_HAS_AVX
-#include <immintrin.h>
-#endif
-
-namespace hpc::simd {
-
-// ============================================================================
-// Forward declarations
-// ============================================================================
-
-template <typename T, size_t Width>
-class SimdVec;
-
-// ============================================================================
-// Scalar fallback implementation
-// ============================================================================
-
-template <typename T, size_t Width>
-class SimdVecScalar {
-public:
-    static constexpr size_t width = Width;
-    using value_type = T;
-
-    T data[Width];
-
-    SimdVecScalar() = default;
-
-    explicit SimdVecScalar(T val) {
-        for (size_t i = 0; i < Width; ++i)
-            data[i] = val;
-    }
-
-    SimdVecScalar(const T* ptr) {
-        for (size_t i = 0; i < Width; ++i)
-            data[i] = ptr[i];
-    }
-
-    void store(T* ptr) const {
-        for (size_t i = 0; i < Width; ++i)
-            ptr[i] = data[i];
-    }
-
-    T operator[](size_t i) const { return data[i]; }
-    T& operator[](size_t i) { return data[i]; }
-
-    SimdVecScalar operator+(const SimdVecScalar& other) const {
-        SimdVecScalar result;
-        for (size_t i = 0; i < Width; ++i)
-            result.data[i] = data[i] + other.data[i];
-        return result;
-    }
-
-    SimdVecScalar operator-(const SimdVecScalar& other) const {
-        SimdVecScalar result;
-        for (size_t i = 0; i < Width; ++i)
-            result.data[i] = data[i] - other.data[i];
-        return result;
-    }
-
-    SimdVecScalar operator*(const SimdVecScalar& other) const {
-        SimdVecScalar result;
-        for (size_t i = 0; i < Width; ++i)
-            result.data[i] = data[i] * other.data[i];
-        return result;
-    }
-
-    SimdVecScalar operator/(const SimdVecScalar& other) const {
-        SimdVecScalar result;
-        for (size_t i = 0; i < Width; ++i)
-            result.data[i] = data[i] / other.data[i];
-        return result;
-    }
-
-    SimdVecScalar& operator+=(const SimdVecScalar& other) {
-        for (size_t i = 0; i < Width; ++i)
-            data[i] += other.data[i];
-        return *this;
-    }
-
-    SimdVecScalar& operator-=(const SimdVecScalar& other) {
-        for (size_t i = 0; i < Width; ++i)
-            data[i] -= other.data[i];
-        return *this;
-    }
-
-    SimdVecScalar& operator*=(const SimdVecScalar& other) {
-        for (size_t i = 0; i < Width; ++i)
-            data[i] *= other.data[i];
-        return *this;
-    }
-
-    T horizontal_sum() const {
-        T sum = data[0];
-        for (size_t i = 1; i < Width; ++i)
-            sum += data[i];
-        return sum;
-    }
-
-    static SimdVecScalar fmadd(const SimdVecScalar& a, const SimdVecScalar& b,
-                               const SimdVecScalar& c) {
-        SimdVecScalar result;
-        for (size_t i = 0; i < Width; ++i)
-            result.data[i] = a.data[i] * b.data[i] + c.data[i];
-        return result;
-    }
-
-    SimdVecScalar sqrt() const {
-        SimdVecScalar result;
-        for (size_t i = 0; i < Width; ++i)
-            result.data[i] = std::sqrt(data[i]);
-        return result;
-    }
-
-    SimdVecScalar min(const SimdVecScalar& other) const {
-        SimdVecScalar result;
-        for (size_t i = 0; i < Width; ++i)
-            result.data[i] = std::min(data[i], other.data[i]);
-        return result;
-    }
-
-    SimdVecScalar max(const SimdVecScalar& other) const {
-        SimdVecScalar result;
-        for (size_t i = 0; i < Width; ++i)
-            result.data[i] = std::max(data[i], other.data[i]);
-        return result;
-    }
-};
-
-// ============================================================================
-// SSE Implementation (128-bit, 4 floats)
-// ============================================================================
-
-#ifdef HPC_HAS_SSE2
-
-template <>
-class SimdVec<float, 4> {
-public:
-    static constexpr size_t width = 4;
-    using value_type = float;
-
-    __m128 data;
-
-    SimdVec() : data(_mm_setzero_ps()) {}
-
-    explicit SimdVec(__m128 v) : data(v) {}
-
-    explicit SimdVec(float val) : data(_mm_set1_ps(val)) {}
-
-    SimdVec(const float* ptr) : data(_mm_loadu_ps(ptr)) {}
-
-    static SimdVec load_aligned(const float* ptr) { return SimdVec(_mm_load_ps(ptr)); }
-
-    void store(float* ptr) const { _mm_storeu_ps(ptr, data); }
-
-    void store_aligned(float* ptr) const { _mm_store_ps(ptr, data); }
-
-    float operator[](size_t i) const {
-        alignas(16) float tmp[4];
-        _mm_store_ps(tmp, data);
-        return tmp[i];
-    }
-
-    SimdVec operator+(const SimdVec& other) const { return SimdVec(_mm_add_ps(data, other.data)); }
-
-    SimdVec operator-(const SimdVec& other) const { return SimdVec(_mm_sub_ps(data, other.data)); }
-
-    SimdVec operator*(const SimdVec& other) const { return SimdVec(_mm_mul_ps(data, other.data)); }
-
-    SimdVec operator/(const SimdVec& other) const { return SimdVec(_mm_div_ps(data, other.data)); }
-
-    SimdVec& operator+=(const SimdVec& other) {
-        data = _mm_add_ps(data, other.data);
-        return *this;
-    }
-
-    SimdVec& operator-=(const SimdVec& other) {
-        data = _mm_sub_ps(data, other.data);
-        return *this;
-    }
-
-    SimdVec& operator*=(const SimdVec& other) {
-        data = _mm_mul_ps(data, other.data);
-        return *this;
-    }
-
-    float horizontal_sum() const {
-        __m128 shuf = _mm_shuffle_ps(data, data, _MM_SHUFFLE(2, 3, 0, 1));
-        __m128 sums = _mm_add_ps(data, shuf);
-        shuf = _mm_movehl_ps(shuf, sums);
-        sums = _mm_add_ss(sums, shuf);
-        return _mm_cvtss_f32(sums);
-    }
-
-    static SimdVec fmadd(const SimdVec& a, const SimdVec& b, const SimdVec& c) {
-#ifdef HPC_HAS_AVX2
-        return SimdVec(_mm_fmadd_ps(a.data, b.data, c.data));
-#else
-        return SimdVec(_mm_add_ps(_mm_mul_ps(a.data, b.data), c.data));
-#endif
-    }
-
-    SimdVec sqrt() const { return SimdVec(_mm_sqrt_ps(data)); }
-
-    SimdVec min(const SimdVec& other) const { return SimdVec(_mm_min_ps(data, other.data)); }
-
-    SimdVec max(const SimdVec& other) const { return SimdVec(_mm_max_ps(data, other.data)); }
-};
-
-#endif  // HPC_HAS_SSE2
-
-// ============================================================================
-// AVX2 Implementation (256-bit, 8 floats)
-// ============================================================================
-
-#ifdef HPC_HAS_AVX2
-
-template <>
-class SimdVec<float, 8> {
-public:
-    static constexpr size_t width = 8;
-    using value_type = float;
-
-    __m256 data;
-
-    SimdVec() : data(_mm256_setzero_ps()) {}
-
-    explicit SimdVec(__m256 v) : data(v) {}
-
-    explicit SimdVec(float val) : data(_mm256_set1_ps(val)) {}
-
-    SimdVec(const float* ptr) : data(_mm256_loadu_ps(ptr)) {}
-
-    static SimdVec load_aligned(const float* ptr) { return SimdVec(_mm256_load_ps(ptr)); }
-
-    void store(float* ptr) const { _mm256_storeu_ps(ptr, data); }
-
-    void store_aligned(float* ptr) const { _mm256_store_ps(ptr, data); }
-
-    float operator[](size_t i) const {
-        alignas(32) float tmp[8];
-        _mm256_store_ps(tmp, data);
-        return tmp[i];
-    }
-
-    SimdVec operator+(const SimdVec& other) const {
-        return SimdVec(_mm256_add_ps(data, other.data));
-    }
-
-    SimdVec operator-(const SimdVec& other) const {
-        return SimdVec(_mm256_sub_ps(data, other.data));
-    }
-
-    SimdVec operator*(const SimdVec& other) const {
-        return SimdVec(_mm256_mul_ps(data, other.data));
-    }
-
-    SimdVec operator/(const SimdVec& other) const {
-        return SimdVec(_mm256_div_ps(data, other.data));
-    }
-
-    SimdVec& operator+=(const SimdVec& other) {
-        data = _mm256_add_ps(data, other.data);
-        return *this;
-    }
-
-    SimdVec& operator-=(const SimdVec& other) {
-        data = _mm256_sub_ps(data, other.data);
-        return *this;
-    }
-
-    SimdVec& operator*=(const SimdVec& other) {
-        data = _mm256_mul_ps(data, other.data);
-        return *this;
-    }
-
-    float horizontal_sum() const {
-        __m128 hi = _mm256_extractf128_ps(data, 1);
-        __m128 lo = _mm256_castps256_ps128(data);
-        __m128 sum128 = _mm_add_ps(hi, lo);
-        __m128 shuf = _mm_shuffle_ps(sum128, sum128, _MM_SHUFFLE(2, 3, 0, 1));
-        sum128 = _mm_add_ps(sum128, shuf);
-        shuf = _mm_movehl_ps(shuf, sum128);
-        sum128 = _mm_add_ss(sum128, shuf);
-        return _mm_cvtss_f32(sum128);
-    }
-
-    static SimdVec fmadd(const SimdVec& a, const SimdVec& b, const SimdVec& c) {
-        return SimdVec(_mm256_fmadd_ps(a.data, b.data, c.data));
-    }
-
-    SimdVec sqrt() const { return SimdVec(_mm256_sqrt_ps(data)); }
-
-    SimdVec min(const SimdVec& other) const { return SimdVec(_mm256_min_ps(data, other.data)); }
-
-    SimdVec max(const SimdVec& other) const { return SimdVec(_mm256_max_ps(data, other.data)); }
-};
-
-#endif  // HPC_HAS_AVX2
-
-// ============================================================================
-// AVX-512 Implementation (512-bit, 16 floats)
-// ============================================================================
-
-#ifdef HPC_HAS_AVX512
-
-template <>
-class SimdVec<float, 16> {
-public:
-    static constexpr size_t width = 16;
-    using value_type = float;
-
-    __m512 data;
-
-    SimdVec() : data(_mm512_setzero_ps()) {}
-
-    explicit SimdVec(__m512 v) : data(v) {}
-
-    explicit SimdVec(float val) : data(_mm512_set1_ps(val)) {}
-
-    SimdVec(const float* ptr) : data(_mm512_loadu_ps(ptr)) {}
-
-    static SimdVec load_aligned(const float* ptr) { return SimdVec(_mm512_load_ps(ptr)); }
-
-    void store(float* ptr) const { _mm512_storeu_ps(ptr, data); }
-
-    void store_aligned(float* ptr) const { _mm512_store_ps(ptr, data); }
-
-    float operator[](size_t i) const {
-        alignas(64) float tmp[16];
-        _mm512_store_ps(tmp, data);
-        return tmp[i];
-    }
-
-    SimdVec operator+(const SimdVec& other) const {
-        return SimdVec(_mm512_add_ps(data, other.data));
-    }
-
-    SimdVec operator-(const SimdVec& other) const {
-        return SimdVec(_mm512_sub_ps(data, other.data));
-    }
-
-    SimdVec operator*(const SimdVec& other) const {
-        return SimdVec(_mm512_mul_ps(data, other.data));
-    }
-
-    SimdVec operator/(const SimdVec& other) const {
-        return SimdVec(_mm512_div_ps(data, other.data));
-    }
-
-    SimdVec& operator+=(const SimdVec& other) {
-        data = _mm512_add_ps(data, other.data);
-        return *this;
-    }
-
-    SimdVec& operator-=(const SimdVec& other) {
-        data = _mm512_sub_ps(data, other.data);
-        return *this;
-    }
-
-    SimdVec& operator*=(const SimdVec& other) {
-        data = _mm512_mul_ps(data, other.data);
-        return *this;
-    }
-
-    float horizontal_sum() const { return _mm512_reduce_add_ps(data); }
-
-    static SimdVec fmadd(const SimdVec& a, const SimdVec& b, const SimdVec& c) {
-        return SimdVec(_mm512_fmadd_ps(a.data, b.data, c.data));
-    }
-
-    SimdVec sqrt() const { return SimdVec(_mm512_sqrt_ps(data)); }
-
-    SimdVec min(const SimdVec& other) const { return SimdVec(_mm512_min_ps(data, other.data)); }
-
-    SimdVec max(const SimdVec& other) const { return SimdVec(_mm512_max_ps(data, other.data)); }
-};
-
-#endif  // HPC_HAS_AVX512
-
-// ============================================================================
-// Type aliases for convenience
-// ============================================================================
-
-#ifdef HPC_HAS_AVX512
-using FloatVec = SimdVec<float, 16>;
-constexpr size_t FLOAT_VEC_WIDTH = 16;
-#elif defined(HPC_HAS_AVX2)
-using FloatVec = SimdVec<float, 8>;
-constexpr size_t FLOAT_VEC_WIDTH = 8;
-#elif defined(HPC_HAS_SSE2)
-using FloatVec = SimdVec<float, 4>;
-constexpr size_t FLOAT_VEC_WIDTH = 4;
-#else
-using FloatVec = SimdVecScalar<float, 4>;
-constexpr size_t FLOAT_VEC_WIDTH = 4;
-#endif
-
-// ============================================================================
-// High-level operations using the wrapper
-// ============================================================================
-
-/// Add two arrays using SIMD wrapper
-inline void add_arrays_wrapped(const float* a, const float* b, float* c, size_t n) {
-    size_t i = 0;
-    for (; i + FLOAT_VEC_WIDTH <= n; i += FLOAT_VEC_WIDTH) {
-        FloatVec va(&a[i]);
-        FloatVec vb(&b[i]);
-        FloatVec vc = va + vb;
-        vc.store(&c[i]);
-    }
-    for (; i < n; ++i) {
-        c[i] = a[i] + b[i];
-    }
-}
-
-/// Dot product using SIMD wrapper
-inline float dot_product_wrapped(const float* a, const float* b, size_t n) {
-    FloatVec sum(0.0f);
-    size_t i = 0;
-
-    for (; i + FLOAT_VEC_WIDTH <= n; i += FLOAT_VEC_WIDTH) {
-        FloatVec va(&a[i]);
-        FloatVec vb(&b[i]);
-        sum = FloatVec::fmadd(va, vb, sum);
-    }
-
-    float result = sum.horizontal_sum();
-
-    for (; i < n; ++i) {
-        result += a[i] * b[i];
-    }
-
-    return result;
-}
-
-/// Scale array by scalar using SIMD wrapper
-inline void scale_array_wrapped(float* arr, float scalar, size_t n) {
-    FloatVec vscalar(scalar);
-    size_t i = 0;
-
-    for (; i + FLOAT_VEC_WIDTH <= n; i += FLOAT_VEC_WIDTH) {
-        FloatVec v(&arr[i]);
-        v *= vscalar;
-        v.store(&arr[i]);
-    }
-
-    for (; i < n; ++i) {
-        arr[i] *= scalar;
-    }
-}
-
-/// Clamp array values using SIMD wrapper
-inline void clamp_array_wrapped(float* arr, float min_val, float max_val, size_t n) {
-    FloatVec vmin(min_val);
-    FloatVec vmax(max_val);
-    size_t i = 0;
-
-    for (; i + FLOAT_VEC_WIDTH <= n; i += FLOAT_VEC_WIDTH) {
-        FloatVec v(&arr[i]);
-        v = v.max(vmin).min(vmax);
-        v.store(&arr[i]);
-    }
-
-    for (; i < n; ++i) {
-        arr[i] = std::max(min_val, std::min(max_val, arr[i]));
-    }
-}
-
-}  // namespace hpc::simd
+#include <hpc/simd.hpp>
diff --git a/examples/04-simd-vectorization/src/auto_vectorize.cpp b/examples/04-simd-vectorization/src/auto_vectorize.cpp
index 77a4d23..a6671ee 100644
--- a/examples/04-simd-vectorization/src/auto_vectorize.cpp
+++ b/examples/04-simd-vectorization/src/auto_vectorize.cpp
@@ -15,7 +15,7 @@
 #include <numeric>
 #include <vector>
 
-#include "simd_utils.hpp"
+#include <hpc/simd.hpp>
 
 namespace hpc::simd {
 
diff --git a/examples/04-simd-vectorization/src/dispatch_example_main.cpp b/examples/04-simd-vectorization/src/dispatch_example_main.cpp
index aa698e8..1cf93af 100644
--- a/examples/04-simd-vectorization/src/dispatch_example_main.cpp
+++ b/examples/04-simd-vectorization/src/dispatch_example_main.cpp
@@ -1,6 +1,6 @@
 #include <iostream>
 
-#include "simd_utils.hpp"
+#include <hpc/simd.hpp>
 
 int main() {
     constexpr size_t kSize = 8;
diff --git a/examples/04-simd-vectorization/src/intrinsics_intro.cpp b/examples/04-simd-vectorization/src/intrinsics_intro.cpp
index 51530a1..b18fb45 100644
--- a/examples/04-simd-vectorization/src/intrinsics_intro.cpp
+++ b/examples/04-simd-vectorization/src/intrinsics_intro.cpp
@@ -12,7 +12,7 @@
 #include <cmath>
 #include <iostream>
 
-#include "simd_utils.hpp"
+#include <hpc/simd.hpp>
 
 // Include SIMD headers based on availability
 #ifdef HPC_HAS_SSE2
@@ -253,7 +253,7 @@ float dot_product_avx512(const float* a, const float* b, size_t n) {
 // Unified interface with runtime dispatch
 // ============================================================================
 
-void add_arrays(const float* a, const float* b, float* c, size_t n) {
+void demo_add_arrays(const float* a, const float* b, float* c, size_t n) {
 #ifdef HPC_HAS_AVX512
     add_arrays_avx512(a, b, c, n);
 #elif defined(HPC_HAS_AVX2)
@@ -265,7 +265,7 @@ void add_arrays(const float* a, const float* b, float* c, size_t n) {
 #endif
 }
 
-void multiply_arrays(const float* a, const float* b, float* c, size_t n) {
+void demo_multiply_arrays(const float* a, const float* b, float* c, size_t n) {
 #ifdef HPC_HAS_AVX512
     multiply_arrays_avx512(a, b, c, n);
 #elif defined(HPC_HAS_AVX2)
@@ -277,7 +277,7 @@ void multiply_arrays(const float* a, const float* b, float* c, size_t n) {
 #endif
 }
 
-float dot_product(const float* a, const float* b, size_t n) {
+float demo_dot_product(const float* a, const float* b, size_t n) {
 #ifdef HPC_HAS_AVX512
     return dot_product_avx512(a, b, n);
 #elif defined(HPC_HAS_AVX2)
@@ -347,7 +347,7 @@ void demonstrate_intrinsics() {
 
     // Verify correctness
     float scalar_result = dot_product_scalar(a.data(), b.data(), N);
-    float simd_result = dot_product(a.data(), b.data(), N);
+    float simd_result = demo_dot_product(a.data(), b.data(), N);
     std::cout << std::endl << "Correctness check:" << std::endl;
     std::cout << "Scalar dot product: " << scalar_result << std::endl;
     std::cout << "SIMD dot product:   " << simd_result << std::endl;
diff --git a/include/hpc/core.hpp b/include/hpc/core.hpp
index 5c2ddd5..a66a549 100644
--- a/include/hpc/core.hpp
+++ b/include/hpc/core.hpp
@@ -14,6 +14,10 @@
 #include <cstddef>
 #include <thread>
 
+#if defined(_WIN32)
+#include <windows.h>
+#endif
+
 // POSIX headers for sysconf
 #if defined(__unix__) || defined(__APPLE__)
 #include <unistd.h>
diff --git a/include/hpc/simd.hpp b/include/hpc/simd.hpp
new file mode 100644
index 0000000..734e971
--- /dev/null
+++ b/include/hpc/simd.hpp
@@ -0,0 +1,739 @@
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <limits>
+#include <memory>
+#include <new>
+#include <vector>
+
+#if (defined(__GNUC__) || defined(__clang__)) && (defined(__x86_64__) || defined(__i386__))
+#include <immintrin.h>
+#endif
+
+namespace hpc::simd {
+
+#ifdef __SSE2__
+#define HPC_SIMD_HAS_SSE2 1
+#define HPC_HAS_SSE2 1
+#endif
+
+#ifdef __AVX__
+#define HPC_SIMD_HAS_AVX 1
+#define HPC_HAS_AVX 1
+#endif
+
+#ifdef __AVX2__
+#define HPC_SIMD_HAS_AVX2 1
+#define HPC_HAS_AVX2 1
+#endif
+
+#ifdef __AVX512F__
+#define HPC_SIMD_HAS_AVX512 1
+#define HPC_HAS_AVX512 1
+#endif
+
+enum class SIMDLevel { Scalar, SSE2, AVX, AVX2, AVX512 };
+
+inline SIMDLevel detect_simd_level() {
+#if (defined(__GNUC__) || defined(__clang__)) && (defined(__x86_64__) || defined(__i386__))
+    __builtin_cpu_init();
+    if (__builtin_cpu_supports("avx512f")) {
+        return SIMDLevel::AVX512;
+    }
+    if (__builtin_cpu_supports("avx2")) {
+        return SIMDLevel::AVX2;
+    }
+    if (__builtin_cpu_supports("avx")) {
+        return SIMDLevel::AVX;
+    }
+    if (__builtin_cpu_supports("sse2")) {
+        return SIMDLevel::SSE2;
+    }
+    return SIMDLevel::Scalar;
+#elif defined(HPC_SIMD_HAS_AVX512)
+    return SIMDLevel::AVX512;
+#elif defined(HPC_SIMD_HAS_AVX2)
+    return SIMDLevel::AVX2;
+#elif defined(HPC_SIMD_HAS_AVX)
+    return SIMDLevel::AVX;
+#elif defined(HPC_SIMD_HAS_SSE2)
+    return SIMDLevel::SSE2;
+#else
+    return SIMDLevel::Scalar;
+#endif
+}
+
+inline const char* simd_level_name(SIMDLevel level) {
+    switch (level) {
+        case SIMDLevel::AVX512:
+            return "AVX-512";
+        case SIMDLevel::AVX2:
+            return "AVX2";
+        case SIMDLevel::AVX:
+            return "AVX";
+        case SIMDLevel::SSE2:
+            return "SSE2";
+        case SIMDLevel::Scalar:
+        default:
+            return "Scalar";
+    }
+}
+
+inline std::size_t simd_vector_width(SIMDLevel level) {
+    switch (level) {
+        case SIMDLevel::AVX512:
+            return 64;
+        case SIMDLevel::AVX2:
+        case SIMDLevel::AVX:
+            return 32;
+        case SIMDLevel::SSE2:
+            return 16;
+        case SIMDLevel::Scalar:
+        default:
+            return sizeof(float);
+    }
+}
+
+inline std::size_t get_simd_alignment() {
+    switch (detect_simd_level()) {
+        case SIMDLevel::AVX512:
+            return 64;
+        case SIMDLevel::AVX2:
+        case SIMDLevel::AVX:
+            return 32;
+        case SIMDLevel::SSE2:
+            return 16;
+        case SIMDLevel::Scalar:
+        default:
+            return sizeof(void*);
+    }
+}
+
+inline bool is_aligned(const void* ptr, std::size_t alignment) {
+    return reinterpret_cast<std::uintptr_t>(ptr) % alignment == 0;
+}
+
+inline std::size_t align_up(std::size_t size, std::size_t alignment) {
+    return (size + alignment - 1) & ~(alignment - 1);
+}
+
+template <typename T>
+class AlignedAllocator {
+public:
+    using value_type = T;
+    using size_type = std::size_t;
+    using difference_type = std::ptrdiff_t;
+
+    template <typename U>
+    struct rebind {
+        using other = AlignedAllocator<U>;
+    };
+
+    AlignedAllocator() = default;
+
+    template <typename U>
+    AlignedAllocator(const AlignedAllocator<U>&) {}
+
+    T* allocate(size_type n) {
+        if (n > std::numeric_limits<size_type>::max() / sizeof(T)) {
+            throw std::bad_alloc();
+        }
+        if (n == 0) {
+            return nullptr;
+        }
+
+        void* ptr = nullptr;
+        const std::size_t alignment = get_simd_alignment();
+        const std::size_t size = n * sizeof(T);
+#if defined(_MSC_VER)
+        ptr = _aligned_malloc(size, alignment);
+#else
+        if (posix_memalign(&ptr, alignment, size) != 0) {
+            ptr = nullptr;
+        }
+#endif
+        if (ptr == nullptr) {
+            throw std::bad_alloc();
+        }
+        return static_cast<T*>(ptr);
+    }
+
+    void deallocate(T* ptr, size_type) {
+        if (ptr == nullptr) {
+            return;
+        }
+#if defined(_MSC_VER)
+        _aligned_free(ptr);
+#else
+        free(ptr);
+#endif
+    }
+
+    template <typename U>
+    bool operator==(const AlignedAllocator<U>&) const {
+        return true;
+    }
+
+    template <typename U>
+    bool operator!=(const AlignedAllocator<U>&) const {
+        return false;
+    }
+};
+
+template <typename T>
+using aligned_vector = std::vector<T, AlignedAllocator<T>>;
+
+template <typename T>
+using AlignedBuffer = aligned_vector<T>;
+
+template <typename T>
+using aligned_allocator [[deprecated("Use AlignedAllocator<T> directly")]] = AlignedAllocator<T>;
+
+template <typename T>
+using simd_allocator = AlignedAllocator<T>;
+
+template <typename T>
+aligned_vector<T> make_aligned_vector(std::size_t size) {
+    return aligned_vector<T>(size);
+}
+
+template <typename T>
+aligned_vector<T> make_aligned_vector(std::size_t size, const T& value) {
+    return aligned_vector<T>(size, value);
+}
+
+template <typename T, std::size_t Width>
+class SimdVec;
+
+template <typename T, std::size_t Width>
+class SimdVecScalar {
+public:
+    static constexpr std::size_t width = Width;
+    using value_type = T;
+
+    T data[Width];
+
+    SimdVecScalar() = default;
+
+    explicit SimdVecScalar(T value) {
+        for (std::size_t i = 0; i < Width; ++i) {
+            data[i] = value;
+        }
+    }
+
+    explicit SimdVecScalar(const T* ptr) {
+        for (std::size_t i = 0; i < Width; ++i) {
+            data[i] = ptr[i];
+        }
+    }
+
+    void store(T* ptr) const {
+        for (std::size_t i = 0; i < Width; ++i) {
+            ptr[i] = data[i];
+        }
+    }
+
+    T operator[](std::size_t i) const { return data[i]; }
+    T& operator[](std::size_t i) { return data[i]; }
+
+    SimdVecScalar operator+(const SimdVecScalar& other) const {
+        SimdVecScalar result;
+        for (std::size_t i = 0; i < Width; ++i) {
+            result.data[i] = data[i] + other.data[i];
+        }
+        return result;
+    }
+
+    SimdVecScalar operator-(const SimdVecScalar& other) const {
+        SimdVecScalar result;
+        for (std::size_t i = 0; i < Width; ++i) {
+            result.data[i] = data[i] - other.data[i];
+        }
+        return result;
+    }
+
+    SimdVecScalar operator*(const SimdVecScalar& other) const {
+        SimdVecScalar result;
+        for (std::size_t i = 0; i < Width; ++i) {
+            result.data[i] = data[i] * other.data[i];
+        }
+        return result;
+    }
+
+    SimdVecScalar operator/(const SimdVecScalar& other) const {
+        SimdVecScalar result;
+        for (std::size_t i = 0; i < Width; ++i) {
+            result.data[i] = data[i] / other.data[i];
+        }
+        return result;
+    }
+
+    SimdVecScalar& operator+=(const SimdVecScalar& other) {
+        for (std::size_t i = 0; i < Width; ++i) {
+            data[i] += other.data[i];
+        }
+        return *this;
+    }
+
+    SimdVecScalar& operator-=(const SimdVecScalar& other) {
+        for (std::size_t i = 0; i < Width; ++i) {
+            data[i] -= other.data[i];
+        }
+        return *this;
+    }
+
+    SimdVecScalar& operator*=(const SimdVecScalar& other) {
+        for (std::size_t i = 0; i < Width; ++i) {
+            data[i] *= other.data[i];
+        }
+        return *this;
+    }
+
+    T horizontal_sum() const {
+        T sum = data[0];
+        for (std::size_t i = 1; i < Width; ++i) {
+            sum += data[i];
+        }
+        return sum;
+    }
+
+    static SimdVecScalar fmadd(const SimdVecScalar& a, const SimdVecScalar& b,
+                               const SimdVecScalar& c) {
+        SimdVecScalar result;
+        for (std::size_t i = 0; i < Width; ++i) {
+            result.data[i] = a.data[i] * b.data[i] + c.data[i];
+        }
+        return result;
+    }
+
+    SimdVecScalar sqrt() const {
+        SimdVecScalar result;
+        for (std::size_t i = 0; i < Width; ++i) {
+            result.data[i] = std::sqrt(data[i]);
+        }
+        return result;
+    }
+
+    SimdVecScalar min(const SimdVecScalar& other) const {
+        SimdVecScalar result;
+        for (std::size_t i = 0; i < Width; ++i) {
+            result.data[i] = std::min(data[i], other.data[i]);
+        }
+        return result;
+    }
+
+    SimdVecScalar max(const SimdVecScalar& other) const {
+        SimdVecScalar result;
+        for (std::size_t i = 0; i < Width; ++i) {
+            result.data[i] = std::max(data[i], other.data[i]);
+        }
+        return result;
+    }
+};
+
+#ifdef HPC_HAS_SSE2
+
+template <>
+class SimdVec<float, 4> {
+public:
+    static constexpr std::size_t width = 4;
+    using value_type = float;
+
+    __m128 data;
+
+    SimdVec() : data(_mm_setzero_ps()) {}
+    explicit SimdVec(__m128 value) : data(value) {}
+    explicit SimdVec(float value) : data(_mm_set1_ps(value)) {}
+    explicit SimdVec(const float* ptr) : data(_mm_loadu_ps(ptr)) {}
+
+    static SimdVec load_aligned(const float* ptr) { return SimdVec(_mm_load_ps(ptr)); }
+
+    void store(float* ptr) const { _mm_storeu_ps(ptr, data); }
+    void store_aligned(float* ptr) const { _mm_store_ps(ptr, data); }
+
+    float operator[](std::size_t i) const {
+        alignas(16) float tmp[4];
+        _mm_store_ps(tmp, data);
+        return tmp[i];
+    }
+
+    SimdVec operator+(const SimdVec& other) const { return SimdVec(_mm_add_ps(data, other.data)); }
+    SimdVec operator-(const SimdVec& other) const { return SimdVec(_mm_sub_ps(data, other.data)); }
+    SimdVec operator*(const SimdVec& other) const { return SimdVec(_mm_mul_ps(data, other.data)); }
+    SimdVec operator/(const SimdVec& other) const { return SimdVec(_mm_div_ps(data, other.data)); }
+
+    SimdVec& operator+=(const SimdVec& other) {
+        data = _mm_add_ps(data, other.data);
+        return *this;
+    }
+
+    SimdVec& operator-=(const SimdVec& other) {
+        data = _mm_sub_ps(data, other.data);
+        return *this;
+    }
+
+    SimdVec& operator*=(const SimdVec& other) {
+        data = _mm_mul_ps(data, other.data);
+        return *this;
+    }
+
+    float horizontal_sum() const {
+        __m128 shuf = _mm_shuffle_ps(data, data, _MM_SHUFFLE(2, 3, 0, 1));
+        __m128 sums = _mm_add_ps(data, shuf);
+        shuf = _mm_movehl_ps(shuf, sums);
+        sums = _mm_add_ss(sums, shuf);
+        return _mm_cvtss_f32(sums);
+    }
+
+    static SimdVec fmadd(const SimdVec& a, const SimdVec& b, const SimdVec& c) {
+#ifdef HPC_HAS_AVX2
+        return SimdVec(_mm_fmadd_ps(a.data, b.data, c.data));
+#else
+        return SimdVec(_mm_add_ps(_mm_mul_ps(a.data, b.data), c.data));
+#endif
+    }
+
+    SimdVec sqrt() const { return SimdVec(_mm_sqrt_ps(data)); }
+    SimdVec min(const SimdVec& other) const { return SimdVec(_mm_min_ps(data, other.data)); }
+    SimdVec max(const SimdVec& other) const { return SimdVec(_mm_max_ps(data, other.data)); }
+};
+
+#endif
+
+#ifdef HPC_HAS_AVX2
+
+template <>
+class SimdVec<float, 8> {
+public:
+    static constexpr std::size_t width = 8;
+    using value_type = float;
+
+    __m256 data;
+
+    SimdVec() : data(_mm256_setzero_ps()) {}
+    explicit SimdVec(__m256 value) : data(value) {}
+    explicit SimdVec(float value) : data(_mm256_set1_ps(value)) {}
+    explicit SimdVec(const float* ptr) : data(_mm256_loadu_ps(ptr)) {}
+
+    static SimdVec load_aligned(const float* ptr) { return SimdVec(_mm256_load_ps(ptr)); }
+
+    void store(float* ptr) const { _mm256_storeu_ps(ptr, data); }
+    void store_aligned(float* ptr) const { _mm256_store_ps(ptr, data); }
+
+    float operator[](std::size_t i) const {
+        alignas(32) float tmp[8];
+        _mm256_store_ps(tmp, data);
+        return tmp[i];
+    }
+
+    SimdVec operator+(const SimdVec& other) const {
+        return SimdVec(_mm256_add_ps(data, other.data));
+    }
+
+    SimdVec operator-(const SimdVec& other) const {
+        return SimdVec(_mm256_sub_ps(data, other.data));
+    }
+
+    SimdVec operator*(const SimdVec& other) const {
+        return SimdVec(_mm256_mul_ps(data, other.data));
+    }
+
+    SimdVec operator/(const SimdVec& other) const {
+        return SimdVec(_mm256_div_ps(data, other.data));
+    }
+
+    SimdVec& operator+=(const SimdVec& other) {
+        data = _mm256_add_ps(data, other.data);
+        return *this;
+    }
+
+    SimdVec& operator-=(const SimdVec& other) {
+        data = _mm256_sub_ps(data, other.data);
+        return *this;
+    }
+
+    SimdVec& operator*=(const SimdVec& other) {
+        data = _mm256_mul_ps(data, other.data);
+        return *this;
+    }
+
+    float horizontal_sum() const {
+        __m128 hi = _mm256_extractf128_ps(data, 1);
+        __m128 lo = _mm256_castps256_ps128(data);
+        __m128 sum128 = _mm_add_ps(hi, lo);
+        __m128 shuf = _mm_shuffle_ps(sum128, sum128, _MM_SHUFFLE(2, 3, 0, 1));
+        sum128 = _mm_add_ps(sum128, shuf);
+        shuf = _mm_movehl_ps(shuf, sum128);
+        sum128 = _mm_add_ss(sum128, shuf);
+        return _mm_cvtss_f32(sum128);
+    }
+
+    static SimdVec fmadd(const SimdVec& a, const SimdVec& b, const SimdVec& c) {
+        return SimdVec(_mm256_fmadd_ps(a.data, b.data, c.data));
+    }
+
+    SimdVec sqrt() const { return SimdVec(_mm256_sqrt_ps(data)); }
+    SimdVec min(const SimdVec& other) const { return SimdVec(_mm256_min_ps(data, other.data)); }
+    SimdVec max(const SimdVec& other) const { return SimdVec(_mm256_max_ps(data, other.data)); }
+};
+
+#endif
+
+#ifdef HPC_HAS_AVX512
+
+template <>
+class SimdVec<float, 16> {
+public:
+    static constexpr std::size_t width = 16;
+    using value_type = float;
+
+    __m512 data;
+
+    SimdVec() : data(_mm512_setzero_ps()) {}
+    explicit SimdVec(__m512 value) : data(value) {}
+    explicit SimdVec(float value) : data(_mm512_set1_ps(value)) {}
+    explicit SimdVec(const float* ptr) : data(_mm512_loadu_ps(ptr)) {}
+
+    static SimdVec load_aligned(const float* ptr) { return SimdVec(_mm512_load_ps(ptr)); }
+
+    void store(float* ptr) const { _mm512_storeu_ps(ptr, data); }
+    void store_aligned(float* ptr) const { _mm512_store_ps(ptr, data); }
+
+    float operator[](std::size_t i) const {
+        alignas(64) float tmp[16];
+        _mm512_store_ps(tmp, data);
+        return tmp[i];
+    }
+
+    SimdVec operator+(const SimdVec& other) const {
+        return SimdVec(_mm512_add_ps(data, other.data));
+    }
+
+    SimdVec operator-(const SimdVec& other) const {
+        return SimdVec(_mm512_sub_ps(data, other.data));
+    }
+
+    SimdVec operator*(const SimdVec& other) const {
+        return SimdVec(_mm512_mul_ps(data, other.data));
+    }
+
+    SimdVec operator/(const SimdVec& other) const {
+        return SimdVec(_mm512_div_ps(data, other.data));
+    }
+
+    SimdVec& operator+=(const SimdVec& other) {
+        data = _mm512_add_ps(data, other.data);
+        return *this;
+    }
+
+    SimdVec& operator-=(const SimdVec& other) {
+        data = _mm512_sub_ps(data, other.data);
+        return *this;
+    }
+
+    SimdVec& operator*=(const SimdVec& other) {
+        data = _mm512_mul_ps(data, other.data);
+        return *this;
+    }
+
+    float horizontal_sum() const { return _mm512_reduce_add_ps(data); }
+
+    static SimdVec fmadd(const SimdVec& a, const SimdVec& b, const SimdVec& c) {
+        return SimdVec(_mm512_fmadd_ps(a.data, b.data, c.data));
+    }
+
+    SimdVec sqrt() const { return SimdVec(_mm512_sqrt_ps(data)); }
+    SimdVec min(const SimdVec& other) const { return SimdVec(_mm512_min_ps(data, other.data)); }
+    SimdVec max(const SimdVec& other) const { return SimdVec(_mm512_max_ps(data, other.data)); }
+};
+
+#endif
+
+#ifdef HPC_HAS_AVX512
+using FloatVec = SimdVec<float, 16>;
+constexpr std::size_t FLOAT_VEC_WIDTH = 16;
+#elif defined(HPC_HAS_AVX2)
+using FloatVec = SimdVec<float, 8>;
+constexpr std::size_t FLOAT_VEC_WIDTH = 8;
+#elif defined(HPC_HAS_SSE2)
+using FloatVec = SimdVec<float, 4>;
+constexpr std::size_t FLOAT_VEC_WIDTH = 4;
+#else
+using FloatVec = SimdVecScalar<float, 4>;
+constexpr std::size_t FLOAT_VEC_WIDTH = 4;
+#endif
+
+namespace detail {
+
+using AddArraysFn = void (*)(const float* a, const float* b, float* c, std::size_t n);
+
+inline void add_arrays_scalar(const float* a, const float* b, float* c, std::size_t n) {
+    for (std::size_t i = 0; i < n; ++i) {
+        c[i] = a[i] + b[i];
+    }
+}
+
+#if (defined(__GNUC__) || defined(__clang__)) && (defined(__x86_64__) || defined(__i386__))
+
+__attribute__((target("sse2"))) inline void add_arrays_sse2(const float* a, const float* b,
+                                                            float* c, std::size_t n) {
+    std::size_t i = 0;
+    for (; i + 4 <= n; i += 4) {
+        const __m128 va = _mm_loadu_ps(a + i);
+        const __m128 vb = _mm_loadu_ps(b + i);
+        _mm_storeu_ps(c + i, _mm_add_ps(va, vb));
+    }
+    add_arrays_scalar(a + i, b + i, c + i, n - i);
+}
+
+__attribute__((target("avx2,avx"))) inline void add_arrays_avx2(const float* a, const float* b,
+                                                                float* c, std::size_t n) {
+    std::size_t i = 0;
+    for (; i + 8 <= n; i += 8) {
+        const __m256 va = _mm256_loadu_ps(a + i);
+        const __m256 vb = _mm256_loadu_ps(b + i);
+        _mm256_storeu_ps(c + i, _mm256_add_ps(va, vb));
+    }
+    add_arrays_sse2(a + i, b + i, c + i, n - i);
+}
+
+__attribute__((target("avx512f,avx2,avx"))) inline void add_arrays_avx512(const float* a,
+                                                                           const float* b,
+                                                                           float* c,
+                                                                           std::size_t n) {
+    std::size_t i = 0;
+    for (; i + 16 <= n; i += 16) {
+        const __m512 va = _mm512_loadu_ps(a + i);
+        const __m512 vb = _mm512_loadu_ps(b + i);
+        _mm512_storeu_ps(c + i, _mm512_add_ps(va, vb));
+    }
+    add_arrays_avx2(a + i, b + i, c + i, n - i);
+}
+
+template <typename Func>
+inline Func resolve_best(Func scalar, Func sse2, Func avx2, Func avx512) {
+    __builtin_cpu_init();
+    if (avx512 && __builtin_cpu_supports("avx512f")) {
+        return avx512;
+    }
+    if (avx2 && __builtin_cpu_supports("avx2")) {
+        return avx2;
+    }
+    if (sse2 && __builtin_cpu_supports("sse2")) {
+        return sse2;
+    }
+    return scalar;
+}
+
+#endif
+
+}  // namespace detail
+
+inline void add_arrays(const float* a, const float* b, float* c, std::size_t n) {
+    using Fn = detail::AddArraysFn;
+#if (defined(__GNUC__) || defined(__clang__)) && (defined(__x86_64__) || defined(__i386__))
+    static const Fn dispatch =
+        detail::resolve_best<Fn>(&detail::add_arrays_scalar, &detail::add_arrays_sse2,
+                                 &detail::add_arrays_avx2, &detail::add_arrays_avx512);
+    dispatch(a, b, c, n);
+#else
+    detail::add_arrays_scalar(a, b, c, n);
+#endif
+}
+
+inline void dispatch_add_arrays(const float* a, const float* b, float* c, std::size_t n) {
+    add_arrays(a, b, c, n);
+}
+
+inline void multiply_arrays(const float* a, const float* b, float* c, std::size_t n) {
+    for (std::size_t i = 0; i < n; ++i) {
+        c[i] = a[i] * b[i];
+    }
+}
+
+inline float dot_product(const float* a, const float* b, std::size_t n) {
+    float sum = 0.0f;
+    for (std::size_t i = 0; i < n; ++i) {
+        sum += a[i] * b[i];
+    }
+    return sum;
+}
+
+inline void scale_array(float* arr, float scalar, std::size_t n) {
+    for (std::size_t i = 0; i < n; ++i) {
+        arr[i] *= scalar;
+    }
+}
+
+inline void clamp_array(float* arr, float min_val, float max_val, std::size_t n) {
+    for (std::size_t i = 0; i < n; ++i) {
+        arr[i] = std::max(min_val, std::min(max_val, arr[i]));
+    }
+}
+
+inline void add_arrays_wrapped(const float* a, const float* b, float* c, std::size_t n) {
+    std::size_t i = 0;
+    for (; i + FLOAT_VEC_WIDTH <= n; i += FLOAT_VEC_WIDTH) {
+        FloatVec va(a + i);
+        FloatVec vb(b + i);
+        FloatVec vc = va + vb;
+        vc.store(c + i);
+    }
+    for (; i < n; ++i) {
+        c[i] = a[i] + b[i];
+    }
+}
+
+inline float dot_product_wrapped(const float* a, const float* b, std::size_t n) {
+    FloatVec sum(0.0f);
+    std::size_t i = 0;
+
+    for (; i + FLOAT_VEC_WIDTH <= n; i += FLOAT_VEC_WIDTH) {
+        FloatVec va(a + i);
+        FloatVec vb(b + i);
+        sum = FloatVec::fmadd(va, vb, sum);
+    }
+
+    float result = sum.horizontal_sum();
+    for (; i < n; ++i) {
+        result += a[i] * b[i];
+    }
+    return result;
+}
+
+inline void scale_array_wrapped(float* arr, float scalar, std::size_t n) {
+    FloatVec vscalar(scalar);
+    std::size_t i = 0;
+
+    for (; i + FLOAT_VEC_WIDTH <= n; i += FLOAT_VEC_WIDTH) {
+        FloatVec value(arr + i);
+        value *= vscalar;
+        value.store(arr + i);
+    }
+
+    for (; i < n; ++i) {
+        arr[i] *= scalar;
+    }
+}
+
+inline void clamp_array_wrapped(float* arr, float min_val, float max_val, std::size_t n) {
+    FloatVec vmin(min_val);
+    FloatVec vmax(max_val);
+    std::size_t i = 0;
+
+    for (; i + FLOAT_VEC_WIDTH <= n; i += FLOAT_VEC_WIDTH) {
+        FloatVec value(arr + i);
+        value = value.max(vmin).min(vmax);
+        value.store(arr + i);
+    }
+
+    for (; i < n; ++i) {
+        arr[i] = std::max(min_val, std::min(max_val, arr[i]));
+    }
+}
+
+}  // namespace hpc::simd
diff --git a/openspec/changes/deepen-runtime-simd-module/design.md b/openspec/changes/deepen-runtime-simd-module/design.md
new file mode 100644
index 0000000..a0ec0ba
--- /dev/null
+++ b/openspec/changes/deepen-runtime-simd-module/design.md
@@ -0,0 +1,98 @@
+# Design: Deepen Runtime SIMD Module
+
+## Overview
+
+This change creates a deeper SIMD module by moving the reusable caller-facing interface into `include/hpc/simd.hpp`. The module owns:
+
+- runtime SIMD capability detection
+- runtime alignment selection
+- aligned SIMD buffer aliases
+- vector wrapper types (`SimdVec`, `FloatVec`)
+- reusable vector kernels (`add`, `multiply`, `dot`, `scale`, `clamp`)
+
+The examples remain teaching entry points, but they stop being the seam that tests and benchmarks must cross to reach reusable behavior.
+
+## Architectural problem
+
+Today the SIMD area makes maintainers bounce across multiple shallow modules:
+
+- `examples/04-simd-vectorization/include/simd_utils.hpp`
+- `examples/04-simd-vectorization/include/simd_wrapper.hpp`
+- `examples/04-simd-vectorization/src/intrinsics_intro.cpp`
+- `examples/04-simd-vectorization/bench/simd_bench.cpp`
+
+The interface is scattered almost as widely as the implementation. That yields low depth, low leverage, and poor locality. The deletion test fails: deleting any one file would mostly move the same logic to another caller rather than remove complexity.
+
+## New module shape
+
+### Public module
+
+`include/hpc/simd.hpp`
+
+Responsibilities:
+
+1. expose a single SIMD interface for callers and tests
+2. detect runtime SIMD capability on supported compilers/architectures
+3. derive the alignment policy from that capability
+4. provide the wrapper types and high-level kernels
+5. hide the target-attribute dispatch details behind the module interface
+
+### Adapters and seams
+
+- The module seam is `hpc::simd`
+- Runtime dispatch is internal to the module implementation
+- Example executables and benchmarks become adapters that demonstrate or measure the module
+- Tests validate behavior through the same public seam callers use
+
+## Runtime capability design
+
+### Detection
+
+`detect_simd_level()` will use `__builtin_cpu_init()` / `__builtin_cpu_supports()` on GCC/Clang x86 builds to report the highest runtime-supported level among:
+
+- `AVX512`
+- `AVX2`
+- `AVX`
+- `SSE2`
+- `Scalar`
+
+On unsupported compilers or architectures, the function falls back to a conservative compile-time answer.
+
+### Alignment
+
+`get_simd_alignment()` will derive the alignment from `detect_simd_level()` rather than from compile-time macros. That keeps the interface consistent with the actual runtime-dispatched implementation path.
+
+## Kernel design
+
+The public module will own these reusable kernels:
+
+- `add_arrays`
+- `multiply_arrays`
+- `dot_product`
+- `scale_array`
+- `clamp_array`
+
+Compatibility alias:
+
+- `dispatch_add_arrays` forwards to `add_arrays`
+
+This retains the existing teaching name where it is already useful while collapsing the actual implementation behind one deeper module.
+
+## Wrapper design
+
+`SimdVec`, `SimdVecScalar`, `FloatVec`, and `FLOAT_VEC_WIDTH` move into `include/hpc/simd.hpp`. The wrapper remains part of the teaching surface because it provides leverage for examples and benchmarks, but it is no longer isolated in a separate shallow header.
+
+## Migration plan
+
+1. Add tests for the new public seam first.
+2. Add `include/hpc/simd.hpp`.
+3. Refactor SIMD tests to include the public module.
+4. Refactor examples and benchmarks to include the public module and delete duplicated scalar/SIMD kernels where the shared module now owns them.
+5. Keep `examples/04-simd-vectorization/include/simd_utils.hpp` as a thin forwarding header only if needed during transition. Delete or minimize `simd_wrapper.hpp`.
+6. Fix `include/hpc/core.hpp` Windows include bug because the new module depends on platform utilities remaining portable.
+
+## Validation strategy
+
+- Focused red/green runs for the SIMD unit tests while iterating
+- Full debug preset validation before completion
+- Sanitizer preset validation on the touched surface after the refactor is green
diff --git a/openspec/changes/deepen-runtime-simd-module/proposal.md b/openspec/changes/deepen-runtime-simd-module/proposal.md
new file mode 100644
index 0000000..d83c8bd
--- /dev/null
+++ b/openspec/changes/deepen-runtime-simd-module/proposal.md
@@ -0,0 +1,39 @@
+# Proposal: Deepen Runtime SIMD Module
+
+## Summary
+
+Create one canonical SIMD module in `include/hpc/simd.hpp` that owns runtime capability detection, alignment policy, wrapper types, and reusable vector kernels. Refactor the example, benchmark, and test surfaces to use that seam, and fix the coupled platform bug in `include/hpc/core.hpp`.
+
+## Why
+
+The current SIMD code is architecturally shallow:
+
+1. Runtime feature detection is split between `simd_utils.hpp` and ad-hoc dispatch helpers, so the interface a caller learns is not the interface the implementation actually uses.
+2. Reusable kernels (`add`, `dot`, `scale`, `clamp`, `multiply`) are duplicated across `simd_utils.hpp`, `simd_wrapper.hpp`, `intrinsics_intro.cpp`, and `simd_bench.cpp`, which destroys locality.
+3. The public `hpc` surface has no canonical SIMD module even though SIMD is a first-class capability in the repository.
+4. `include/hpc/core.hpp` has a Windows compile-path bug (`SYSTEM_INFO` / `GetSystemInfo` without the required include), which is tightly coupled to the refactor because the new public module depends on platform utilities being trustworthy.
+
+## In scope
+
+- Add `include/hpc/simd.hpp` as the public SIMD module
+- Concentrate runtime SIMD level detection and alignment selection there
+- Centralize reusable SIMD wrapper types and high-level kernels there
+- Refactor example, benchmark, and test code to use the new seam
+- Keep or reduce total code duplication in the SIMD area
+- Fix the coupled `core.hpp` Windows include bug
+
+## Out of scope
+
+- Repository-wide allocator unification across memory/concurrency/SIMD
+- New docs-site information architecture
+- New benchmark dashboards or CI jobs
+- Reworking the memory or concurrency teaching modules in this change
+
+## Success criteria
+
+- `include/hpc/simd.hpp` becomes the single caller-facing SIMD module
+- Runtime SIMD level reporting and alignment selection are consistent on supported x86 GCC/Clang builds
+- SIMD tests validate the new public seam instead of example-internal headers
+- Existing SIMD examples and benchmarks still build and run through the refactored module
+- `cmake --preset=debug && cmake --build build/debug && ctest --preset=debug` passes
+- Sanitizer validation for the touched surface passes without new failures
diff --git a/openspec/changes/deepen-runtime-simd-module/specs/simd-vectorization/spec.md b/openspec/changes/deepen-runtime-simd-module/specs/simd-vectorization/spec.md
new file mode 100644
index 0000000..d551c97
--- /dev/null
+++ b/openspec/changes/deepen-runtime-simd-module/specs/simd-vectorization/spec.md
@@ -0,0 +1,44 @@
+# SIMD Vectorization
+
+## ADDED Requirements
+
+### Requirement: Canonical SIMD Module
+
+THE Example_Module SHALL expose a single canonical SIMD module for reusable runtime detection, alignment selection, wrapper types, and vector kernels.
+
+#### Scenario: Callers use one SIMD seam
+
+- **WHEN** tests, examples, or benchmarks need reusable SIMD behavior
+- **THEN** they include `include/hpc/simd.hpp` instead of reconstructing the same logic across multiple example-local modules
+
+---
+
+### Requirement: Runtime capability reporting matches kernel selection
+
+THE Example_Module SHALL report runtime SIMD capability in a way that is consistent with the runtime-dispatched kernel path on supported x86 GCC/Clang builds.
+
+#### Scenario: Runtime level reflects runtime CPU support
+
+- **WHEN** `detect_simd_level()` runs on a supported x86 GCC/Clang build
+- **THEN** it reports the highest supported runtime level among AVX-512, AVX2, AVX, SSE2, or Scalar
+
+#### Scenario: Runtime alignment follows runtime level
+
+- **WHEN** `get_simd_alignment()` is called
+- **THEN** it returns the alignment implied by the reported runtime SIMD level
+
+---
+
+### Requirement: Shared SIMD kernels
+
+THE Example_Module SHALL provide reusable vector kernels through the canonical SIMD module rather than duplicating them across demos and benchmarks.
+
+#### Scenario: Shared add and dot kernels stay correct
+
+- **WHEN** callers execute shared SIMD kernels such as array add or dot product
+- **THEN** the results match the scalar reference within floating-point tolerance
+
+#### Scenario: Shared scalar-transform kernels stay correct
+
+- **WHEN** callers execute shared SIMD kernels such as multiply, scale, or clamp
+- **THEN** the results match the scalar reference for aligned and tail-length inputs
diff --git a/openspec/changes/deepen-runtime-simd-module/tasks.md b/openspec/changes/deepen-runtime-simd-module/tasks.md
new file mode 100644
index 0000000..fb5aeb7
--- /dev/null
+++ b/openspec/changes/deepen-runtime-simd-module/tasks.md
@@ -0,0 +1,25 @@
+# Tasks: Deepen Runtime SIMD Module
+
+## 1. Spec and red tests
+
+- [x] 1.1 Add the OpenSpec proposal, design, tasks, and SIMD spec delta for the runtime SIMD module deepening change
+- [x] 1.2 Add failing tests for the new public SIMD seam in `tests/unit/simd/`
+- [x] 1.3 Run the focused SIMD test target and confirm the new tests fail for the expected reason
+
+## 2. Public module implementation
+
+- [x] 2.1 Add `include/hpc/simd.hpp` with runtime detection, alignment helpers, wrapper types, and reusable kernels
+- [x] 2.2 Refactor the SIMD tests to include the new public header
+- [x] 2.3 Refactor the SIMD examples and benchmark to use the new public seam and delete duplicated kernel logic
+- [x] 2.4 Minimize or remove obsolete example-local SIMD forwarding headers
+
+## 3. Coupled platform hardening
+
+- [x] 3.1 Fix the Windows `page_size()` compile path in `include/hpc/core.hpp`
+- [x] 3.2 Re-run the focused SIMD and core tests
+
+## 4. Full validation
+
+- [x] 4.1 Run `cmake --preset=debug && cmake --build build/debug && ctest --preset=debug`
+- [x] 4.2 Run sanitizer validation for the touched surface
+- [x] 4.3 Summarize the architectural deepening and any deferred opportunities
diff --git a/tests/property/simd_properties.cpp b/tests/property/simd_properties.cpp
index 07737f6..bf5d263 100644
--- a/tests/property/simd_properties.cpp
+++ b/tests/property/simd_properties.cpp
@@ -20,8 +20,7 @@
 #include <random>
 #include <vector>
 
-// Include SIMD wrapper
-#include "simd_wrapper.hpp"
+#include <hpc/simd.hpp>
 
 namespace {
 
diff --git a/tests/unit/simd/CMakeLists.txt b/tests/unit/simd/CMakeLists.txt
index 6615278..d2d0b2c 100644
--- a/tests/unit/simd/CMakeLists.txt
+++ b/tests/unit/simd/CMakeLists.txt
@@ -27,6 +27,20 @@ endif()
 
 gtest_add_tests(TARGET simd_utils_test)
 
+add_executable(hpc_simd_test
+    hpc_simd_test.cpp
+)
+
+target_link_libraries(hpc_simd_test PRIVATE
+    GTest::gtest
+    GTest::gtest_main
+)
+
+hpc_set_compiler_options(hpc_simd_test)
+hpc_enable_sanitizers(hpc_simd_test)
+
+gtest_add_tests(TARGET hpc_simd_test)
+
 # simd_dispatch_test - now uses header-only implementation
 add_executable(simd_dispatch_test
     simd_dispatch_test.cpp
diff --git a/tests/unit/simd/hpc_simd_test.cpp b/tests/unit/simd/hpc_simd_test.cpp
new file mode 100644
index 0000000..7c95b07
--- /dev/null
+++ b/tests/unit/simd/hpc_simd_test.cpp
@@ -0,0 +1,120 @@
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <cmath>
+
+#include <hpc/simd.hpp>
+
+namespace hpc::simd::test {
+
+namespace {
+
+SIMDLevel expected_runtime_level() {
+#if (defined(__GNUC__) || defined(__clang__)) && (defined(__x86_64__) || defined(__i386__))
+    __builtin_cpu_init();
+    if (__builtin_cpu_supports("avx512f")) {
+        return SIMDLevel::AVX512;
+    }
+    if (__builtin_cpu_supports("avx2")) {
+        return SIMDLevel::AVX2;
+    }
+    if (__builtin_cpu_supports("avx")) {
+        return SIMDLevel::AVX;
+    }
+    if (__builtin_cpu_supports("sse2")) {
+        return SIMDLevel::SSE2;
+    }
+#endif
+    return SIMDLevel::Scalar;
+}
+
+size_t expected_alignment(SIMDLevel level) {
+    switch (level) {
+        case SIMDLevel::AVX512:
+            return 64;
+        case SIMDLevel::AVX2:
+        case SIMDLevel::AVX:
+            return 32;
+        case SIMDLevel::SSE2:
+            return 16;
+        case SIMDLevel::Scalar:
+        default:
+            return sizeof(void*);
+    }
+}
+
+}  // namespace
+
+TEST(HpcSimdTest, RuntimeLevelMatchesSupportedCapability) {
+    EXPECT_EQ(detect_simd_level(), expected_runtime_level());
+}
+
+TEST(HpcSimdTest, RuntimeAlignmentMatchesReportedLevel) {
+    const SIMDLevel level = detect_simd_level();
+    EXPECT_EQ(get_simd_alignment(), expected_alignment(level));
+}
+
+TEST(HpcSimdTest, AddArraysHandlesTailLength) {
+    constexpr size_t kSize = 21;
+    aligned_vector<float> a(kSize), b(kSize), actual(kSize), expected(kSize);
+
+    for (size_t i = 0; i < kSize; ++i) {
+        a[i] = static_cast<float>(i) * 1.25f;
+        b[i] = static_cast<float>(kSize - i) * 0.75f;
+        expected[i] = a[i] + b[i];
+    }
+
+    add_arrays(a.data(), b.data(), actual.data(), kSize);
+
+    for (size_t i = 0; i < kSize; ++i) {
+        EXPECT_FLOAT_EQ(actual[i], expected[i]);
+    }
+}
+
+TEST(HpcSimdTest, MultiplyDotScaleAndClampShareTheCanonicalModule) {
+    constexpr size_t kSize = 37;
+    aligned_vector<float> a(kSize), b(kSize), multiplied(kSize);
+
+    for (size_t i = 0; i < kSize; ++i) {
+        a[i] = static_cast<float>(i) - 10.0f;
+        b[i] = static_cast<float>((i % 5) + 1);
+    }
+
+    multiply_arrays(a.data(), b.data(), multiplied.data(), kSize);
+    for (size_t i = 0; i < kSize; ++i) {
+        EXPECT_FLOAT_EQ(multiplied[i], a[i] * b[i]);
+    }
+
+    const float dot = dot_product(a.data(), b.data(), kSize);
+    float expected_dot = 0.0f;
+    for (size_t i = 0; i < kSize; ++i) {
+        expected_dot += a[i] * b[i];
+    }
+    EXPECT_NEAR(dot, expected_dot, 1e-4f);
+
+    scale_array(multiplied.data(), 0.5f, kSize);
+    for (size_t i = 0; i < kSize; ++i) {
+        EXPECT_FLOAT_EQ(multiplied[i], a[i] * b[i] * 0.5f);
+    }
+
+    clamp_array(multiplied.data(), -6.0f, 6.0f, kSize);
+    for (size_t i = 0; i < kSize; ++i) {
+        EXPECT_GE(multiplied[i], -6.0f);
+        EXPECT_LE(multiplied[i], 6.0f);
+    }
+}
+
+TEST(HpcSimdTest, DispatchAliasMatchesCanonicalAddKernel) {
+    constexpr size_t kSize = 13;
+    aligned_vector<float> a(kSize, 2.0f);
+    aligned_vector<float> b(kSize, -0.5f);
+    aligned_vector<float> actual(kSize, 0.0f);
+
+    dispatch_add_arrays(a.data(), b.data(), actual.data(), kSize);
+
+    for (float value : actual) {
+        EXPECT_FLOAT_EQ(value, 1.5f);
+    }
+}
+
+}  // namespace hpc::simd::test
diff --git a/tests/unit/simd/simd_dispatch_test.cpp b/tests/unit/simd/simd_dispatch_test.cpp
index 6294e4a..96673f4 100644
--- a/tests/unit/simd/simd_dispatch_test.cpp
+++ b/tests/unit/simd/simd_dispatch_test.cpp
@@ -1,6 +1,6 @@
 #include <gtest/gtest.h>
 
-#include "simd_utils.hpp"
+#include <hpc/simd.hpp>
 
 namespace hpc::simd {
 
diff --git a/tests/unit/simd/simd_utils_test.cpp b/tests/unit/simd/simd_utils_test.cpp
index 75a87c5..8402656 100644
--- a/tests/unit/simd/simd_utils_test.cpp
+++ b/tests/unit/simd/simd_utils_test.cpp
@@ -3,15 +3,13 @@
  * @brief Unit tests for simd_utils.hpp and simd_wrapper.hpp
  */
 
-#include "simd_utils.hpp"
+#include <hpc/simd.hpp>
 
 #include <gtest/gtest.h>
 
 #include <cmath>
 #include <numeric>
 
-#include "simd_wrapper.hpp"
-
 namespace hpc::simd::test {
 
 // ---------------------------------------------------------------------------