diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
index 87ef251..de0ca8e 100644
--- a/.github/workflows/cmake.yml
+++ b/.github/workflows/cmake.yml
@@ -17,6 +17,10 @@ jobs:
     steps:
     - uses: actions/checkout@v3
 
+    - name: Install BLAS (Ubuntu)
+      if: runner.os == 'Linux'
+      run: sudo apt-get update && sudo apt-get install -y libopenblas-dev
+
     - name: Configure CMake
       run: cmake -B build -S . -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DCMAKE_VERBOSE_MAKEFILE=ON
       
diff --git a/.github/workflows/sanitizer.yml b/.github/workflows/sanitizer.yml
index 7fb13d3..86f3bbc 100644
--- a/.github/workflows/sanitizer.yml
+++ b/.github/workflows/sanitizer.yml
@@ -16,6 +16,10 @@ jobs:
     steps:
       - uses: actions/checkout@v5
 
+      - name: Install BLAS (Ubuntu)
+        if: runner.os == 'Linux'
+        run: sudo apt-get update && sudo apt-get install -y libopenblas-dev
+
       # Configure CMake with ASan + UBSan
       - name: Configure with sanitizers
         run: cmake -B build -S . \
diff --git a/.github/workflows/valgrind.yml b/.github/workflows/valgrind.yml
index b381c8f..9eec318 100644
--- a/.github/workflows/valgrind.yml
+++ b/.github/workflows/valgrind.yml
@@ -11,7 +11,7 @@ jobs:
       - name: Install dependencies
         run: |
           sudo apt-get update
-          sudo apt-get install -y valgrind
+          sudo apt-get install -y valgrind libopenblas-dev
 
       - name: Configure Debug build
         run: cmake -B build -S . -DCMAKE_BUILD_TYPE=Debug
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3611755..91e3fc6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -44,6 +44,10 @@ include_directories(${PROJECT_SOURCE_DIR}/include)
 # Source files - automatically gather all .c files from src/
 file(GLOB_RECURSE SOURCES "src/*.c")
 
+# Exclude BLAS-dependent files on Windows (no BLAS available)
+if(MSVC)
+    list(FILTER SOURCES EXCLUDE REGEX "dense_left_matmul\\.c$|dense_right_matmul\\.c$")
+endif()
 
 # Create core library
 add_library(dnlp_diff ${SOURCES})
@@ -53,6 +57,14 @@ if(NOT MSVC)
     target_link_libraries(dnlp_diff m)
 endif()
 
+# Link BLAS (needed for dense_left_matmul / dense_right_matmul)
+if(APPLE)
+    target_link_libraries(dnlp_diff "-framework Accelerate")
+elseif(NOT MSVC)
+    find_package(BLAS REQUIRED)
+    target_link_libraries(dnlp_diff ${BLAS_LIBRARIES})
+endif()
+
 # Config-specific compile options (compiler-specific)
 if(MSVC)
     target_compile_options(dnlp_diff PRIVATE
diff --git a/include/bivariate.h b/include/bivariate.h
index aa005ed..358b185 100644
--- a/include/bivariate.h
+++ b/include/bivariate.h
@@ -36,6 +36,14 @@ expr *new_left_matmul(expr *u, const CSR_Matrix *A);
 /* Right matrix multiplication: f(x) @ A where A is a constant matrix */
 expr *new_right_matmul(expr *u, const CSR_Matrix *A);
 
+/* Dense left matrix multiplication: A @ f(x) where A is a dense row-major matrix.
+   Uses BLAS for all operations — much faster than sparse path for dense A. */
+expr *new_dense_left_matmul(expr *child, const double *A_dense, int m, int n);
+
+/* Dense right matrix multiplication: f(x) @ A where A is a dense row-major matrix.
+   Implemented as (A^T @ f(x)^T)^T using dense_left_matmul. */
+expr *new_dense_right_matmul(expr *u, const double *A_dense, int m, int n);
+
 /* Constant scalar multiplication: a * f(x) where a is a constant double */
 expr *new_const_scalar_mult(double a, expr *child);
 
diff --git a/include/subexpr.h b/include/subexpr.h
index b3f4170..19bcd3e 100644
--- a/include/subexpr.h
+++ b/include/subexpr.h
@@ -115,6 +115,25 @@ typedef struct left_matmul_expr
     int *csc_to_csr_workspace;
 } left_matmul_expr;
 
+/* Dense left matrix multiplication: y = A * f(x) where A is a dense matrix.
+ * Performance-optimized variant that uses BLAS (cblas_dgemv/dgemm) instead of
+ * sparse operations. Avoids the per-row overlap checks in sparsity detection and
+ * sparse dot products in value computation. */
+typedef struct dense_left_matmul_expr
+{
+    expr base;
+    double *A_dense;  /* row-major m×n */
+    double *AT_dense; /* row-major n×m (for wsum_hess) */
+    int m, n, n_blocks;
+    CSC_Matrix *Jchild_CSC;
+    CSC_Matrix *J_CSC;
+    int *csc_to_csr_workspace;
+    double *J_block_work; /* n × BATCH col-major workspace for dgemm */
+    double *C_block_work; /* m × BATCH col-major workspace for dgemm */
+    int *col_work;        /* 2 × n_vars ints for column indices/offsets */
+    bool affine_cached;   /* true if Jacobian was pre-computed in init */
+} dense_left_matmul_expr;
+
 /* Right matrix multiplication: y = f(x) * A where f(x) is an expression.
  * f(x) has shape p x n, A has shape n x q, output y has shape p x q.
  * Uses vec(y) = B * vec(f(x)) where B = A^T kron I_p. */
diff --git a/include/utils/blas_compat.h b/include/utils/blas_compat.h
new file mode 100644
index 0000000..3c4a562
--- /dev/null
+++ b/include/utils/blas_compat.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright 2026 Daniel Cederberg and William Zhang
+ *
+ * This file is part of the DNLP-differentiation-engine project.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef BLAS_COMPAT_H
+#define BLAS_COMPAT_H
+
+#ifdef __APPLE__
+#define ACCELERATE_NEW_LAPACK
+#include <Accelerate/Accelerate.h>
+#else
+#include <cblas.h>
+#endif
+
+#endif /* BLAS_COMPAT_H */
diff --git a/src/bivariate/dense_left_matmul.c b/src/bivariate/dense_left_matmul.c
new file mode 100644
index 0000000..bfa2054
--- /dev/null
+++ b/src/bivariate/dense_left_matmul.c
@@ -0,0 +1,403 @@
+/*
+ * Copyright 2026 Daniel Cederberg and William Zhang
+ *
+ * This file is part of the DNLP-differentiation-engine project.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "bivariate.h"
+#include "subexpr.h"
+#include "utils/blas_compat.h"
+#include "utils/linalg_sparse_matmuls.h"
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define DGEMM_BATCH_SIZE 256
+
+/* This file implements 'dense_left_matmul' for the operation y = A @ f(x),
+   where A is a DENSE matrix (stored row-major) and f(x) is an arbitrary
+   expression. The dimensions are A - m x n, f(x) - n x p, y - m x p.
+
+   This is a performance-optimized variant of left_matmul that avoids
+   sparse overhead when A is dense:
+   - Forward pass uses cblas_dgemv instead of sparse matvec
+   - Jacobian sparsity: if ANY entry in a block column of J_child is nonzero,
+     ALL m rows get entries (since A is dense), avoiding per-row overlap checks
+   - Jacobian values use cblas_dgemv per block column instead of sparse dot products
+   - Weighted sum Hessian uses cblas_dgemv with AT for A^T @ w
+*/
+
+static void forward(expr *node, const double *u)
+{
+    dense_left_matmul_expr *dn = (dense_left_matmul_expr *) node;
+
+    /* child's forward pass */
+    node->left->forward(node->left, u);
+
+    /* y = (I_p ⊗ A) @ vec(f(x)), computed as p independent A @ x_block */
+    for (int b = 0; b < dn->n_blocks; b++)
+    {
+        cblas_dgemv(CblasRowMajor, CblasNoTrans, dn->m, dn->n, 1.0, dn->A_dense,
+                    dn->n, node->left->value + b * dn->n, 1, 0.0,
+                    node->value + b * dn->m, 1);
+    }
+}
+
+static bool is_affine(const expr *node)
+{
+    return node->left->is_affine(node->left);
+}
+
+static void free_type_data(expr *node)
+{
+    dense_left_matmul_expr *dn = (dense_left_matmul_expr *) node;
+    free(dn->A_dense);
+    free(dn->AT_dense);
+    free_csc_matrix(dn->Jchild_CSC);
+    free_csc_matrix(dn->J_CSC);
+    free(dn->csc_to_csr_workspace);
+    free(dn->J_block_work);
+    free(dn->C_block_work);
+    free(dn->col_work);
+    dn->A_dense = NULL;
+    dn->AT_dense = NULL;
+    dn->Jchild_CSC = NULL;
+    dn->J_CSC = NULL;
+    dn->csc_to_csr_workspace = NULL;
+    dn->J_block_work = NULL;
+    dn->C_block_work = NULL;
+    dn->col_work = NULL;
+}
+
+/* Compute sparsity pattern of C = (I_p ⊗ A_dense) @ J where A_dense is dense m×n.
+   Since A is dense, if ANY entry in a block of J's column is nonzero,
+   ALL m rows in the corresponding block of C get entries.
+   Two-pass approach: count nnz first, then fill — avoids O(m*p*k) temp buffer. */
+static CSC_Matrix *dense_block_left_multiply_fill_sparsity(const CSC_Matrix *J,
+                                                           int m, int n, int p)
+{
+    int k = J->n;
+    int *Cp = (int *) malloc((k + 1) * sizeof(int));
+    Cp[0] = 0;
+
+    /* Pass 1: count nnz per column */
+    for (int j = 0; j < k; j++)
+    {
+        if (J->p[j] == J->p[j + 1])
+        {
+            Cp[j + 1] = Cp[j];
+            continue;
+        }
+
+        int col_nnz = 0;
+        int jj = J->p[j];
+        for (int block = 0; block < p; block++)
+        {
+            int block_start = block * n;
+            int block_end = block_start + n;
+
+            while (jj < J->p[j + 1] && J->i[jj] < block_start) jj++;
+            bool has_entry = (jj < J->p[j + 1] && J->i[jj] < block_end);
+            while (jj < J->p[j + 1] && J->i[jj] < block_end) jj++;
+
+            if (has_entry) col_nnz += m;
+        }
+        Cp[j + 1] = Cp[j] + col_nnz;
+    }
+
+    int total_nnz = Cp[k];
+    CSC_Matrix *C = new_csc_matrix(m * p, k, total_nnz);
+    memcpy(C->p, Cp, (k + 1) * sizeof(int));
+    free(Cp);
+
+    /* Pass 2: fill row indices directly into C */
+    for (int j = 0; j < k; j++)
+    {
+        if (C->p[j] == C->p[j + 1]) continue;
+
+        int ci = C->p[j];
+        int jj = J->p[j];
+        for (int block = 0; block < p; block++)
+        {
+            int block_start = block * n;
+            int block_end = block_start + n;
+
+            while (jj < J->p[j + 1] && J->i[jj] < block_start) jj++;
+            bool has_entry = (jj < J->p[j + 1] && J->i[jj] < block_end);
+            while (jj < J->p[j + 1] && J->i[jj] < block_end) jj++;
+
+            if (has_entry)
+            {
+                int row_offset = block * m;
+                for (int i = 0; i < m; i++)
+                {
+                    C->i[ci++] = row_offset + i;
+                }
+            }
+        }
+    }
+
+    return C;
+}
+
+/* Fill values of C = (I_p ⊗ A_dense) @ J using batched BLAS dgemm.
+   For each block, gathers all active columns into a dense matrix and
+   uses a single dgemm instead of per-column dgemv calls.
+   Columns with a single entry in the block use a fast scaled-column-copy
+   path (O(m) vs O(m*n) for dgemm), which is critical when child J is
+   an identity or permutation matrix. */
+static void dense_block_left_multiply_fill_values(
+    const double *A_dense, const CSC_Matrix *J, CSC_Matrix *C, int m, int n, int p,
+    double *J_block, double *C_block, int *col_indices, int *col_C_offsets)
+{
+    int k = J->n;
+
+    for (int block = 0; block < p; block++)
+    {
+        int block_start = block * n;
+        int block_end = block_start + n;
+        int row_offset = block * m;
+
+        /* Gather columns that have entries in this block, classify as
+           single-entry (fast path) or multi-entry (dgemm path). */
+        int n_gemm = 0; /* columns needing dgemm */
+        for (int j = 0; j < k; j++)
+        {
+            if (C->p[j] == C->p[j + 1]) continue;
+
+            /* Find offset of this block in C's column j.
+               Each present block contributes exactly m consecutive entries. */
+            int ci = C->p[j];
+            while (ci < C->p[j + 1] && C->i[ci] < row_offset) ci += m;
+            if (ci >= C->p[j + 1] || C->i[ci] != row_offset) continue;
+
+            /* Count entries in this block */
+            int count = 0;
+            int single_row = -1;
+            double single_val = 0.0;
+            for (int jj = J->p[j]; jj < J->p[j + 1]; jj++)
+            {
+                int row = J->i[jj];
+                if (row >= block_start && row < block_end)
+                {
+                    count++;
+                    single_row = row - block_start;
+                    single_val = J->x[jj];
+                }
+            }
+
+            if (count == 1)
+            {
+                /* Fast path: C[:, j] = A[:, single_row] * single_val.
+                   A is row-major so column access has stride n. */
+                cblas_dcopy(m, A_dense + single_row, n, C->x + ci, 1);
+                if (single_val != 1.0)
+                {
+                    cblas_dscal(m, single_val, C->x + ci, 1);
+                }
+            }
+            else
+            {
+                /* Queue for dgemm batch */
+                col_indices[n_gemm] = j;
+                col_C_offsets[n_gemm] = ci;
+                n_gemm++;
+            }
+        }
+
+        /* Process multi-entry columns in batches via dgemm */
+        for (int bs = 0; bs < n_gemm; bs += DGEMM_BATCH_SIZE)
+        {
+            int batch = n_gemm - bs;
+            if (batch > DGEMM_BATCH_SIZE) batch = DGEMM_BATCH_SIZE;
+
+            /* Zero J_block (n × batch, column-major) */
+            memset(J_block, 0, (size_t) n * batch * sizeof(double));
+
+            /* Scatter J entries into J_block */
+            for (int bi = 0; bi < batch; bi++)
+            {
+                int j = col_indices[bs + bi];
+                for (int jj = J->p[j]; jj < J->p[j + 1]; jj++)
+                {
+                    int row = J->i[jj];
+                    if (row >= block_start && row < block_end)
+                    {
+                        J_block[(row - block_start) + (size_t) bi * n] = J->x[jj];
+                    }
+                }
+            }
+
+            /* C_block(m × batch) = A(m×n) × J_block(n × batch)
+               A_dense is row-major m×n = col-major n×m, so CblasTrans gives m×n. */
+            cblas_dgemm(CblasColMajor, CblasTrans, CblasNoTrans, m, batch, n, 1.0,
+                        A_dense, n, J_block, n, 0.0, C_block, m);
+
+            /* Scatter C_block → C->x */
+            for (int bi = 0; bi < batch; bi++)
+            {
+                memcpy(C->x + col_C_offsets[bs + bi], C_block + (size_t) bi * m,
+                       m * sizeof(double));
+            }
+        }
+    }
+}
+
+static void jacobian_init(expr *node)
+{
+    expr *x = node->left;
+    dense_left_matmul_expr *dn = (dense_left_matmul_expr *) node;
+
+    /* initialize child's jacobian and convert to CSC */
+    x->jacobian_init(x);
+    dn->Jchild_CSC = csr_to_csc_fill_sparsity(x->jacobian, node->iwork);
+
+    /* compute sparsity of this node's jacobian in CSC and CSR */
+    dn->J_CSC = dense_block_left_multiply_fill_sparsity(dn->Jchild_CSC, dn->m, dn->n,
+                                                        dn->n_blocks);
+    node->jacobian = csc_to_csr_fill_sparsity(dn->J_CSC, dn->csc_to_csr_workspace);
+
+    /* Allocate dgemm batch workspaces */
+    int batch = DGEMM_BATCH_SIZE;
+    dn->J_block_work = (double *) malloc((size_t) dn->n * batch * sizeof(double));
+    dn->C_block_work = (double *) malloc((size_t) dn->m * batch * sizeof(double));
+    dn->col_work = (int *) malloc(2 * node->n_vars * sizeof(int));
+
+    /* For affine children, the Jacobian is constant — compute once and cache. */
+    if (x->is_affine(x))
+    {
+        x->eval_jacobian(x);
+        csr_to_csc_fill_values(x->jacobian, dn->Jchild_CSC, node->iwork);
+        dense_block_left_multiply_fill_values(
+            dn->A_dense, dn->Jchild_CSC, dn->J_CSC, dn->m, dn->n, dn->n_blocks,
+            dn->J_block_work, dn->C_block_work, dn->col_work,
+            dn->col_work + node->n_vars);
+        csc_to_csr_fill_values(dn->J_CSC, node->jacobian, dn->csc_to_csr_workspace);
+        dn->affine_cached = true;
+    }
+}
+
+static void eval_jacobian(expr *node)
+{
+    dense_left_matmul_expr *dn = (dense_left_matmul_expr *) node;
+
+    /* Affine children have constant Jacobians — already computed in jacobian_init.
+     */
+    if (dn->affine_cached) return;
+
+    expr *x = node->left;
+
+    /* evaluate child's jacobian and convert to CSC */
+    x->eval_jacobian(x);
+    csr_to_csc_fill_values(x->jacobian, dn->Jchild_CSC, node->iwork);
+
+    /* compute this node's jacobian using batched dgemm */
+    dense_block_left_multiply_fill_values(dn->A_dense, dn->Jchild_CSC, dn->J_CSC,
+                                          dn->m, dn->n, dn->n_blocks,
+                                          dn->J_block_work, dn->C_block_work,
+                                          dn->col_work, dn->col_work + node->n_vars);
+    csc_to_csr_fill_values(dn->J_CSC, node->jacobian, dn->csc_to_csr_workspace);
+}
+
+static void wsum_hess_init(expr *node)
+{
+    /* initialize child's hessian */
+    expr *x = node->left;
+    x->wsum_hess_init(x);
+
+    /* allocate this node's hessian with the same sparsity as child's */
+    node->wsum_hess = new_csr_matrix(node->n_vars, node->n_vars, x->wsum_hess->nnz);
+    memcpy(node->wsum_hess->p, x->wsum_hess->p, (node->n_vars + 1) * sizeof(int));
+    memcpy(node->wsum_hess->i, x->wsum_hess->i, x->wsum_hess->nnz * sizeof(int));
+
+    /* node->dwork is already allocated in constructor (size n * n_blocks) */
+}
+
+static void eval_wsum_hess(expr *node, const double *w)
+{
+    dense_left_matmul_expr *dn = (dense_left_matmul_expr *) node;
+
+    /* compute A^T @ w, block-wise: for each block b,
+       dwork[b*n .. (b+1)*n-1] = AT @ w[b*m .. (b+1)*m-1] */
+    for (int b = 0; b < dn->n_blocks; b++)
+    {
+        cblas_dgemv(CblasRowMajor, CblasNoTrans, dn->n, dn->m, 1.0, dn->AT_dense,
+                    dn->m, w + b * dn->m, 1, 0.0, node->dwork + b * dn->n, 1);
+    }
+
+    node->left->eval_wsum_hess(node->left, node->dwork);
+    memcpy(node->wsum_hess->x, node->left->wsum_hess->x,
+           node->wsum_hess->nnz * sizeof(double));
+}
+
+expr *new_dense_left_matmul(expr *child, const double *A_dense, int m, int n)
+{
+    /* Dimension logic — same as left_matmul */
+    int d1, d2, n_blocks;
+    if (child->d1 == n)
+    {
+        d1 = m;
+        d2 = child->d2;
+        n_blocks = child->d2;
+    }
+    else if (child->d2 == n && child->d1 == 1)
+    {
+        d1 = 1;
+        d2 = m;
+        n_blocks = 1;
+    }
+    else
+    {
+        fprintf(stderr, "Error in new_dense_left_matmul: dimension mismatch\n");
+        exit(1);
+    }
+
+    /* Allocate the type-specific struct */
+    dense_left_matmul_expr *dn =
+        (dense_left_matmul_expr *) calloc(1, sizeof(dense_left_matmul_expr));
+    expr *node = &dn->base;
+    init_expr(node, d1, d2, child->n_vars, forward, jacobian_init, eval_jacobian,
+              is_affine, wsum_hess_init, eval_wsum_hess, free_type_data);
+    node->left = child;
+    expr_retain(child);
+
+    /* Store dense A (row-major m x n) and AT (row-major n x m) */
+    dn->m = m;
+    dn->n = n;
+    dn->n_blocks = n_blocks;
+
+    dn->A_dense = (double *) malloc(m * n * sizeof(double));
+    memcpy(dn->A_dense, A_dense, m * n * sizeof(double));
+
+    dn->AT_dense = (double *) malloc(n * m * sizeof(double));
+    for (int i = 0; i < m; i++)
+    {
+        for (int j = 0; j < n; j++)
+        {
+            dn->AT_dense[j * m + i] = A_dense[i * n + j];
+        }
+    }
+
+    /* Allocate workspaces */
+    node->iwork = (int *) malloc(node->n_vars * sizeof(int));
+    dn->csc_to_csr_workspace = (int *) malloc(node->size * sizeof(int));
+
+    /* dwork: used for AT @ w in hess eval. Needs n * n_blocks doubles. */
+    int dwork_size = n * n_blocks;
+    if (dwork_size < n) dwork_size = n;
+    node->dwork = (double *) malloc(dwork_size * sizeof(double));
+
+    return node;
+}
diff --git a/src/bivariate/dense_right_matmul.c b/src/bivariate/dense_right_matmul.c
new file mode 100644
index 0000000..8ef7a0f
--- /dev/null
+++ b/src/bivariate/dense_right_matmul.c
@@ -0,0 +1,46 @@
+/*
+ * Copyright 2026 Daniel Cederberg and William Zhang
+ *
+ * This file is part of the DNLP-differentiation-engine project.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "affine.h"
+#include "bivariate.h"
+#include "subexpr.h"
+#include <stdlib.h>
+
+/* This file implements the atom 'dense_right_matmul' corresponding to the
+   operation y = f(x) @ A, where A is a given dense matrix.
+   We implement this by expressing right matmul in terms of dense left matmul
+   and transpose: f(x) @ A = (A^T @ f(x)^T)^T. */
+expr *new_dense_right_matmul(expr *u, const double *A_dense, int m, int n)
+{
+    /* We express: u @ A = (A^T @ u^T)^T
+       A is m x n, so A^T is n x m. */
+    double *AT = (double *) malloc(n * m * sizeof(double));
+    for (int i = 0; i < m; i++)
+    {
+        for (int j = 0; j < n; j++)
+        {
+            AT[j * m + i] = A_dense[i * n + j];
+        }
+    }
+
+    expr *u_transpose = new_transpose(u);
+    expr *left_matmul_node = new_dense_left_matmul(u_transpose, AT, n, m);
+    expr *node = new_transpose(left_matmul_node);
+
+    free(AT);
+    return node;
+}
diff --git a/tests/all_tests.c b/tests/all_tests.c
index fece8de..d8df839 100644
--- a/tests/all_tests.c
+++ b/tests/all_tests.c
@@ -25,6 +25,9 @@
 #include "jacobian_tests/test_elementwise_mult.h"
 #include "jacobian_tests/test_hstack.h"
 #include "jacobian_tests/test_index.h"
+#ifndef _MSC_VER
+#include "jacobian_tests/test_dense_left_matmul.h"
+#endif
 #include "jacobian_tests/test_left_matmul.h"
 #include "jacobian_tests/test_log.h"
 #include "jacobian_tests/test_matmul.h"
@@ -61,6 +64,9 @@
 #include "wsum_hess/test_const_vector_mult.h"
 #include "wsum_hess/test_hstack.h"
 #include "wsum_hess/test_index.h"
+#ifndef _MSC_VER
+#include "wsum_hess/test_dense_left_matmul.h"
+#endif
 #include "wsum_hess/test_left_matmul.h"
 #include "wsum_hess/test_matmul.h"
 #include "wsum_hess/test_multiply.h"
@@ -168,6 +174,11 @@ int main(void)
     mu_run_test(test_wsum_hess_multiply_2, tests_run);
     mu_run_test(test_jacobian_trace_variable, tests_run);
     mu_run_test(test_jacobian_trace_composite, tests_run);
+#ifndef _MSC_VER
+    mu_run_test(test_jacobian_dense_left_matmul_log, tests_run);
+    mu_run_test(test_jacobian_dense_left_matmul_log_matrix, tests_run);
+    mu_run_test(test_jacobian_dense_left_matmul_log_composite, tests_run);
+#endif
     mu_run_test(test_jacobian_left_matmul_log, tests_run);
     mu_run_test(test_jacobian_left_matmul_log_matrix, tests_run);
     mu_run_test(test_jacobian_left_matmul_log_composite, tests_run);
@@ -226,6 +237,11 @@ int main(void)
     mu_run_test(test_wsum_hess_multiply_sparse_random, tests_run);
     mu_run_test(test_wsum_hess_multiply_1, tests_run);
     mu_run_test(test_wsum_hess_multiply_2, tests_run);
+#ifndef _MSC_VER
+    mu_run_test(test_wsum_hess_dense_left_matmul, tests_run);
+    mu_run_test(test_wsum_hess_dense_left_matmul_matrix, tests_run);
+    mu_run_test(test_wsum_hess_dense_left_matmul_composite, tests_run);
+#endif
     mu_run_test(test_wsum_hess_left_matmul, tests_run);
     mu_run_test(test_wsum_hess_left_matmul_matrix, tests_run);
     mu_run_test(test_wsum_hess_left_matmul_composite, tests_run);
diff --git a/tests/jacobian_tests/test_dense_left_matmul.h b/tests/jacobian_tests/test_dense_left_matmul.h
new file mode 100644
index 0000000..8f82690
--- /dev/null
+++ b/tests/jacobian_tests/test_dense_left_matmul.h
@@ -0,0 +1,154 @@
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "bivariate.h"
+#include "elementwise_univariate.h"
+#include "expr.h"
+#include "minunit.h"
+#include "test_helpers.h"
+
+const char *test_jacobian_dense_left_matmul_log(void)
+{
+    /* Test Jacobian of A @ log(x) where A is dense, identical to the sparse test.
+     * x is 3x1 variable at x = [1, 2, 3]
+     * A is 4x3 dense matrix [1, 0, 2; 3, 0, 4; 5, 0, 6; 7, 0, 0]
+     * Expected Jacobian = A @ diag(1/x):
+     * [1, 0, 2/3; 3, 0, 4/3; 5, 0, 2; 7, 0, 0]
+     */
+    double x_vals[3] = {1.0, 2.0, 3.0};
+    expr *x = new_variable(3, 1, 0, 3);
+
+    /* A is 4x3 dense row-major */
+    double A_dense[12] = {1.0, 0.0, 2.0, 3.0, 0.0, 4.0,
+                          5.0, 0.0, 6.0, 7.0, 0.0, 0.0};
+
+    expr *log_x = new_log(x);
+    expr *A_log_x = new_dense_left_matmul(log_x, A_dense, 4, 3);
+
+    A_log_x->forward(A_log_x, x_vals);
+    A_log_x->jacobian_init(A_log_x);
+    A_log_x->eval_jacobian(A_log_x);
+
+    /* Since A is dense, the jacobian may have entries where A has zeros.
+     * For dense_left_matmul, ALL m rows get entries if any child jacobian
+     * entry exists in the block. Since child is a variable (identity jacobian),
+     * all 3 columns have entries, so all 4 rows get all 3 columns.
+     * The values at zero positions of A will be 0.0. */
+    double expected_Ax[12] = {
+        1.0, 0.0, 2.0 / 3.0, /* row 0: A[0,:] @ diag(1/x) */
+        3.0, 0.0, 4.0 / 3.0, /* row 1 */
+        5.0, 0.0, 2.0,       /* row 2 */
+        7.0, 0.0, 0.0        /* row 3 */
+    };
+    int expected_Ai[12] = {0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2};
+    int expected_Ap[5] = {0, 3, 6, 9, 12};
+
+    mu_assert("dense_left_matmul vals fail",
+              cmp_double_array(A_log_x->jacobian->x, expected_Ax, 12));
+    mu_assert("dense_left_matmul cols fail",
+              cmp_int_array(A_log_x->jacobian->i, expected_Ai, 12));
+    mu_assert("dense_left_matmul rows fail",
+              cmp_int_array(A_log_x->jacobian->p, expected_Ap, 5));
+
+    free_expr(A_log_x);
+    return 0;
+}
+
+const char *test_jacobian_dense_left_matmul_log_matrix(void)
+{
+    /* x is 3x2, vectorized column-wise: [1,2,3 | 4,5,6]
+     * A is 4x3 dense: [1, 0, 2; 3, 0, 4; 5, 0, 6; 7, 0, 0]
+     * Jacobian = block-diag(A @ diag(1/x_col0), A @ diag(1/x_col1))
+     * = 8x6 block-diagonal with two 4x3 blocks */
+    double x_vals[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+    expr *x = new_variable(3, 2, 0, 6);
+
+    double A_dense[12] = {1.0, 0.0, 2.0, 3.0, 0.0, 4.0,
+                          5.0, 0.0, 6.0, 7.0, 0.0, 0.0};
+
+    expr *log_x = new_log(x);
+    expr *A_log_x = new_dense_left_matmul(log_x, A_dense, 4, 3);
+
+    A_log_x->forward(A_log_x, x_vals);
+    A_log_x->jacobian_init(A_log_x);
+    A_log_x->eval_jacobian(A_log_x);
+
+    /* 8x6 Jacobian, block-diagonal with 4x3 dense blocks */
+    double expected_Ax[24] = {
+        /* block 0: A @ diag(1/[1,2,3]) */
+        1.0, 0.0, 2.0 / 3.0, 3.0, 0.0, 4.0 / 3.0, 5.0, 0.0, 2.0, 7.0, 0.0, 0.0,
+        /* block 1: A @ diag(1/[4,5,6]) */
+        0.25, 0.0, 1.0 / 3.0, 0.75, 0.0, 2.0 / 3.0, 1.25, 0.0, 1.0, 1.75, 0.0, 0.0};
+    int expected_Ai[24] = {0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2,
+                           3, 4, 5, 3, 4, 5, 3, 4, 5, 3, 4, 5};
+    int expected_Ap[9] = {0, 3, 6, 9, 12, 15, 18, 21, 24};
+
+    mu_assert("dense_left_matmul_matrix vals fail",
+              cmp_double_array(A_log_x->jacobian->x, expected_Ax, 24));
+    mu_assert("dense_left_matmul_matrix cols fail",
+              cmp_int_array(A_log_x->jacobian->i, expected_Ai, 24));
+    mu_assert("dense_left_matmul_matrix rows fail",
+              cmp_int_array(A_log_x->jacobian->p, expected_Ap, 9));
+
+    free_expr(A_log_x);
+    return 0;
+}
+
+const char *test_jacobian_dense_left_matmul_log_composite(void)
+{
+    /* Test Jacobian of A @ log(B @ x) where both A and B are dense.
+     * x is 3x1 at x = [1, 2, 3]
+     * B is 3x3 all ones, A is 4x3 dense: [1, 0, 2; 3, 0, 4; 5, 0, 6; 7, 0, 0]
+     *
+     * B @ x = [6, 6, 6]^T
+     * Jacobian = A @ diag(1/(B@x)) @ B
+     * = A @ diag([1/6, 1/6, 1/6]) @ all_ones(3x3)
+     * Row 0: (1+0+2)/6 * [1,1,1] = [1/2, 1/2, 1/2]
+     * Row 1: (3+0+4)/6 * [1,1,1] = [7/6, 7/6, 7/6]
+     * Row 2: (5+0+6)/6 * [1,1,1] = [11/6, 11/6, 11/6]
+     * Row 3: (7+0+0)/6 * [1,1,1] = [7/6, 7/6, 7/6]
+     */
+    double x_vals[3] = {1.0, 2.0, 3.0};
+    expr *x = new_variable(3, 1, 0, 3);
+
+    /* B as CSR (for new_linear) */
+    CSR_Matrix *B = new_csr_matrix(3, 3, 9);
+    int B_p[4] = {0, 3, 6, 9};
+    int B_i[9] = {0, 1, 2, 0, 1, 2, 0, 1, 2};
+    double B_x[9] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+    memcpy(B->p, B_p, 4 * sizeof(int));
+    memcpy(B->i, B_i, 9 * sizeof(int));
+    memcpy(B->x, B_x, 9 * sizeof(double));
+
+    double A_dense[12] = {1.0, 0.0, 2.0, 3.0, 0.0, 4.0,
+                          5.0, 0.0, 6.0, 7.0, 0.0, 0.0};
+
+    expr *Bx = new_linear(x, B, NULL);
+    expr *log_Bx = new_log(Bx);
+    expr *A_log_Bx = new_dense_left_matmul(log_Bx, A_dense, 4, 3);
+
+    A_log_Bx->forward(A_log_Bx, x_vals);
+    A_log_Bx->jacobian_init(A_log_Bx);
+    A_log_Bx->eval_jacobian(A_log_Bx);
+
+    double expected_Ax[12] = {
+        0.5,        0.5,        0.5,        /* row 0 */
+        7.0 / 6.0,  7.0 / 6.0,  7.0 / 6.0,  /* row 1 */
+        11.0 / 6.0, 11.0 / 6.0, 11.0 / 6.0, /* row 2 */
+        7.0 / 6.0,  7.0 / 6.0,  7.0 / 6.0   /* row 3 */
+    };
+    int expected_Ai[12] = {0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2};
+    int expected_Ap[5] = {0, 3, 6, 9, 12};
+
+    mu_assert("dense composite vals fail",
+              cmp_double_array(A_log_Bx->jacobian->x, expected_Ax, 12));
+    mu_assert("dense composite cols fail",
+              cmp_int_array(A_log_Bx->jacobian->i, expected_Ai, 12));
+    mu_assert("dense composite rows fail",
+              cmp_int_array(A_log_Bx->jacobian->p, expected_Ap, 5));
+
+    free_csr_matrix(B);
+    free_expr(A_log_Bx);
+    return 0;
+}
diff --git a/tests/wsum_hess/test_dense_left_matmul.h b/tests/wsum_hess/test_dense_left_matmul.h
new file mode 100644
index 0000000..a6a29ca
--- /dev/null
+++ b/tests/wsum_hess/test_dense_left_matmul.h
@@ -0,0 +1,133 @@
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "affine.h"
+#include "bivariate.h"
+#include "elementwise_univariate.h"
+#include "expr.h"
+#include "minunit.h"
+#include "test_helpers.h"
+
+const char *test_wsum_hess_dense_left_matmul(void)
+{
+    /* Test weighted sum of Hessian of A @ log(x) with dense A.
+     * Same math as the sparse test:
+     * x = [1, 2, 3], A = [1, 0, 2; 3, 0, 4; 5, 0, 6; 7, 0, 0], w = [1, 2, 3, 4]
+     * A^T @ w = [50, 0, 28]
+     * wsum_hess = diag(-50/1, -0/4, -28/9)
+     */
+    double x_vals[3] = {1.0, 2.0, 3.0};
+    double w[4] = {1.0, 2.0, 3.0, 4.0};
+
+    expr *x = new_variable(3, 1, 0, 3);
+    double A_dense[12] = {1.0, 0.0, 2.0, 3.0, 0.0, 4.0,
+                          5.0, 0.0, 6.0, 7.0, 0.0, 0.0};
+
+    expr *log_x = new_log(x);
+    expr *A_log_x = new_dense_left_matmul(log_x, A_dense, 4, 3);
+
+    A_log_x->forward(A_log_x, x_vals);
+    A_log_x->jacobian_init(A_log_x);
+    A_log_x->wsum_hess_init(A_log_x);
+    A_log_x->eval_wsum_hess(A_log_x, w);
+
+    double expected_x[3] = {-50.0, -0.0, -28.0 / 9.0};
+    int expected_i[3] = {0, 1, 2};
+    int expected_p[4] = {0, 1, 2, 3};
+
+    mu_assert("dense wsum_hess vals incorrect",
+              cmp_double_array(A_log_x->wsum_hess->x, expected_x, 3));
+    mu_assert("dense wsum_hess cols incorrect",
+              cmp_int_array(A_log_x->wsum_hess->i, expected_i, 3));
+    mu_assert("dense wsum_hess rows incorrect",
+              cmp_int_array(A_log_x->wsum_hess->p, expected_p, 4));
+
+    free_expr(A_log_x);
+    return 0;
+}
+
+const char *test_wsum_hess_dense_left_matmul_matrix(void)
+{
+    /* Test with matrix-valued x (3x2), same math as sparse test.
+     * x = [1,2,3,4,5,6], A same as above, w = [1,2,3,4,5,6,7,8]
+     */
+    double x_vals[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+    double w[8] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0};
+
+    expr *x = new_variable(3, 2, 0, 6);
+    double A_dense[12] = {1.0, 0.0, 2.0, 3.0, 0.0, 4.0,
+                          5.0, 0.0, 6.0, 7.0, 0.0, 0.0};
+
+    expr *log_x = new_log(x);
+    expr *A_log_x = new_dense_left_matmul(log_x, A_dense, 4, 3);
+
+    A_log_x->forward(A_log_x, x_vals);
+    A_log_x->jacobian_init(A_log_x);
+    A_log_x->wsum_hess_init(A_log_x);
+    A_log_x->eval_wsum_hess(A_log_x, w);
+
+    double expected_x[6] = {-50.0,       -0.0, -28.0 / 9.0,
+                            -57.0 / 8.0, -0.0, -19.0 / 9.0};
+    int expected_i[6] = {0, 1, 2, 3, 4, 5};
+    int expected_p[7] = {0, 1, 2, 3, 4, 5, 6};
+
+    mu_assert("dense wsum_hess matrix vals incorrect",
+              cmp_double_array(A_log_x->wsum_hess->x, expected_x, 6));
+    mu_assert("dense wsum_hess matrix cols incorrect",
+              cmp_int_array(A_log_x->wsum_hess->i, expected_i, 6));
+    mu_assert("dense wsum_hess matrix rows incorrect",
+              cmp_int_array(A_log_x->wsum_hess->p, expected_p, 7));
+
+    free_expr(A_log_x);
+    return 0;
+}
+
+const char *test_wsum_hess_dense_left_matmul_composite(void)
+{
+    /* Test with A @ log(B @ x), same math as sparse test.
+     * x = [1,2,3], B = 3x3 all ones, A = [1,0,2;3,0,4;5,0,6;7,0,0], w = [1,2,3,4]
+     * Expected: 3x3 dense matrix, all entries = -13/6
+     */
+    double x_vals[3] = {1.0, 2.0, 3.0};
+    double w[4] = {1.0, 2.0, 3.0, 4.0};
+
+    expr *x = new_variable(3, 1, 0, 3);
+
+    CSR_Matrix *B = new_csr_matrix(3, 3, 9);
+    int B_p[4] = {0, 3, 6, 9};
+    int B_i[9] = {0, 1, 2, 0, 1, 2, 0, 1, 2};
+    double B_x[9] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+    memcpy(B->p, B_p, 4 * sizeof(int));
+    memcpy(B->i, B_i, 9 * sizeof(int));
+    memcpy(B->x, B_x, 9 * sizeof(double));
+
+    double A_dense[12] = {1.0, 0.0, 2.0, 3.0, 0.0, 4.0,
+                          5.0, 0.0, 6.0, 7.0, 0.0, 0.0};
+
+    expr *Bx = new_linear(x, B, NULL);
+    expr *log_Bx = new_log(Bx);
+    expr *A_log_Bx = new_dense_left_matmul(log_Bx, A_dense, 4, 3);
+
+    A_log_Bx->forward(A_log_Bx, x_vals);
+    A_log_Bx->jacobian_init(A_log_Bx);
+    A_log_Bx->wsum_hess_init(A_log_Bx);
+    A_log_Bx->eval_wsum_hess(A_log_Bx, w);
+
+    double expected_x[9] = {-13.0 / 6.0, -13.0 / 6.0, -13.0 / 6.0,
+                            -13.0 / 6.0, -13.0 / 6.0, -13.0 / 6.0,
+                            -13.0 / 6.0, -13.0 / 6.0, -13.0 / 6.0};
+    int expected_i[9] = {0, 1, 2, 0, 1, 2, 0, 1, 2};
+    int expected_p[4] = {0, 3, 6, 9};
+
+    mu_assert("dense composite wsum_hess vals incorrect",
+              cmp_double_array(A_log_Bx->wsum_hess->x, expected_x, 9));
+    mu_assert("dense composite wsum_hess cols incorrect",
+              cmp_int_array(A_log_Bx->wsum_hess->i, expected_i, 9));
+    mu_assert("dense composite wsum_hess rows incorrect",
+              cmp_int_array(A_log_Bx->wsum_hess->p, expected_p, 4));
+
+    free_csr_matrix(B);
+    free_expr(A_log_Bx);
+    return 0;
+}