diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index 87ef251..de0ca8e 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -17,6 +17,10 @@ jobs: steps: - uses: actions/checkout@v3 + - name: Install BLAS (Ubuntu) + if: runner.os == 'Linux' + run: sudo apt-get update && sudo apt-get install -y libopenblas-dev + - name: Configure CMake run: cmake -B build -S . -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DCMAKE_VERBOSE_MAKEFILE=ON diff --git a/.github/workflows/sanitizer.yml b/.github/workflows/sanitizer.yml index 7fb13d3..86f3bbc 100644 --- a/.github/workflows/sanitizer.yml +++ b/.github/workflows/sanitizer.yml @@ -16,6 +16,10 @@ jobs: steps: - uses: actions/checkout@v5 + - name: Install BLAS (Ubuntu) + if: runner.os == 'Linux' + run: sudo apt-get update && sudo apt-get install -y libopenblas-dev + # Configure CMake with ASan + UBSan - name: Configure with sanitizers run: cmake -B build -S . \ diff --git a/.github/workflows/valgrind.yml b/.github/workflows/valgrind.yml index b381c8f..9eec318 100644 --- a/.github/workflows/valgrind.yml +++ b/.github/workflows/valgrind.yml @@ -11,7 +11,7 @@ jobs: - name: Install dependencies run: | sudo apt-get update - sudo apt-get install -y valgrind + sudo apt-get install -y valgrind libopenblas-dev - name: Configure Debug build run: cmake -B build -S . -DCMAKE_BUILD_TYPE=Debug diff --git a/CMakeLists.txt b/CMakeLists.txt index 3611755..91e3fc6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -44,6 +44,10 @@ include_directories(${PROJECT_SOURCE_DIR}/include) # Source files - automatically gather all .c files from src/ file(GLOB_RECURSE SOURCES "src/*.c") +# Exclude BLAS-dependent files on Windows (no BLAS available) +if(MSVC) + list(FILTER SOURCES EXCLUDE REGEX "dense_left_matmul\\.c$|dense_right_matmul\\.c$") +endif() # Create core library add_library(dnlp_diff ${SOURCES}) @@ -53,6 +57,14 @@ if(NOT MSVC) target_link_libraries(dnlp_diff m) endif() +# Link BLAS (needed for dense_left_matmul / dense_right_matmul) +if(APPLE) + target_link_libraries(dnlp_diff "-framework Accelerate") +elseif(NOT MSVC) + find_package(BLAS REQUIRED) + target_link_libraries(dnlp_diff ${BLAS_LIBRARIES}) +endif() + # Config-specific compile options (compiler-specific) if(MSVC) target_compile_options(dnlp_diff PRIVATE diff --git a/include/bivariate.h b/include/bivariate.h index aa005ed..358b185 100644 --- a/include/bivariate.h +++ b/include/bivariate.h @@ -36,6 +36,14 @@ expr *new_left_matmul(expr *u, const CSR_Matrix *A); /* Right matrix multiplication: f(x) @ A where A is a constant matrix */ expr *new_right_matmul(expr *u, const CSR_Matrix *A); +/* Dense left matrix multiplication: A @ f(x) where A is a dense row-major matrix. + Uses BLAS for all operations — much faster than sparse path for dense A. */ +expr *new_dense_left_matmul(expr *child, const double *A_dense, int m, int n); + +/* Dense right matrix multiplication: f(x) @ A where A is a dense row-major matrix. + Implemented as (A^T @ f(x)^T)^T using dense_left_matmul. */ +expr *new_dense_right_matmul(expr *u, const double *A_dense, int m, int n); + /* Constant scalar multiplication: a * f(x) where a is a constant double */ expr *new_const_scalar_mult(double a, expr *child); diff --git a/include/subexpr.h b/include/subexpr.h index b3f4170..19bcd3e 100644 --- a/include/subexpr.h +++ b/include/subexpr.h @@ -115,6 +115,25 @@ typedef struct left_matmul_expr int *csc_to_csr_workspace; } left_matmul_expr; +/* Dense left matrix multiplication: y = A * f(x) where A is a dense matrix. + * Performance-optimized variant that uses BLAS (cblas_dgemv/dgemm) instead of + * sparse operations. Avoids the per-row overlap checks in sparsity detection and + * sparse dot products in value computation. */ +typedef struct dense_left_matmul_expr +{ + expr base; + double *A_dense; /* row-major m×n */ + double *AT_dense; /* row-major n×m (for wsum_hess) */ + int m, n, n_blocks; + CSC_Matrix *Jchild_CSC; + CSC_Matrix *J_CSC; + int *csc_to_csr_workspace; + double *J_block_work; /* n × BATCH col-major workspace for dgemm */ + double *C_block_work; /* m × BATCH col-major workspace for dgemm */ + int *col_work; /* 2 × n_vars ints for column indices/offsets */ + bool affine_cached; /* true if Jacobian was pre-computed in init */ +} dense_left_matmul_expr; + /* Right matrix multiplication: y = f(x) * A where f(x) is an expression. * f(x) has shape p x n, A has shape n x q, output y has shape p x q. * Uses vec(y) = B * vec(f(x)) where B = A^T kron I_p. */ diff --git a/include/utils/blas_compat.h b/include/utils/blas_compat.h new file mode 100644 index 0000000..3c4a562 --- /dev/null +++ b/include/utils/blas_compat.h @@ -0,0 +1,28 @@ +/* + * Copyright 2026 Daniel Cederberg and William Zhang + * + * This file is part of the DNLP-differentiation-engine project. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef BLAS_COMPAT_H +#define BLAS_COMPAT_H + +#ifdef __APPLE__ +#define ACCELERATE_NEW_LAPACK +#include +#else +#include +#endif + +#endif /* BLAS_COMPAT_H */ diff --git a/src/bivariate/dense_left_matmul.c b/src/bivariate/dense_left_matmul.c new file mode 100644 index 0000000..bfa2054 --- /dev/null +++ b/src/bivariate/dense_left_matmul.c @@ -0,0 +1,403 @@ +/* + * Copyright 2026 Daniel Cederberg and William Zhang + * + * This file is part of the DNLP-differentiation-engine project. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "bivariate.h" +#include "subexpr.h" +#include "utils/blas_compat.h" +#include "utils/linalg_sparse_matmuls.h" +#include +#include +#include +#include + +#define DGEMM_BATCH_SIZE 256 + +/* This file implements 'dense_left_matmul' for the operation y = A @ f(x), + where A is a DENSE matrix (stored row-major) and f(x) is an arbitrary + expression. The dimensions are A - m x n, f(x) - n x p, y - m x p. + + This is a performance-optimized variant of left_matmul that avoids + sparse overhead when A is dense: + - Forward pass uses cblas_dgemv instead of sparse matvec + - Jacobian sparsity: if ANY entry in a block column of J_child is nonzero, + ALL m rows get entries (since A is dense), avoiding per-row overlap checks + - Jacobian values use cblas_dgemv per block column instead of sparse dot products + - Weighted sum Hessian uses cblas_dgemv with AT for A^T @ w +*/ + +static void forward(expr *node, const double *u) +{ + dense_left_matmul_expr *dn = (dense_left_matmul_expr *) node; + + /* child's forward pass */ + node->left->forward(node->left, u); + + /* y = (I_p ⊗ A) @ vec(f(x)), computed as p independent A @ x_block */ + for (int b = 0; b < dn->n_blocks; b++) + { + cblas_dgemv(CblasRowMajor, CblasNoTrans, dn->m, dn->n, 1.0, dn->A_dense, + dn->n, node->left->value + b * dn->n, 1, 0.0, + node->value + b * dn->m, 1); + } +} + +static bool is_affine(const expr *node) +{ + return node->left->is_affine(node->left); +} + +static void free_type_data(expr *node) +{ + dense_left_matmul_expr *dn = (dense_left_matmul_expr *) node; + free(dn->A_dense); + free(dn->AT_dense); + free_csc_matrix(dn->Jchild_CSC); + free_csc_matrix(dn->J_CSC); + free(dn->csc_to_csr_workspace); + free(dn->J_block_work); + free(dn->C_block_work); + free(dn->col_work); + dn->A_dense = NULL; + dn->AT_dense = NULL; + dn->Jchild_CSC = NULL; + dn->J_CSC = NULL; + dn->csc_to_csr_workspace = NULL; + dn->J_block_work = NULL; + dn->C_block_work = NULL; + dn->col_work = NULL; +} + +/* Compute sparsity pattern of C = (I_p ⊗ A_dense) @ J where A_dense is dense m×n. + Since A is dense, if ANY entry in a block of J's column is nonzero, + ALL m rows in the corresponding block of C get entries. + Two-pass approach: count nnz first, then fill — avoids O(m*p*k) temp buffer. */ +static CSC_Matrix *dense_block_left_multiply_fill_sparsity(const CSC_Matrix *J, + int m, int n, int p) +{ + int k = J->n; + int *Cp = (int *) malloc((k + 1) * sizeof(int)); + Cp[0] = 0; + + /* Pass 1: count nnz per column */ + for (int j = 0; j < k; j++) + { + if (J->p[j] == J->p[j + 1]) + { + Cp[j + 1] = Cp[j]; + continue; + } + + int col_nnz = 0; + int jj = J->p[j]; + for (int block = 0; block < p; block++) + { + int block_start = block * n; + int block_end = block_start + n; + + while (jj < J->p[j + 1] && J->i[jj] < block_start) jj++; + bool has_entry = (jj < J->p[j + 1] && J->i[jj] < block_end); + while (jj < J->p[j + 1] && J->i[jj] < block_end) jj++; + + if (has_entry) col_nnz += m; + } + Cp[j + 1] = Cp[j] + col_nnz; + } + + int total_nnz = Cp[k]; + CSC_Matrix *C = new_csc_matrix(m * p, k, total_nnz); + memcpy(C->p, Cp, (k + 1) * sizeof(int)); + free(Cp); + + /* Pass 2: fill row indices directly into C */ + for (int j = 0; j < k; j++) + { + if (C->p[j] == C->p[j + 1]) continue; + + int ci = C->p[j]; + int jj = J->p[j]; + for (int block = 0; block < p; block++) + { + int block_start = block * n; + int block_end = block_start + n; + + while (jj < J->p[j + 1] && J->i[jj] < block_start) jj++; + bool has_entry = (jj < J->p[j + 1] && J->i[jj] < block_end); + while (jj < J->p[j + 1] && J->i[jj] < block_end) jj++; + + if (has_entry) + { + int row_offset = block * m; + for (int i = 0; i < m; i++) + { + C->i[ci++] = row_offset + i; + } + } + } + } + + return C; +} + +/* Fill values of C = (I_p ⊗ A_dense) @ J using batched BLAS dgemm. + For each block, gathers all active columns into a dense matrix and + uses a single dgemm instead of per-column dgemv calls. + Columns with a single entry in the block use a fast scaled-column-copy + path (O(m) vs O(m*n) for dgemm), which is critical when child J is + an identity or permutation matrix. */ +static void dense_block_left_multiply_fill_values( + const double *A_dense, const CSC_Matrix *J, CSC_Matrix *C, int m, int n, int p, + double *J_block, double *C_block, int *col_indices, int *col_C_offsets) +{ + int k = J->n; + + for (int block = 0; block < p; block++) + { + int block_start = block * n; + int block_end = block_start + n; + int row_offset = block * m; + + /* Gather columns that have entries in this block, classify as + single-entry (fast path) or multi-entry (dgemm path). */ + int n_gemm = 0; /* columns needing dgemm */ + for (int j = 0; j < k; j++) + { + if (C->p[j] == C->p[j + 1]) continue; + + /* Find offset of this block in C's column j. + Each present block contributes exactly m consecutive entries. */ + int ci = C->p[j]; + while (ci < C->p[j + 1] && C->i[ci] < row_offset) ci += m; + if (ci >= C->p[j + 1] || C->i[ci] != row_offset) continue; + + /* Count entries in this block */ + int count = 0; + int single_row = -1; + double single_val = 0.0; + for (int jj = J->p[j]; jj < J->p[j + 1]; jj++) + { + int row = J->i[jj]; + if (row >= block_start && row < block_end) + { + count++; + single_row = row - block_start; + single_val = J->x[jj]; + } + } + + if (count == 1) + { + /* Fast path: C[:, j] = A[:, single_row] * single_val. + A is row-major so column access has stride n. */ + cblas_dcopy(m, A_dense + single_row, n, C->x + ci, 1); + if (single_val != 1.0) + { + cblas_dscal(m, single_val, C->x + ci, 1); + } + } + else + { + /* Queue for dgemm batch */ + col_indices[n_gemm] = j; + col_C_offsets[n_gemm] = ci; + n_gemm++; + } + } + + /* Process multi-entry columns in batches via dgemm */ + for (int bs = 0; bs < n_gemm; bs += DGEMM_BATCH_SIZE) + { + int batch = n_gemm - bs; + if (batch > DGEMM_BATCH_SIZE) batch = DGEMM_BATCH_SIZE; + + /* Zero J_block (n × batch, column-major) */ + memset(J_block, 0, (size_t) n * batch * sizeof(double)); + + /* Scatter J entries into J_block */ + for (int bi = 0; bi < batch; bi++) + { + int j = col_indices[bs + bi]; + for (int jj = J->p[j]; jj < J->p[j + 1]; jj++) + { + int row = J->i[jj]; + if (row >= block_start && row < block_end) + { + J_block[(row - block_start) + (size_t) bi * n] = J->x[jj]; + } + } + } + + /* C_block(m × batch) = A(m×n) × J_block(n × batch) + A_dense is row-major m×n = col-major n×m, so CblasTrans gives m×n. */ + cblas_dgemm(CblasColMajor, CblasTrans, CblasNoTrans, m, batch, n, 1.0, + A_dense, n, J_block, n, 0.0, C_block, m); + + /* Scatter C_block → C->x */ + for (int bi = 0; bi < batch; bi++) + { + memcpy(C->x + col_C_offsets[bs + bi], C_block + (size_t) bi * m, + m * sizeof(double)); + } + } + } +} + +static void jacobian_init(expr *node) +{ + expr *x = node->left; + dense_left_matmul_expr *dn = (dense_left_matmul_expr *) node; + + /* initialize child's jacobian and convert to CSC */ + x->jacobian_init(x); + dn->Jchild_CSC = csr_to_csc_fill_sparsity(x->jacobian, node->iwork); + + /* compute sparsity of this node's jacobian in CSC and CSR */ + dn->J_CSC = dense_block_left_multiply_fill_sparsity(dn->Jchild_CSC, dn->m, dn->n, + dn->n_blocks); + node->jacobian = csc_to_csr_fill_sparsity(dn->J_CSC, dn->csc_to_csr_workspace); + + /* Allocate dgemm batch workspaces */ + int batch = DGEMM_BATCH_SIZE; + dn->J_block_work = (double *) malloc((size_t) dn->n * batch * sizeof(double)); + dn->C_block_work = (double *) malloc((size_t) dn->m * batch * sizeof(double)); + dn->col_work = (int *) malloc(2 * node->n_vars * sizeof(int)); + + /* For affine children, the Jacobian is constant — compute once and cache. */ + if (x->is_affine(x)) + { + x->eval_jacobian(x); + csr_to_csc_fill_values(x->jacobian, dn->Jchild_CSC, node->iwork); + dense_block_left_multiply_fill_values( + dn->A_dense, dn->Jchild_CSC, dn->J_CSC, dn->m, dn->n, dn->n_blocks, + dn->J_block_work, dn->C_block_work, dn->col_work, + dn->col_work + node->n_vars); + csc_to_csr_fill_values(dn->J_CSC, node->jacobian, dn->csc_to_csr_workspace); + dn->affine_cached = true; + } +} + +static void eval_jacobian(expr *node) +{ + dense_left_matmul_expr *dn = (dense_left_matmul_expr *) node; + + /* Affine children have constant Jacobians — already computed in jacobian_init. + */ + if (dn->affine_cached) return; + + expr *x = node->left; + + /* evaluate child's jacobian and convert to CSC */ + x->eval_jacobian(x); + csr_to_csc_fill_values(x->jacobian, dn->Jchild_CSC, node->iwork); + + /* compute this node's jacobian using batched dgemm */ + dense_block_left_multiply_fill_values(dn->A_dense, dn->Jchild_CSC, dn->J_CSC, + dn->m, dn->n, dn->n_blocks, + dn->J_block_work, dn->C_block_work, + dn->col_work, dn->col_work + node->n_vars); + csc_to_csr_fill_values(dn->J_CSC, node->jacobian, dn->csc_to_csr_workspace); +} + +static void wsum_hess_init(expr *node) +{ + /* initialize child's hessian */ + expr *x = node->left; + x->wsum_hess_init(x); + + /* allocate this node's hessian with the same sparsity as child's */ + node->wsum_hess = new_csr_matrix(node->n_vars, node->n_vars, x->wsum_hess->nnz); + memcpy(node->wsum_hess->p, x->wsum_hess->p, (node->n_vars + 1) * sizeof(int)); + memcpy(node->wsum_hess->i, x->wsum_hess->i, x->wsum_hess->nnz * sizeof(int)); + + /* node->dwork is already allocated in constructor (size n * n_blocks) */ +} + +static void eval_wsum_hess(expr *node, const double *w) +{ + dense_left_matmul_expr *dn = (dense_left_matmul_expr *) node; + + /* compute A^T @ w, block-wise: for each block b, + dwork[b*n .. (b+1)*n-1] = AT @ w[b*m .. (b+1)*m-1] */ + for (int b = 0; b < dn->n_blocks; b++) + { + cblas_dgemv(CblasRowMajor, CblasNoTrans, dn->n, dn->m, 1.0, dn->AT_dense, + dn->m, w + b * dn->m, 1, 0.0, node->dwork + b * dn->n, 1); + } + + node->left->eval_wsum_hess(node->left, node->dwork); + memcpy(node->wsum_hess->x, node->left->wsum_hess->x, + node->wsum_hess->nnz * sizeof(double)); +} + +expr *new_dense_left_matmul(expr *child, const double *A_dense, int m, int n) +{ + /* Dimension logic — same as left_matmul */ + int d1, d2, n_blocks; + if (child->d1 == n) + { + d1 = m; + d2 = child->d2; + n_blocks = child->d2; + } + else if (child->d2 == n && child->d1 == 1) + { + d1 = 1; + d2 = m; + n_blocks = 1; + } + else + { + fprintf(stderr, "Error in new_dense_left_matmul: dimension mismatch\n"); + exit(1); + } + + /* Allocate the type-specific struct */ + dense_left_matmul_expr *dn = + (dense_left_matmul_expr *) calloc(1, sizeof(dense_left_matmul_expr)); + expr *node = &dn->base; + init_expr(node, d1, d2, child->n_vars, forward, jacobian_init, eval_jacobian, + is_affine, wsum_hess_init, eval_wsum_hess, free_type_data); + node->left = child; + expr_retain(child); + + /* Store dense A (row-major m x n) and AT (row-major n x m) */ + dn->m = m; + dn->n = n; + dn->n_blocks = n_blocks; + + dn->A_dense = (double *) malloc(m * n * sizeof(double)); + memcpy(dn->A_dense, A_dense, m * n * sizeof(double)); + + dn->AT_dense = (double *) malloc(n * m * sizeof(double)); + for (int i = 0; i < m; i++) + { + for (int j = 0; j < n; j++) + { + dn->AT_dense[j * m + i] = A_dense[i * n + j]; + } + } + + /* Allocate workspaces */ + node->iwork = (int *) malloc(node->n_vars * sizeof(int)); + dn->csc_to_csr_workspace = (int *) malloc(node->size * sizeof(int)); + + /* dwork: used for AT @ w in hess eval. Needs n * n_blocks doubles. */ + int dwork_size = n * n_blocks; + if (dwork_size < n) dwork_size = n; + node->dwork = (double *) malloc(dwork_size * sizeof(double)); + + return node; +} diff --git a/src/bivariate/dense_right_matmul.c b/src/bivariate/dense_right_matmul.c new file mode 100644 index 0000000..8ef7a0f --- /dev/null +++ b/src/bivariate/dense_right_matmul.c @@ -0,0 +1,46 @@ +/* + * Copyright 2026 Daniel Cederberg and William Zhang + * + * This file is part of the DNLP-differentiation-engine project. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "affine.h" +#include "bivariate.h" +#include "subexpr.h" +#include + +/* This file implements the atom 'dense_right_matmul' corresponding to the + operation y = f(x) @ A, where A is a given dense matrix. + We implement this by expressing right matmul in terms of dense left matmul + and transpose: f(x) @ A = (A^T @ f(x)^T)^T. */ +expr *new_dense_right_matmul(expr *u, const double *A_dense, int m, int n) +{ + /* We express: u @ A = (A^T @ u^T)^T + A is m x n, so A^T is n x m. */ + double *AT = (double *) malloc(n * m * sizeof(double)); + for (int i = 0; i < m; i++) + { + for (int j = 0; j < n; j++) + { + AT[j * m + i] = A_dense[i * n + j]; + } + } + + expr *u_transpose = new_transpose(u); + expr *left_matmul_node = new_dense_left_matmul(u_transpose, AT, n, m); + expr *node = new_transpose(left_matmul_node); + + free(AT); + return node; +} diff --git a/tests/all_tests.c b/tests/all_tests.c index fece8de..d8df839 100644 --- a/tests/all_tests.c +++ b/tests/all_tests.c @@ -25,6 +25,9 @@ #include "jacobian_tests/test_elementwise_mult.h" #include "jacobian_tests/test_hstack.h" #include "jacobian_tests/test_index.h" +#ifndef _MSC_VER +#include "jacobian_tests/test_dense_left_matmul.h" +#endif #include "jacobian_tests/test_left_matmul.h" #include "jacobian_tests/test_log.h" #include "jacobian_tests/test_matmul.h" @@ -61,6 +64,9 @@ #include "wsum_hess/test_const_vector_mult.h" #include "wsum_hess/test_hstack.h" #include "wsum_hess/test_index.h" +#ifndef _MSC_VER +#include "wsum_hess/test_dense_left_matmul.h" +#endif #include "wsum_hess/test_left_matmul.h" #include "wsum_hess/test_matmul.h" #include "wsum_hess/test_multiply.h" @@ -168,6 +174,11 @@ int main(void) mu_run_test(test_wsum_hess_multiply_2, tests_run); mu_run_test(test_jacobian_trace_variable, tests_run); mu_run_test(test_jacobian_trace_composite, tests_run); +#ifndef _MSC_VER + mu_run_test(test_jacobian_dense_left_matmul_log, tests_run); + mu_run_test(test_jacobian_dense_left_matmul_log_matrix, tests_run); + mu_run_test(test_jacobian_dense_left_matmul_log_composite, tests_run); +#endif mu_run_test(test_jacobian_left_matmul_log, tests_run); mu_run_test(test_jacobian_left_matmul_log_matrix, tests_run); mu_run_test(test_jacobian_left_matmul_log_composite, tests_run); @@ -226,6 +237,11 @@ int main(void) mu_run_test(test_wsum_hess_multiply_sparse_random, tests_run); mu_run_test(test_wsum_hess_multiply_1, tests_run); mu_run_test(test_wsum_hess_multiply_2, tests_run); +#ifndef _MSC_VER + mu_run_test(test_wsum_hess_dense_left_matmul, tests_run); + mu_run_test(test_wsum_hess_dense_left_matmul_matrix, tests_run); + mu_run_test(test_wsum_hess_dense_left_matmul_composite, tests_run); +#endif mu_run_test(test_wsum_hess_left_matmul, tests_run); mu_run_test(test_wsum_hess_left_matmul_matrix, tests_run); mu_run_test(test_wsum_hess_left_matmul_composite, tests_run); diff --git a/tests/jacobian_tests/test_dense_left_matmul.h b/tests/jacobian_tests/test_dense_left_matmul.h new file mode 100644 index 0000000..8f82690 --- /dev/null +++ b/tests/jacobian_tests/test_dense_left_matmul.h @@ -0,0 +1,154 @@ +#include +#include +#include + +#include "bivariate.h" +#include "elementwise_univariate.h" +#include "expr.h" +#include "minunit.h" +#include "test_helpers.h" + +const char *test_jacobian_dense_left_matmul_log(void) +{ + /* Test Jacobian of A @ log(x) where A is dense, identical to the sparse test. + * x is 3x1 variable at x = [1, 2, 3] + * A is 4x3 dense matrix [1, 0, 2; 3, 0, 4; 5, 0, 6; 7, 0, 0] + * Expected Jacobian = A @ diag(1/x): + * [1, 0, 2/3; 3, 0, 4/3; 5, 0, 2; 7, 0, 0] + */ + double x_vals[3] = {1.0, 2.0, 3.0}; + expr *x = new_variable(3, 1, 0, 3); + + /* A is 4x3 dense row-major */ + double A_dense[12] = {1.0, 0.0, 2.0, 3.0, 0.0, 4.0, + 5.0, 0.0, 6.0, 7.0, 0.0, 0.0}; + + expr *log_x = new_log(x); + expr *A_log_x = new_dense_left_matmul(log_x, A_dense, 4, 3); + + A_log_x->forward(A_log_x, x_vals); + A_log_x->jacobian_init(A_log_x); + A_log_x->eval_jacobian(A_log_x); + + /* Since A is dense, the jacobian may have entries where A has zeros. + * For dense_left_matmul, ALL m rows get entries if any child jacobian + * entry exists in the block. Since child is a variable (identity jacobian), + * all 3 columns have entries, so all 4 rows get all 3 columns. + * The values at zero positions of A will be 0.0. */ + double expected_Ax[12] = { + 1.0, 0.0, 2.0 / 3.0, /* row 0: A[0,:] @ diag(1/x) */ + 3.0, 0.0, 4.0 / 3.0, /* row 1 */ + 5.0, 0.0, 2.0, /* row 2 */ + 7.0, 0.0, 0.0 /* row 3 */ + }; + int expected_Ai[12] = {0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2}; + int expected_Ap[5] = {0, 3, 6, 9, 12}; + + mu_assert("dense_left_matmul vals fail", + cmp_double_array(A_log_x->jacobian->x, expected_Ax, 12)); + mu_assert("dense_left_matmul cols fail", + cmp_int_array(A_log_x->jacobian->i, expected_Ai, 12)); + mu_assert("dense_left_matmul rows fail", + cmp_int_array(A_log_x->jacobian->p, expected_Ap, 5)); + + free_expr(A_log_x); + return 0; +} + +const char *test_jacobian_dense_left_matmul_log_matrix(void) +{ + /* x is 3x2, vectorized column-wise: [1,2,3 | 4,5,6] + * A is 4x3 dense: [1, 0, 2; 3, 0, 4; 5, 0, 6; 7, 0, 0] + * Jacobian = block-diag(A @ diag(1/x_col0), A @ diag(1/x_col1)) + * = 8x6 block-diagonal with two 4x3 blocks */ + double x_vals[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0}; + expr *x = new_variable(3, 2, 0, 6); + + double A_dense[12] = {1.0, 0.0, 2.0, 3.0, 0.0, 4.0, + 5.0, 0.0, 6.0, 7.0, 0.0, 0.0}; + + expr *log_x = new_log(x); + expr *A_log_x = new_dense_left_matmul(log_x, A_dense, 4, 3); + + A_log_x->forward(A_log_x, x_vals); + A_log_x->jacobian_init(A_log_x); + A_log_x->eval_jacobian(A_log_x); + + /* 8x6 Jacobian, block-diagonal with 4x3 dense blocks */ + double expected_Ax[24] = { + /* block 0: A @ diag(1/[1,2,3]) */ + 1.0, 0.0, 2.0 / 3.0, 3.0, 0.0, 4.0 / 3.0, 5.0, 0.0, 2.0, 7.0, 0.0, 0.0, + /* block 1: A @ diag(1/[4,5,6]) */ + 0.25, 0.0, 1.0 / 3.0, 0.75, 0.0, 2.0 / 3.0, 1.25, 0.0, 1.0, 1.75, 0.0, 0.0}; + int expected_Ai[24] = {0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, + 3, 4, 5, 3, 4, 5, 3, 4, 5, 3, 4, 5}; + int expected_Ap[9] = {0, 3, 6, 9, 12, 15, 18, 21, 24}; + + mu_assert("dense_left_matmul_matrix vals fail", + cmp_double_array(A_log_x->jacobian->x, expected_Ax, 24)); + mu_assert("dense_left_matmul_matrix cols fail", + cmp_int_array(A_log_x->jacobian->i, expected_Ai, 24)); + mu_assert("dense_left_matmul_matrix rows fail", + cmp_int_array(A_log_x->jacobian->p, expected_Ap, 9)); + + free_expr(A_log_x); + return 0; +} + +const char *test_jacobian_dense_left_matmul_log_composite(void) +{ + /* Test Jacobian of A @ log(B @ x) where both A and B are dense. + * x is 3x1 at x = [1, 2, 3] + * B is 3x3 all ones, A is 4x3 dense: [1, 0, 2; 3, 0, 4; 5, 0, 6; 7, 0, 0] + * + * B @ x = [6, 6, 6]^T + * Jacobian = A @ diag(1/(B@x)) @ B + * = A @ diag([1/6, 1/6, 1/6]) @ all_ones(3x3) + * Row 0: (1+0+2)/6 * [1,1,1] = [1/2, 1/2, 1/2] + * Row 1: (3+0+4)/6 * [1,1,1] = [7/6, 7/6, 7/6] + * Row 2: (5+0+6)/6 * [1,1,1] = [11/6, 11/6, 11/6] + * Row 3: (7+0+0)/6 * [1,1,1] = [7/6, 7/6, 7/6] + */ + double x_vals[3] = {1.0, 2.0, 3.0}; + expr *x = new_variable(3, 1, 0, 3); + + /* B as CSR (for new_linear) */ + CSR_Matrix *B = new_csr_matrix(3, 3, 9); + int B_p[4] = {0, 3, 6, 9}; + int B_i[9] = {0, 1, 2, 0, 1, 2, 0, 1, 2}; + double B_x[9] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + memcpy(B->p, B_p, 4 * sizeof(int)); + memcpy(B->i, B_i, 9 * sizeof(int)); + memcpy(B->x, B_x, 9 * sizeof(double)); + + double A_dense[12] = {1.0, 0.0, 2.0, 3.0, 0.0, 4.0, + 5.0, 0.0, 6.0, 7.0, 0.0, 0.0}; + + expr *Bx = new_linear(x, B, NULL); + expr *log_Bx = new_log(Bx); + expr *A_log_Bx = new_dense_left_matmul(log_Bx, A_dense, 4, 3); + + A_log_Bx->forward(A_log_Bx, x_vals); + A_log_Bx->jacobian_init(A_log_Bx); + A_log_Bx->eval_jacobian(A_log_Bx); + + double expected_Ax[12] = { + 0.5, 0.5, 0.5, /* row 0 */ + 7.0 / 6.0, 7.0 / 6.0, 7.0 / 6.0, /* row 1 */ + 11.0 / 6.0, 11.0 / 6.0, 11.0 / 6.0, /* row 2 */ + 7.0 / 6.0, 7.0 / 6.0, 7.0 / 6.0 /* row 3 */ + }; + int expected_Ai[12] = {0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2}; + int expected_Ap[5] = {0, 3, 6, 9, 12}; + + mu_assert("dense composite vals fail", + cmp_double_array(A_log_Bx->jacobian->x, expected_Ax, 12)); + mu_assert("dense composite cols fail", + cmp_int_array(A_log_Bx->jacobian->i, expected_Ai, 12)); + mu_assert("dense composite rows fail", + cmp_int_array(A_log_Bx->jacobian->p, expected_Ap, 5)); + + free_csr_matrix(B); + free_expr(A_log_Bx); + return 0; +} diff --git a/tests/wsum_hess/test_dense_left_matmul.h b/tests/wsum_hess/test_dense_left_matmul.h new file mode 100644 index 0000000..a6a29ca --- /dev/null +++ b/tests/wsum_hess/test_dense_left_matmul.h @@ -0,0 +1,133 @@ +#include +#include +#include + +#include "affine.h" +#include "bivariate.h" +#include "elementwise_univariate.h" +#include "expr.h" +#include "minunit.h" +#include "test_helpers.h" + +const char *test_wsum_hess_dense_left_matmul(void) +{ + /* Test weighted sum of Hessian of A @ log(x) with dense A. + * Same math as the sparse test: + * x = [1, 2, 3], A = [1, 0, 2; 3, 0, 4; 5, 0, 6; 7, 0, 0], w = [1, 2, 3, 4] + * A^T @ w = [50, 0, 28] + * wsum_hess = diag(-50/1, -0/4, -28/9) + */ + double x_vals[3] = {1.0, 2.0, 3.0}; + double w[4] = {1.0, 2.0, 3.0, 4.0}; + + expr *x = new_variable(3, 1, 0, 3); + double A_dense[12] = {1.0, 0.0, 2.0, 3.0, 0.0, 4.0, + 5.0, 0.0, 6.0, 7.0, 0.0, 0.0}; + + expr *log_x = new_log(x); + expr *A_log_x = new_dense_left_matmul(log_x, A_dense, 4, 3); + + A_log_x->forward(A_log_x, x_vals); + A_log_x->jacobian_init(A_log_x); + A_log_x->wsum_hess_init(A_log_x); + A_log_x->eval_wsum_hess(A_log_x, w); + + double expected_x[3] = {-50.0, -0.0, -28.0 / 9.0}; + int expected_i[3] = {0, 1, 2}; + int expected_p[4] = {0, 1, 2, 3}; + + mu_assert("dense wsum_hess vals incorrect", + cmp_double_array(A_log_x->wsum_hess->x, expected_x, 3)); + mu_assert("dense wsum_hess cols incorrect", + cmp_int_array(A_log_x->wsum_hess->i, expected_i, 3)); + mu_assert("dense wsum_hess rows incorrect", + cmp_int_array(A_log_x->wsum_hess->p, expected_p, 4)); + + free_expr(A_log_x); + return 0; +} + +const char *test_wsum_hess_dense_left_matmul_matrix(void) +{ + /* Test with matrix-valued x (3x2), same math as sparse test. + * x = [1,2,3,4,5,6], A same as above, w = [1,2,3,4,5,6,7,8] + */ + double x_vals[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0}; + double w[8] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}; + + expr *x = new_variable(3, 2, 0, 6); + double A_dense[12] = {1.0, 0.0, 2.0, 3.0, 0.0, 4.0, + 5.0, 0.0, 6.0, 7.0, 0.0, 0.0}; + + expr *log_x = new_log(x); + expr *A_log_x = new_dense_left_matmul(log_x, A_dense, 4, 3); + + A_log_x->forward(A_log_x, x_vals); + A_log_x->jacobian_init(A_log_x); + A_log_x->wsum_hess_init(A_log_x); + A_log_x->eval_wsum_hess(A_log_x, w); + + double expected_x[6] = {-50.0, -0.0, -28.0 / 9.0, + -57.0 / 8.0, -0.0, -19.0 / 9.0}; + int expected_i[6] = {0, 1, 2, 3, 4, 5}; + int expected_p[7] = {0, 1, 2, 3, 4, 5, 6}; + + mu_assert("dense wsum_hess matrix vals incorrect", + cmp_double_array(A_log_x->wsum_hess->x, expected_x, 6)); + mu_assert("dense wsum_hess matrix cols incorrect", + cmp_int_array(A_log_x->wsum_hess->i, expected_i, 6)); + mu_assert("dense wsum_hess matrix rows incorrect", + cmp_int_array(A_log_x->wsum_hess->p, expected_p, 7)); + + free_expr(A_log_x); + return 0; +} + +const char *test_wsum_hess_dense_left_matmul_composite(void) +{ + /* Test with A @ log(B @ x), same math as sparse test. + * x = [1,2,3], B = 3x3 all ones, A = [1,0,2;3,0,4;5,0,6;7,0,0], w = [1,2,3,4] + * Expected: 3x3 dense matrix, all entries = -13/6 + */ + double x_vals[3] = {1.0, 2.0, 3.0}; + double w[4] = {1.0, 2.0, 3.0, 4.0}; + + expr *x = new_variable(3, 1, 0, 3); + + CSR_Matrix *B = new_csr_matrix(3, 3, 9); + int B_p[4] = {0, 3, 6, 9}; + int B_i[9] = {0, 1, 2, 0, 1, 2, 0, 1, 2}; + double B_x[9] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + memcpy(B->p, B_p, 4 * sizeof(int)); + memcpy(B->i, B_i, 9 * sizeof(int)); + memcpy(B->x, B_x, 9 * sizeof(double)); + + double A_dense[12] = {1.0, 0.0, 2.0, 3.0, 0.0, 4.0, + 5.0, 0.0, 6.0, 7.0, 0.0, 0.0}; + + expr *Bx = new_linear(x, B, NULL); + expr *log_Bx = new_log(Bx); + expr *A_log_Bx = new_dense_left_matmul(log_Bx, A_dense, 4, 3); + + A_log_Bx->forward(A_log_Bx, x_vals); + A_log_Bx->jacobian_init(A_log_Bx); + A_log_Bx->wsum_hess_init(A_log_Bx); + A_log_Bx->eval_wsum_hess(A_log_Bx, w); + + double expected_x[9] = {-13.0 / 6.0, -13.0 / 6.0, -13.0 / 6.0, + -13.0 / 6.0, -13.0 / 6.0, -13.0 / 6.0, + -13.0 / 6.0, -13.0 / 6.0, -13.0 / 6.0}; + int expected_i[9] = {0, 1, 2, 0, 1, 2, 0, 1, 2}; + int expected_p[4] = {0, 3, 6, 9}; + + mu_assert("dense composite wsum_hess vals incorrect", + cmp_double_array(A_log_Bx->wsum_hess->x, expected_x, 9)); + mu_assert("dense composite wsum_hess cols incorrect", + cmp_int_array(A_log_Bx->wsum_hess->i, expected_i, 9)); + mu_assert("dense composite wsum_hess rows incorrect", + cmp_int_array(A_log_Bx->wsum_hess->p, expected_p, 4)); + + free_csr_matrix(B); + free_expr(A_log_Bx); + return 0; +}