From cf6a4e9889818143496a7f1882f660e4afa2536e Mon Sep 17 00:00:00 2001 From: dance858 Date: Sat, 23 May 2026 06:49:41 -0700 Subject: [PATCH 1/3] Polymorphic sum vtable on matrix base (axis = -1, 0, 1) --- include/utils/matrix.h | 19 ++++ src/atoms/affine/sum.c | 46 +------- src/utils/permuted_dense.c | 143 +++++++++++++++++++++++++ src/utils/sparse_matrix.c | 39 +++++++ src/utils/stacked_pd.c | 91 ++++++++++++++++ tests/all_tests.c | 6 +- tests/jacobian_tests/affine/test_sum.h | 35 ++++++ tests/utils/test_permuted_dense.h | 96 +++++++++++++++++ 8 files changed, 433 insertions(+), 42 deletions(-) diff --git a/include/utils/matrix.h b/include/utils/matrix.h index a1c4420..a2c46dc 100644 --- a/include/utils/matrix.h +++ b/include/utils/matrix.h @@ -106,6 +106,24 @@ typedef void (*matrix_broadcast_fill_values_fn)(matrix *A, broadcast_type type, typedef matrix *(*matrix_diag_vec_alloc_fn)(matrix *A); typedef void (*matrix_diag_vec_fill_values_fn)(matrix *A, matrix *out); +/* Allocate C as a row-wise reduction of A. The reduction pattern is chosen by + axis: + - axis = -1: sum all rows of A. C has shape (1, A->n). + C[0, k] = sum_{i in [0, A->m)} A[i, k]. + - axis = 0: block-sum rows in consecutive groups of d1 (requires + A->m % d1 == 0). C has shape (A->m / d1, A->n). + C[j, k] = sum_{i in [j*d1, (j+1)*d1)} A[i, k]. + - axis = 1: stride-sum rows at modular spacing d1 (requires + A->m % d1 == 0). C has shape (d1, A->n). + C[j, k] = sum_{i : i % d1 == j} A[i, k]. + d1 is ignored when axis == -1. + + Caller pre-allocates idx_map of size A->nnz. On return idx_map[ii] holds the + position in C's base.x where A's base.x[ii] scatter-adds. A values-fill pass + can compute C->x by zeroing it and accumulating A->x[ii] into + C->x[idx_map[ii]] for ii in [0, A->nnz). */ +typedef matrix *(*matrix_sum_alloc_fn)(matrix *A, int axis, int d1, int *idx_map); + typedef void (*matrix_free_fn)(matrix *self); struct matrix @@ -141,6 +159,7 @@ struct matrix matrix_broadcast_fill_values_fn broadcast_fill_values; matrix_diag_vec_alloc_fn diag_vec_alloc; matrix_diag_vec_fill_values_fn diag_vec_fill_values; + matrix_sum_alloc_fn sum_alloc; /* Lifecycle */ matrix_free_fn free_fn; diff --git a/src/atoms/affine/sum.c b/src/atoms/affine/sum.c index 754f262..4ce60fb 100644 --- a/src/atoms/affine/sum.c +++ b/src/atoms/affine/sum.c @@ -85,49 +85,13 @@ static void jacobian_init_impl(expr *node) { expr *x = node->left; sum_expr *snode = (sum_expr *) node; - int axis = snode->axis; - - /* initialize child's jacobian */ jacobian_init(x); - CSR_matrix *Jx = x->jacobian->to_csr(x->jacobian); - - /* we never have to store more than the child's nnz, nor more than the - output's cell count */ - int max_nnz = MIN(Jx->nnz, node->size * node->n_vars); - CSR_matrix *jac = new_CSR_matrix(node->size, node->n_vars, max_nnz); - node->work->iwork = sp_malloc(MAX(jac->n, Jx->nnz) * sizeof(int)); - snode->idx_map = sp_malloc(Jx->nnz * sizeof(int)); - - /* the idx_map array maps each nonzero entry j in x->jacobian - to the corresponding index in the output row matrix C. Specifically, for - each nonzero entry j in A, idx_map[j] gives the position in C->x where - the value from x->jacobian->x[j] should be accumulated. */ - - if (axis == -1) - { - sum_all_rows_csr_alloc(Jx, jac, node->work->iwork, snode->idx_map); - } - else if (axis == 0) - { - sum_block_of_rows_csr_alloc(Jx, jac, x->d1, node->work->iwork, - snode->idx_map); - } - else if (axis == 1) - { - sum_evenly_spaced_rows_csr_alloc(Jx, jac, node->size, node->work->iwork, - snode->idx_map); - } - - /* For stacked_pd children, child->jacobian->base.x is block-major while - csr->x is row-major sorted. Re-index idx_map so it can be applied - directly to base.x in eval_jacobian. */ - if (x->jacobian->is_stacked_pd) - { - compose_csr_idx_map_for_spd((const stacked_pd *) x->jacobian, Jx, - snode->idx_map); - } - node->jacobian = new_sparse_matrix(jac); + /* sum_alloc fills idx_map so eval_jacobian can accumulate from + child->jacobian->x. */ + snode->idx_map = sp_malloc(x->jacobian->nnz * sizeof(int)); + node->jacobian = + x->jacobian->sum_alloc(x->jacobian, snode->axis, x->d1, snode->idx_map); } static void eval_jacobian(expr *node) diff --git a/src/utils/permuted_dense.c b/src/utils/permuted_dense.c index eadf9b6..4907dea 100644 --- a/src/utils/permuted_dense.c +++ b/src/utils/permuted_dense.c @@ -397,6 +397,148 @@ static void permuted_dense_vtable_block_left_mult_values(const matrix *A, I_kron_A_fill_values(A, J, C, pd->kernel_dwork); } +/* C = sum-all-rows of A. */ +static matrix *sum_all_rows_pd_alloc(matrix *A, int *idx_map) +{ + permuted_dense *pd = (permuted_dense *) A; + + /* allocate C */ + int zero = 0; + matrix *C = new_permuted_dense(1, A->n, 1, pd->n0, &zero, pd->col_perm, NULL); + + /* fill idx_map */ + for (int i = 0; i < pd->m0; i++) + { + int *idx_base = idx_map + i * pd->n0; + for (int j = 0; j < pd->n0; j++) + { + idx_base[j] = j; + } + } + return C; +} + +/* C = block-sum of A's rows in consecutive groups of d1. */ +static matrix *sum_block_of_rows_pd_alloc(matrix *A_matrix, int d1, int *idx_map) +{ + permuted_dense *A = (permuted_dense *) A_matrix; + int C_m0 = 0; + int last_bucket = -1; + int *C_row_perm = (int *) sp_malloc(A->m0 * sizeof(int)); + + /* per input dense row ii, the index of its bucket within C_row_perm */ + int *row_to_out = (int *) sp_malloc(A->m0 * sizeof(int)); + + // --------------------------------------------------------------------------- + // determine C's row_perm + // --------------------------------------------------------------------------- + + /* for every row in the dense block */ + for (int ii = 0; ii < A->m0; ii++) + { + /* find the bucket to which this row belongs */ + int bucket = A->row_perm[ii] / d1; + + /* add the bucket to C if it's new */ + if (bucket != last_bucket) + { + C_row_perm[C_m0++] = bucket; + last_bucket = bucket; + } + + /* map the input row of A to its row in C */ + row_to_out[ii] = C_m0 - 1; + } + + matrix *C = new_permuted_dense(A_matrix->m / d1, A_matrix->n, C_m0, A->n0, + C_row_perm, A->col_perm, NULL); + + // --------------------------------------------------------------------------- + // fill idx_map + // --------------------------------------------------------------------------- + for (int ii = 0; ii < A->m0; ii++) + { + int offset = row_to_out[ii] * A->n0; + int *idx_base = idx_map + ii * A->n0; + for (int jj = 0; jj < A->n0; jj++) + { + idx_base[jj] = offset + jj; + } + } + + sp_free(row_to_out); + sp_free(C_row_perm); + return C; +} + +/* C = stride-sum of A's rows at modular spacing d1. C has shape (d1, A->n); + C[j, :] = sum_{i : i % d1 == j} A[i, :]. */ +static matrix *sum_evenly_spaced_rows_pd_alloc(matrix *self, int d1, int *idx_map) +{ + permuted_dense *A = (permuted_dense *) self; + + // --------------------------------------------------------------------------- + // which buckets of [0, d1) are hit by A->row_perm? + // --------------------------------------------------------------------------- + bool *seen = (bool *) sp_calloc(d1, sizeof(bool)); + for (int ii = 0; ii < A->m0; ii++) + { + seen[A->row_perm[ii] % d1] = true; + } + + // --------------------------------------------------------------------------- + // determine C's row_perm (guarantees sorted order) + // --------------------------------------------------------------------------- + int *C_row_perm = (int *) sp_malloc(A->m0 * sizeof(int)); + int *bucket_to_out_idx = (int *) sp_malloc(d1 * sizeof(int)); + int C_m0 = 0; + for (int ii = 0; ii < d1; ii++) + { + if (seen[ii]) + { + bucket_to_out_idx[ii] = C_m0; + C_row_perm[C_m0++] = ii; + } + } + sp_free(seen); + + matrix *C = + new_permuted_dense(d1, self->n, C_m0, A->n0, C_row_perm, A->col_perm, NULL); + + // --------------------------------------------------------------------------- + // fill idx_map + // --------------------------------------------------------------------------- + for (int ii = 0; ii < A->m0; ii++) + { + int base = bucket_to_out_idx[A->row_perm[ii] % d1] * A->n0; + int *idx_base = idx_map + ii * A->n0; + for (int jj = 0; jj < A->n0; jj++) + { + idx_base[jj] = base + jj; + } + } + + sp_free(bucket_to_out_idx); + sp_free(C_row_perm); + return C; +} + +static matrix *permuted_dense_vtable_sum_alloc(matrix *self, int axis, int d1, + int *idx_map) +{ + if (axis == -1) + { + return sum_all_rows_pd_alloc(self, idx_map); + } + + if (axis == 0) + { + return sum_block_of_rows_pd_alloc(self, d1, idx_map); + } + + return sum_evenly_spaced_rows_pd_alloc(self, d1, idx_map); /* axis == 1 */ +} + static void wire_vtable(permuted_dense *pd) { pd->base.is_permuted_dense = true; @@ -420,6 +562,7 @@ static void wire_vtable(permuted_dense *pd) pd->base.broadcast_fill_values = permuted_dense_vtable_broadcast_fill_values; pd->base.diag_vec_alloc = permuted_dense_vtable_diag_vec_alloc; pd->base.diag_vec_fill_values = permuted_dense_vtable_diag_vec_fill_values; + pd->base.sum_alloc = permuted_dense_vtable_sum_alloc; pd->base.refresh_csc_values = permuted_dense_refresh_csc_values; } diff --git a/src/utils/sparse_matrix.c b/src/utils/sparse_matrix.c index 04538d8..06cb5e2 100644 --- a/src/utils/sparse_matrix.c +++ b/src/utils/sparse_matrix.c @@ -18,6 +18,7 @@ #include "utils/sparse_matrix.h" #include "utils/CSC_matrix.h" +#include "utils/CSR_sum.h" #include "utils/linalg_sparse_matmuls.h" #include "utils/matrix.h" #include "utils/mini_numpy.h" @@ -311,6 +312,43 @@ static void sparse_refresh_csc_values(matrix *self) csr_to_csc_fill_values(sm->csr, sm->csc_cache, sm->csc_iwork); } +static matrix *sparse_sum_alloc(matrix *self, int axis, int d1, int *idx_map) +{ + CSR_matrix *A = ((sparse_matrix *) self)->csr; + int m; + if (axis == -1) + { + m = 1; + } + else if (axis == 0) + { + m = A->m / d1; + } + else + { + m = d1; + } + int max_nnz = MIN(A->nnz, m * A->n); + CSR_matrix *out = new_CSR_matrix(m, A->n, max_nnz); + int *iwork = (int *) sp_malloc(MAX(A->n, A->nnz) * sizeof(int)); + + if (axis == -1) + { + sum_all_rows_csr_alloc(A, out, iwork, idx_map); + } + else if (axis == 0) + { + sum_block_of_rows_csr_alloc(A, out, d1, iwork, idx_map); + } + else + { + sum_evenly_spaced_rows_csr_alloc(A, out, m, iwork, idx_map); + } + + sp_free(iwork); + return new_sparse_matrix(out); +} + static void wire_vtable(sparse_matrix *sm) { sm->base.block_left_mult_vec = sparse_block_left_mult_vec; @@ -331,6 +369,7 @@ static void wire_vtable(sparse_matrix *sm) sm->base.broadcast_fill_values = sparse_broadcast_fill_values; sm->base.diag_vec_alloc = sparse_diag_vec_alloc; sm->base.diag_vec_fill_values = sparse_diag_vec_fill_values; + sm->base.sum_alloc = sparse_sum_alloc; sm->base.refresh_csc_values = sparse_refresh_csc_values; sm->base.free_fn = sparse_free; } diff --git a/src/utils/stacked_pd.c b/src/utils/stacked_pd.c index 2310472..0724859 100644 --- a/src/utils/stacked_pd.c +++ b/src/utils/stacked_pd.c @@ -17,8 +17,11 @@ */ #include "utils/stacked_pd.h" +#include "utils/CSR_sum.h" +#include "utils/iVec.h" #include "utils/matrix.h" #include "utils/permuted_dense.h" +#include "utils/sparse_matrix.h" #include "utils/stacked_pd_linalg.h" #include "utils/tracked_alloc.h" #include "utils/utils.h" @@ -378,6 +381,93 @@ static void assert_disjoint_row_perms(int n_blocks, permuted_dense *const *block } #endif +/* C = sum(A, axis) for a stacked_pd A. + + axis == -1: native union path. C is a 1-row permuted_dense whose col_perm + is the sorted union of all blocks' col_perms (every input cell collapses + to exactly one output column). idx_map is filled in block-major order + matching A's base.x layout. + + axis == 0 or 1: internal CSR fallback. A row-reduction of a stacked_pd + doesn't in general fit a single PD output (different output rows may have + different col footprints), so we materialize via to_csr, dispatch to the + existing CSR helper, then re-index idx_map from CSR ordering into A's + block-major base.x ordering so a downstream values-fill pass can read + directly from A->base.x. */ +static matrix *stacked_pd_vtable_sum_alloc(matrix *self, int axis, int d1, + int *idx_map) +{ + stacked_pd *spd = (stacked_pd *) self; + + if (axis == -1) + { + /* multi-way sorted union of each block's col_perm */ + iVec *col_union = iVec_new(8); + const int **col_arrs = + (const int **) sp_malloc(spd->n_blocks * sizeof(int *)); + int *col_lens = (int *) sp_malloc(spd->n_blocks * sizeof(int)); + for (int k = 0; k < spd->n_blocks; k++) + { + col_arrs[k] = spd->blocks[k]->col_perm; + col_lens[k] = spd->blocks[k]->n0; + } + sorted_union_int_arrays(col_arrs, col_lens, spd->n_blocks, col_union); + sp_free(col_arrs); + sp_free(col_lens); + + /* inverse map: column-id (in [0, self->n)) → its position in col_union */ + int *col_to_pos = (int *) sp_malloc(self->n * sizeof(int)); + for (int p = 0; p < col_union->len; p++) + { + col_to_pos[col_union->data[p]] = p; + } + + int row_zero = 0; + matrix *out = new_permuted_dense(1, self->n, 1, col_union->len, &row_zero, + col_union->data, NULL); + iVec_free(col_union); + + /* fill idx_map in block-major order matching spd->base.x layout */ + int native_pos = 0; + for (int k = 0; k < spd->n_blocks; k++) + { + permuted_dense *blk = spd->blocks[k]; + for (int i = 0; i < blk->m0; i++) + { + for (int j = 0; j < blk->n0; j++) + { + idx_map[native_pos++] = col_to_pos[blk->col_perm[j]]; + } + } + } + sp_free(col_to_pos); + return out; + } + + /* axis == 0 or 1: CSR fallback */ + CSR_matrix *A = self->to_csr(self); + int m_out = (axis == 0) ? A->m / d1 : d1; + int max_out_nnz = MIN(A->nnz, m_out * A->n); + CSR_matrix *out = new_CSR_matrix(m_out, A->n, max_out_nnz); + int *iwork = (int *) sp_malloc(MAX(A->n, A->nnz) * sizeof(int)); + + if (axis == 0) + { + sum_block_of_rows_csr_alloc(A, out, d1, iwork, idx_map); + } + else + { + sum_evenly_spaced_rows_csr_alloc(A, out, m_out, iwork, idx_map); + } + sp_free(iwork); + + /* idx_map is currently in CSR row order; re-index to block-major base.x + order so eval_jacobian reads child->jacobian->x directly. */ + compose_csr_idx_map_for_spd(spd, A, idx_map); + + return new_sparse_matrix(out); +} + static void wire_vtable(stacked_pd *spd) { spd->base.is_stacked_pd = true; @@ -398,6 +488,7 @@ static void wire_vtable(stacked_pd *spd) spd->base.diag_vec_fill_values = stacked_pd_vtable_diag_vec_fill_values; spd->base.broadcast_alloc = stacked_pd_vtable_broadcast_alloc; spd->base.broadcast_fill_values = stacked_pd_vtable_broadcast_fill_values; + spd->base.sum_alloc = stacked_pd_vtable_sum_alloc; } matrix *new_stacked_pd_unchecked(int m, int n, int n_blocks, permuted_dense **blocks, diff --git a/tests/all_tests.c b/tests/all_tests.c index 77a9d12..e571683 100644 --- a/tests/all_tests.c +++ b/tests/all_tests.c @@ -66,8 +66,8 @@ #include "utils/test_csr_matrix.h" #include "utils/test_linalg_sparse_matmuls.h" #include "utils/test_linalg_utils_matmul_chain_rule.h" -#include "utils/test_matrix.h" #include "utils/test_matmul_dispatchers.h" +#include "utils/test_matrix.h" #include "utils/test_permuted_dense.h" #include "utils/test_stacked_pd.h" #include "wsum_hess/affine/test_broadcast.h" @@ -211,6 +211,7 @@ int main(void) mu_run_test(test_jacobian_sum_log_axis_0, tests_run); mu_run_test(test_jacobian_sum_add_log_axis_0, tests_run); mu_run_test(test_jacobian_sum_log_axis_1, tests_run); + mu_run_test(test_jacobian_sum_axis_minus_one_pd_child, tests_run); mu_run_test(test_jacobian_hstack_vectors, tests_run); mu_run_test(test_jacobian_hstack_matrix, tests_run); mu_run_test(test_jacobian_vstack_vectors, tests_run); @@ -405,6 +406,9 @@ int main(void) mu_run_test(test_permuted_dense_BTA_empty_overlap, tests_run); mu_run_test(test_permuted_dense_BTA_partial_overlap, tests_run); mu_run_test(test_permuted_dense_BTDA_decomposition, tests_run); + mu_run_test(test_permuted_dense_sum_all_rows, tests_run); + mu_run_test(test_permuted_dense_sum_block_of_rows, tests_run); + mu_run_test(test_permuted_dense_sum_evenly_spaced_rows, tests_run); mu_run_test(test_BTA_pd_csc_matches_csr, tests_run); mu_run_test(test_BA_pd_matrices_pd_pd_full_block_B, tests_run); mu_run_test(test_BA_pd_matrices_pd_pd_general_B, tests_run); diff --git a/tests/jacobian_tests/affine/test_sum.h b/tests/jacobian_tests/affine/test_sum.h index 150fe53..c2b61bd 100644 --- a/tests/jacobian_tests/affine/test_sum.h +++ b/tests/jacobian_tests/affine/test_sum.h @@ -6,6 +6,7 @@ #include "atoms/elementwise_restricted_dom.h" #include "expr.h" #include "minunit.h" +#include "numerical_diff.h" #include "test_helpers.h" const char *test_jacobian_sum_log(void) @@ -195,3 +196,37 @@ const char *test_jacobian_sum_log_axis_1(void) free_expr(sum_node); return 0; } + +/* End-to-end test that the polymorphic sum vtable (axis=-1) dispatches + correctly when the child Jacobian is a permuted_dense. Constructed as + sum(A2 @ (A1 @ x), -1): a single left_matmul on a leaf variable goes + through the sparse leaf-var fast path, but the outer left_matmul on a + composite child fires the PD jacobian_init path (BA_pd_matrices_alloc), + giving us a PD child for sum. We assert the PD-ness of the child and then + verify the resulting sum Jacobian by finite differences. + + PD child Jacobians with d2 > 1 (needed to make axis=0/1 non-trivial via + the sum atom) are hard to construct from leaf variables — the multi-column + left_matmul path produces a stacked_pd instead. Direct unit tests on the + PD vtable for axis=0 and axis=1 live in tests/utils/test_permuted_dense.h. */ +const char *test_jacobian_sum_axis_minus_one_pd_child(void) +{ + double A1_data[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0}; + double A2_data[12] = {1.0, 0.0, 1.0, 0.0, 1.0, 0.0, + 1.0, 0.0, 1.0, 0.0, 1.0, 0.0}; + + expr *x = new_variable(2, 1, 0, 2); + expr *A1_x = new_left_matmul_dense(NULL, x, 3, 2, A1_data); + expr *A2_A1_x = new_left_matmul_dense(NULL, A1_x, 4, 3, A2_data); + expr *sum_node = new_sum(A2_A1_x, -1); + + double u_vals[2] = {0.5, -1.5}; + jacobian_init(sum_node); + mu_assert("child Jacobian should be PD", A2_A1_x->jacobian->is_permuted_dense); + + mu_assert("check_jacobian failed", + check_jacobian_num(sum_node, u_vals, NUMERICAL_DIFF_DEFAULT_H)); + + free_expr(sum_node); + return 0; +} diff --git a/tests/utils/test_permuted_dense.h b/tests/utils/test_permuted_dense.h index da07db6..4e68e58 100644 --- a/tests/utils/test_permuted_dense.h +++ b/tests/utils/test_permuted_dense.h @@ -1047,4 +1047,100 @@ const char *test_BA_pd_matrices_fast_path(void) return 0; } +/* Direct vtable tests for sum_alloc. The test PD represents a (6, 4) matrix + with a (3, 2) dense block at rows {0, 3, 4}, cols {1, 3}. We exercise all + three axes; for axis=0 (d1=2) the buckets {0/2, 3/2, 4/2} = {0, 1, 2} + are all distinct and non-decreasing (linear-scan dedupe); for axis=1 + (d1=3) the buckets {0%3, 3%3, 4%3} = {0, 0, 1} collapse two input rows + onto the same output row (bitmap dedupe), so multiple idx_map entries + point to the same output position — a values-fill pass would scatter-add. */ +const char *test_permuted_dense_sum_all_rows(void) +{ + int row_perm[3] = {0, 3, 4}; + int col_perm[2] = {1, 3}; + double X[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0}; + matrix *M = new_permuted_dense(6, 4, 3, 2, row_perm, col_perm, X); + + int idx_map[6]; + matrix *out = M->sum_alloc(M, -1, /*d1 unused*/ 0, idx_map); + + mu_assert("output is PD", out->is_permuted_dense); + permuted_dense *opd = (permuted_dense *) out; + mu_assert("out m", out->m == 1); + mu_assert("out n", out->n == 4); + mu_assert("out m0", opd->m0 == 1); + mu_assert("out n0", opd->n0 == 2); + int expected_row_perm[1] = {0}; + int expected_col_perm[2] = {1, 3}; + mu_assert("out row_perm", cmp_int_array(opd->row_perm, expected_row_perm, 1)); + mu_assert("out col_perm", cmp_int_array(opd->col_perm, expected_col_perm, 2)); + /* every input row collapses to row 0, so idx_map[i*n0+j] = j */ + int expected_idx_map[6] = {0, 1, 0, 1, 0, 1}; + mu_assert("idx_map", cmp_int_array(idx_map, expected_idx_map, 6)); + + free_matrix(out); + free_matrix(M); + return 0; +} + +const char *test_permuted_dense_sum_block_of_rows(void) +{ + int row_perm[3] = {0, 3, 4}; + int col_perm[2] = {1, 3}; + double X[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0}; + matrix *M = new_permuted_dense(6, 4, 3, 2, row_perm, col_perm, X); + + int d1 = 2; /* child shape (d1, d2) = (2, 3); output rows = d2 = 3 */ + int idx_map[6]; + matrix *out = M->sum_alloc(M, 0, d1, idx_map); + + mu_assert("output is PD", out->is_permuted_dense); + permuted_dense *opd = (permuted_dense *) out; + mu_assert("out m", out->m == 3); + mu_assert("out n", out->n == 4); + mu_assert("out m0", opd->m0 == 3); + mu_assert("out n0", opd->n0 == 2); + int expected_row_perm[3] = {0, 1, 2}; /* {0/2, 3/2, 4/2} */ + int expected_col_perm[2] = {1, 3}; + mu_assert("out row_perm", cmp_int_array(opd->row_perm, expected_row_perm, 3)); + mu_assert("out col_perm", cmp_int_array(opd->col_perm, expected_col_perm, 2)); + int expected_idx_map[6] = {0, 1, 2, 3, 4, 5}; + mu_assert("idx_map", cmp_int_array(idx_map, expected_idx_map, 6)); + + free_matrix(out); + free_matrix(M); + return 0; +} + +const char *test_permuted_dense_sum_evenly_spaced_rows(void) +{ + int row_perm[3] = {0, 3, 4}; + int col_perm[2] = {1, 3}; + double X[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0}; + matrix *M = new_permuted_dense(6, 4, 3, 2, row_perm, col_perm, X); + + int d1 = 3; /* output rows = d1 = 3, buckets = {0%3, 3%3, 4%3} = {0, 0, 1} */ + int idx_map[6]; + matrix *out = M->sum_alloc(M, 1, d1, idx_map); + + mu_assert("output is PD", out->is_permuted_dense); + permuted_dense *opd = (permuted_dense *) out; + mu_assert("out m", out->m == 3); + mu_assert("out n", out->n == 4); + mu_assert("out m0", opd->m0 == 2); /* two distinct buckets {0, 1} */ + mu_assert("out n0", opd->n0 == 2); + int expected_row_perm[2] = {0, 1}; + int expected_col_perm[2] = {1, 3}; + mu_assert("out row_perm", cmp_int_array(opd->row_perm, expected_row_perm, 2)); + mu_assert("out col_perm", cmp_int_array(opd->col_perm, expected_col_perm, 2)); + /* input rows 0 and 3 both bucket to output row 0 (idx_map → 0, 1); + input row 4 buckets to output row 1 (idx_map → 2, 3) */ + int expected_idx_map[6] = {0, 1, 0, 1, 2, 3}; + mu_assert("idx_map", cmp_int_array(idx_map, expected_idx_map, 6)); + + free_matrix(out); + free_matrix(M); + return 0; +} + #endif /* TEST_PERMUTED_DENSE_H */ From 121c2b35ef5ddb47cc923eeff629eafaf43e3f8d Mon Sep 17 00:00:00 2001 From: dance858 Date: Sat, 23 May 2026 08:03:41 -0700 Subject: [PATCH 2/3] permuted dense sum different rows --- include/utils/matrix.h | 25 ++++++++++--------------- src/atoms/affine/sum.c | 6 +++--- src/utils/permuted_dense.c | 6 +++--- src/utils/sparse_matrix.c | 5 +++-- src/utils/stacked_pd.c | 6 +++--- tests/utils/test_permuted_dense.h | 10 +++++----- 6 files changed, 27 insertions(+), 31 deletions(-) diff --git a/include/utils/matrix.h b/include/utils/matrix.h index a2c46dc..1ff9c17 100644 --- a/include/utils/matrix.h +++ b/include/utils/matrix.h @@ -109,20 +109,15 @@ typedef void (*matrix_diag_vec_fill_values_fn)(matrix *A, matrix *out); /* Allocate C as a row-wise reduction of A. The reduction pattern is chosen by axis: - axis = -1: sum all rows of A. C has shape (1, A->n). - C[0, k] = sum_{i in [0, A->m)} A[i, k]. - - axis = 0: block-sum rows in consecutive groups of d1 (requires - A->m % d1 == 0). C has shape (A->m / d1, A->n). - C[j, k] = sum_{i in [j*d1, (j+1)*d1)} A[i, k]. - - axis = 1: stride-sum rows at modular spacing d1 (requires - A->m % d1 == 0). C has shape (d1, A->n). - C[j, k] = sum_{i : i % d1 == j} A[i, k]. - d1 is ignored when axis == -1. - - Caller pre-allocates idx_map of size A->nnz. On return idx_map[ii] holds the - position in C's base.x where A's base.x[ii] scatter-adds. A values-fill pass - can compute C->x by zeroing it and accumulating A->x[ii] into - C->x[idx_map[ii]] for ii in [0, A->nnz). */ -typedef matrix *(*matrix_sum_alloc_fn)(matrix *A, int axis, int d1, int *idx_map); + - axis = 0: block-sum rows in consecutive groups of d1. C has shape (A->m + / d1, A->n). C[j, :] = sum_{i in [j*d1, (j+1)*d1)} A[i, :]. + - axis = 1: stride-sum rows at modular spacing d1. C has shape (d1, A->n). + C[j, :] = sum_{i : i % d1 == j} A[i, :]. + + Caller pre-allocates idx_map of size A->nnz that can be used to compute the + numerical result of the operation using via accumulation. */ +typedef matrix *(*matrix_sum_row_partition_alloc_fn)(matrix *A, int axis, int d1, + int *idx_map); typedef void (*matrix_free_fn)(matrix *self); @@ -159,7 +154,7 @@ struct matrix matrix_broadcast_fill_values_fn broadcast_fill_values; matrix_diag_vec_alloc_fn diag_vec_alloc; matrix_diag_vec_fill_values_fn diag_vec_fill_values; - matrix_sum_alloc_fn sum_alloc; + matrix_sum_row_partition_alloc_fn sum_row_partition_alloc; /* Lifecycle */ matrix_free_fn free_fn; diff --git a/src/atoms/affine/sum.c b/src/atoms/affine/sum.c index 4ce60fb..edbb257 100644 --- a/src/atoms/affine/sum.c +++ b/src/atoms/affine/sum.c @@ -87,11 +87,11 @@ static void jacobian_init_impl(expr *node) sum_expr *snode = (sum_expr *) node; jacobian_init(x); - /* sum_alloc fills idx_map so eval_jacobian can accumulate from + /* sum_row_partition_alloc fills idx_map so eval_jacobian can accumulate from child->jacobian->x. */ snode->idx_map = sp_malloc(x->jacobian->nnz * sizeof(int)); - node->jacobian = - x->jacobian->sum_alloc(x->jacobian, snode->axis, x->d1, snode->idx_map); + node->jacobian = x->jacobian->sum_row_partition_alloc(x->jacobian, snode->axis, + x->d1, snode->idx_map); } static void eval_jacobian(expr *node) diff --git a/src/utils/permuted_dense.c b/src/utils/permuted_dense.c index 4907dea..ed7dca5 100644 --- a/src/utils/permuted_dense.c +++ b/src/utils/permuted_dense.c @@ -523,8 +523,8 @@ static matrix *sum_evenly_spaced_rows_pd_alloc(matrix *self, int d1, int *idx_ma return C; } -static matrix *permuted_dense_vtable_sum_alloc(matrix *self, int axis, int d1, - int *idx_map) +static matrix *permuted_dense_vtable_sum_row_partition_alloc(matrix *self, int axis, + int d1, int *idx_map) { if (axis == -1) { @@ -562,7 +562,7 @@ static void wire_vtable(permuted_dense *pd) pd->base.broadcast_fill_values = permuted_dense_vtable_broadcast_fill_values; pd->base.diag_vec_alloc = permuted_dense_vtable_diag_vec_alloc; pd->base.diag_vec_fill_values = permuted_dense_vtable_diag_vec_fill_values; - pd->base.sum_alloc = permuted_dense_vtable_sum_alloc; + pd->base.sum_row_partition_alloc = permuted_dense_vtable_sum_row_partition_alloc; pd->base.refresh_csc_values = permuted_dense_refresh_csc_values; } diff --git a/src/utils/sparse_matrix.c b/src/utils/sparse_matrix.c index 06cb5e2..22c3fc9 100644 --- a/src/utils/sparse_matrix.c +++ b/src/utils/sparse_matrix.c @@ -312,7 +312,8 @@ static void sparse_refresh_csc_values(matrix *self) csr_to_csc_fill_values(sm->csr, sm->csc_cache, sm->csc_iwork); } -static matrix *sparse_sum_alloc(matrix *self, int axis, int d1, int *idx_map) +static matrix *sparse_sum_row_partition_alloc(matrix *self, int axis, int d1, + int *idx_map) { CSR_matrix *A = ((sparse_matrix *) self)->csr; int m; @@ -369,7 +370,7 @@ static void wire_vtable(sparse_matrix *sm) sm->base.broadcast_fill_values = sparse_broadcast_fill_values; sm->base.diag_vec_alloc = sparse_diag_vec_alloc; sm->base.diag_vec_fill_values = sparse_diag_vec_fill_values; - sm->base.sum_alloc = sparse_sum_alloc; + sm->base.sum_row_partition_alloc = sparse_sum_row_partition_alloc; sm->base.refresh_csc_values = sparse_refresh_csc_values; sm->base.free_fn = sparse_free; } diff --git a/src/utils/stacked_pd.c b/src/utils/stacked_pd.c index 0724859..5ac5a3c 100644 --- a/src/utils/stacked_pd.c +++ b/src/utils/stacked_pd.c @@ -394,8 +394,8 @@ static void assert_disjoint_row_perms(int n_blocks, permuted_dense *const *block existing CSR helper, then re-index idx_map from CSR ordering into A's block-major base.x ordering so a downstream values-fill pass can read directly from A->base.x. */ -static matrix *stacked_pd_vtable_sum_alloc(matrix *self, int axis, int d1, - int *idx_map) +static matrix *stacked_pd_vtable_sum_row_partition_alloc(matrix *self, int axis, + int d1, int *idx_map) { stacked_pd *spd = (stacked_pd *) self; @@ -488,7 +488,7 @@ static void wire_vtable(stacked_pd *spd) spd->base.diag_vec_fill_values = stacked_pd_vtable_diag_vec_fill_values; spd->base.broadcast_alloc = stacked_pd_vtable_broadcast_alloc; spd->base.broadcast_fill_values = stacked_pd_vtable_broadcast_fill_values; - spd->base.sum_alloc = stacked_pd_vtable_sum_alloc; + spd->base.sum_row_partition_alloc = stacked_pd_vtable_sum_row_partition_alloc; } matrix *new_stacked_pd_unchecked(int m, int n, int n_blocks, permuted_dense **blocks, diff --git a/tests/utils/test_permuted_dense.h b/tests/utils/test_permuted_dense.h index 4e68e58..53a49ba 100644 --- a/tests/utils/test_permuted_dense.h +++ b/tests/utils/test_permuted_dense.h @@ -1047,8 +1047,8 @@ const char *test_BA_pd_matrices_fast_path(void) return 0; } -/* Direct vtable tests for sum_alloc. The test PD represents a (6, 4) matrix - with a (3, 2) dense block at rows {0, 3, 4}, cols {1, 3}. We exercise all +/* Direct vtable tests for sum_row_partition_alloc. The test PD represents a (6, 4) + matrix with a (3, 2) dense block at rows {0, 3, 4}, cols {1, 3}. We exercise all three axes; for axis=0 (d1=2) the buckets {0/2, 3/2, 4/2} = {0, 1, 2} are all distinct and non-decreasing (linear-scan dedupe); for axis=1 (d1=3) the buckets {0%3, 3%3, 4%3} = {0, 0, 1} collapse two input rows @@ -1062,7 +1062,7 @@ const char *test_permuted_dense_sum_all_rows(void) matrix *M = new_permuted_dense(6, 4, 3, 2, row_perm, col_perm, X); int idx_map[6]; - matrix *out = M->sum_alloc(M, -1, /*d1 unused*/ 0, idx_map); + matrix *out = M->sum_row_partition_alloc(M, -1, /*d1 unused*/ 0, idx_map); mu_assert("output is PD", out->is_permuted_dense); permuted_dense *opd = (permuted_dense *) out; @@ -1092,7 +1092,7 @@ const char *test_permuted_dense_sum_block_of_rows(void) int d1 = 2; /* child shape (d1, d2) = (2, 3); output rows = d2 = 3 */ int idx_map[6]; - matrix *out = M->sum_alloc(M, 0, d1, idx_map); + matrix *out = M->sum_row_partition_alloc(M, 0, d1, idx_map); mu_assert("output is PD", out->is_permuted_dense); permuted_dense *opd = (permuted_dense *) out; @@ -1121,7 +1121,7 @@ const char *test_permuted_dense_sum_evenly_spaced_rows(void) int d1 = 3; /* output rows = d1 = 3, buckets = {0%3, 3%3, 4%3} = {0, 0, 1} */ int idx_map[6]; - matrix *out = M->sum_alloc(M, 1, d1, idx_map); + matrix *out = M->sum_row_partition_alloc(M, 1, d1, idx_map); mu_assert("output is PD", out->is_permuted_dense); permuted_dense *opd = (permuted_dense *) out; From ba8a2f30457a11c0894c37d13cb395ab51e592ce Mon Sep 17 00:00:00 2001 From: dance858 Date: Sat, 23 May 2026 08:16:51 -0700 Subject: [PATCH 3/3] minor --- include/utils/matrix.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/utils/matrix.h b/include/utils/matrix.h index 1ff9c17..55d99a1 100644 --- a/include/utils/matrix.h +++ b/include/utils/matrix.h @@ -111,8 +111,8 @@ typedef void (*matrix_diag_vec_fill_values_fn)(matrix *A, matrix *out); - axis = -1: sum all rows of A. C has shape (1, A->n). - axis = 0: block-sum rows in consecutive groups of d1. C has shape (A->m / d1, A->n). C[j, :] = sum_{i in [j*d1, (j+1)*d1)} A[i, :]. - - axis = 1: stride-sum rows at modular spacing d1. C has shape (d1, A->n). - C[j, :] = sum_{i : i % d1 == j} A[i, :]. + - axis = 1: stride-sum rows at spacing d1. C has shape (d1, A->n). C[j, :] + = sum_{i : i % d1 == j} A[i, :]. Caller pre-allocates idx_map of size A->nnz that can be used to compute the numerical result of the operation using via accumulation. */