From cf6a4e9889818143496a7f1882f660e4afa2536e Mon Sep 17 00:00:00 2001
From: dance858 <danielcederberg1@gmail.com>
Date: Sat, 23 May 2026 06:49:41 -0700
Subject: [PATCH 1/3] Polymorphic sum vtable on matrix base (axis = -1, 0, 1)

---
 include/utils/matrix.h                 |  19 ++++
 src/atoms/affine/sum.c                 |  46 +-------
 src/utils/permuted_dense.c             | 143 +++++++++++++++++++++++++
 src/utils/sparse_matrix.c              |  39 +++++++
 src/utils/stacked_pd.c                 |  91 ++++++++++++++++
 tests/all_tests.c                      |   6 +-
 tests/jacobian_tests/affine/test_sum.h |  35 ++++++
 tests/utils/test_permuted_dense.h      |  96 +++++++++++++++++
 8 files changed, 433 insertions(+), 42 deletions(-)

diff --git a/include/utils/matrix.h b/include/utils/matrix.h
index a1c4420..a2c46dc 100644
--- a/include/utils/matrix.h
+++ b/include/utils/matrix.h
@@ -106,6 +106,24 @@ typedef void (*matrix_broadcast_fill_values_fn)(matrix *A, broadcast_type type,
 typedef matrix *(*matrix_diag_vec_alloc_fn)(matrix *A);
 typedef void (*matrix_diag_vec_fill_values_fn)(matrix *A, matrix *out);
 
+/* Allocate C as a row-wise reduction of A. The reduction pattern is chosen by
+   axis:
+     - axis = -1: sum all rows of A. C has shape (1, A->n).
+                  C[0, k] = sum_{i in [0, A->m)} A[i, k].
+     - axis = 0:  block-sum rows in consecutive groups of d1 (requires
+                  A->m % d1 == 0). C has shape (A->m / d1, A->n).
+                  C[j, k] = sum_{i in [j*d1, (j+1)*d1)} A[i, k].
+     - axis = 1:  stride-sum rows at modular spacing d1 (requires
+                  A->m % d1 == 0). C has shape (d1, A->n).
+                  C[j, k] = sum_{i : i % d1 == j} A[i, k].
+   d1 is ignored when axis == -1.
+
+   Caller pre-allocates idx_map of size A->nnz. On return idx_map[ii] holds the
+   position in C's base.x where A's base.x[ii] scatter-adds. A values-fill pass
+   can compute C->x by zeroing it and accumulating A->x[ii] into
+   C->x[idx_map[ii]] for ii in [0, A->nnz). */
+typedef matrix *(*matrix_sum_alloc_fn)(matrix *A, int axis, int d1, int *idx_map);
+
 typedef void (*matrix_free_fn)(matrix *self);
 
 struct matrix
@@ -141,6 +159,7 @@ struct matrix
     matrix_broadcast_fill_values_fn broadcast_fill_values;
     matrix_diag_vec_alloc_fn diag_vec_alloc;
     matrix_diag_vec_fill_values_fn diag_vec_fill_values;
+    matrix_sum_alloc_fn sum_alloc;
 
     /* Lifecycle */
     matrix_free_fn free_fn;
diff --git a/src/atoms/affine/sum.c b/src/atoms/affine/sum.c
index 754f262..4ce60fb 100644
--- a/src/atoms/affine/sum.c
+++ b/src/atoms/affine/sum.c
@@ -85,49 +85,13 @@ static void jacobian_init_impl(expr *node)
 {
     expr *x = node->left;
     sum_expr *snode = (sum_expr *) node;
-    int axis = snode->axis;
-
-    /* initialize child's jacobian */
     jacobian_init(x);
-    CSR_matrix *Jx = x->jacobian->to_csr(x->jacobian);
-
-    /* we never have to store more than the child's nnz, nor more than the
-       output's cell count */
-    int max_nnz = MIN(Jx->nnz, node->size * node->n_vars);
-    CSR_matrix *jac = new_CSR_matrix(node->size, node->n_vars, max_nnz);
-    node->work->iwork = sp_malloc(MAX(jac->n, Jx->nnz) * sizeof(int));
-    snode->idx_map = sp_malloc(Jx->nnz * sizeof(int));
-
-    /* the idx_map array maps each nonzero entry j in x->jacobian
-       to the corresponding index in the output row matrix C. Specifically, for
-       each nonzero entry j in A, idx_map[j] gives the position in C->x where
-       the value from x->jacobian->x[j] should be accumulated. */
-
-    if (axis == -1)
-    {
-        sum_all_rows_csr_alloc(Jx, jac, node->work->iwork, snode->idx_map);
-    }
-    else if (axis == 0)
-    {
-        sum_block_of_rows_csr_alloc(Jx, jac, x->d1, node->work->iwork,
-                                    snode->idx_map);
-    }
-    else if (axis == 1)
-    {
-        sum_evenly_spaced_rows_csr_alloc(Jx, jac, node->size, node->work->iwork,
-                                         snode->idx_map);
-    }
-
-    /* For stacked_pd children, child->jacobian->base.x is block-major while
-       csr->x is row-major sorted. Re-index idx_map so it can be applied
-       directly to base.x in eval_jacobian. */
-    if (x->jacobian->is_stacked_pd)
-    {
-        compose_csr_idx_map_for_spd((const stacked_pd *) x->jacobian, Jx,
-                                    snode->idx_map);
-    }
 
-    node->jacobian = new_sparse_matrix(jac);
+    /* sum_alloc fills idx_map so eval_jacobian can accumulate from
+       child->jacobian->x. */
+    snode->idx_map = sp_malloc(x->jacobian->nnz * sizeof(int));
+    node->jacobian =
+        x->jacobian->sum_alloc(x->jacobian, snode->axis, x->d1, snode->idx_map);
 }
 
 static void eval_jacobian(expr *node)
diff --git a/src/utils/permuted_dense.c b/src/utils/permuted_dense.c
index eadf9b6..4907dea 100644
--- a/src/utils/permuted_dense.c
+++ b/src/utils/permuted_dense.c
@@ -397,6 +397,148 @@ static void permuted_dense_vtable_block_left_mult_values(const matrix *A,
     I_kron_A_fill_values(A, J, C, pd->kernel_dwork);
 }
 
+/* C = sum-all-rows of A. */
+static matrix *sum_all_rows_pd_alloc(matrix *A, int *idx_map)
+{
+    permuted_dense *pd = (permuted_dense *) A;
+
+    /* allocate C */
+    int zero = 0;
+    matrix *C = new_permuted_dense(1, A->n, 1, pd->n0, &zero, pd->col_perm, NULL);
+
+    /* fill idx_map */
+    for (int i = 0; i < pd->m0; i++)
+    {
+        int *idx_base = idx_map + i * pd->n0;
+        for (int j = 0; j < pd->n0; j++)
+        {
+            idx_base[j] = j;
+        }
+    }
+    return C;
+}
+
+/* C = block-sum of A's rows in consecutive groups of d1. */
+static matrix *sum_block_of_rows_pd_alloc(matrix *A_matrix, int d1, int *idx_map)
+{
+    permuted_dense *A = (permuted_dense *) A_matrix;
+    int C_m0 = 0;
+    int last_bucket = -1;
+    int *C_row_perm = (int *) sp_malloc(A->m0 * sizeof(int));
+
+    /* per input dense row ii, the index of its bucket within C_row_perm */
+    int *row_to_out = (int *) sp_malloc(A->m0 * sizeof(int));
+
+    // ---------------------------------------------------------------------------
+    //                          determine C's row_perm
+    // ---------------------------------------------------------------------------
+
+    /* for every row in the dense block */
+    for (int ii = 0; ii < A->m0; ii++)
+    {
+        /* find the bucket to which this row belongs */
+        int bucket = A->row_perm[ii] / d1;
+
+        /* add the bucket to C if it's new */
+        if (bucket != last_bucket)
+        {
+            C_row_perm[C_m0++] = bucket;
+            last_bucket = bucket;
+        }
+
+        /* map the input row of A to its row in C */
+        row_to_out[ii] = C_m0 - 1;
+    }
+
+    matrix *C = new_permuted_dense(A_matrix->m / d1, A_matrix->n, C_m0, A->n0,
+                                   C_row_perm, A->col_perm, NULL);
+
+    // ---------------------------------------------------------------------------
+    //                          fill idx_map
+    // ---------------------------------------------------------------------------
+    for (int ii = 0; ii < A->m0; ii++)
+    {
+        int offset = row_to_out[ii] * A->n0;
+        int *idx_base = idx_map + ii * A->n0;
+        for (int jj = 0; jj < A->n0; jj++)
+        {
+            idx_base[jj] = offset + jj;
+        }
+    }
+
+    sp_free(row_to_out);
+    sp_free(C_row_perm);
+    return C;
+}
+
+/* C = stride-sum of A's rows at modular spacing d1. C has shape (d1, A->n);
+   C[j, :] = sum_{i : i % d1 == j} A[i, :]. */
+static matrix *sum_evenly_spaced_rows_pd_alloc(matrix *self, int d1, int *idx_map)
+{
+    permuted_dense *A = (permuted_dense *) self;
+
+    // ---------------------------------------------------------------------------
+    //          which buckets of [0, d1) are hit by A->row_perm?
+    // ---------------------------------------------------------------------------
+    bool *seen = (bool *) sp_calloc(d1, sizeof(bool));
+    for (int ii = 0; ii < A->m0; ii++)
+    {
+        seen[A->row_perm[ii] % d1] = true;
+    }
+
+    // ---------------------------------------------------------------------------
+    //              determine C's row_perm (guarantees sorted order)
+    // ---------------------------------------------------------------------------
+    int *C_row_perm = (int *) sp_malloc(A->m0 * sizeof(int));
+    int *bucket_to_out_idx = (int *) sp_malloc(d1 * sizeof(int));
+    int C_m0 = 0;
+    for (int ii = 0; ii < d1; ii++)
+    {
+        if (seen[ii])
+        {
+            bucket_to_out_idx[ii] = C_m0;
+            C_row_perm[C_m0++] = ii;
+        }
+    }
+    sp_free(seen);
+
+    matrix *C =
+        new_permuted_dense(d1, self->n, C_m0, A->n0, C_row_perm, A->col_perm, NULL);
+
+    // ---------------------------------------------------------------------------
+    //                          fill idx_map
+    // ---------------------------------------------------------------------------
+    for (int ii = 0; ii < A->m0; ii++)
+    {
+        int base = bucket_to_out_idx[A->row_perm[ii] % d1] * A->n0;
+        int *idx_base = idx_map + ii * A->n0;
+        for (int jj = 0; jj < A->n0; jj++)
+        {
+            idx_base[jj] = base + jj;
+        }
+    }
+
+    sp_free(bucket_to_out_idx);
+    sp_free(C_row_perm);
+    return C;
+}
+
+static matrix *permuted_dense_vtable_sum_alloc(matrix *self, int axis, int d1,
+                                               int *idx_map)
+{
+    if (axis == -1)
+    {
+        return sum_all_rows_pd_alloc(self, idx_map);
+    }
+
+    if (axis == 0)
+    {
+        return sum_block_of_rows_pd_alloc(self, d1, idx_map);
+    }
+
+    return sum_evenly_spaced_rows_pd_alloc(self, d1, idx_map); /* axis == 1 */
+}
+
 static void wire_vtable(permuted_dense *pd)
 {
     pd->base.is_permuted_dense = true;
@@ -420,6 +562,7 @@ static void wire_vtable(permuted_dense *pd)
     pd->base.broadcast_fill_values = permuted_dense_vtable_broadcast_fill_values;
     pd->base.diag_vec_alloc = permuted_dense_vtable_diag_vec_alloc;
     pd->base.diag_vec_fill_values = permuted_dense_vtable_diag_vec_fill_values;
+    pd->base.sum_alloc = permuted_dense_vtable_sum_alloc;
     pd->base.refresh_csc_values = permuted_dense_refresh_csc_values;
 }
 
diff --git a/src/utils/sparse_matrix.c b/src/utils/sparse_matrix.c
index 04538d8..06cb5e2 100644
--- a/src/utils/sparse_matrix.c
+++ b/src/utils/sparse_matrix.c
@@ -18,6 +18,7 @@
 #include "utils/sparse_matrix.h"
 
 #include "utils/CSC_matrix.h"
+#include "utils/CSR_sum.h"
 #include "utils/linalg_sparse_matmuls.h"
 #include "utils/matrix.h"
 #include "utils/mini_numpy.h"
@@ -311,6 +312,43 @@ static void sparse_refresh_csc_values(matrix *self)
     csr_to_csc_fill_values(sm->csr, sm->csc_cache, sm->csc_iwork);
 }
 
+static matrix *sparse_sum_alloc(matrix *self, int axis, int d1, int *idx_map)
+{
+    CSR_matrix *A = ((sparse_matrix *) self)->csr;
+    int m;
+    if (axis == -1)
+    {
+        m = 1;
+    }
+    else if (axis == 0)
+    {
+        m = A->m / d1;
+    }
+    else
+    {
+        m = d1;
+    }
+    int max_nnz = MIN(A->nnz, m * A->n);
+    CSR_matrix *out = new_CSR_matrix(m, A->n, max_nnz);
+    int *iwork = (int *) sp_malloc(MAX(A->n, A->nnz) * sizeof(int));
+
+    if (axis == -1)
+    {
+        sum_all_rows_csr_alloc(A, out, iwork, idx_map);
+    }
+    else if (axis == 0)
+    {
+        sum_block_of_rows_csr_alloc(A, out, d1, iwork, idx_map);
+    }
+    else
+    {
+        sum_evenly_spaced_rows_csr_alloc(A, out, m, iwork, idx_map);
+    }
+
+    sp_free(iwork);
+    return new_sparse_matrix(out);
+}
+
 static void wire_vtable(sparse_matrix *sm)
 {
     sm->base.block_left_mult_vec = sparse_block_left_mult_vec;
@@ -331,6 +369,7 @@ static void wire_vtable(sparse_matrix *sm)
     sm->base.broadcast_fill_values = sparse_broadcast_fill_values;
     sm->base.diag_vec_alloc = sparse_diag_vec_alloc;
     sm->base.diag_vec_fill_values = sparse_diag_vec_fill_values;
+    sm->base.sum_alloc = sparse_sum_alloc;
     sm->base.refresh_csc_values = sparse_refresh_csc_values;
     sm->base.free_fn = sparse_free;
 }
diff --git a/src/utils/stacked_pd.c b/src/utils/stacked_pd.c
index 2310472..0724859 100644
--- a/src/utils/stacked_pd.c
+++ b/src/utils/stacked_pd.c
@@ -17,8 +17,11 @@
  */
 #include "utils/stacked_pd.h"
 
+#include "utils/CSR_sum.h"
+#include "utils/iVec.h"
 #include "utils/matrix.h"
 #include "utils/permuted_dense.h"
+#include "utils/sparse_matrix.h"
 #include "utils/stacked_pd_linalg.h"
 #include "utils/tracked_alloc.h"
 #include "utils/utils.h"
@@ -378,6 +381,93 @@ static void assert_disjoint_row_perms(int n_blocks, permuted_dense *const *block
 }
 #endif
 
+/* C = sum(A, axis) for a stacked_pd A.
+
+   axis == -1: native union path. C is a 1-row permuted_dense whose col_perm
+   is the sorted union of all blocks' col_perms (every input cell collapses
+   to exactly one output column). idx_map is filled in block-major order
+   matching A's base.x layout.
+
+   axis == 0 or 1: internal CSR fallback. A row-reduction of a stacked_pd
+   doesn't in general fit a single PD output (different output rows may have
+   different col footprints), so we materialize via to_csr, dispatch to the
+   existing CSR helper, then re-index idx_map from CSR ordering into A's
+   block-major base.x ordering so a downstream values-fill pass can read
+   directly from A->base.x. */
+static matrix *stacked_pd_vtable_sum_alloc(matrix *self, int axis, int d1,
+                                           int *idx_map)
+{
+    stacked_pd *spd = (stacked_pd *) self;
+
+    if (axis == -1)
+    {
+        /* multi-way sorted union of each block's col_perm */
+        iVec *col_union = iVec_new(8);
+        const int **col_arrs =
+            (const int **) sp_malloc(spd->n_blocks * sizeof(int *));
+        int *col_lens = (int *) sp_malloc(spd->n_blocks * sizeof(int));
+        for (int k = 0; k < spd->n_blocks; k++)
+        {
+            col_arrs[k] = spd->blocks[k]->col_perm;
+            col_lens[k] = spd->blocks[k]->n0;
+        }
+        sorted_union_int_arrays(col_arrs, col_lens, spd->n_blocks, col_union);
+        sp_free(col_arrs);
+        sp_free(col_lens);
+
+        /* inverse map: column-id (in [0, self->n)) → its position in col_union */
+        int *col_to_pos = (int *) sp_malloc(self->n * sizeof(int));
+        for (int p = 0; p < col_union->len; p++)
+        {
+            col_to_pos[col_union->data[p]] = p;
+        }
+
+        int row_zero = 0;
+        matrix *out = new_permuted_dense(1, self->n, 1, col_union->len, &row_zero,
+                                         col_union->data, NULL);
+        iVec_free(col_union);
+
+        /* fill idx_map in block-major order matching spd->base.x layout */
+        int native_pos = 0;
+        for (int k = 0; k < spd->n_blocks; k++)
+        {
+            permuted_dense *blk = spd->blocks[k];
+            for (int i = 0; i < blk->m0; i++)
+            {
+                for (int j = 0; j < blk->n0; j++)
+                {
+                    idx_map[native_pos++] = col_to_pos[blk->col_perm[j]];
+                }
+            }
+        }
+        sp_free(col_to_pos);
+        return out;
+    }
+
+    /* axis == 0 or 1: CSR fallback */
+    CSR_matrix *A = self->to_csr(self);
+    int m_out = (axis == 0) ? A->m / d1 : d1;
+    int max_out_nnz = MIN(A->nnz, m_out * A->n);
+    CSR_matrix *out = new_CSR_matrix(m_out, A->n, max_out_nnz);
+    int *iwork = (int *) sp_malloc(MAX(A->n, A->nnz) * sizeof(int));
+
+    if (axis == 0)
+    {
+        sum_block_of_rows_csr_alloc(A, out, d1, iwork, idx_map);
+    }
+    else
+    {
+        sum_evenly_spaced_rows_csr_alloc(A, out, m_out, iwork, idx_map);
+    }
+    sp_free(iwork);
+
+    /* idx_map is currently in CSR row order; re-index to block-major base.x
+       order so eval_jacobian reads child->jacobian->x directly. */
+    compose_csr_idx_map_for_spd(spd, A, idx_map);
+
+    return new_sparse_matrix(out);
+}
+
 static void wire_vtable(stacked_pd *spd)
 {
     spd->base.is_stacked_pd = true;
@@ -398,6 +488,7 @@ static void wire_vtable(stacked_pd *spd)
     spd->base.diag_vec_fill_values = stacked_pd_vtable_diag_vec_fill_values;
     spd->base.broadcast_alloc = stacked_pd_vtable_broadcast_alloc;
     spd->base.broadcast_fill_values = stacked_pd_vtable_broadcast_fill_values;
+    spd->base.sum_alloc = stacked_pd_vtable_sum_alloc;
 }
 
 matrix *new_stacked_pd_unchecked(int m, int n, int n_blocks, permuted_dense **blocks,
diff --git a/tests/all_tests.c b/tests/all_tests.c
index 77a9d12..e571683 100644
--- a/tests/all_tests.c
+++ b/tests/all_tests.c
@@ -66,8 +66,8 @@
 #include "utils/test_csr_matrix.h"
 #include "utils/test_linalg_sparse_matmuls.h"
 #include "utils/test_linalg_utils_matmul_chain_rule.h"
-#include "utils/test_matrix.h"
 #include "utils/test_matmul_dispatchers.h"
+#include "utils/test_matrix.h"
 #include "utils/test_permuted_dense.h"
 #include "utils/test_stacked_pd.h"
 #include "wsum_hess/affine/test_broadcast.h"
@@ -211,6 +211,7 @@ int main(void)
     mu_run_test(test_jacobian_sum_log_axis_0, tests_run);
     mu_run_test(test_jacobian_sum_add_log_axis_0, tests_run);
     mu_run_test(test_jacobian_sum_log_axis_1, tests_run);
+    mu_run_test(test_jacobian_sum_axis_minus_one_pd_child, tests_run);
     mu_run_test(test_jacobian_hstack_vectors, tests_run);
     mu_run_test(test_jacobian_hstack_matrix, tests_run);
     mu_run_test(test_jacobian_vstack_vectors, tests_run);
@@ -405,6 +406,9 @@ int main(void)
     mu_run_test(test_permuted_dense_BTA_empty_overlap, tests_run);
     mu_run_test(test_permuted_dense_BTA_partial_overlap, tests_run);
     mu_run_test(test_permuted_dense_BTDA_decomposition, tests_run);
+    mu_run_test(test_permuted_dense_sum_all_rows, tests_run);
+    mu_run_test(test_permuted_dense_sum_block_of_rows, tests_run);
+    mu_run_test(test_permuted_dense_sum_evenly_spaced_rows, tests_run);
     mu_run_test(test_BTA_pd_csc_matches_csr, tests_run);
     mu_run_test(test_BA_pd_matrices_pd_pd_full_block_B, tests_run);
     mu_run_test(test_BA_pd_matrices_pd_pd_general_B, tests_run);
diff --git a/tests/jacobian_tests/affine/test_sum.h b/tests/jacobian_tests/affine/test_sum.h
index 150fe53..c2b61bd 100644
--- a/tests/jacobian_tests/affine/test_sum.h
+++ b/tests/jacobian_tests/affine/test_sum.h
@@ -6,6 +6,7 @@
 #include "atoms/elementwise_restricted_dom.h"
 #include "expr.h"
 #include "minunit.h"
+#include "numerical_diff.h"
 #include "test_helpers.h"
 
 const char *test_jacobian_sum_log(void)
@@ -195,3 +196,37 @@ const char *test_jacobian_sum_log_axis_1(void)
     free_expr(sum_node);
     return 0;
 }
+
+/* End-to-end test that the polymorphic sum vtable (axis=-1) dispatches
+   correctly when the child Jacobian is a permuted_dense. Constructed as
+   sum(A2 @ (A1 @ x), -1): a single left_matmul on a leaf variable goes
+   through the sparse leaf-var fast path, but the outer left_matmul on a
+   composite child fires the PD jacobian_init path (BA_pd_matrices_alloc),
+   giving us a PD child for sum. We assert the PD-ness of the child and then
+   verify the resulting sum Jacobian by finite differences.
+
+   PD child Jacobians with d2 > 1 (needed to make axis=0/1 non-trivial via
+   the sum atom) are hard to construct from leaf variables — the multi-column
+   left_matmul path produces a stacked_pd instead. Direct unit tests on the
+   PD vtable for axis=0 and axis=1 live in tests/utils/test_permuted_dense.h. */
+const char *test_jacobian_sum_axis_minus_one_pd_child(void)
+{
+    double A1_data[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+    double A2_data[12] = {1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
+                          1.0, 0.0, 1.0, 0.0, 1.0, 0.0};
+
+    expr *x = new_variable(2, 1, 0, 2);
+    expr *A1_x = new_left_matmul_dense(NULL, x, 3, 2, A1_data);
+    expr *A2_A1_x = new_left_matmul_dense(NULL, A1_x, 4, 3, A2_data);
+    expr *sum_node = new_sum(A2_A1_x, -1);
+
+    double u_vals[2] = {0.5, -1.5};
+    jacobian_init(sum_node);
+    mu_assert("child Jacobian should be PD", A2_A1_x->jacobian->is_permuted_dense);
+
+    mu_assert("check_jacobian failed",
+              check_jacobian_num(sum_node, u_vals, NUMERICAL_DIFF_DEFAULT_H));
+
+    free_expr(sum_node);
+    return 0;
+}
diff --git a/tests/utils/test_permuted_dense.h b/tests/utils/test_permuted_dense.h
index da07db6..4e68e58 100644
--- a/tests/utils/test_permuted_dense.h
+++ b/tests/utils/test_permuted_dense.h
@@ -1047,4 +1047,100 @@ const char *test_BA_pd_matrices_fast_path(void)
     return 0;
 }
 
+/* Direct vtable tests for sum_alloc. The test PD represents a (6, 4) matrix
+   with a (3, 2) dense block at rows {0, 3, 4}, cols {1, 3}. We exercise all
+   three axes; for axis=0 (d1=2) the buckets {0/2, 3/2, 4/2} = {0, 1, 2}
+   are all distinct and non-decreasing (linear-scan dedupe); for axis=1
+   (d1=3) the buckets {0%3, 3%3, 4%3} = {0, 0, 1} collapse two input rows
+   onto the same output row (bitmap dedupe), so multiple idx_map entries
+   point to the same output position — a values-fill pass would scatter-add. */
+const char *test_permuted_dense_sum_all_rows(void)
+{
+    int row_perm[3] = {0, 3, 4};
+    int col_perm[2] = {1, 3};
+    double X[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+    matrix *M = new_permuted_dense(6, 4, 3, 2, row_perm, col_perm, X);
+
+    int idx_map[6];
+    matrix *out = M->sum_alloc(M, -1, /*d1 unused*/ 0, idx_map);
+
+    mu_assert("output is PD", out->is_permuted_dense);
+    permuted_dense *opd = (permuted_dense *) out;
+    mu_assert("out m", out->m == 1);
+    mu_assert("out n", out->n == 4);
+    mu_assert("out m0", opd->m0 == 1);
+    mu_assert("out n0", opd->n0 == 2);
+    int expected_row_perm[1] = {0};
+    int expected_col_perm[2] = {1, 3};
+    mu_assert("out row_perm", cmp_int_array(opd->row_perm, expected_row_perm, 1));
+    mu_assert("out col_perm", cmp_int_array(opd->col_perm, expected_col_perm, 2));
+    /* every input row collapses to row 0, so idx_map[i*n0+j] = j */
+    int expected_idx_map[6] = {0, 1, 0, 1, 0, 1};
+    mu_assert("idx_map", cmp_int_array(idx_map, expected_idx_map, 6));
+
+    free_matrix(out);
+    free_matrix(M);
+    return 0;
+}
+
+const char *test_permuted_dense_sum_block_of_rows(void)
+{
+    int row_perm[3] = {0, 3, 4};
+    int col_perm[2] = {1, 3};
+    double X[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+    matrix *M = new_permuted_dense(6, 4, 3, 2, row_perm, col_perm, X);
+
+    int d1 = 2; /* child shape (d1, d2) = (2, 3); output rows = d2 = 3 */
+    int idx_map[6];
+    matrix *out = M->sum_alloc(M, 0, d1, idx_map);
+
+    mu_assert("output is PD", out->is_permuted_dense);
+    permuted_dense *opd = (permuted_dense *) out;
+    mu_assert("out m", out->m == 3);
+    mu_assert("out n", out->n == 4);
+    mu_assert("out m0", opd->m0 == 3);
+    mu_assert("out n0", opd->n0 == 2);
+    int expected_row_perm[3] = {0, 1, 2}; /* {0/2, 3/2, 4/2} */
+    int expected_col_perm[2] = {1, 3};
+    mu_assert("out row_perm", cmp_int_array(opd->row_perm, expected_row_perm, 3));
+    mu_assert("out col_perm", cmp_int_array(opd->col_perm, expected_col_perm, 2));
+    int expected_idx_map[6] = {0, 1, 2, 3, 4, 5};
+    mu_assert("idx_map", cmp_int_array(idx_map, expected_idx_map, 6));
+
+    free_matrix(out);
+    free_matrix(M);
+    return 0;
+}
+
+const char *test_permuted_dense_sum_evenly_spaced_rows(void)
+{
+    int row_perm[3] = {0, 3, 4};
+    int col_perm[2] = {1, 3};
+    double X[6] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+    matrix *M = new_permuted_dense(6, 4, 3, 2, row_perm, col_perm, X);
+
+    int d1 = 3; /* output rows = d1 = 3, buckets = {0%3, 3%3, 4%3} = {0, 0, 1} */
+    int idx_map[6];
+    matrix *out = M->sum_alloc(M, 1, d1, idx_map);
+
+    mu_assert("output is PD", out->is_permuted_dense);
+    permuted_dense *opd = (permuted_dense *) out;
+    mu_assert("out m", out->m == 3);
+    mu_assert("out n", out->n == 4);
+    mu_assert("out m0", opd->m0 == 2); /* two distinct buckets {0, 1} */
+    mu_assert("out n0", opd->n0 == 2);
+    int expected_row_perm[2] = {0, 1};
+    int expected_col_perm[2] = {1, 3};
+    mu_assert("out row_perm", cmp_int_array(opd->row_perm, expected_row_perm, 2));
+    mu_assert("out col_perm", cmp_int_array(opd->col_perm, expected_col_perm, 2));
+    /* input rows 0 and 3 both bucket to output row 0 (idx_map → 0, 1);
+       input row 4 buckets to output row 1 (idx_map → 2, 3) */
+    int expected_idx_map[6] = {0, 1, 0, 1, 2, 3};
+    mu_assert("idx_map", cmp_int_array(idx_map, expected_idx_map, 6));
+
+    free_matrix(out);
+    free_matrix(M);
+    return 0;
+}
+
 #endif /* TEST_PERMUTED_DENSE_H */

From 121c2b35ef5ddb47cc923eeff629eafaf43e3f8d Mon Sep 17 00:00:00 2001
From: dance858 <danielcederberg1@gmail.com>
Date: Sat, 23 May 2026 08:03:41 -0700
Subject: [PATCH 2/3] permuted dense sum different rows

---
 include/utils/matrix.h            | 25 ++++++++++---------------
 src/atoms/affine/sum.c            |  6 +++---
 src/utils/permuted_dense.c        |  6 +++---
 src/utils/sparse_matrix.c         |  5 +++--
 src/utils/stacked_pd.c            |  6 +++---
 tests/utils/test_permuted_dense.h | 10 +++++-----
 6 files changed, 27 insertions(+), 31 deletions(-)

diff --git a/include/utils/matrix.h b/include/utils/matrix.h
index a2c46dc..1ff9c17 100644
--- a/include/utils/matrix.h
+++ b/include/utils/matrix.h
@@ -109,20 +109,15 @@ typedef void (*matrix_diag_vec_fill_values_fn)(matrix *A, matrix *out);
 /* Allocate C as a row-wise reduction of A. The reduction pattern is chosen by
    axis:
      - axis = -1: sum all rows of A. C has shape (1, A->n).
-                  C[0, k] = sum_{i in [0, A->m)} A[i, k].
-     - axis = 0:  block-sum rows in consecutive groups of d1 (requires
-                  A->m % d1 == 0). C has shape (A->m / d1, A->n).
-                  C[j, k] = sum_{i in [j*d1, (j+1)*d1)} A[i, k].
-     - axis = 1:  stride-sum rows at modular spacing d1 (requires
-                  A->m % d1 == 0). C has shape (d1, A->n).
-                  C[j, k] = sum_{i : i % d1 == j} A[i, k].
-   d1 is ignored when axis == -1.
-
-   Caller pre-allocates idx_map of size A->nnz. On return idx_map[ii] holds the
-   position in C's base.x where A's base.x[ii] scatter-adds. A values-fill pass
-   can compute C->x by zeroing it and accumulating A->x[ii] into
-   C->x[idx_map[ii]] for ii in [0, A->nnz). */
-typedef matrix *(*matrix_sum_alloc_fn)(matrix *A, int axis, int d1, int *idx_map);
+     - axis = 0:  block-sum rows in consecutive groups of d1. C has shape (A->m
+                  / d1, A->n). C[j, :] = sum_{i in [j*d1, (j+1)*d1)} A[i, :].
+     - axis = 1:  stride-sum rows at modular spacing d1. C has shape (d1, A->n).
+                  C[j, :] = sum_{i : i % d1 == j} A[i, :].
+
+   Caller pre-allocates idx_map of size A->nnz that can be used to compute the
+   numerical result of the operation using via accumulation. */
+typedef matrix *(*matrix_sum_row_partition_alloc_fn)(matrix *A, int axis, int d1,
+                                                     int *idx_map);
 
 typedef void (*matrix_free_fn)(matrix *self);
 
@@ -159,7 +154,7 @@ struct matrix
     matrix_broadcast_fill_values_fn broadcast_fill_values;
     matrix_diag_vec_alloc_fn diag_vec_alloc;
     matrix_diag_vec_fill_values_fn diag_vec_fill_values;
-    matrix_sum_alloc_fn sum_alloc;
+    matrix_sum_row_partition_alloc_fn sum_row_partition_alloc;
 
     /* Lifecycle */
     matrix_free_fn free_fn;
diff --git a/src/atoms/affine/sum.c b/src/atoms/affine/sum.c
index 4ce60fb..edbb257 100644
--- a/src/atoms/affine/sum.c
+++ b/src/atoms/affine/sum.c
@@ -87,11 +87,11 @@ static void jacobian_init_impl(expr *node)
     sum_expr *snode = (sum_expr *) node;
     jacobian_init(x);
 
-    /* sum_alloc fills idx_map so eval_jacobian can accumulate from
+    /* sum_row_partition_alloc fills idx_map so eval_jacobian can accumulate from
        child->jacobian->x. */
     snode->idx_map = sp_malloc(x->jacobian->nnz * sizeof(int));
-    node->jacobian =
-        x->jacobian->sum_alloc(x->jacobian, snode->axis, x->d1, snode->idx_map);
+    node->jacobian = x->jacobian->sum_row_partition_alloc(x->jacobian, snode->axis,
+                                                          x->d1, snode->idx_map);
 }
 
 static void eval_jacobian(expr *node)
diff --git a/src/utils/permuted_dense.c b/src/utils/permuted_dense.c
index 4907dea..ed7dca5 100644
--- a/src/utils/permuted_dense.c
+++ b/src/utils/permuted_dense.c
@@ -523,8 +523,8 @@ static matrix *sum_evenly_spaced_rows_pd_alloc(matrix *self, int d1, int *idx_ma
     return C;
 }
 
-static matrix *permuted_dense_vtable_sum_alloc(matrix *self, int axis, int d1,
-                                               int *idx_map)
+static matrix *permuted_dense_vtable_sum_row_partition_alloc(matrix *self, int axis,
+                                                             int d1, int *idx_map)
 {
     if (axis == -1)
     {
@@ -562,7 +562,7 @@ static void wire_vtable(permuted_dense *pd)
     pd->base.broadcast_fill_values = permuted_dense_vtable_broadcast_fill_values;
     pd->base.diag_vec_alloc = permuted_dense_vtable_diag_vec_alloc;
     pd->base.diag_vec_fill_values = permuted_dense_vtable_diag_vec_fill_values;
-    pd->base.sum_alloc = permuted_dense_vtable_sum_alloc;
+    pd->base.sum_row_partition_alloc = permuted_dense_vtable_sum_row_partition_alloc;
     pd->base.refresh_csc_values = permuted_dense_refresh_csc_values;
 }
 
diff --git a/src/utils/sparse_matrix.c b/src/utils/sparse_matrix.c
index 06cb5e2..22c3fc9 100644
--- a/src/utils/sparse_matrix.c
+++ b/src/utils/sparse_matrix.c
@@ -312,7 +312,8 @@ static void sparse_refresh_csc_values(matrix *self)
     csr_to_csc_fill_values(sm->csr, sm->csc_cache, sm->csc_iwork);
 }
 
-static matrix *sparse_sum_alloc(matrix *self, int axis, int d1, int *idx_map)
+static matrix *sparse_sum_row_partition_alloc(matrix *self, int axis, int d1,
+                                              int *idx_map)
 {
     CSR_matrix *A = ((sparse_matrix *) self)->csr;
     int m;
@@ -369,7 +370,7 @@ static void wire_vtable(sparse_matrix *sm)
     sm->base.broadcast_fill_values = sparse_broadcast_fill_values;
     sm->base.diag_vec_alloc = sparse_diag_vec_alloc;
     sm->base.diag_vec_fill_values = sparse_diag_vec_fill_values;
-    sm->base.sum_alloc = sparse_sum_alloc;
+    sm->base.sum_row_partition_alloc = sparse_sum_row_partition_alloc;
     sm->base.refresh_csc_values = sparse_refresh_csc_values;
     sm->base.free_fn = sparse_free;
 }
diff --git a/src/utils/stacked_pd.c b/src/utils/stacked_pd.c
index 0724859..5ac5a3c 100644
--- a/src/utils/stacked_pd.c
+++ b/src/utils/stacked_pd.c
@@ -394,8 +394,8 @@ static void assert_disjoint_row_perms(int n_blocks, permuted_dense *const *block
    existing CSR helper, then re-index idx_map from CSR ordering into A's
    block-major base.x ordering so a downstream values-fill pass can read
    directly from A->base.x. */
-static matrix *stacked_pd_vtable_sum_alloc(matrix *self, int axis, int d1,
-                                           int *idx_map)
+static matrix *stacked_pd_vtable_sum_row_partition_alloc(matrix *self, int axis,
+                                                         int d1, int *idx_map)
 {
     stacked_pd *spd = (stacked_pd *) self;
 
@@ -488,7 +488,7 @@ static void wire_vtable(stacked_pd *spd)
     spd->base.diag_vec_fill_values = stacked_pd_vtable_diag_vec_fill_values;
     spd->base.broadcast_alloc = stacked_pd_vtable_broadcast_alloc;
     spd->base.broadcast_fill_values = stacked_pd_vtable_broadcast_fill_values;
-    spd->base.sum_alloc = stacked_pd_vtable_sum_alloc;
+    spd->base.sum_row_partition_alloc = stacked_pd_vtable_sum_row_partition_alloc;
 }
 
 matrix *new_stacked_pd_unchecked(int m, int n, int n_blocks, permuted_dense **blocks,
diff --git a/tests/utils/test_permuted_dense.h b/tests/utils/test_permuted_dense.h
index 4e68e58..53a49ba 100644
--- a/tests/utils/test_permuted_dense.h
+++ b/tests/utils/test_permuted_dense.h
@@ -1047,8 +1047,8 @@ const char *test_BA_pd_matrices_fast_path(void)
     return 0;
 }
 
-/* Direct vtable tests for sum_alloc. The test PD represents a (6, 4) matrix
-   with a (3, 2) dense block at rows {0, 3, 4}, cols {1, 3}. We exercise all
+/* Direct vtable tests for sum_row_partition_alloc. The test PD represents a (6, 4)
+   matrix with a (3, 2) dense block at rows {0, 3, 4}, cols {1, 3}. We exercise all
    three axes; for axis=0 (d1=2) the buckets {0/2, 3/2, 4/2} = {0, 1, 2}
    are all distinct and non-decreasing (linear-scan dedupe); for axis=1
    (d1=3) the buckets {0%3, 3%3, 4%3} = {0, 0, 1} collapse two input rows
@@ -1062,7 +1062,7 @@ const char *test_permuted_dense_sum_all_rows(void)
     matrix *M = new_permuted_dense(6, 4, 3, 2, row_perm, col_perm, X);
 
     int idx_map[6];
-    matrix *out = M->sum_alloc(M, -1, /*d1 unused*/ 0, idx_map);
+    matrix *out = M->sum_row_partition_alloc(M, -1, /*d1 unused*/ 0, idx_map);
 
     mu_assert("output is PD", out->is_permuted_dense);
     permuted_dense *opd = (permuted_dense *) out;
@@ -1092,7 +1092,7 @@ const char *test_permuted_dense_sum_block_of_rows(void)
 
     int d1 = 2; /* child shape (d1, d2) = (2, 3); output rows = d2 = 3 */
     int idx_map[6];
-    matrix *out = M->sum_alloc(M, 0, d1, idx_map);
+    matrix *out = M->sum_row_partition_alloc(M, 0, d1, idx_map);
 
     mu_assert("output is PD", out->is_permuted_dense);
     permuted_dense *opd = (permuted_dense *) out;
@@ -1121,7 +1121,7 @@ const char *test_permuted_dense_sum_evenly_spaced_rows(void)
 
     int d1 = 3; /* output rows = d1 = 3, buckets = {0%3, 3%3, 4%3} = {0, 0, 1} */
     int idx_map[6];
-    matrix *out = M->sum_alloc(M, 1, d1, idx_map);
+    matrix *out = M->sum_row_partition_alloc(M, 1, d1, idx_map);
 
     mu_assert("output is PD", out->is_permuted_dense);
     permuted_dense *opd = (permuted_dense *) out;

From ba8a2f30457a11c0894c37d13cb395ab51e592ce Mon Sep 17 00:00:00 2001
From: dance858 <danielcederberg1@gmail.com>
Date: Sat, 23 May 2026 08:16:51 -0700
Subject: [PATCH 3/3] minor

---
 include/utils/matrix.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/utils/matrix.h b/include/utils/matrix.h
index 1ff9c17..55d99a1 100644
--- a/include/utils/matrix.h
+++ b/include/utils/matrix.h
@@ -111,8 +111,8 @@ typedef void (*matrix_diag_vec_fill_values_fn)(matrix *A, matrix *out);
      - axis = -1: sum all rows of A. C has shape (1, A->n).
      - axis = 0:  block-sum rows in consecutive groups of d1. C has shape (A->m
                   / d1, A->n). C[j, :] = sum_{i in [j*d1, (j+1)*d1)} A[i, :].
-     - axis = 1:  stride-sum rows at modular spacing d1. C has shape (d1, A->n).
-                  C[j, :] = sum_{i : i % d1 == j} A[i, :].
+     - axis = 1:  stride-sum rows at spacing d1. C has shape (d1, A->n). C[j, :]
+                  = sum_{i : i % d1 == j} A[i, :].
 
    Caller pre-allocates idx_map of size A->nnz that can be used to compute the
    numerical result of the operation using via accumulation. */