diff --git a/.github/workflows/hub-client-e2e.yml b/.github/workflows/hub-client-e2e.yml index 279cedd63..24390c42d 100644 --- a/.github/workflows/hub-client-e2e.yml +++ b/.github/workflows/hub-client-e2e.yml @@ -1,21 +1,22 @@ name: Hub-Client E2E Tests on: - push: - branches: [main] - paths: - - 'hub-client/**' - - '.github/workflows/hub-client-e2e.yml' - pull_request: - paths: - - 'hub-client/**' - - '.github/workflows/hub-client-e2e.yml' workflow_dispatch: inputs: recreate-all-snapshots: description: 'Delete and recreate ALL visual regression baselines' type: boolean default: false + push: + branches: + - main + pull_request: + branches: + - main + +concurrency: + group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.event.pull_request.number || github.ref }} + cancel-in-progress: true jobs: e2e-tests: diff --git a/Cargo.lock b/Cargo.lock index 7fea400d0..d093c5378 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3384,6 +3384,7 @@ dependencies = [ "serde", "serde_json", "sha1 0.11.0", + "smallvec", "supports-hyperlinks", "tempfile", "tokio", @@ -3877,6 +3878,7 @@ dependencies = [ "rustc-hash", "serde", "serde_json", + "yaml-rust2", ] [[package]] @@ -3963,6 +3965,7 @@ dependencies = [ "serde_json", "serde_yaml", "sha2 0.11.0", + "smallvec", "tempfile", "thiserror 2.0.18", "time", @@ -4277,6 +4280,7 @@ version = "0.1.0" dependencies = [ "serde", "serde_json", + "smallvec", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 6e726d757..d5824d159 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -44,6 +44,7 @@ proc-macro2 = { version = "1.0.106", features = ["span-locations"] } schemars = "1.2.1" serde = { version = "1.0.228", features = ["derive"] } serde_json = "1.0.149" +smallvec = { version = "1.13", features = ["serde"] } serde_yaml = "0.9" thiserror = "2.0" toml = "0.9.11" diff --git a/claude-notes/designs/incremental-writer-contract.md b/claude-notes/designs/incremental-writer-contract.md new file mode 100644 index 000000000..be913bf66 --- /dev/null +++ b/claude-notes/designs/incremental-writer-contract.md @@ -0,0 +1,563 @@ +# Incremental writer contract + +The incremental writer (`pampa::writers::incremental`) edits a qmd +source file in place from a pair of ASTs: a baseline AST that +matches what was last produced from the source, and a new AST that +reflects the user's edits. It diffs the two structurally, copies +unchanged bytes from the original source, and re-serializes the +changed regions through the qmd writer. + +This document describes the rules the writer obeys — what it +guarantees, what it forbids, and how callers must shape their inputs +to make the guarantees hold. It is the contract; implementation +specifics, file paths, and migration plans live in plans that +modify the writer. + +**Companion doc:** [`provenance-contract.md`](provenance-contract.md) +covers the *producer* side — how transforms pick the right `SourceInfo` +shapes that this doc tells the writer how to consume. The two are +designed in pairs: if you change either contract, check the other. +The provenance doc also carries the `By::` constructor catalog with +atomicity flags that the §"Atomic-kind `Generated`" section below +draws on. + +## The four primitives + +The writer is one node in a four-primitive grammar: + +| Primitive | What it does | +|---|---| +| **parse** | Lex/parse qmd source bytes into a parse-only AST. No transforms. | +| **transform** | Apply a pipeline's transform stages to an AST. Produces a same-shape AST at a different tier. | +| **reconcile** | Diff two ASTs structurally, producing a plan of `KeepBefore` / `UseAfter` / `RecurseIntoContainer` alignments. | +| **write** | Materialize the plan as qmd bytes — Verbatim-copy source bytes for `KeepBefore`, re-serialize through the qmd writer for `UseAfter` / `Rewrite`. | + +The primitives are orthogonal. The writer is pipeline-agnostic: it +diffs the two ASTs it is given and writes accordingly, regardless +of what pipeline produced them. The caller picks which transforms +to apply (or none); the writer just diffs. + +### Pipeline-tier discipline + +The two ASTs handed to the writer must be at the **same pipeline +tier**. Same-tier means: both ASTs were produced by the same +sequence of transform stages, applied to inputs that were both +parsed from the same kind of source. The reconciler is +tier-agnostic — it diffs whatever it is given — but if the two +inputs do not share a tier, every Generated wrapper looks like a +new insertion and the output degrades to whole-document +re-serialization. + +Two tiers are in use today: + +- **parse-only**: the output of `parse_qmd_to_ast(content)`. Used + by q2-debug, q2-slides, and the WASM demos. +- **q2-preview**: the output of + `renderPageInProjectWithAttribution(path, …)`, i.e. post-q2- + preview-pipeline AST. Used by ReactPreview's q2-preview path and + the q2-preview SPA. + +Future pipeline kinds are admitted without writer changes. The +caller composes parse and transform separately and hands the +writer two ASTs; the tier is implicit in whichever baseline the +caller passes. + +## The byte-provenance contract + +The writer materializes bytes constantly. Every Rewrite path emits +new bytes through the qmd writer; even Verbatim copies are a form +of materialization. The contract is not "no materialization" — that +phrasing is too blunt. It is more precise: + +> The writer only emits bytes whose origin can be honestly traced +> to either **existing source bytes in the target file** (Verbatim +> copies, slot preimages via `preimage_in`) or **fresh AST the +> user constructed** (Rewrite paths fed by user-supplied AST +> nodes via the qmd writer's normal arms). + +The case the contract forbids is the one where the writer would +emit bytes synthesized from a wrapper's slot children as flat +content in the parent file. The canonical example is include +expansion: an `IncludeExpansion` wrapper carries the included +file's blocks in a content slot. Emitting those blocks as flat +parent-file bytes would put bytes in `parent.qmd` whose provenance +is `foo.qmd` — dishonest at the parent-file boundary. + +The writer's coarsen step prevents this case structurally rather +than catching it at write time. When the reconciler asks the +writer to recurse into a wrapper that is not editable inside (an +atomic CustomNode, an atomic-kind Generated, or any node with no +preimage in the target file), `coarsen` substitutes a safe +alignment — usually KeepBefore — before the qmd writer ever sees +the case. The qmd writer's arms for these wrappers thus become +`unreachable!()` in a well-formed pipeline: a debug-assertion +surface for coarsen bugs, not a user-facing failure mode. + +This is why `incremental_write` returns `Result<(qmd, warnings), +Vec>` — `Ok` is the normal path (write +succeeded; warnings carry any soft-drops); `Err` keeps its +pre-Plan-7 meaning, surfacing qmd-writer failures that bubble up +via `?` from the underlying serializer. Programmer errors — +invariant violations from coarsen bugs, structurally impossible +reconciliation states — do **not** flow through `Result`; they +`panic!()` / `unreachable!()` / `debug_assert!()` inline. This is +the idiomatic q2 pattern (see existing uses across +`pampa/src/writers/`) and the WASM-side surface is loud: +`console_error_panic_hook` is installed at module init, so a panic +becomes a JS exception with a full stack trace. Every user-facing +bad-edit case is handled by soft-drop, not by returning `Err`. + +## The role-asymmetry contract on `Generated.from` + +A `Generated` node's `from` field is a list of `Anchor`s, each +carrying a role and a `source_info` chain. Roles in use today: + +- **`Invocation`**: the source token whose pipeline-time + interpretation produced this node. E.g. the `{{< meta title >}}` + shortcode bytes that resolved into the inlines now appearing in + the rendered output. +- **`ValueSource`** (Plan 9): the metadata range whose value the + node was synthesized from. E.g. the YAML byte range of + `meta.title` that the title-block synthesizer read to build the + rendered title block. +- **`Other("…")`**: extension-defined attribution. Carries + whatever identity the extension wants; not interpreted by core. +- **`Dispatch`** (Plan 10, future): the Lua source location of a + filter or shortcode handler that produced this node. + +`preimage_in` — the writer's byte-range lookup — walks **only the +`Invocation` anchor**. All other roles, present and future, are +diagnostic-only. The writer never copies bytes from a non- +`Invocation` anchor's source range. + +This asymmetry is load-bearing. A `ValueSource` anchor points at +YAML metadata bytes; copying those into a document body would +emit raw YAML in the middle of prose. A `Dispatch` anchor points +at Lua filter source; copying those bytes would emit Lua code as +prose. Both are correctness bugs. The writer prevents them by +never walking past the role discrimination. + +Extension authors using `AnchorRole::Other("…")` can rely on this: +their attribution data will not be accidentally consulted by the +writer's byte-copy path, regardless of what they choose to point +it at. The role-asymmetry is the forward-compat guarantee. + +## The unified editability predicate + +`is_editable_inside(node, target_file_id) -> bool` decides whether +inner edits to a node are accepted. The same predicate is consulted +by two surfaces: + +- React's read-only gate (Plan 2A's framework atomic gate) + classifies regions in the rendered DOM and prevents the user + from typing into uneditable regions in the first place. +- The writer's coarsen step uses it to decide whether to recurse + into a container or soft-drop edits aimed at its interior. + +Three structural reasons a node is not editable inside: + +1. **Atomic CustomNodes** — types listed in `ATOMIC_CUSTOM_NODES` + (`CrossrefResolvedRef`, `IncludeExpansion`). These represent + single replaceable units. The user can replace them wholesale + via a component menu; they cannot type inside them. + +2. **Atomic-kind `Generated`** — `Generated` nodes whose `by.kind` + is one of `"shortcode"`, `"filter"`, `"title-block"`, + `"tree-sitter-postprocess"`. Pipeline-emitted content whose + user-source is the invocation token (for shortcode) or whose + source identity is the pipeline stage that produced it (for + the others); not the resolved text the user sees. + +3. **No preimage in target** — nodes whose `preimage_in(target)` + returns `None`. This covers cross-file `Original` nodes + (without a wrapper that pulls them into the target's + provenance), synthesized containers like sectionize / footnotes + / appendix that have empty anchor lists, and gappy `Concat` + chains. + +A node is editable inside iff it has byte-traceable preimage in +the target file AND is not an atomic CustomNode AND is not an +atomic-kind `Generated`. + +The predicate is canonical on the Rust side; React consults an +equivalent TypeScript predicate that reads the same AST shape. +Keeping the two in lockstep is a discipline like the `ATOMIC_CUSTOM_NODES` +const / TS hand-mirror pairing. + +## Soft-drop semantics + +When a reconciliation alignment would target a non-editable region, +`coarsen` substitutes a safe alignment and emits a warning into a +warning sink. The write succeeds; the rejected edit is the only +casualty. + +Six cases: + +- **Inline-level UseAfter on a region where `is_editable_inside` + returns false** (typically: user retyped resolved shortcode + text). `coarsen` substitutes `KeepBefore` for the inline at the + original-side index; the surrounding inline plan continues. + Emits `Q-3-42`. + +- **Block-level RecurseIntoContainer on a non-editable region** + (user edited inside an include, or inside a synthesized-from- + metadata container). `coarsen` substitutes `KeepBefore` for the + wrapper. If the wrapper has preimage in target (atomic + CustomNode whose `source_info` is `Original` covering the + include token), the substitution lands in `Verbatim`. If it + does not (no-preimage Generated container), the substitution + lands in `Omit`; the container regenerates from baseline content + on the next pipeline run. Emits `Q-3-43`. + +- **Block-level UseAfter on an atomic CustomNode** — *let-user- + win*. Kept as `Rewrite`; the qmd writer's CustomNode arm reads + `plain_data` and emits the include syntax from a fresh user- + edit-tagged CustomNode. No warning. This is the deliberate + asymmetry: when the user explicitly destroys or replaces an + atomic CustomNode through an explicit affordance (e.g. a + component menu picker), the intent is unambiguous. + +- **Block-level UseAfter on an atomic-kind `Generated` *with* + preimage in target** — the user edited inside a shortcode- + resolved or filter-output block, and the reconciler split the + edit into a deleted-original + new-block. The new block still + carries the token's `Invocation` anchor; substitute `Verbatim` + of the preimage range and discard the new block. Emits `Q-3-43`. + (Added 2026-05-26; see commit `e584428d` for the implementation + and the lipsum-paragraph repro that motivated it. Without this + branch the let-user-win below would emit the resolved bytes + back into source qmd.) + +- **Block-level UseAfter on a no-preimage Generated container** + — substitute `Omit`; the original container regenerates next + run. There is no source position to anchor a `Rewrite` at, so + let-user-win is not available. Emits `Q-3-43`. + +- **KeepBefore on an atomic-kind `Generated` with empty `from`** + — substitute `Omit`. The original content regenerates from + baseline (the filter constructs it again, the title-block + synthesizer reads the metadata again, etc.). No user edit was + involved; this case is normal Coarsen flow, not a soft-drop in + the user-facing sense. The only exception is the shortcode + sub-case discussed below. + +### Non-soft-drop branches in the same cascade + +Two cascade branches share the predicate but do not emit +warnings — they exist alongside the soft-drop cases and are +mentioned here for completeness: + +- **Block-level RecurseIntoContainer on a non-atomic Generated + wrapper with source-bearing children + a `block_container_plans` + entry** — substitute `Transparent`, recursing into the wrapper's + children. No warning emitted at the wrapper level; warnings emerge + from per-child cascades. This is the sectionize / footnotes- + container / appendix-container case. + +- **KeepBefore catch-all** — cross-file `Original`, gappy `Concat`, + Generated wrapper without source-bearing children, etc. — fall + through to `Rewrite` of the original block. No warning. This is + the cascade's silent serialization fallback for shapes the other + rules don't classify; it is also where the algebra is currently + weakest (the serializer walks the entire subtree, so atomic + descendants hidden inside an editable-by-source_info container + could leak resolved bytes). Plan 7d (algebraic-soundness + refactor) names this as the explicit departure from the + byte-provenance contract above. + +## `CoarsenedEntry` self-containment + +The cascade above produces a flat list of `CoarsenedEntry` values +that `emit_entries` walks to produce the output bytes. The +variants — `Verbatim`, `InlineSplice`, `Rewrite`, `Transparent`, +`Omit` — share a structural property worth pinning explicitly: + +> Every variant of `CoarsenedEntry` carries enough information +> to produce its emit bytes **without further context**. No +> deferred index lookups against an ambient slice. No "look this +> up at emit time" handoffs. Each entry is self-describing. + +| Variant | Self-contained because | +|---|---| +| `Verbatim` | `byte_range` is absolute into `original_qmd`. | +| `InlineSplice` | Pre-computed `block_text: String` set at coarsen time. | +| `Rewrite` | Pre-computed `block_text: String` set at coarsen time (the same write the emit path used to perform — moved earlier so the entry can travel through `Transparent` recursion intact). | +| `Transparent` | List of self-contained child entries. | +| `Omit` | No bytes. | + +Two `Option` indices appear on `Verbatim` and `InlineSplice` +(`orig_idx`) — these are hints to `compute_separator`'s +"consecutive-in-original" optimization, **not** byte-production +context. They are always `Option`: `None` for children inside a +`Transparent` wrapper, where any index would be ambiguous between +top-level and child-level slices. + +Why this matters: `Transparent` recursion is the writer's +compositional escape from the alignment-kind dispatch. A +`Transparent` entry inlines its children into the emit stream as +if the wrapper weren't there. The composition is sound only if +each child can produce its bytes without depending on its position +in some ambient slice. Pre-2026-05-25, `Rewrite` carried +`new_idx: usize` that `emit_entries` looked up against +`new_ast.blocks` top-level; inside a `Transparent` recursion, the +index pointed at a child-relative position and the lookup +panicked. The refactor on `e584428d` lifted `Rewrite` to carry +pre-computed text, matching the shape `InlineSplice` had carried +since `ab10f37b`. + +The self-containment property is *necessary* for the algebra to +be sound, but it is not *sufficient* on its own. The cascade +above still has a "Rewrite as subtree-serializing fallback" arm +(see "Non-soft-drop branches" above and the KeepBefore catch-all) +whose semantics depend on the input subtree not containing +unauthorable descendants. Plan 7d takes the next step: replace +the subtree-serializing fallback with structural recursion all +the way down to leaves where source_info attests user-authored +content. Self-containment is the substrate that recursion can +land on safely. + +### Anti-patterns for new variants + +Don't add a `CoarsenedEntry` variant that: + +- Defers to a named slice ("index N into `new_ast.blocks`," + "child M of original block at index K"). The moment a future + refactor produces the variant in a different context (recursion, + reuse from a sibling crate, a test fixture), the index points at + the wrong slice and the failure is silent until the panic. +- Depends on context not encoded in the variant itself. +- Requires specific timing of side effects. The current variants' + byte-producing operations are referentially transparent — + `write_block_to_string` depends only on its `Block` argument. + A variant whose correctness depends on emit-time vs coarsen-time + ordering is a sign the entry shape is wrong. + +When in doubt, look at `InlineSplice` — the first variant to carry +pre-computed `block_text` (introduced when partial inline rewrites +made deferral impossible). It is the structural blueprint the +other variants should match. + +### Why soft-drop replaces hard-abort + +The writer could have made every bad-edit case fatal: an +`AtomicViolation` variant returned as `Err`, causing the entire +save to fail until the user undoes the bad edit. Soft-drop is +better because: + +- React (Plan 2A's read-only gate) is the primary safeguard. The + writer is the contract guarantor. If React has a hole, the + writer protects without losing the user's session. +- The user's *other* edits in the same save are not held hostage + to the bad one. A user editing several paragraphs and + accidentally typing into a shortcode resolution loses the + shortcode edit, not the paragraph edits. +- The user-facing failure mode "the entire save was rejected" is + not a recoverable state in an autosave context (hub-client and + the SPA both persist on every keystroke; there is no discrete + save the user can discard). + +### User-facing diagnostic surface + +Soft-drop emits warnings, not errors. Two codes: + +- **`Q-3-42` — Shortcode edit dropped.** Inline-level cases: + the user retyped over a shortcode-resolved, filter-decorated, + or title-block-generated inline. The diagnostic body names the + affected text and the source range of the invocation token. +- **`Q-3-43` — Generated content edit dropped.** Block-level + cases: include-expansion recursion, synthesized-container + recursion, synthesized-container replacement. The diagnostic + body names the include `source_path` (for includes) or the + metadata key (for metadata-derived containers), and an + imperative instruction ("To edit this content, open `` + directly." / "This content is generated from metadata; edit + `_quarto.yml` to change it.") + +Both warnings carry source ranges and surface in Monaco as +squiggles. The autosave context makes both codes prone to +repeating on every keystroke; the diagnostic-ingest layer applies +suppress-after-3-by-source-range so the user is not flooded. + +## Atomic CustomNodes + +A CustomNode is *atomic* if it represents a single, indivisible +unit at the editing layer. The user cannot type inside one; they +can only replace it wholesale through an explicit affordance +(component menu, palette command). + +The set of atomic CustomNode type names is declared in two places +that must stay in sync: + +- **Rust**: `quarto_core::ATOMIC_CUSTOM_NODES: &[&str]` and the + predicate `quarto_core::is_atomic_custom_node(type_name: &str)`. +- **TypeScript**: a hand-mirrored `ATOMIC_CUSTOM_NODES: ReadonlySet` + in `ts-packages/preview-renderer/src/utils/atomicCustomNodes.ts`. + +Built-in atomic types as of this writing: `CrossrefResolvedRef`, +`IncludeExpansion`. Extensions wanting to declare their own atomic +types will eventually do so via `_extension.yml` schema; until +that lands, the const set covers the cases. + +### Atomic CustomNodes do not block let-user-win + +The let-user-win Rewrite path for block-level UseAfter on an +atomic CustomNode is provenance-honest. When the user constructs +a fresh `IncludeExpansion` through React (with `plain_data = +{ source_path: "bar.qmd" }`) and the writer materializes +`{{< include bar.qmd >}}` into source, the bytes' origin is the +user's edit. The qmd writer's `IncludeExpansion` arm reads +`plain_data`, not `source_info`, and emits the include syntax — +the same arm whether the wrapper came from `IncludeExpansionStage` +(pipeline) or from React (user). That symmetry is what makes +let-user-win clean. + +## Atomic-kind `Generated` and the shortcode-only invariant + +Four `By::kind` values are classified as atomic by +`By::is_atomic_kind()`: + +- `"shortcode"` — resolution of a `{{< … >}}` token +- `"filter"` — filter-emitted construction (e.g. `pandoc.Str(...)`) +- `"title-block"` — title-block synthesizer output +- `"tree-sitter-postprocess"` — tree-sitter postprocess synthesized + whitespace and similar + +These split into two structurally different cases at the writer's +`KeepBefore` branch: + +| Kind | Source token in qmd? | Missing `Invocation` anchor means | Correct writer action | +|---|---|---|---| +| `shortcode` | Yes — `{{< … >}}` | Plan-6 stamper bug; the token bytes get lost in output | Debug-assert; `Omit` in release | +| `filter` | No — filter constructed the node | Expected (no source token exists) | `Omit` — regenerates next run | +| `title-block` | No — synthesized from metadata | Expected | `Omit` — regenerates next run | +| `tree-sitter-postprocess` | No — synthesized space etc. | Expected | `Omit` — regenerates next run | + +Shortcode is the only kind that warrants a debug-assert on the +empty-`from` case. For the other three, empty `from` is the +normal shape — there is no source token to anchor at — and +regenerating from baseline is the correct behavior. For +shortcode, empty `from` means the stamper failed to attach the +token's source range, and `Omit` would silently lose the +`{{< … >}}` bytes the user wrote. + +The asymmetry is intentional. Tests covering Coarsen's `Omit` +path must exercise all four kinds (filter / title-block / +tree-sitter-postprocess hit the regular Omit path; shortcode hits +the debug-asserted Omit path under `cfg(debug_assertions)`). + +## Multi-inline dedupe + +A single source token can resolve to multiple AST inlines. The +canonical case is a shortcode whose metadata value parses as +markdown: + + {{< meta title >}} + +with `meta.title: "**Bold** Title"` resolves to three inlines: +`Strong[Str("Bold")]`, `Space`, `Str("Title")`. Each inline +carries the same `Generated { by: shortcode("meta"), from: +[Invocation -> Original{shortcode_token_range}] }` shape. + +At the block level, both reconciliation inputs see the same +three-inline output, the surrounding `Para` is structurally +identical, and the alignment is `KeepBefore` over the whole Para +— one `Verbatim` of the whole Para's bytes. Correct. + +At the inline level (when the user edits something else in the +same Para), the reconciler picks `RecurseIntoContainer` and walks +the inline plan. Without dedupe, each shortcode-derived inline's +`KeepBefore` would Verbatim-copy the shortcode token, emitting +the `{{< meta title >}}` bytes three times. + +The dedupe rule: when iterating inline alignments, group +consecutive `KeepBefore` entries whose inlines' `Invocation` +anchors are `PartialEq`-equal, and emit `Verbatim` once for the +group using the anchor's preimage byte range. + +`SourceInfo` derives `PartialEq`, and `Anchor` carries +`source_info: Arc`. `Arc`'s `PartialEq` compares +the inner value, not the pointer, so structurally-equal anchors +in distinct `Arc`s still compare equal. This is what makes +dedupe work without identity-tracking machinery. + +Dedupe consults `Invocation` only. Two inlines whose `Invocation` +anchors match but whose `ValueSource` or `Dispatch` anchors +differ still dedupe — the user is asking "which source token did +these come from", not "which metadata value" or "which Lua +file". + +## Filter mutations versus constructions + +Plan 4 distinguishes two kinds of filter activity: + +- **Filter construction** — a filter emits a new node from + scratch (`pandoc.Str("decoration")`). The result carries + `Generated { by: filter, from: [] }`, classified atomic. +- **Filter mutation** — a filter modifies a node it received + (`Str.text = upper(Str.text)`). The result keeps the + `Original` source_info of the input node, *not* `Generated`. + Not classified atomic. + +A user edit through React on a filter-mutated `Str` produces an +unusual round-trip. The user types "world" over the filter-output +"HELLO"; the writer Rewrites "world" to source bytes; the next +pipeline run filters "world" → "WORLD". For idempotent filters +(like uppercase) this is fine — the typed text round-trips through +the filter to itself. For non-idempotent filters +(`x => upper(x) + "!"`) the typed text gets a `!` appended on +every save, which is confusing. + +This corner is accepted, not fixed, because: + +- Revising Plan 4 to track filter mutations distinctly from + plain `Original` would be a notable type-system change. +- Plan 7a's runtime user-filter idempotence detection catches the + AST-level non-idempotence that would actually corrupt + round-trip. +- Plan 3's idempotence test enforces the contract for built-in + filters at CI time. + +Users who write non-idempotent filters get a runtime warning +(Q-3-44 / Q-3-45) and can decide whether the trade-off is +acceptable. + +## Design rationale & evolution + +This section captures the *why* behind decisions that read as +arbitrary out of context. + +**Soft-drop replaces hard-abort.** Earlier sketches of the writer +modeled bad-edit cases as a fatal `AtomicViolation` returned as +`Err`. That variant was never implemented — soft-drop subsumes it +before reaching code. The reason is the autosave context: there +is no discrete "save" affordance the user could use to discard a +bad edit, so a save-rejecting error trades one keystroke loss +for an entire session's worth of edits held hostage. Soft-drop +keeps the surface area of failure minimal — only the bad edit is +lost — and gives the contract guarantor a way to protect honesty +without punishing the user. + +**The writer is pipeline-agnostic by signature.** The WASM entry +takes a baseline AST as an argument rather than parsing the +original qmd internally to synthesize one. This makes the writer +ignorant of which pipeline produced its inputs; future pipelines +land without writer changes. The caller composes parse and +transform; the writer just diffs. The change also removes the +writer's dependency on `RenderContext`, `SystemRuntime`, +`Format`, and pipeline construction machinery — its surface +becomes three strings in and one JSON envelope out. + +**No `pipeline_kind` parameter.** The pipeline tier is implicit +in the baseline AST the caller passes. A `pipeline_kind` +parameter would be a redundant claim that the caller could get +wrong; making it implicit removes one consistency requirement +the writer would have to enforce. + +**`Invocation`-only walking is a forward-compat surface for +extensions.** An extension author who attaches attribution via +`AnchorRole::Other("their-thing")` can rely on the writer not +walking their data. They get a free guarantee: whatever they +point at, the writer will not turn it into rendered bytes by +accident. This makes the role discrimination both a correctness +mechanism (for `ValueSource`, `Dispatch`) and an extensibility +mechanism (for `Other`). diff --git a/claude-notes/designs/provenance-contract.md b/claude-notes/designs/provenance-contract.md new file mode 100644 index 000000000..41337c281 --- /dev/null +++ b/claude-notes/designs/provenance-contract.md @@ -0,0 +1,399 @@ +# Provenance contract — emitting `SourceInfo` from a transform + +**Status:** Active (Plan 6 landed 2026-05-22 on `feature/provenance`). +**Types:** `quarto_source_map::SourceInfo`, `By`, `Anchor`, `AnchorRole` +([`crates/quarto-source-map/src/source_info.rs`](../../crates/quarto-source-map/src/source_info.rs)). +**Plans:** +[Plan 4](../plans/2026-05-04-q2-preview-plan-4-sourceinfo-anchors.md) +(types) · +[Plan 5](../plans/2026-05-04-q2-preview-plan-5-wire-format.md) +(wire format) · +[Plan 6](../plans/2026-05-04-q2-preview-plan-6-provenance-audit.md) +(this audit) · +[Plan 7](../plans/2026-05-04-q2-preview-plan-7-incremental-writer.md) +(writer / consumer) · +[Plan 8](../plans/2026-05-04-q2-preview-plan-8-include-wrapper.md) +(include wrapper). +**Audit report:** [`claude-notes/research/2026-05-22-plan-6-audit.md`](../research/2026-05-22-plan-6-audit.md). +**Companion doc:** [`incremental-writer-contract.md`](incremental-writer-contract.md) +covers the *consumer* side — what the writer does with the `SourceInfo` +shapes this doc tells producers to emit. The two are designed in pairs: +if you change either contract, check the other. + +## Summary + +Every `SourceInfo` a transform emits must accurately describe where the +node came from. The Plan 4 types give four physical shapes (`Original`, +`Substring`, `Concat`, `Generated`); this doc is the contract for which +shape to pick. The rule that follows replaces the historical "stamp +`SourceInfo::default()` and move on" pattern that Plan 6 audited out +of the transform layer. + +## 1. Decision tree for new transforms + +**Pick the shape from where the emitted node's *bytes* come from, not +from how it was constructed.** Four branches: + +| Source of the emitted node | Shape | +|-----------------------------------------|--------------------------------------------------------------------------------------------------------------------------------| +| Corresponds to source bytes | `Original` — `ctx.source_info.clone()`, or clone the input node's `source_info` field. Never construct an `Original` by hand. | +| Pure synthesis with no preimage | `Generated { by: By::(), from: smallvec![] }` | +| Resolution of a user-written construct | `Generated { by: By::(name), from: smallvec![Anchor::invocation(Arc::new(token_si))] }` | +| **Mutation** of a node a filter received (e.g. `Str.text = upper(...)`) | Leave the input node's `Original` source_info untouched. Filter *mutations* are not classified atomic; do **not** rewrite to `Generated`. See [`incremental-writer-contract.md`](incremental-writer-contract.md) §"Filter mutations versus constructions" for the round-trip implications. | +| **Construction** inside a user Lua filter (e.g. `pandoc.Str("...")`) | Leave it alone — `filter_source_info` ([`crates/pampa/src/lua/types.rs:1813`](../../crates/pampa/src/lua/types.rs)) auto-attaches `Generated { by: filter, ... }` on the way out. | + +If two branches feel equally applicable, pick the one with the longer +chain to source: the writer (Plan 7) and attribution +(`resolve_byte_range`) both prefer `Original` over `Generated{from:[]}` +and `Generated{from:[Invocation]}` over `Generated{from:[]}`. + +## 2. `By::` constructor catalog + +The known producer kinds, defined in +[`crates/quarto-source-map/src/source_info.rs`](../../crates/quarto-source-map/src/source_info.rs): + +| Constructor | Line | `kind` string | Purpose | Atomic? | +|------------------------------|------|---------------------------|-------------------------------------------------------------------------------|---------| +| `By::filter(path, line)` | 458 | `"filter"` | Typed Inline/Block constructed inside a user Lua filter (auto-attached). | yes | +| `By::sectionize()` | 470 | `"sectionize"` | `SectionizeTransform`'s synthesized section `Div`. | no | +| `By::user_edit()` | 479 | `"user-edit"` | React-constructed content reaching the AST through the q2-preview client. | no | +| `By::shortcode(name)` | 494 | `"shortcode"` | Result of resolving a `{{< name … >}}` token. **Requires an `Invocation`.** | yes | +| `By::include()` | 505 | `"include"` | `IncludeStage` expansion wrapper (Plan 8); most include children stay `Original`. | (Plan 8) | +| `By::title_block()` | 513 | `"title-block"` | Title-block stage's synthesized title `h1`. | yes | +| `By::footnotes()` | 521 | `"footnotes"` | Footnotes stage's container `Div` chrome. | no | +| `By::appendix()` | 529 | `"appendix"` | Appendix-structure stage's wrapper `Div` and its helpers. | no | +| `By::tree_sitter_postprocess()` | 538 | `"tree-sitter-postprocess"` | Parser-side synthetic Spaces (e.g. citation/suffix separator). | yes | +| `By::raw(kind, data)` | 552 | open | Escape hatch for extension-defined kinds. | no | + +**Extension namespacing.** Third-party transforms going through +`By::raw` must namespace their kind as `ext//` (e.g. +`ext/quarto-mermaid/diagram`). The `is_atomic_kind` set never matches +extension kinds — they are non-atomic by default in v1. + +## 3. Adding a new `By::` kind + +Worked example, using `bd-12vrr` (callout default-title synthesizer) +as the reference: + +1. **Constructor.** Add `pub fn callout() -> Self` to + [`crates/quarto-source-map/src/source_info.rs`](../../crates/quarto-source-map/src/source_info.rs) + alongside the existing constructors. Pick a kebab-case `kind` + string (`"callout"`); leave `data` as `Value::Null` unless the + kind carries per-instance configuration. +2. **Atomicity decision.** Decide whether the new kind belongs in + `is_atomic_kind` (line 570). Default: **no**. Yes only if the + round-trip rule is "treat the entire subtree as one + non-user-editable unit" (see §7). Document the decision in the + beads issue. +3. **Fix the site.** Replace the `SourceInfo::default()` at the + producer with + `SourceInfo::Generated { by: By::callout(), from: smallvec![] }` + (or with an `Invocation` anchor if the new kind resolves a + user-written construct). +4. **Test.** Add a per-transform shape test next to the existing tests + for that transform (e.g. + `test_create_callout_title_has_generated_provenance`), asserting + the produced shape directly. + +The shape test is the per-kind contract — if it fails, the producer +broke the rule. The audit-completion sweep (Plan 6) catches *missing* +provenance; per-transform tests catch *wrong* provenance. + +## 4. `from[]` vs. `by.data` + +**Source-info pointers go in `from[]` as typed `Anchor`s. Per-instance +configuration that is not a source pointer goes in `by.data` as +JSON.** The two are not interchangeable: + +```rust +SourceInfo::Generated { + by: By { + kind: "shortcode".to_string(), + data: serde_json::json!({ "name": "meta" }), // NOT a source pointer + }, + from: smallvec![ + Anchor::invocation(Arc::clone(&token_arc)), // source pointer — typed role + ], +} +``` + +The defined `AnchorRole`s are `Invocation`, `ValueSource`, and +`Other(String)`. New roles are added as enum variants, not as `by.data` +fields. The canonical migration example is **bd-36fr9** (Lua filter +file registration in `SourceContext`): once Lua files have a +`FileId`, the `filter_path`/`line` pair currently living in +`by.data` migrates to a typed `Dispatch` anchor in `from[]`, and +`by.data` for `filter`-kind nodes shrinks to per-kind config only. +Treat that as the worked example whenever you're tempted to put a +path-or-range pair in `by.data`. + +### Role-asymmetry — only `Invocation` drives byte-copy + +**The writer walks `Invocation` only.** `ValueSource`, `Dispatch` +(when it lands), and `Other(...)` are diagnostic-only: attribution +machinery may consult them, but the writer's `preimage_in` skips +past them and they never produce verbatim-copy bytes. See +[`incremental-writer-contract.md`](incremental-writer-contract.md) +§"The role-asymmetry contract on `Generated.from`" for the rule +and rationale. + +The producer-side implication: attaching `ValueSource` to a synthesized +node is fine for diagnostic richness (attribution will surface the +metadata range), but it will **not** make the writer copy bytes from +that range into the output. If you want a node's bytes to come from +a specific source range on round-trip, that range must be reachable +through `Invocation`. Extension authors writing custom attribution +via `Other("…")` get the same forward-compat guarantee: whatever they +point at will never be turned into rendered bytes by accident. + +## 5. Enrichment-via-post-walk pattern + +**When you wrap a dispatch and want to layer your own context on top +of provenance the dispatch already attached, walk the result, append +your anchor, and promote `by.kind` — preserving prior `by.data` +fields, renaming where the new context demands.** This is the +canonical pattern for "transform A constructed via transform B." + +Reference implementation: +[`stamp_shortcode_anchors`](../../crates/quarto-core/src/transforms/shortcode_resolve.rs) ++ [`enrich_or_create`](../../crates/quarto-core/src/transforms/shortcode_resolve.rs) +at `crates/quarto-core/src/transforms/shortcode_resolve.rs:524` (entry +point) and `:774` (the promote/preserve helper). The relevant shape +of `enrich_or_create` is: + +```rust +let by = match existing { + SourceInfo::Generated { by, .. } if by.kind == "filter" => { + // promote filter -> shortcode, rename filter_path -> lua_path + let lua_path = by.data.get("filter_path").cloned(); + let lua_line = by.data.get("line").cloned(); + let mut data = serde_json::json!({ "name": name }); + if let Some(p) = lua_path { data["lua_path"] = p; } + if let Some(l) = lua_line { data["lua_line"] = l; } + By { kind: "shortcode".to_string(), data } + } + _ => By::shortcode(name), +}; +SourceInfo::Generated { + by, + from: smallvec![Anchor::invocation(Arc::clone(token_arc))], +} +``` + +Three rules to apply when copying the pattern: + +- **Append, don't replace.** New anchors join `from[]`; prior anchors + stay. +- **Promote, don't downgrade.** `by.kind` moves to a more specific + context (here: `filter` → `shortcode`). Going the other way drops + information. +- **Preserve prior `by.data`, renaming for context.** Filter dispatch + recorded `filter_path` / `line`; the shortcode context renames + them `lua_path` / `lua_line`. Nothing is discarded. + +The post-walk must also recurse into nested AST so every node in the +returned subtree gets the anchor — model the walk on +[`stamp_inline`](../../crates/quarto-core/src/transforms/shortcode_resolve.rs) +(`:546`) and +[`stamp_block`](../../crates/quarto-core/src/transforms/shortcode_resolve.rs) +(`:612`) rather than the narrower walkers in `callout.rs` / +`theorem.rs` (block-only — they miss `Image.alt` / `Note.content`). + +## 6. `AttrSourceInfo` positional alignment + threaded-source pattern + +**`AttrSourceInfo.attributes[i]` is the `(key_src, val_src)` pair for +the i-th entry of the parallel `Attr.2` (`LinkedHashMap`) in +insertion order.** Two preexisting parser paths break this invariant +(**bd-3aolj** duplicate-key handling, **bd-1e6a5** caption-attr merge +into table); see +[`crates/quarto-pandoc-types/src/attr.rs:28`](../../crates/quarto-pandoc-types/src/attr.rs) +for the full doc comment. + +When a transform needs the value's source range — e.g. lifting an +attribute value into a typed Inline — thread `&div.attr_source` through +and index *before* mutating `attr.2`: + +```rust +let name_idx = kvs.keys().position(|k| k == "name")?; +// Empty attr_source signals "no provenance" (the common test pattern). +// Only assert on a populated-but-misaligned attr_source — that's the +// bd-3aolj / bd-1e6a5 failure mode worth catching in dev. +debug_assert!( + attr_source.attributes.is_empty() + || kvs.len() == attr_source.attributes.len(), + "AttrSourceInfo.attributes is out of sync with Attr.2 (bd-3aolj / bd-1e6a5)" +); +let value_source = if kvs.len() == attr_source.attributes.len() { + attr_source.attributes[name_idx].1.clone() +} else { + None +}; +let name = kvs.remove("name")?; +// ... use value_source.unwrap_or_default() as the new node's source_info. +``` + +Reference: +[`crates/quarto-core/src/transforms/theorem.rs:314`](../../crates/quarto-core/src/transforms/theorem.rs) +(`extract_name_attr`), with a parallel implementation in +[`crates/quarto-core/src/transforms/proof.rs:162`](../../crates/quarto-core/src/transforms/proof.rs). + +**The strict form is wrong.** `debug_assert_eq!(kvs.len(), +attr_source.attributes.len())` fires on the common +`AttrSourceInfo::empty()` test pattern (an `Attr` with non-empty `kvs` +constructed by hand without provenance) and panics every existing +theorem/proof test. The "empty OR equal" form is required so empty +provenance signals "unknown," not "bug." Future contributors will hit +this footgun if they copy the wrong form from a draft plan. + +## 7. Atomic-kind set and consumer impact + +**`is_atomic_kind()` controls how downstream consumers treat the +node, not whether the node carries an anchor.** The §2 catalog +above marks which kinds are currently atomic; the canonical +enumeration plus the shortcode-only debug-assert table lives in +[`incremental-writer-contract.md`](incremental-writer-contract.md) +§"Atomic-kind `Generated` and the shortcode-only invariant." + +For producer authors: the rule is "new kinds default to **non-atomic**." +Promote to atomic only when the round-trip rule for nodes you emit +is "the entire subtree is one inseparable unit the user can't edit +in-place." Extension kinds (`ext//`) are never atomic +in v1 — `is_atomic_kind` matches builtin kebab-case names only. + +Two consumers consult `is_atomic_kind` today: Plan 7's writer (round- +trip / soft-drop) and Plan 2A's React framework gate (read-only DOM +regions). The writer doc covers both behaviors; this contract just +says "make the decision deliberately, default no." + +**Where the writer's internal shape is pinned:** +[`incremental-writer-contract.md`](./incremental-writer-contract.md) +§"`CoarsenedEntry` self-containment" documents the rule that +every emitted entry must be self-contained, and how the atomic-kind +decision flows into the choice of `Verbatim` (atomic with preimage) +vs `Omit` (atomic without preimage) vs `Rewrite` (non-atomic +catch-all) vs `Transparent` (non-atomic wrapper with source-bearing +children) at coarsen time. + +## 8. Required-anchor invariants + +**`by.kind == "shortcode"` always carries at least one `Invocation` +anchor.** The producer (the stamper in §5) enforces this; the writer +adds a consumer-side `debug_assert!` so an extension that calls +`By::raw("shortcode", …)` without the required anchor is caught. The +writer-side table that distinguishes "missing `Invocation` is a bug" +(shortcode) from "missing `Invocation` is the normal shape" +(filter / title-block / tree-sitter-postprocess) lives in +[`incremental-writer-contract.md`](incremental-writer-contract.md) +§"Atomic-kind `Generated` and the shortcode-only invariant." + +The pattern generalizes: when a new kind always has a source-side +preimage (e.g. a hypothetical `By::macro_expansion(name)`), declare +the invariant here, enforce it at the producer, and add the +corresponding consumer-side assert in the writer doc. Kinds that +*sometimes* have a preimage (sectionize wraps existing content; the +inner `Header` carries the original `source_info`, but the wrapper +`Div` doesn't) are not in this set — they emit `from: smallvec![]` +and don't require any anchor. + +**Sibling contract for these "no source token of its own" wrappers:** +see [`transparent-wrappers.md`](./transparent-wrappers.md). It names +the shape (Generated, no Invocation, block-container with +source-bearing children) and pins the *consumer* rule: any code +that asks "where do the user's source bytes live?" must descend +through transparent wrappers via `first_in_user_tree`, not read +`blocks[0]` directly. The producer side of that — what wrapper +kinds emit `from: []` — lives here in §2's catalog (`sectionize`, +`appendix`, footnotes container, …); the descent invariant lives +there. Adding a new `By::` kind that produces a block-container +wrapper should cross-reference both docs. + +## 9. Outliers — call-site threading vs. the stamper + +**Two shortcode-related sites bypass the stamper because they don't +flow through the dispatch funnel:** + +- [`make_error_inline`](../../crates/quarto-core/src/transforms/shortcode_resolve.rs) + (`:1352`) — `?key` Strong wrapping the unknown-shortcode message. +- [`shortcode_to_literal`](../../crates/quarto-core/src/transforms/shortcode_resolve.rs) + (`:1368`) — `{{}}` escaped-shortcode literal text. + +Both branches consume their `shortcode_owned.source_info` directly +and emit an `Original` (the user-visible bytes belong to the token, +not to a synthesized replacement). Plan 7's `is_atomic_kind()` does +not fire on `Original`, so error/escaped regions round-trip +verbatim-copy as plain user content. + +The pattern to recognize: **if the result variant is `Preserve` or +`Error` rather than `Inlines`/`Blocks`, the stamper does not run.** +Whenever you add a new `ShortcodeResult`-style enum variant that +short-circuits the dispatch funnel, thread the token's `source_info` +through the call sites and use it as the emitted node's +`source_info` — don't try to retrofit a `Generated{by:shortcode}` +shape onto content the user can edit directly. + +## 10. Do-not list + +- **Don't emit `SourceInfo::default()` for new synthesized nodes.** + Use the four-branch decision in §1. `default()` survives in the + Pandoc JSON reader ([`crates/pampa/src/readers/json.rs:80`](../../crates/pampa/src/readers/json.rs)) + by design (the source bytes genuinely don't exist there) and in + test scaffolding; everywhere else it's a bug. +- **Don't put source-info pointers in `by.data`.** Add an + `AnchorRole` variant and a typed `Anchor` in `from[]` instead. See + §4 and the bd-36fr9 migration. +- **Don't drop existing `by.data` when enriching.** Promote / + migrate. See §5. +- **Don't introduce a `CustomNode` wrapper for provenance alone.** + The 2026-05-20 design discussion settled on `Generated` with + typed anchors instead of `CustomNode("ShortcodeResolution")`-style + wrappers because the anchor carries the structural information + cheaply without forcing a new HTML-pipeline resolve transform, a + React component, and a `qmd` writer arm. Wrappers remain + appropriate for the include case (Plan 8) because the cross-file + `FileId` problem genuinely needs anchoring at the parent-file + level. Do not re-litigate. +- **Don't add a `test` arm to a `wasm32` cfg guard** when introducing + new provenance code paths. See + [`.claude/rules/wasm.md`](../../.claude/rules/wasm.md) — the + `#[cfg(any(target_arch = "wasm32", test))]` pattern is prohibited + because it forces native tests through the WASM-restricted Lua + stdlib and fails on Windows. + +## Follow-ups (named, not designed here) + +- **bd-129m3** — `ValueSource` anchor stamping for `meta` / `var` + shortcodes once the metadata loader threads per-key source-info + through. Integration point is `enrich_or_create` (§5). +- **bd-36fr9** — `Dispatch` anchor for Lua-handler filter / shortcode + source location, once Lua files are registered in `SourceContext`. + Migration example for §4. +- **bd-12vrr** — Callout default-title synthesizer needs a `By::callout()` + constructor + atomicity decision. The §3 worked example. +- **bd-1inj0** — Code-block decoration synthesizers + (`code_block_generate` / `code_block_render`) — another small audit + pass to bring into this contract. +- **bd-3aolj** / **bd-1e6a5** — Parser-side `AttrSourceInfo` / + `Attr.2` alignment bugs that the §6 guard works around. + +## Change log + +- **2026-05-25 — v1.** Initial version, written after Plan 6 landed + on `feature/provenance` (2026-05-22). Documents the conventions + that survived implementation: + four-branch decision tree, `By::` catalog, enrichment pattern, + `AttrSourceInfo` threading recipe (with the relaxed + `debug_assert!` form), atomic-kind / required-anchor invariants, + outlier call-site threading, and a do-not list. Plan-6 audit + report lives separately at + [`claude-notes/research/2026-05-22-plan-6-audit.md`](../research/2026-05-22-plan-6-audit.md). +- **2026-05-25 — v1.1.** Cross-linked the consumer-side + [`incremental-writer-contract.md`](incremental-writer-contract.md) + that landed in parallel on Plan 7's review branch. Three + substantive edits: §1 decision tree gains a row distinguishing + filter *mutations* (keep input's `Original`) from filter + *constructions* (auto-attached `Generated{by:filter}`); §4 + documents the role-asymmetry — only `Invocation` drives the + writer's byte-copy, `ValueSource` / `Dispatch` / `Other` are + diagnostic-only; §7 / §8 now defer the canonical atomic-kind + enumeration and shortcode-only debug-assert table to the + writer-contract doc rather than duplicating them. diff --git a/claude-notes/designs/transparent-wrappers.md b/claude-notes/designs/transparent-wrappers.md new file mode 100644 index 000000000..2fee01eef --- /dev/null +++ b/claude-notes/designs/transparent-wrappers.md @@ -0,0 +1,218 @@ +# Transparent wrappers — descending past synthesized block containers + +**Status:** Active (introduced 2026-05-25 alongside Plan 7c Phase 8). +**Types:** `pampa::pandoc::Block`, `quarto_source_map::SourceInfo`. +**Reference impl:** +[`crates/pampa/src/writers/incremental.rs`](../../crates/pampa/src/writers/incremental.rs) +(`first_in_user_tree`, `is_transparent_wrapper`, +`derive_target_file_id`, `first_target_anchored_start_in`). +**Plans:** +[Plan 7](../plans/2026-05-04-q2-preview-plan-7-incremental-writer.md) +(writer) · +[Plan 7c](../plans/2026-05-25-q2-preview-plan-7c-closure-gaps.md) +(Phase 8 — target_file_id descent) · +[Plan 8](../plans/2026-05-04-q2-preview-plan-8-include-roundtrip.md) +(IncludeExpansion — *not* a transparent wrapper) · +[Plan 9](../plans/2026-05-22-provenance-plan-9-valuesource-threading.md) +(`title_source_info`) · +[Plan 10](../plans/2026-05-22-provenance-plan-10-dispatch-anchor.md) +(Lua-emitted wrappers). + +## Summary + +The post-render AST that q2-preview hands the React iframe is **not +flat.** The render pipeline wraps the user's blocks in synthesized +containers — most notably a single top-level `Div` from +`SectionizeTransform` — that group content by heading level for +sidebar / cross-reference / outline construction. These wrappers +carry `SourceInfo::Generated` with no `Invocation` anchor: they're +structurally part of the AST but have **no source bytes of their own** +in the user's qmd. + +A *transparent wrapper* is the name for this shape. Code that asks +"where do the user's source bytes live?" must descend through +transparent wrappers, not read `blocks[0]` directly. + +Three writer bugs landed on this rake before the pattern was named +(commits `bdcfdc53`, `b9f64b56`, `2bf92664`): the writer +soft-dropped the wrapper instead of recursing, derived the wrong +file id, and silently deleted the YAML frontmatter. All three were +the same mistake — `blocks[0]` is not necessarily a real user +block. + +## Definition + +A `Block` is a *transparent wrapper* with respect to a +`target_file_id` when **all three** hold: + +1. Its `SourceInfo` is `Generated` with no `Invocation` anchor. + It has no source token of its own; its bytes are synthesized. +2. It is recognised by `block_block_children` (i.e. it's a `Div`, + `BlockQuote`, `Figure`, or `NoteDefinitionFencedBlock` — the + block-container kinds today's synthesizers emit). +3. At least one descendant has real + `preimage_in(target_file_id).is_some()` — there's actual user + content under it. + +Condition (3) is what makes the predicate *structural* rather than +opt-in: a Lua filter that wraps existing user paragraphs in a +`Div.callout` produces a Generated Div whose children still carry +their original preimage → it's transparent → the visual editor sees +through it → user edits inside the wrapped content round-trip +cleanly. A filter that constructs a fresh Div from metadata has no +source-bearing children → it's atomic → editor treats it as a unit. +The filter author doesn't have to declare anything; the AST shape +declares it for them. + +## Known transparent wrappers today + +Produced by `pampa::pandoc::sugar::SectionizeTransform` and friends: + +- **sectionize** Div — groups blocks by heading depth (`By::sectionize()`). +- **footnotes-container** Div — collects all footnote definitions. +- **appendix-container** Div — collects appendix-tagged content. + +Plus, by structural construction, any Lua-emitted block-container +that meets the three conditions above (Plan 10). + +**Not** transparent wrappers: + +- `IncludeExpansion` CustomNode (Plan 8) — its `SourceInfo` is + `Original`, anchored to the include-token bytes in the parent qmd. + Descent stops at it; that's correct behaviour. +- Atomic CustomNodes like `CrossrefResolvedRef` — `SourceInfo` + is `Original` pointing at the `@ref` token. +- The synthesized title-block Header (`By::title_block()`) — + `is_atomic_kind` is `true` for `title-block`. Editor treats the + resolved title as read-only; the user's source-side knob is the + YAML `title:` key. (Not block-container shape either.) + +## Sibling primitive on the emission side + +`first_in_user_tree` (below) is the *traversal* primitive — how a +caller descends past transparent wrappers when looking up source +positions. The *emission* primitive is `CoarsenedEntry::Transparent` +in the incremental writer: same wrapper shape, but the question is +"how do I emit bytes through this wrapper?" rather than "where do +the user's source bytes live?" + +Both rely on the same descent rule (skip the wrapper, look at the +children) and the same invariant (a `Generated` block-container +with no Invocation anchor and source-bearing children is +transparent). They diverge in what they do with the descent: +traversal stops at the first match; emission walks all children +and concatenates their bytes. + +See [`incremental-writer-contract.md`](./incremental-writer-contract.md) +for the writer-side contract — in particular the rule that every +`CoarsenedEntry` variant must be self-contained, which is what +makes child entries safe to inline through a `Transparent`. + +## Reference primitive: `first_in_user_tree` + +```rust +fn first_in_user_tree( + blocks: &[Block], + extract: &impl Fn(&Block) -> Option, +) -> Option +``` + +Walks `blocks` depth-first. On each block, applies `extract`; if +`Some`, returns it. If `None`, descends through +`block_block_children` and tries again. This is how we see through +transparent wrappers — a wrapper has no source position of its own +(extract returns `None` for it), so the walker looks inside. + +The two consumers today are one-liners: + +```rust +fn derive_target_file_id(blocks: &[Block]) -> FileId { + first_in_user_tree(blocks, &|b| b.source_info().root_file_id()) + .unwrap_or(FileId(0)) +} + +fn first_target_anchored_start_in(blocks: &[Block], target: FileId) -> Option { + first_in_user_tree(blocks, &|b| { + b.source_info().preimage_in(target).map(|r| r.start) + }) +} +``` + +A `visit_user_blocks(blocks, &mut visit)` sibling (visiting all user +blocks in document order, transparent wrappers skipped) is the +natural extension for callers that need every block, not just the +first; add it the moment a second caller wants it. + +## When to use which + +| Need | Tool | +|---|---| +| Find the first block where some property holds | `first_in_user_tree` | +| Visit all user blocks in document order | (add `visit_user_blocks` when needed) | +| Ask "is *this specific block* a transparent wrapper?" | `is_transparent_wrapper` | +| Get the document's editing-file id | `derive_target_file_id` | +| Find where the YAML frontmatter region ends | `first_target_anchored_start_in` | + +`is_transparent_wrapper` is intentionally a small predicate — used +when a caller needs to make an *explicit* decision (e.g. a future +Q-3-44 diagnostic that hints "your filter walked into a sectionize +wrapper; you probably meant to walk its children"). Routine +source-position lookups should use the walkers, not the predicate. + +## Where the code lives, and when to promote it + +The primitives live in +`crates/pampa/src/writers/incremental.rs` next to +`block_block_children`. That's the right home today — the writer +is the only consumer. + +Promote to `quarto-pandoc-types` (or a new +`quarto-pandoc-types::traversal` module) **the moment a second +crate needs them.** Plan 9's `DocumentProfile` extractor (when it +gains a "first H1" fallback), Plan 10's filter-output classifier, +and the project-replay engine's cell walker are the candidates. +Don't promote pre-emptively — premature generalisation has its own +debt. + +## Adding a new synthesizer + +If you're writing a new transform that wraps user content in a Div +(or other block container): + +1. Emit `SourceInfo::generated(By::())` on the wrapper. + No `Invocation` anchor (because there's no source token). +2. Preserve the children's existing source_info — don't restamp + them with the wrapper's `By`. The whole point is that the + children stay editable. +3. Your wrapper is automatically transparent; nothing else to do. +4. If your `By::()` should *also* be considered + `is_atomic_kind()` (the resolved children are read-only, like + shortcode resolutions), add it to the atomic-kind set in + `crates/quarto-source-map/src/source_info.rs` — separate + concept, separate decision. + +## Anti-patterns + +- `ast.blocks[0]` for source-position questions (file id, start + offset, "the first user block"). Use `first_in_user_tree`. +- `ast.blocks.iter()` flatly for "every user block" enumeration + when the document might be wrapped. Use a descending visitor. +- Declaring a transparent wrapper via a `By::kind` registry. The + predicate is structural; don't add an opt-in mechanism that the + shape already encodes. +- Asking "is this Generated and atomic-kind?" when what you mean + is "should I descend?" — `is_atomic_kind` and transparency are + orthogonal. Shortcode resolutions are atomic *and* have + Invocation anchors (descent is meaningful but the resolved + content is read-only). Sectionize Divs are *neither* atomic + *nor* invocation-anchored. Mixing the two predicates produces + subtle bugs. + +## History + +| Date | Commit | What | +|---|---|---| +| 2026-05-25 | `bdcfdc53` | `coarsen` recurses Transparent into non-atomic Generated wrappers (the first bug — empty qmd) | +| 2026-05-25 | `b9f64b56` | `derive_target_file_id` descends; Plan 7c Phase 8 closed | +| 2026-05-25 | `2bf92664` | `emit_metadata_prefix` descends; YAML frontmatter preserved | +| 2026-05-25 | (this doc) | Pattern named, primitives centralized | diff --git a/claude-notes/instructions/idempotence-contract.md b/claude-notes/instructions/idempotence-contract.md new file mode 100644 index 000000000..b2f69f002 --- /dev/null +++ b/claude-notes/instructions/idempotence-contract.md @@ -0,0 +1,149 @@ +# The q2-preview idempotence contract + +A note for transform / filter authors. Read this before adding a new +Rust transform to `build_q2_preview_transform_pipeline`, a new stage +to `build_q2_preview_pipeline_stages`, or a new built-in Lua filter +under `resources/extensions/`. + +The contract is enforced by the CI gate at +`crates/quarto-core/tests/idempotence.rs`, which is the Phase-3 +deliverable of the provenance epic. The full design lives in +`claude-notes/plans/2026-05-04-q2-preview-plan-3-builtin-filter-idempotence.md`. + +## What the contract says + +Running the q2-preview pipeline twice on the same input must produce +the same structural AST: identical `blocks` hash and identical `meta` +hash with `meta.rendered.*` excluded. + +"Same input" means the same byte sequences for the same file layout — +but **not** necessarily the same absolute paths. Each idempotence +fixture runs both pipeline invocations inside a fresh `TempDir`, so +the project root differs across runs while the content is identical. +A transform that captures the absolute project root into the AST will +fail the gate. + +## What the hash includes and excludes + +Defined by `compute_blocks_hash_fresh` / +`compute_meta_hash_fresh_excluding_rendered` in +`crates/quarto-ast-reconcile/src/hash.rs`. + +Included: + +- All block / inline structure (type, text, attributes, children). +- All meta tree structure: scalars by `Yaml` payload; `Map` entries + in **insertion order** (no sort); `Array` entries in order; + `merge_op` on every `ConfigValue`. +- `PandocInlines` / `PandocBlocks` payloads inside meta values, + recursed via the existing block/inline hashers. + +Excluded: + +- `SourceInfo` on every block, inline, and `ConfigValue`. +- `key_source` on every `ConfigMapEntry`. +- Top-level `meta.rendered.*` — chrome transforms, `IncludeResolveStage`, + the favicon transform, and Bootstrap/clipboard injection populate + HTML/text strings under `rendered.*` that may legitimately vary in + trivial whitespace or attribute ordering; HTML-shape canonicalization + is a different concern. + +Source-info is excluded by design so Plan 4's source-info churn +doesn't break the contract. + +## What this means in practice + +A new transform / stage / filter must: + +### 1. Not depend on undefined-iteration-order state + +If you populate a `Map` value in `meta` from a `HashMap`, the +iteration order is undefined and two runs will produce different +hashes. The gate uses insertion-order map hashing precisely to catch +this — sorting would silently mask it. + +Use `Vec<(key, value)>`, `BTreeMap`, or `LinkedHashMap` and append +in a deterministic order. + +### 2. Not capture process-local state into the AST + +No timestamps, no PIDs, no random IDs, no absolute paths derived +from the project root, no `temp_dir()` output. If you need to refer +to a file, emit a path relative to the project root. + +Source-info is the only legitimate place absolute paths live, and +the hash excludes source-info by design. + +### 3. Use fresh Lua state per pipeline run (Lua filters / shortcodes) + +The shortcode resolver and per-filter Lua engine are constructed +fresh inside their respective transforms; do not stash global state +on `_G` and expect it to survive between runs. If you need a cache, +key it by the *filter* identity, not the *pipeline run* — and clear +it on `Lua` construction. + +### 4. Not execute engine cells + +CI doesn't run Jupyter / Knitr. Fixtures use only fenced code blocks +(`` ```python `` etc.) — AST nodes, not executed. If your transform's +behavior is conditional on engine-execution side effects, the gate +cannot exercise it. + +## Adding a fixture when you add a new transform + +Every new transform / filter must come with at least one fixture +that exercises its happy path. Add it to +`crates/quarto-core/tests/idempotence.rs`: + +- Trivial single-page fixture: use the `doc_fixture(name, content)` + helper. Writes `index.qmd` to a fresh `TempDir` and runs both + `DriveMode::SingleFile` and `DriveMode::ProjectOrchestrator`. +- Multi-file fixture (sibling files, includes, image resources): + write an inline `setup` closure that writes everything into the + fresh `TempDir`. Same dual-mode run. +- Website-chrome / link / listing fixture: use + `modes: ORCHESTRATOR_ONLY`. Chrome transforms need a populated + `ProjectIndex`, which only the orchestrator pass-1 builds. +- Attribution exercise: set `attribution_json: Some(...)` with a + deterministic transport-shape JSON; `PreBuiltAttributionProvider` + is installed on the `RenderContext` automatically. Do not use + `GitBlameProvider` here — it depends on actual git history. + +See `crates/quarto-core/tests/fixtures/idempotence/README.md` for +the per-fixture rules (no engine cells, no absolute paths, mode +mapping). + +## If your new fixture fails on first run + +Two possibilities: + +1. **Your transform really is non-deterministic.** Trace the + `DivergencePoint` the panic message hands you (block index, or + meta key path) and fix the underlying state — usually a + `HashMap` iteration, a `SystemTime::now()`, or an absolute path + stuffed somewhere it shouldn't be. + +2. **The hasher is wrong.** Vanishingly unlikely with FxHasher, + but if you've ruled out (1), file a bug against + `quarto-ast-reconcile`. + +Per the plan's long-lived-integration-branch policy, **do not +`#[ignore]` the failing test** without explicit user approval. +Failing fixtures are the triage backlog; the integration branch +(`feature/provenance`) is allowed to be red while the queue is +drained. + +## Related + +- `claude-notes/plans/2026-05-04-q2-preview-plan-3-builtin-filter-idempotence.md` — + the plan that introduced this gate, with the design rationale. +- `claude-notes/plans/2026-05-04-q2-preview-plan-7a-user-filter-idempotence.md` — + the runtime counterpart: per-user-filter idempotence detection at + render time, with `idempotent: false` opt-out. The contract this + file describes is the CI-time half for built-ins; Plan 7a is the + runtime half for user filters. +- `crates/quarto-ast-reconcile/src/hash.rs` — the hash implementations + and unit tests. +- `crates/quarto-core/tests/idempotence.rs` — the gate. +- `crates/quarto-core/tests/fixtures/idempotence/README.md` — the + fixture-format rules. diff --git a/claude-notes/plans/2026-05-04-q2-preview-plan-3-builtin-filter-idempotence.md b/claude-notes/plans/2026-05-04-q2-preview-plan-3-builtin-filter-idempotence.md new file mode 100644 index 000000000..5aa4bfe72 --- /dev/null +++ b/claude-notes/plans/2026-05-04-q2-preview-plan-3-builtin-filter-idempotence.md @@ -0,0 +1,1276 @@ +# Plan 3 — Built-in transform and filter idempotence verification (CI-time) + +**Date:** 2026-05-04 (revised 2026-05-21) +**Branch:** feature/provenance (long-lived integration branch — see +§"Phase 5 — Failure triage" and §"Long-lived branch policy" below) +**Status:** Development plan (work items below) +**Milestone:** M2 verification gate (no new milestone — locks in property +on what's already shipped) + +## Long-lived branch policy + +`feature/provenance` is **not** intended to merge to `main` while any +fixture in this plan is red. The integration branch is the *home* of +the failing-test queue; each red fixture has a beads issue, and the +queue is drained before merge. This is by design — the plan ships a +verification gate, and the gate has to be allowed to be red while it +discovers what's actually non-deterministic in the pipeline today. +See §"Phase 5 — Failure triage" for the operational rules, and +§"CI failure policy & sub-agent prompt template" (under §"Decisions") +for the rationale. + +## Epic context + +Part of the **provenance epic** (Plans 3–8). Plan 3 is the +verification-gate piece: it locks in the idempotence + structural-hash- +stability contract the rest of the epic (typed provenance, incremental +writer, soft-drop) rests on. The file name keeps its q2-preview-plan-N +form for continuity with the earlier discussion notes. + +## Goal + +Verify and lock in the **idempotence + structural-hash-stability** +contract for the q2-preview pipeline. Every Rust transform in the +q2-preview transform list **and** every built-in Lua filter shipped +under `resources/extensions/` must produce the same structural AST when +run twice on the same input. Without this, the incremental writer's +reconciliation (Plan 7) cannot reliably preserve untouched regions. + +This plan ships: + +- A canonical fixture set covering each transform and built-in Lua + filter in scope. +- A test that runs each fixture through the q2-preview pipeline twice + and asserts the resulting `blocks` and `meta` (excluding + `rendered.*`) hash equal. +- A `compute_meta_hash_fresh` helper in `quarto-ast-reconcile` + parallel to the existing `compute_blocks_hash_fresh`. +- Documentation of the idempotence contract for future transform/filter + authors. + +When this plan lands on `main` (after Phase 5's failure queue is +drained), the q2-preview round-trip story (Plans 4-8) rests on a +**CI-enforced** stable foundation: every push to `main` runs the +idempotence suite and fails the build on regression. *Until* the +plan lands on `main`, the integration branch +(`feature/provenance`) carries the suite in a possibly-red state +as the queue of discovered non-determinism issues is worked +through — that's the design, not a process gap. See §"Long-lived +branch policy" and §"Phase 5 — Failure triage." + +## Scope + +### What "built-in" covers — the universe under test + +Two distinct classes, both shipped with Quarto and both in scope: + +**Rust transforms** — the source of truth is +`build_q2_preview_transform_pipeline` in +`crates/quarto-core/src/pipeline.rs:1237`, which is +`build_transform_pipeline` minus the four names in +`Q2_PREVIEW_TRANSFORM_EXCLUDED` (`pipeline.rs:1198`). As of this +revision, the q2-preview pipeline runs **37 transforms** across four +phases: + +- **Normalization**: callout, shortcode-resolve, metadata-normalize, + code-block-generate, website-title-prefix, website-favicon, + website-bootstrap-icons, website-canonical-url, sectionize, + footnotes, theorem-sugar, proof-sugar, float-ref-target-sugar, + equation-label. +- **Crossref**: crossref-index, crossref-resolve. +- **Navigation**: toc-generate, navbar-generate, sidebar-generate, + page-nav-generate, footer-generate, listing-generate, listing-render, + categories-sidebar, listing-feed-stage (native only), + listing-feed-link, toc-render, navbar-render, sidebar-render, + page-nav-render, footer-render. +- **Finalization**: link-rewrite, appendix-structure, code-block-render, + resource-collector, table-bootstrap-class, attribution-render. + +Excluded by `Q2_PREVIEW_TRANSFORM_EXCLUDED` (out of scope for Plan 3 +because they don't run): callout-resolve, attribution-viewer, +title-block, crossref-render. + +**Stage-level work** in `build_q2_preview_pipeline_stages` +(`pipeline.rs:380`) also runs around `AstTransformsStage` and can +introduce non-determinism: parse-document, metadata-merge, +include-expansion, include-resolve, listing-item-info, document-profile, +link-resolution, unwrap-profile, pre-engine-sugaring, capture-splice, +engine-execution, compile-theme-css, attribution-generate, +user-filters-pre/post, resource-report, code-highlight. These are +exercised implicitly by every fixture (most are no-ops absent specific +metadata). + +`Q2_PREVIEW_STAGE_EXCLUDED` (`pipeline.rs:356`) currently excludes +three stages by name: `math-js`, `render-html-body`, and +`apply-template`. `MathJsStage`'s exclusion means `meta.math` never +appears under this pipeline and contributes nothing to the meta +hash; `RenderHtmlBodyStage` and `ApplyTemplateStage` produce +HTML/text side outputs that wouldn't reach the AST anyway, so their +exclusion is also AST-neutral. `BootstrapJsStage` and +`ClipboardJsStage` are *not* excluded — they run on native q2-preview +but write only to `ctx.artifacts`, not to `doc.ast.meta` or +`doc.ast.blocks`, so they don't affect the hash. (Whether they +should be in `Q2_PREVIEW_STAGE_EXCLUDED` at all is a separate +question, filed as **bd-2ag1c** — see §"Open questions for +implementation" for ordering relative to Plan 3.) + +**Lua filters under `resources/extensions/`** — there is exactly **one** +today: `resources/extensions/quarto/video/video-filter.lua`. It rewrites +Header attributes when `background-video` is set on a slide-shaped +header. (The other Lua files in `resources/extensions/` — kbd, video, +lipsum, version, placeholder — are *shortcodes*, not filters, and run +through `shortcode-resolve` rather than `UserFiltersStage`. They're +exercised via shortcode fixtures.) + +### In scope + +- **Canonical fixture set**: small `.qmd` files exercising each + transform / filter in the universe above. Existing fixtures + new + ones from the gap audit below. Detailed listing in §"Coverage gaps to + address during implementation." + +- **`compute_meta_hash_fresh` helper** in + `crates/quarto-ast-reconcile/src/hash.rs`. Walks `ConfigValue` + source-info-agnostically: + - hashes scalars by their `Yaml` payload; + - recurses into `PandocInlines` / `PandocBlocks` via the existing + inline / block hashers; + - hashes `Array` entries in order (matches `Vec` shape); + - hashes `Map` entries as `(key_string, recurse(value))` pairs **in + insertion order — no sort**. Insertion-order hashing is the right + choice for an idempotence test: it catches HashMap-iteration-order + bugs in transforms that stuff results into a meta `Map`. Sorting + would silently mask exactly the class of non-determinism we want to + detect. `ConfigValue::Map` is already a `Vec` that + preserves YAML document order, so hashing insertion order is also + the simplest implementation; + - **includes `merge_op`** in the hash (every `ConfigValue` has + `value: ConfigValueKind`, `source_info: SourceInfo`, and + `merge_op: MergeOp` — `merge_op` participates so we catch + transforms that change merge semantics non-deterministically). + `MergeOp::default()` is `Concat` + (`crates/quarto-pandoc-types/src/config_value.rs:75`, derived + `#[default]`) — a stable compile-time constant with no env or + runtime dependence, so transforms that leave `merge_op` at its + default contribute a deterministic value to the hash; + - skips `source_info` and `key_source` (Plan 4's churn must not break + the contract). + + Tests for the helper land alongside it (mirroring the existing + `test_same_content_same_hash` style at `hash.rs:767`). Include a test + proving the helper diverges when `Map` insertion order changes — this + is the regression guard for the no-sort choice. + +- **Idempotence test runner**: takes a fixture, runs the q2-preview + pipeline twice (once per `DriveMode` — see §"What gets tested + concretely"), hashes `doc.ast.blocks` via `compute_blocks_hash_fresh` + and `doc.ast.meta` via `compute_meta_hash_fresh_excluding_rendered` + (everything under `rendered.*` is HTML/text side output — see §"Out + of scope"). Asserts hash equality across the two runs *within a + mode*. One assertion per (fixture, mode) pair; failures name the + fixture, the mode, and which hash diverged. + +- **Divergence-localization helper** in + `crates/quarto-ast-reconcile/src/hash.rs`, alongside the hash fns. + When the (blocks, meta) hashes diverge, the test driver calls + `find_first_divergence(&doc_1, &doc_2) -> DivergencePoint` to + surface a useful location in the failure message. Returns one of: + - `DivergencePoint::Block { index, hash_a, hash_b }` — first block + index whose `compute_block_hash_fresh` differs; + - `DivergencePoint::MetaKey { path, hash_a, hash_b }` — first meta + key path (e.g. `["listings", "foo", "items"]`) whose recursive + hash differs, walking the `ConfigValue` tree in insertion order + and excluding `rendered.*`; + - `DivergencePoint::None` — hashes equal at top but a sub-component + differs (would indicate a bug in the hasher itself; vanishingly + unlikely with FxHasher). + + The test driver embeds the returned `DivergencePoint` in the panic + message, so the sub-agent investigation prompt arrives with a + concrete starting point ("block index 7" / "meta.listings.foo + diverged") rather than just "hash diverged." Saves agent triage + time and makes the sub-agent prompt template (§"Open questions for + implementation") fillable from the panic message alone. + +- **Documentation** in `claude-notes/instructions/`: a short note on the + idempotence contract for transform and filter authors, including the + meta-hash-excludes-`rendered.*` rule and how to add a fixture when + introducing a new transform. + +### Out of scope + +- **Round-trip non-idempotence** + (`pipeline(write(pipeline(x))) ≠ pipeline(x)`). Plan 7a's runtime + check handles this. Plan 3 deliberately tests only pipeline + non-determinism — see §"Pipeline-determinism only" below. +- **User-supplied filters**. Per-document, per-user; Plan 7a covers + these at runtime with an `idempotent: false` opt-out. +- **Rust-vs-React rendering parity**. Different contract; later plan. +- **Performance / debouncing**. Idempotence verification doesn't + measure runtime. +- **Engine execution non-determinism**. CI doesn't run jupyter / knitr; + fixtures must contain only fenced code blocks (AST-level), not + executable code cells. The `engine-execution` stage is a no-op on + fixtures with no engine cells; the `capture-splice` stage is a + pass-through when no capture is supplied. See §"No executable engine + cells" below. +- **Chrome HTML-string canonicalization**. Meta hash skips + `rendered.*` because those are HTML strings populated by + navbar-render / sidebar-render / etc.; semantically-equal but + textually-different HTML would fail a strict comparison. Structural + non-determinism in chrome transforms shows up elsewhere (e.g., a + navbar transform that emits attributes in non-canonical order + inside its HTML still produces a stable hash *of the meta key + containing the HTML* across runs, because both runs go through + the same code path — what we're missing is HTML-shape determinism, + which is a separate concern best tested with HTML snapshots). +- **`meta.rendered.includes.*` HTML/text strings**. Written by + `IncludeResolveStage` (user-supplied `include-in-header` / + `before-body` / `after-body` files), `WebsiteFaviconTransform` + (favicon ``), `attribution_viewer` (CLI-only — q2-preview + excludes it), and Bootstrap/clipboard injection on the HTML path. + These all sit under `rendered.*` and are skipped by + `compute_meta_hash_fresh_excluding_rendered`. If we ever want to + cover the includes subtree separately (catch a transform that + shuffles include-file ordering, say), the right shape is a separate + helper, not a partial inclusion of the rendered subtree. + +### No executable engine cells + +CI does not execute engine cells. Fixtures must: + +- Use only fenced code blocks (`` ```python ``, ` ```r `, etc.) — AST + nodes, not executed. +- NOT use `{python}` / `{r}` / `{julia}` style executable cells. + +If a fixture happens to include an executable cell, the +`engine-execution` stage will either fail (no kernel available) or +fall through to the markdown passthrough. Either way the test is +unreliable. The fixture-format documentation enforces this. + +## Pipeline-determinism only — round-trip is Plan 7a's job + +Two distinct properties get loosely called "non-idempotence": + +1. **Pipeline non-determinism**: `pipeline(x)` produces different + output on repeat calls. Caused by time / RNG / mutable global state + / undefined-order iteration. **This is what Plan 3 tests.** + +2. **Round-trip non-idempotence**: + `pipeline(write(pipeline(x))) ≠ pipeline(x)`. The pipeline doesn't + re-parse its own output today; this becomes a concern only when + Plan 7's incremental writer lands. Plan 7a covers (2) at runtime + for **user-supplied** Lua filters, with per-filter attribution and + an `idempotent: false` opt-out. **Built-in** filter round-trip is + not covered by any plan in the epic (see Plan 7a's §"Notes" for + the accepted-gap reasoning). + +Plan 3 deliberately scopes to (1) because: + +- (2) isn't exercised by today's pipeline. +- (2)'s test conflates writer-lossiness with filter-non-idempotence; + Plan 7's writer-lossless baseline test (planned for Plan 7's first + commit) and Plan 7a's per-filter isolation disambiguate the user + filter case. +- For built-ins, the universe is small (one Lua filter + + ~36 Rust transforms, all under our control); if (2) bites us in + production after Plan 7 ships, the fix is to extend Plan 7a's + runtime check to also fire on `FilterSource::Extension` filters — + a small follow-up tracked in 7a's §"Out of scope." + +See Plan 7a's §"Two flavors of non-idempotence" for the full +treatment. + +## Design decisions (settled in conversation) + +- **The hash is source-info-agnostic** (verified). `compute_block_hash_fresh` + excludes `source_info`; the new `compute_meta_hash_fresh` will do the + same for `ConfigValue::source_info` and `ConfigMapEntry::key_source`. + Test asserting this lives at `hash.rs:767` for blocks; equivalent + test lands for meta. +- **`merge_op` participates; map keys hashed in insertion order, no + sort.** See the helper spec in §"In scope" for the full reasoning. + In one line: an idempotence test wants to *catch* the kind of + non-determinism a sort would hide. +- **Hash covers blocks and meta-minus-`rendered.*`**. Meta inclusion + catches non-determinism in metadata-normalize, listing data, + shortcode-resolved meta values, attribution metadata, etc. The + `rendered.*` keys are HTML strings populated by chrome-render + transforms; their canonicalization is a separate concern. +- **Filter mutation provenance stays Original** (post-Plan 4 unified + `Generated { by: By::filter(...), from: [] }` shape). Idempotence + test sees consistent shape across runs. +- **Each pipeline run uses fresh Lua state.** Two construction sites, + both verified fresh per pipeline invocation: + - **User filters**: `apply_lua_filter` (singular, at + `crates/pampa/src/lua/filter.rs:158`) constructs a fresh + `Lua::new()` per filter. The outer `apply_lua_filters` (plural, at + line 270) loops over `filter_paths` and calls the singular form + once per filter, so every filter in every run starts from a clean + Lua state. + - **Shortcodes**: `LuaShortcodeEngine::new` + (`crates/pampa/src/lua/shortcode.rs:68`) is constructed on the + stack inside `ShortcodeResolveTransform::transform()` at + `crates/quarto-core/src/transforms/shortcode_resolve.rs:513`, so + each pipeline run also gets a fresh shortcode-side `Lua::new()`. + + No cross-run state accumulation on either side. This matches + production (hub-client builds a new pipeline per render) and + resolves the prior "second-run pipeline starts fresh?" open + question. +- **Built-in scope = Rust transforms + ship-with-Quarto Lua filters**. + User filters are out of scope here (Plan 7a covers them). + +## What gets tested concretely + +Every fixture runs through **two pipeline-driver modes**, both compared +against themselves: + +1. **Single-file mode** — `run_pipeline` directly with + `build_q2_preview_pipeline_stages`. Mirrors the lowest-level entry + point used by `render_qmd_to_preview_ast` (`pipeline.rs:859`). +2. **Project-orchestrator mode** — calls the existing + `render_active_page_preview` helper at + `crates/quarto-core/tests/render_page_in_project.rs:660`. That + helper already drives + `ProjectPipeline` end-to-end (project + discovery, multi-file re-discovery guard, format setup, `ActivePage` + mode), returns `WasmPassTwoOutput`, and panics on pass-1 / pass-2 + failures. It is exactly the path the real `q2 preview` and + hub-client renders take. We use it as-is; no fresh orchestrator + wiring is required. + +Why both: single-file mode catches stage / transform non-determinism; +project mode additionally exercises any non-determinism introduced by +the orchestrator itself (project discovery, ProjectIndex assembly, +file-iteration order, pass-1 → pass-2 hand-off). + +```rust +use quarto_ast_reconcile::{compute_blocks_hash_fresh, compute_meta_hash_fresh_excluding_rendered}; +use quarto_core::format::Format; +use quarto_core::pipeline::{build_q2_preview_pipeline_stages, run_pipeline}; +use quarto_core::project::pass2_renderer::WasmPassTwoOutput; +use quarto_core::stage::{DocumentAst, PipelineData}; +use quarto_pandoc_types::Pandoc; +use quarto_system_runtime::NativeRuntime; +use std::path::{Path, PathBuf}; +use std::sync::Arc; +use tempfile::TempDir; + +/// How a fixture is driven through the pipeline. Every fixture runs +/// once per mode; both modes hash equal across two runs. +#[derive(Clone, Copy, Debug)] +enum DriveMode { + /// `run_pipeline` directly with `build_q2_preview_pipeline_stages`. + SingleFile, + /// Reuses the existing `render_active_page_preview` helper at + /// `crates/quarto-core/tests/render_page_in_project.rs:660`. + ProjectOrchestrator, +} + +/// A test fixture. The whole project lives in a `TempDir` that the +/// fixture owns; the `_quarto.yml` (if any) plus the page contents +/// are written by `setup()`. Document-only fixtures still create a +/// temp dir + minimal `index.qmd` so the orchestrator mode has +/// something to discover. +struct Fixture { + name: &'static str, + /// Idempotent setup callback. Receives the project root. + /// Must write at minimum `/` (the page being rendered), + /// optionally `/_quarto.yml` and sibling files. + setup: Box, + /// The active page, relative to the project root. Defaults to + /// `index.qmd`. + active: PathBuf, +} + +fn run_fixture(fixture: &Fixture, mode: DriveMode) { + let doc_1 = run_q2_preview(fixture, mode); + let doc_2 = run_q2_preview(fixture, mode); + + let blocks_a = compute_blocks_hash_fresh(&doc_1.blocks); + let blocks_b = compute_blocks_hash_fresh(&doc_2.blocks); + let meta_a = compute_meta_hash_fresh_excluding_rendered(&doc_1.meta); + let meta_b = compute_meta_hash_fresh_excluding_rendered(&doc_2.meta); + + if blocks_a != blocks_b || meta_a != meta_b { + // Localize before panicking so the failure message gives the + // sub-agent prompt a concrete starting point. + let point = find_first_divergence(&doc_1, &doc_2); + panic!( + "fixture {} ({mode:?}): non-idempotent\n \ + blocks: {blocks_a:016x} vs {blocks_b:016x}\n \ + meta: {meta_a:016x} vs {meta_b:016x}\n \ + first divergence: {point:?}", + fixture.name, + ); + } +} + +fn run_q2_preview(fixture: &Fixture, mode: DriveMode) -> DocumentAst { + let temp = TempDir::new().unwrap(); + let project_dir = temp.path().canonicalize().unwrap(); + (fixture.setup)(&project_dir); + let active = project_dir.join(&fixture.active).canonicalize().unwrap(); + + match mode { + DriveMode::SingleFile => run_single_file(&project_dir, &active), + DriveMode::ProjectOrchestrator => run_orchestrator(&project_dir, &active), + } +} + +fn run_single_file(project_dir: &Path, active: &Path) -> DocumentAst { + // `run_pipeline` is async; the existing tests (e.g. + // render_page_in_project.rs) drive it via `pollster::block_on`. + pollster::block_on(async { + let runtime: Arc = + Arc::new(NativeRuntime::new()); + let mut project = quarto_core::project::ProjectContext::discover( + active, + runtime.as_ref(), + ) + .unwrap(); + if !project.is_single_file { + project = quarto_core::project::ProjectContext::discover( + &project.dir, + runtime.as_ref(), + ) + .unwrap(); + } + let doc = project + .documents + .iter() + .find(|d| d.path == active) + .expect("active doc in project") + .clone(); + let format = Format::from_format_string("q2-preview") + .expect("q2-preview is a recognized pseudo-format"); + let binaries = quarto_core::render::BinaryDependencies::new(); + let mut ctx = quarto_core::render::RenderContext::new( + &project, &doc, &format, &binaries, + ); + + let content = std::fs::read(active).unwrap(); + let stages = build_q2_preview_pipeline_stages(None, None); + let (output, _diagnostics) = run_pipeline( + &content, + &active.to_string_lossy(), + &mut ctx, + runtime, + stages, + ) + .await + .expect("pipeline run"); + + match output { + PipelineData::DocumentAst(ast) => ast, + other => panic!("expected DocumentAst, got {:?}", other.kind()), + } + }) +} + +fn run_orchestrator(project_dir: &Path, active: &Path) -> DocumentAst { + // Delegates to the existing helper. It already drives + // ProjectContext::discover + ProjectPipeline + ActivePage mode, + // and panics if pass-1 / pass-2 surface failures. We just lift + // the AST JSON out of `Pass2Payload::AstJson` and re-parse it + // back into a typed Pandoc via pampa's JSON reader — the source_info + // round-trips, but the hash explicitly excludes source_info, so + // the parse is a clean conversion for hashing purposes. + let output: WasmPassTwoOutput = + render_active_page_preview(project_dir, active); + let ast_json = output + .payload + .as_ast_json() + .expect("orchestrator must emit Pass2Payload::AstJson"); + let mut bytes = ast_json.as_bytes(); + let (pandoc, _ast_ctx) = pampa::readers::json::read(&mut bytes) + .expect("re-parse AST JSON"); + pandoc_to_document_ast(pandoc) +} + +// `pandoc_to_document_ast` converts the re-parsed `Pandoc` into the +// `DocumentAst` shape the hash helpers want. Pandoc carries blocks +// + the document meta; the helpers take `&[Block]` and `&ConfigValue` +// respectively, so this is mostly a field shuffle. Exact body +// determined during implementation once the DocumentAst struct is +// inspected next to Pandoc. +``` + +Notes on the helpers: + +- `run_pipeline` (`pipeline.rs:627`) is the existing entry point for + the single-file mode; no new driver is needed. +- The q2-preview pipeline ends at `CodeHighlightStage`, so its output + is `PipelineData::DocumentAst`. +- Each call constructs fresh `StageContext` (inside `run_pipeline` or + inside the orchestrator's per-page renderer setup) and fresh Lua + engines per filter / shortcode invocation — natural per-run + isolation. +- The orchestrator path's `Pass2Payload::as_ast_json()` accessor + (`crates/quarto-core/src/project/pass2_renderer.rs:272`) already + exists. `pampa::readers::json::read` (`crates/pampa/src/readers/json.rs:1063`) + parses the JSON back into a typed `Pandoc`. The source_info that + the JSON writer emits with `include_inline_locations: true` + (`crates/quarto-core/src/pipeline.rs:910` area) round-trips through + the reader, but **the hash explicitly excludes source_info** — so + no stripping pass is required and no production plumbing change is + needed. See §"Decisions" / + §"Orchestrator-mode `DocumentAst` extraction" for why option (a) + beats the typed-plumbing alternative. + +### Fixture-to-mode mapping + +Not every fixture is meaningful in every mode: + +| Fixture class | Single-file | Project-orchestrator | +|---|---|---| +| Plain document (`callout-warning`, `theorem`, `code-block-fenced`, …) | ✓ | ✓ (one-page project) | +| Website chrome (`website-chrome`, `website-links`, `website-listing`) | n/a (chrome stages need ProjectContext) | ✓ | +| Attribution (`attribution-basic`) | ✓ (provider on RenderContext) | ✓ | + +Document fixtures run in both modes against the *same* fixture content +(the orchestrator wraps the document in a tiny synthetic project). +Website fixtures run orchestrator-only because the chrome transforms +require a populated ProjectIndex; running them through single-file +mode would test a partial pipeline that doesn't exist in production. + +### Failure modes the test catches + +- A filter that's truly non-idempotent (e.g., `Str.text + "!"` → + growing text on each run). +- A transform that emits non-deterministic attributes or `plain_data` + (e.g., HashMap iteration order in a sloppy implementation). +- A transform that mutates inputs differently across runs (probably a + bug). +- A metadata transform that synthesizes meta keys non-deterministically + (e.g., listing-item-info that gets file-mtime in racy ways). + +### Failure modes the test does NOT catch + +- A transform that's idempotent but produces *wrong* output (wrong-but- + consistent — needs other testing). +- A filter that's idempotent for one input but non-idempotent for + another (need representative fixtures). +- Round-trip non-idempotence — see §"Pipeline-determinism only" above + and Plan 7a. +- HTML-shape non-determinism inside `meta.rendered.*` (excluded from + the hash). + +## Coverage gaps to address during implementation + +Each fixture below covers one or more transforms. **All 26 fixtures +below now ship in `crates/quarto-core/tests/idempotence.rs`** (plus a +27th `smoke_plain_paragraph` not enumerated here). Two are in the +Phase-5 triage queue (marked inline below); the other 24 pass on +first run in both applicable modes. + +**Existing fixtures (carry forward from prior plan draft):** + +- [x] `meta-single` — `{{< meta foo >}}` with single-string foo → + shortcode-resolve, metadata-normalize. +- [x] `meta-markdown` — `{{< meta foo >}}` with `**Bold** title` → + shortcode-resolve (PandocInlines branch). +- [x] `include-trivial` — `{{< include child.qmd >}}` → + include-expansion stage, shortcode-resolve. +- [x] `callout-warning` — `::: {.callout-warning} Body :::` → callout. + (callout-resolve is excluded; CustomNode survives.) +- [x] `theorem` — `::: {.theorem #thm-foo} Math here :::` → + theorem-sugar. +- [x] `figure-ref-target` — `:::: {#fig-foo} ![cap](img.png) ::::` → + float-ref-target-sugar. +- [x] `crossref-to-theorem` — `See @thm-foo` paired with the theorem + above → crossref-index, crossref-resolve. +- [x] `sectionize-multi` — `## A` / `### B` / `## C` with body → + sectionize. +- [x] `footnotes-mixed` — inline `^[...]` + reference `[^foo]` → + footnotes. +- [x] `appendix-license` — `license:` / `copyright:` meta + + `:::{.appendix}` user block + footnotes → appendix-structure + (+ footnotes interaction). +- [x] `combined-stress` — sectionize + callouts + shortcodes + interacting. + +**New fixtures (gap audit):** + +- [x] `code-block-fenced` — fenced ``` ```python ``` block with content + → code-block-generate, code-block-render, code-highlight stage. +- [x] `lua-shortcode-version` — `{{< version >}}` → shortcode-resolve + (Lua-loaded handler path; simplest deterministic case — returns + `quarto.version` joined by dots). +- [x] `lua-shortcode-lipsum-fixed` — `{{< lipsum 3 >}}` (no `random=` + kwarg) → shortcode-resolve via lipsum's Lua handler. The + `math.randomseed` in `lipsum.lua:5` runs but `math.random` is never + called on this code path, so the output is the first three + paragraphs of the canned data deterministically. The `random=true` + variant is intentionally non-deterministic and out of scope. + **In Phase-5 queue (bd-3odjm)**: pipeline IS idempotent (SingleFile + mode passes), but ProjectOrchestrator panics with + `MalformedSourceInfoPool` re-parsing the AST JSON — known Plan-5 + wire-format issue, not a transform bug. +- [x] `proof` — `::: {.proof} ... :::` → proof-sugar. +- [x] `equation-labeled` — `$$ E=mc^2 $$ {#eq-mass}` paired with + `@eq-mass` → equation-label, crossref-resolve (equation branch). +- [x] `toc-on` — `toc: true` + multiple sections → toc-generate, + toc-render. +- [x] `video-filter-header` — exercises + `resources/extensions/quarto/video/video-filter.lua` (the only + built-in Lua filter under `resources/extensions/`). The `quarto/video` + extension is **embedded at compile time** (`include_dir!` of + `resources/extensions/` in + `crates/quarto-core/src/extension/mod.rs:33`) and auto-discovered for + every `StageContext::new()` call (`stage/context.rs:221`), so the + fixture needs no scaffolding beyond declaring the filter. Minimal + shape: + + ```yaml + --- + filters: + - video + --- + + # Title {background-video="https://www.youtube.com/embed/abc"} + ``` + + The filter rewrites `background-video` → `background-iframe` on + Headers whose URL matches one of three video hosts. Pattern matches + the smoke-test at + `crates/quarto/tests/smoke-all/extensions/filter-extension/test.qmd`. +- [x] `include-in-header` — `include-in-header: foo.html` in meta with + trivial `foo.html` → include-resolve stage. +- [x] `theme-bootstrap` — `theme: cosmo` (or default) in meta → + compile-theme-css stage. +- [x] `table-bootstrap-class` — a simple pipe table (`| col | + --- | val |`) → `TableBootstrapClassTransform`. The transform + attaches Bootstrap CSS classes (`table`, etc.) to `Table` nodes; + the assertion that the same classes appear in the same order on + both runs is the idempotence check. Minimal shape: one + two-column, two-row pipe table; no extra config needed. + +**Website-project fixtures** (each needs a `ProjectContext` wired to a +`_quarto.yml` with `project.type: website` + the relevant config; one +combined fixture can cover most chrome transforms): + +- [x] `website-chrome` — minimal website with navbar, sidebar, page + navigation, footer, favicon, bootstrap icons → website-title-prefix, + website-favicon, website-bootstrap-icons, website-canonical-url, + navbar-generate/render, sidebar-generate/render, page-nav-generate/render, + footer-generate/render, link-resolution stage. +- [x] `website-links` — internal `.qmd` body links between two project + pages → link-rewrite + link-resolution. (bd-rz2we: fixed by + splitting `vfs_root` into write-root + url-root so native test + helpers can pass a synthetic URL prefix while disk writes still + land in the tempdir; see + `claude-notes/plans/2026-05-21-vfs-url-write-root-split.md`.) +- [x] `website-listing` — minimal listing with two items, one with + categories, one with `feed:` config → listing-generate, listing-render, + categories-sidebar, listing-feed-link, listing-feed-stage (native only), + listing-item-info stage. + +**Attribution fixture** (the test helper installs an +`AttributionSourceProvider` on `RenderContext.attribution_provider`; +`run_pipeline` forwards it to `StageContext.attribution_provider` at +`pipeline.rs:664`): + +- [x] `attribution-basic` — document with an installed + attribution provider → attribution-generate stage, attribution-render + transform. Uses `PreBuiltAttributionProvider` (transport JSON) rather + than `GitBlameProvider` for determinism — git history would vary + across machines. + +**Resource fixture:** + +- [x] `resource-image` — `![alt](./local.png)` with the image file + present → resource-collector. Image is a 67-byte minimal valid PNG + written via `write_bytes`. + +If a fixture in this list discovers non-idempotence on first run, +**leave the test failing** and file a beads issue using the sub-agent +investigation prompt template in §"CI failure policy & sub-agent +prompt template." +The fix lands against the appropriate transform's crate (per §"What +happens when a fixture fails"). Do not silently drop the fixture, and +do not `#[ignore]` it without explicit user approval — failing tests +are the triage backlog. + +## Decisions (was: open questions) + +- **Test crate location** — settled. The test lives at + `crates/quarto-core/tests/idempotence.rs` as a workspace-level + integration test (matches the existing pattern in + `crates/quarto-core/tests/` — `sidebar_pipeline.rs`, + `navbar_footer_pipeline.rs`, `render_page_in_project.rs`, etc.). + Invoke with `cargo nextest run -p quarto-core --test idempotence`. +- **Fixture location** — settled. Files in + `crates/quarto-core/tests/fixtures/idempotence/`, one subdirectory + per non-trivial fixture (for the website/multi-file ones); in-source + literals for the trivial single-page cases written by the fixture's + `setup` closure into a `TempDir`. Pattern matches + `crates/quarto-core/tests/fixtures/websites/hub-smoke/` and + `phase5-website-baseline/`. +- **`ProjectContext` setup for website fixtures** — resolved by + reuse. There is **no need** to write a `make_website_project_ctx` + helper. The existing pattern across `crates/quarto-core/tests/` + (used by `render_page_in_project.rs`, `sidebar_pipeline.rs`, + `navbar_footer_pipeline.rs`, `page_navigation_pipeline.rs`, + `listing_pipeline.rs`, `navigation_e2e.rs`, `link_rewriting_pipeline.rs`, + `website_post_render.rs`) is: write `_quarto.yml` + page contents + into a `TempDir`, then let `ProjectContext::discover` do the rest. + Each Plan 3 fixture's `setup` closure does exactly this in 5-30 + lines. The chrome transforms read their config from the discovered + project — they don't need a parameterized builder. For the website + fixtures (`website-chrome`, `website-links`, `website-listing`) + we can either inline the YAML in `setup` or, for the larger ones, + use `copy_fixture(...)` (see `render_page_in_project.rs:616`) to + pull a pre-built fixture directory out of `tests/fixtures/idempotence/`. +- **Fixture-authoring rules for path-recording transforms** — + settled. Fixtures that exercise `resource-collector`, + `include-resolve`, `BUILTIN_EXTENSIONS` (any built-in extension + lookup), or other transforms that record absolute paths into meta + MUST use only paths that resolve relative to the fixture root, + never absolute process paths. Reason: the built-in extensions + resource bundle extracts to a `temp_dir()`'d location whose + absolute path differs across processes (stable within a single + process — fine for Plan 3's two-runs-compare contract, but a + latent issue for any future stored-snapshot variant). The + fixtures README must spell this out. Two practical rules: + (1) use relative URLs in fixture body content (`./local.png`, + not `/private/var/.../local.png`); + (2) when a transform's output includes a path, the assertion must + hash the value through `compute_meta_hash_fresh_excluding_rendered` + (which we already do) so test-process-specific paths under + `rendered.*` are excluded by construction. +- **Orchestrator-mode `DocumentAst` extraction** — settled on + option (a). The orchestrator path emits the AST as a JSON string + via `Pass2Payload::AstJson`; `Pass2Payload::as_ast_json()` + (`crates/quarto-core/src/project/pass2_renderer.rs:272`) is + already in the API. `pampa::readers::json::read` + (`crates/pampa/src/readers/json.rs:1063`) parses it back into a + typed `(Pandoc, ASTContext)`. The JSON writer emits source_info + triples (`include_inline_locations: true`), and those round-trip + through the reader — but **the hash explicitly excludes + source_info** (`compute_blocks_hash_fresh` / + `compute_meta_hash_fresh` both skip it), so no stripping pass is + needed. Cost: one JSON-string parse per orchestrator-mode + assertion, no production plumbing change. The earlier draft of + this section preferred option (b) (forward typed `DocumentAst` + through `PreviewAstOutput` / `WasmPassTwoOutput`) — abandoned + because (a) needs *no* type changes and the source_info concern + doesn't actually bite the hash. +- **bd-2ag1c ordering** — Plan 3 lands first; bd-2ag1c (whether + `BootstrapJsStage` / `ClipboardJsStage` belong in + `Q2_PREVIEW_STAGE_EXCLUDED`) waits for Plan 3's coverage. The + rationale: Plan 3 is what *measures* whether those stages + contribute non-determinism to the q2-preview AST; if they don't + (they currently only write to `ctx.artifacts`, not to `meta` or + `blocks`), bd-2ag1c can be closed without changes. If they do, + bd-2ag1c picks up the cleanup with measurements in hand. + +### CI failure policy & sub-agent prompt template + +The test fails noisily if any transform / filter is non-idempotent +— that's the point. Failing fixtures stay **failing** (no +auto-`#[ignore]`). For each failure, file a beads issue whose +description doubles as a self-contained sub-agent investigation +prompt: the fixture path, the two hash values, the diverging key +path (block vs meta), and the suspected stage / transform / filter +to focus on. `#[ignore]` is only applied when the user explicitly +says so. **The integration branch (`feature/provenance`) is allowed +to be red while the queue is being drained** — see §"Long-lived +branch policy" at the top of this plan and §"Phase 5 — Failure +triage" for the operational mechanics. + +Sub-agent prompt template (filled in per failure when filing the +beads issue — the test driver's panic message provides the +fixture, mode, hashes, and `DivergencePoint`, so the agent already +has a concrete starting point): + +> Investigate non-idempotence in q2-preview fixture +> `` (`` mode). Two consecutive pipeline +> runs over the same input diverge at +> ` or "MetaKey { path: ["listings", "foo"] }">`. Hashes: blocks +> `` vs ``, meta `` vs ``. Read +> `claude-notes/plans/.md` §"Failure modes the test +> catches" for category guidance. Reproduce with `cargo nextest +> run -p quarto-core --test idempotence `. +> Suspected source likely lives in `` based on +> the divergence location — start there. Verdict: deterministic +> source (HashMap iteration, time, RNG) → propose a fix; +> non-deterministic but semantically equivalent (e.g. attribute +> ordering inside an HTML chrome payload) → propose either +> canonicalization at the source or a targeted hash exclusion. Do +> not `#[ignore]` the test. + +## References + +Line numbers below are accurate as of `feature/provenance` HEAD on +2026-05-21. Plan 4's source_info churn or any pipeline reorganization +may shift them — when in doubt, grep by symbol name. The plan's +factual content survives line-number drift; the references are a +convenience for navigating, not a contract. + +- `crates/quarto-core/src/pipeline.rs:1237` + `build_q2_preview_transform_pipeline` — q2-preview transform list, + source of truth. +- `crates/quarto-core/src/pipeline.rs:1198` + `Q2_PREVIEW_TRANSFORM_EXCLUDED` — the four transforms that don't run. +- `crates/quarto-core/src/pipeline.rs:380` + `build_q2_preview_pipeline_stages` — stage-level pipeline. +- `crates/quarto-core/src/pipeline.rs:356` + `Q2_PREVIEW_STAGE_EXCLUDED` — three excluded stages + (`math-js`, `render-html-body`, `apply-template`). +- `crates/quarto-core/src/pipeline.rs:627` + `run_pipeline` — pipeline execution entry point used by the test + runner. +- `crates/quarto-core/src/pipeline.rs:859` + `render_qmd_to_preview_ast` — production entry point that combines + `build_q2_preview_pipeline_stages` + `run_pipeline`; mirrors the + `DriveMode::SingleFile` helper. +- `crates/quarto-core/src/pipeline.rs:168` + `PreviewAstOutput` — currently carries only `ast_json: String` + (no typed `DocumentAst`). +- `crates/quarto-core/src/transforms/` — the Rust transform crate root. + Each transform's `name()` matches the kebab-case strings listed in + §"What 'built-in' covers." +- `crates/quarto-core/src/transforms/code_highlight.rs:126` + `CodeHighlightStage`'s native user-grammar disk scan + (`ctx.project.dir.join("_quarto").join("grammars")`). OS-order- + dependent if a grammar directory is present; not exercised by + Plan 3 fixtures (see §"Noted, not actively tested"). +- `crates/quarto-core/src/transforms/shortcode_resolve.rs:513` + `ShortcodeResolveTransform::transform` — site of the per-pipeline + fresh `LuaShortcodeEngine::new` construction. +- `crates/quarto-ast-reconcile/src/hash.rs:115` + `compute_blocks_hash_fresh` — the existing blocks hasher (slice). +- `crates/quarto-ast-reconcile/src/hash.rs:102` + `compute_block_hash_fresh` — the singular per-block hasher used + by `find_first_divergence` for index-keyed comparison. +- `crates/quarto-ast-reconcile/src/hash.rs:767` + `test_same_content_same_hash` — confirms blocks hash excludes + source_info. +- `crates/pampa/src/lua/filter.rs:158` + `apply_lua_filter` — per-filter Lua engine creation point (singular). + Driven by `apply_lua_filters` (plural, line 270), which loops over + `filter_paths` and calls the singular form once per filter. +- `crates/pampa/src/lua/shortcode.rs:68` + `LuaShortcodeEngine::new` — per-pipeline Lua engine for shortcodes + (constructed on the stack inside + `ShortcodeResolveTransform::transform`). +- `crates/pampa/src/readers/json.rs:1063` + `pampa::readers::json::read` — re-parses AST JSON back into a + typed `(Pandoc, ASTContext)`; used by `DriveMode::ProjectOrchestrator` + to recover a typed AST for hashing. +- `crates/quarto-core/src/project/pass2_renderer.rs:272` + `Pass2Payload::as_ast_json` — accessor used by both the existing + test in `render_page_in_project.rs` and Plan 3's orchestrator-mode + helper. +- `crates/quarto-core/src/project/pass2_renderer.rs:254` + `Pass2Payload::AstJson` — variant currently carries only + `ast_json: String`. +- `crates/quarto-core/src/stage/context.rs:221` + `StageContext::new` — calls `discover_extensions` with the embedded + built-in extensions path, so the `quarto/video` filter extension is + always discoverable without per-fixture scaffolding. +- `crates/quarto-core/src/extension/mod.rs:33` + `BUILTIN_EXTENSIONS_DIR` — compile-time + `include_dir!(resources/extensions)` ensures the video/lipsum/version/ + kbd/placeholder extensions are baked into the binary. +- `crates/quarto-core/tests/render_page_in_project.rs:660` + `render_active_page_preview` — the **existing** + `DriveMode::ProjectOrchestrator` helper. Reused verbatim by Plan 3, + not reimplemented. +- `crates/quarto-core/tests/render_page_in_project.rs:64` + `render_active_page` — sibling HTML helper; useful prior art for + the project-discovery pattern even though Plan 3 doesn't use it + directly. +- `crates/quarto-core/tests/render_page_in_project.rs:616` + `copy_fixture` — utility for copying a pre-built fixture directory + out of `tests/fixtures/` into a `TempDir`. Available for the + heavier website fixtures. +- `crates/quarto-core/tests/fixtures/websites/hub-smoke/`, + `crates/quarto-core/tests/fixtures/phase5-website-baseline/` — + example website fixture directories with `_quarto.yml` + multi-page + layouts. Demonstrates the shape Plan 3's website fixtures take. +- `resources/extensions/quarto/video/video-filter.lua` — the one + built-in Lua filter today. +- `claude-notes/plans/lua-filter-pipeline/00-index.md` — Carlos's + 2025-12-21 analysis of **TypeScript Quarto**'s `run_as_extended_ast()` + Lua filter pipeline (~78 stages classified by side-effect category). + This is porting reference material for the broader epic, **not** the + inventory Plan 3 tests. Plan 3's universe is enumerated in §"What + 'built-in' covers." Useful when porting an additional TS filter into + Rust and wondering whether the source-side analysis flagged it as + pure / file-reading / network / subprocess. + +## Work items + +### Phase 1 — Hashing infrastructure + +- [x] Add `compute_meta_hash_fresh` in + `crates/quarto-ast-reconcile/src/hash.rs`, parallel to + `compute_blocks_hash_fresh`. Walks `ConfigValue` tree + source-info-agnostically. Hashes scalars by `Yaml` payload, recurses + into `PandocInlines` / `PandocBlocks` via the existing inline / block + hashers, hashes `Array` in order, hashes `Map` entries + `(key_string, recurse(value))` **in insertion order** (no sort), + **includes `merge_op`**, skips `source_info` and `key_source`. (See + §"In scope" for the full spec.) +- [x] Add `compute_meta_hash_fresh_excluding_rendered` variant that + skips the `rendered` top-level key (HTML-string side outputs from + chrome transforms + `IncludeResolveStage` + Bootstrap/clipboard + injection). +- [x] Add unit tests for both: + - same content → same hash; + - different content → different hash; + - different `source_info` / `key_source` → same hash; + - same content with `rendered.foo` key only differing → same hash + for the excluding variant; + - **same content with Map keys in different insertion order → + different hash** (regression guard for the no-sort choice); + - different `merge_op` → different hash (regression guard for the + `merge_op`-participates choice). +- [x] Add `find_first_divergence(blocks_a, meta_a, blocks_b, meta_b) + -> DivergencePoint` alongside the hashers. The plan-sketch signature + took `&DocumentAst`, but `DocumentAst` lives in `quarto-core` and + `quarto-ast-reconcile` cannot depend on it; the helper takes the + underlying `&[Block]` + `&ConfigValue` instead, and the test driver + in `quarto-core/tests/idempotence.rs` will project from its + `DocumentAst`. Reuses `compute_block_hash_fresh` for the block walk + and a recursive insertion-order traversal for the meta walk; both + walks short-circuit on the first divergence. +- [x] Unit tests for `find_first_divergence`: + - identical docs → `DivergencePoint::None`; + - one block differs at index N → `Block { index: N, ... }`; + - one meta key path differs → `MetaKey { path: [...], ... }`; + - divergence under a `rendered.*` path → not reported (skipped to + match `compute_meta_hash_fresh_excluding_rendered`). + +### Phase 2 — Test crate scaffolding + +- [x] Create `crates/quarto-core/tests/idempotence.rs`. +- [x] Implement the `Fixture` struct + `run_fixture(fixture, mode)` + helper that loops `DriveMode::{SingleFile, ProjectOrchestrator}` + (see §"What gets tested concretely" for the body). +- [x] Implement `run_single_file(project_dir, active) -> DocumentAst` + using `ProjectContext::discover` + `build_q2_preview_pipeline_stages` + + `run_pipeline`. (~50 lines; the only genuinely new driver.) +- [x] Implement `run_orchestrator(project_dir, active) -> DocumentAst` + by delegating to the existing `render_active_page_preview` helper + at `crates/quarto-core/tests/render_page_in_project.rs:660` and + re-parsing `Pass2Payload::as_ast_json()` via + `pampa::readers::json::read`. (Helper copied inline since each + `tests/*.rs` is its own binary; the plan flags this as acceptable.) + No new orchestrator wiring is written; no production plumbing + change is needed. +- [x] Implement `pandoc_to_document_ast(pandoc) -> DocumentAst` — the + small field-shuffle between the re-parsed `Pandoc` and the + hashing helpers' expected shape. Land inline in `idempotence.rs`; + do not promote to library code until a second caller appears. +- [x] Create `crates/quarto-core/tests/fixtures/idempotence/` + directory with a README listing the fixture-format rules: + - no executable engine cells (fenced `` ```python `` blocks only); + - **no absolute process paths** in fixture content — see §"Decisions" + / "Fixture-authoring rules for path-recording transforms"; + - per-fixture mode mapping (document fixtures run in both modes; + website fixtures orchestrator-only). +- [x] Borrow `write` / `canonical` (and `snippet` / `copy_fixture` when + needed by Phase 4 fixtures) from `render_page_in_project.rs` — + copied into `idempotence.rs` for now; pulling them into a shared + `tests/common/` module is out of scope for Plan 3. +- [x] **Phase-2 smoke fixture** (`smoke_plain_paragraph`) drives both + modes on a single-paragraph document. Passing this confirms the + harness is wired correctly before Phases 3-4 add the real fixtures. + +### Phase 3 — Existing-fixture coverage (carry-forward) + +- [x] Add fixtures: `meta-single`, `meta-markdown`, `include-trivial`, + `callout-warning`, `theorem`, `figure-ref-target`, + `crossref-to-theorem`, `sectionize-multi`, `footnotes-mixed`, + `appendix-license`, `combined-stress`. +- [x] Wire one assertion per (fixture, mode) pair — these are all + document fixtures, so each runs in both `SingleFile` and + `ProjectOrchestrator` mode. (For now each `#[test]` calls + `run_in_each_mode` which loops over both modes; if a fixture + later goes red in only one mode, the panic message names the + mode, and we can split into two `#[test]` functions for finer + reporting at that point.) + +All 11 carry-forward fixtures pass on first run, in both modes. +No queue entries. + +### Phase 4 — New-fixture coverage (gap closure) + +- [x] Add document-level fixtures (run in **both** modes), batch 4a + (no extra scaffolding): `code-block-fenced`, `lua-shortcode-version`, + `lua-shortcode-lipsum-fixed` (with module-load `randomseed` comment + in the `.qmd` per §"Noted, not actively tested"), `proof`, + `equation-labeled`, `toc-on`, `video-filter-header`, + `theme-bootstrap`, `table-bootstrap-class`. **9/10 pass on first run.** + `lua_shortcode_lipsum_fixed` fails in `ProjectOrchestrator` mode + only — not a hash mismatch but a `MalformedSourceInfoPool` when + re-parsing the orchestrator's AST JSON. JSON writer/reader + round-trip bug specific to lipsum-shortcode-generated inlines. + Filed as **bd-3odjm**; root-caused 2026-05-21 to the type-code-3 + mismatch between writer (`FilterProvenance` payload + `[filter_path, line]`) and reader (still decodes code 3 as legacy + `Transformed` `[parent_id, ...]`). Fix is owned by + **[Plan 5](2026-05-04-q2-preview-plan-5-wire-format.md)** — + §Goal calls this exact bug out and Plan 5's reader change handles + both shapes. Per the long-lived-branch policy below, this stays + red on `feature/provenance` until Plan 5 lands; do not patch + locally. `SingleFile` mode passes — the pipeline itself is + idempotent. +- [x] Add document-level fixtures, batch 4b (multi-file): + `include-in-header` (writes a small HTML stub), + `resource-image` (writes a 67-byte minimal PNG). Both pass on + first run in both modes. +- [x] Add website-project fixtures (orchestrator-mode only): + `website-chrome`, `website-links`, `website-listing`. **All 3 + pass.** `website-chrome` (navbar + sidebar + page-nav + footer + + favicon + bootstrap-icons + canonical-url) is clean. + `website-listing` (listing with categories + feed) is clean. + `website-links` (cross-page `.qmd` body links): initial divergence + filed as **bd-rz2we** turned out to be `ResourceResolverContext` + conflating two roles — disk-write root *and* URL prefix — in a + single `PathBuf`. Native test helpers pointed both at a real + tempdir, so rendered link URLs leaked the absolute tempdir path + into the AST. Fixed by splitting the field into + `{ write_root, url_root }` and adding a per-renderer + `with_url_root("/.quarto/project-artifacts")` builder; native + test helpers now keep the tempdir for disk writes but use the + synthetic prefix for URLs. See + `claude-notes/plans/2026-05-21-vfs-url-write-root-split.md`. +- [x] Add attribution fixture: `attribution-basic` (both modes). + Extended `Fixture` with an optional `attribution_json: Option<&'static str>` + field. `run_single_file` installs a `PreBuiltAttributionProvider` + on `ctx.attribution_provider` when present; `render_active_page_preview` + forwards the JSON to `RenderToPreviewAstRenderer::with_attribution`. + Stub JSON has one actor + one run covering bytes 0..1024 so the + attribution map overlaps the entire fixture body. Passes on first + run in both modes. + +### Phase 5 — Failure triage + +`feature/provenance` is a **long-lived integration branch** that +holds failing fixtures *on purpose* until the queue is drained. +The plan does not merge to `main` while any fixture in this gate is +red. See §"Long-lived branch policy" at the top of this plan for +the rationale; what follows is the operational loop. + +- [x] Run the full test suite. For each failing fixture, classify the + cause (filter non-idempotence, transform non-determinism, + metadata-merge issue, etc.). +- [x] For each failure: either fix in-place (if scope is contained and + obvious) or **file a beads issue using the sub-agent investigation + prompt template** from §"CI failure policy & sub-agent prompt + template." Failing tests **stay failing** — no auto-`#[ignore]`. + Only ignore when the user explicitly says so. +- [x] Keep the (still-failing) tests on the integration branch so each + beads issue has a live reproduction. The integration branch may + stay red for an extended period; the merge to `main` happens only + after the queue is drained (every red fixture either fixed or + explicitly `#[ignore]`-d with a permanent rationale signed off + by the user). The failing tests *are* the triage backlog. + +**Queue state after Phase 4 (initial run):** 25 of 27 fixtures green; +2 in the queue. + +- **bd-3odjm** — `lua_shortcode_lipsum_fixed` orchestrator mode. + `MalformedSourceInfoPool` on `pampa::readers::json::read` for the + AST JSON the orchestrator emits. Root-caused to the type-code-3 + mismatch between the writer (`FilterProvenance` payload + `[filter_path, line]`) and the reader (still decodes code 3 as + legacy `Transformed` `[parent_id, ...]`). Fix is owned by + [Plan 5](2026-05-04-q2-preview-plan-5-wire-format.md). +- **bd-rz2we** — `website_links` orchestrator mode. Block 0 hash + diverges across runs with different project roots; meta hash is + stable. Hypothesis: link-rewrite or link-resolution captures the + absolute project root (or a canonicalized tempdir form) into the + AST when it should emit a path-independent relative URL. + +### Phase 6 — Documentation + +- [x] Add `claude-notes/instructions/idempotence-contract.md` covering: + what the contract requires of new transforms, the meta-hash + `rendered.*` exclusion, how to add a fixture when introducing a new + transform, the engine-cells-forbidden rule. +- [x] Cross-link from the README of the fixtures directory. +- [x] Cross-link from Plan 7a (so authors looking at runtime user-filter + idempotence find the CI contract too). + +### Phase 7 — Verification + +- [x] `cargo nextest run --workspace` runs. **9346/9348 pass; 2 fail.** + The two failures are the documented queue items above + (bd-3odjm, bd-rz2we). Every other test in the workspace is green, + including the Phase-1 unit tests in `quarto-ast-reconcile` and + the 25 passing idempotence fixtures. +- [x] `cargo xtask verify` runs (full WASM stack — `npm install` from + repo root, `npm run build:wasm` from hub-client). Steps 1-4 green; + Step 5 (Rust tests with `-D warnings`) fails on the same 2 + queue-item fixtures. Steps 6-12 don't run because of Step 5's + exit; that's the expected long-lived-integration-branch state + per §"Long-lived branch policy" — the gate is red on purpose + until the queue is drained. +- [x] End-to-end invocation recorded in commit messages + (`cargo nextest run -p quarto-core --test idempotence` cited in + every Phase-2 through Phase-4d commit). + +**Plan 3 is complete as a deliverable** — the gate exists, the +hashing infrastructure exists, 27 fixtures cover the universe under +§"What 'built-in' covers", the contract is documented, and the +queue is filed in beads with reproduction commands. Merge to `main` +remains gated on draining the queue (bd-3odjm via Plan 5; bd-rz2we +via a follow-up). + +## Dependencies + +- Depends on: Plan 1 (`build_q2_preview_pipeline_stages` exists and + runs). +- Blocks: implicitly Plans 4-8 (round-trip work assumes this contract + holds — but for pipeline non-determinism only; round-trip itself is + 7a's concern). +- Related to Plan 7a (runtime user-filter idempotence check). Plan 3 + is the **CI-time** half for built-ins (transforms + ship-with-Quarto + Lua filters); Plan 7a is the **runtime** half for user-supplied + filters. The two share `compute_blocks_hash_fresh` / + `compute_meta_hash_fresh` and the same flavor-1-vs-flavor-2 + distinction. See Plan 7a's §"Two flavors of non-idempotence" for the + shared vocabulary. + +### What happens when a fixture fails + +Plan 3 reports failures; the *fix* lands wherever the offending +transform / filter lives. Failure modes and where their fixes go: + +- **Non-idempotent built-in Lua filter**. Edit the filter's Lua + source. Lands in `resources/extensions/quarto//`. Plan 3 + surfaces the test. +- **Non-deterministic transform attribute / `plain_data` ordering**. + HashMap iteration or similar. Lands in the transform's `.rs` file + under `crates/quarto-core/src/transforms/`. +- **Non-deterministic metadata transform**. Lands in + `metadata_normalize.rs` or wherever the offending merge/normalize + step lives. +- **Source-info-related instability**. Should NOT happen because the + hashers exclude source_info / key_source. If somehow it does, + Plan 4's type changes are the place to investigate. + +If a fixture fails on first run, **leave the test failing** and file +a beads issue (with the sub-agent investigation prompt from §"CI +failure policy & sub-agent prompt template"). The failing test stays +red until the issue is resolved — `#[ignore]` only when the user +explicitly says so. Do not silently disable. + +## Risk areas + +- **A transform or filter might fail the test on first run**. Triaged + per Phase 5; **leave failing + file a sub-agent investigation prompt** + (see §"CI failure policy & sub-agent prompt template"). `#[ignore]` + only when the user explicitly says so. +- **Hash stability across binary versions**: `FxHasher`'s output is + stable within a Rust process but not across versions. Tests compare + hashes computed in the same process, not stored as constants. This is + the natural shape of "run pipeline twice and compare" anyway. +- **Pipeline construction non-determinism**: if extension discovery + picks up paths in OS-dependent order, attributes could differ on + different machines. Mitigated by fixture isolation — fixtures don't + reference real OS paths unless explicitly testing a path-aware + feature. The attribution fixture is the main case to watch. +- **Website-project fixture complexity**: assembling a valid + `ProjectContext` is non-trivial. Risk: time spent on test + scaffolding rather than transform coverage. Mitigation: reuse the + existing pattern (write `_quarto.yml` + page contents into a + `TempDir`, call `ProjectContext::discover`) — the same recipe + used by ~10 sibling tests in `crates/quarto-core/tests/`. No + parameterized builder is needed. See §"Decisions" / + "ProjectContext setup for website fixtures." + +### Noted, not actively tested + +Two latent determinism surfaces surfaced during the source review. The +test suite isn't expected to flake on either; they're recorded here so +the next person who *does* hit a hash divergence in their neighborhood +has a head start: + +- **`CodeHighlightStage`'s native disk scan for user grammars** + (`crates/quarto-core/src/transforms/code_highlight.rs:126-129`). + On native, when no `user_grammar_provider` is supplied (CLI + default), the stage falls back to scanning + `ctx.project.dir.join("_quarto").join("grammars")` for user + grammars. If that scan returns paths in OS-dependent order, + attribute output could differ across machines. Fixtures here + don't supply user grammars, so the directory is absent and the + early-return at the top of the function makes the scan a no-op + in practice. Not tested today; flag if a future fixture + introduces a grammar dependency. +- **Lipsum module-load `randomseed`** + (`resources/extensions/quarto/lipsum/lipsum.lua:5`). The Lua module + calls `math.randomseed(os.time())` at load time, which runs once per + fresh `LuaShortcodeEngine`. On the non-random code path (`{{< lipsum + 3 >}}` — what `lua-shortcode-lipsum-fixed` exercises) `math.random` + is never reached, so the seed has no observable effect. If a future + variant routes through `math.random` (random shortcode-resolution + paths, random shortcode arg parsing) the test would start flaking + noticeably across runs. The fixture should carry a comment naming + this. + +## Estimated scope + +| Component | Lines (rough) | +|---|---| +| `compute_meta_hash_fresh` + excluding-rendered variant + tests | ~140 | +| `find_first_divergence` + `DivergencePoint` + tests | ~80 | +| Test crate scaffolding — `Fixture` struct, `run_single_file`, `run_orchestrator` (thin wrapper over existing helper), `pandoc_to_document_ast` shuffle | ~100 | +| Per-fixture `.qmd` files / inline literals (~25 fixtures, 5-30 lines each) | ~280 | +| Per-fixture (fixture, mode) test assertions (mostly one-liners; ~25 fixtures × 1-2 modes ≈ 40 pairs) | ~120 | +| `idempotence-contract.md` + fixtures README | ~80 | +| **Total** | **~800** | + +The scaffolding line item dropped from an earlier estimate of ~260 +to ~100 after pinning the orchestrator path on the existing +`render_active_page_preview` helper and choosing option (a) for +`DocumentAst` extraction — neither requires a new orchestrator +driver, a `make_website_project_ctx` builder, or production +plumbing changes. `PreviewAstOutput::ast` plumbing is no longer +needed (was ~20 lines in the earlier draft). + +**Inventory note**: an earlier draft estimated "~10-20 built-in filters" +in `resources/extensions/`. That was wrong — `resources/extensions/` +contains one Lua filter (`video-filter.lua`) plus five shortcodes +(kbd, video, lipsum, version, placeholder). The bulk of the universe +under test is the **37 Rust transforms** in +`build_q2_preview_transform_pipeline`, plus the stage-level work in +`build_q2_preview_pipeline_stages`. + +Realistic shape: 2-3 focused sessions — one for hashing +infrastructure + scaffolding + carry-forward fixtures, one for +gap-closure fixtures (particularly the website-project ones), and +a third for Phase 5 triage if the first run surfaces multiple red +fixtures (which is the expected case, not a surprise). + +## Notes + +The user said: "Yes, idempotency and stable structural hash have to be +the base contract — so we have to work that out as part of this complex +of plans. Everything existing must be verified to have those +properties." This plan encodes that contract as a CI-enforced test. + +The hash function excluding source_info means that future plans (4-8) +that change source_info don't risk breaking idempotence — even if a +transform produces different source_info on different runs (e.g., a +Sectionize that generates synthetic source_info from current +timestamps; not what we do, but illustrative), the hash stays stable. + +Round-trip non-idempotence — the property +`pipeline(write(pipeline(x))) ≠ pipeline(x)` — is deliberately not +tested here. The pipeline doesn't re-parse its own output today, so +there's nothing to break. When Plan 7's incremental writer lands, +the property becomes load-bearing for blocks the writer rewrites. +Plan 7a's runtime check is the natural home for round-trip detection +**on user-supplied filters**: per-document, with per-filter attribution +and an `idempotent: false` opt-out, none of which a CI fixture gate +can provide. Round-trip on the built-in side (transforms + one Lua +filter) is consciously left unverified — see Plan 7a's §"Notes" for +the v1 acceptance reasoning. diff --git a/claude-notes/plans/2026-05-04-q2-preview-plan-3-filter-idempotence.md b/claude-notes/plans/2026-05-04-q2-preview-plan-3-filter-idempotence.md deleted file mode 100644 index 13262e828..000000000 --- a/claude-notes/plans/2026-05-04-q2-preview-plan-3-filter-idempotence.md +++ /dev/null @@ -1,311 +0,0 @@ -# Plan 3 — Filter idempotence verification - -**Date:** 2026-05-04 -**Branch:** feature/q2-preview -**Status:** Implementation plan (open questions named) -**Milestone:** M2 verification gate (no new milestone — locks in property -on what's already shipped) - -## Goal - -Verify and lock in the **idempotence + structural-hash-stability** contract -for the q2-preview pipeline. This is the contract the user has stated must be -the foundation: every transform and every built-in Lua filter must produce the -same structural output when run twice on the same input. Without this, the -incremental writer's reconciliation cannot reliably preserve untouched -regions. - -This plan ships: -- A canonical fixture set covering the q2-preview transforms. -- A test that runs each fixture through the q2-preview pipeline twice and - asserts the resulting ASTs hash equal. -- Coverage for the built-in Lua filters that ship with Quarto (those in - `resources/extensions/`). - -When this plan lands, we have CI-enforced confidence that the q2-preview -round-trip story (Plans 4-8) rests on a stable foundation. - -## Scope - -### In scope - -- Canonical fixture set: small `.qmd` files exercising: - - Meta shortcode (single-inline resolution): `{{< meta foo >}}` where `foo` - is a single string. - - Meta shortcode (multi-inline resolution): `{{< meta foo >}}` where `foo` - contains markdown like `**Bold** title`. - - Include shortcode: `{{< include child.qmd >}}` (with a trivial child file). - - Lua filter (mutating): a filter that uppercases all `Str.text`. - - Lua filter (synthesizing): a filter that adds a `pandoc.Str("decoration")` - to each paragraph. - - Callout: `::: {.callout-warning} Body :::`. - - Theorem: `::: {.theorem #thm-foo} Math here :::`. - - Figure with cross-ref target: `:::: {#fig-foo} ![caption](img.png) ::::`. - - Cross-reference: `See @thm-foo`. - - Sectionized doc: a doc with `## Section A`, content, `### Subsection`, - content, `## Section B`, content. - - **Footnotes**: a doc with one inline footnote (`text^[footnote body]`) and - one reference-style footnote (`text[^foo]` + `[^foo]: definition`). - Exercises `FootnotesTransform` (now included in q2-preview's pipeline per - Plan 2B's audit) — produces the synthesized `` markers and the - `
` container. - - **Appendix**: a doc with `license:`, `copyright:`, and a user - `:::{.appendix} Body :::` block, plus footnotes from the previous fixture. - Exercises `AppendixStructureTransform` (also included per Plan 2B's audit) - — produces the `
` container with footnotes, - license, and copyright sections nested inside. - - Combined: a doc with several of the above interacting. -- Idempotence test runner: takes a fixture, runs the q2-preview pipeline - twice, hashes both ASTs via - `quarto_ast_reconcile::compute_blocks_hash_fresh`, asserts equality. -- Coverage of the built-in extensions' filters (those in - `resources/extensions/`): - - For each shipped filter, run the test against a fixture that triggers - that filter. - - Document which built-in filters pass / fail (in case any are - non-idempotent — flag for follow-up). -- Documentation in `claude-notes/instructions/`: a short note on the - idempotence contract for filter authors and transform authors. - -### Out of scope - -- Verification of *user-supplied* filters. They're per-document; the contract - is enforced at runtime via the idempotence test pattern, but we don't - pre-verify every possible user filter. -- Rust-vs-React rendering parity (different contract; later plan). -- Performance / debouncing — idempotence verification doesn't measure runtime. - -## Design decisions (settled in conversation) - -- **The hash is already source-info-agnostic** (verified during research). - `compute_block_hash_fresh` excludes `source_info`. Two runs producing nodes - with different source_info but identical content/attr/plain_data hash - identically. This is what makes the idempotence test work cleanly. -- **The contract's load-bearing property** is "double-pipeline-run produces - hash-equal AST." Equivalent to "every transform is idempotent, every filter - is idempotent, no transform is non-deterministic about plain_data or attr - ordering." -- **Filter mutation provenance stays Original** (settled during conversation). - Lua filter mutations don't change source_info. Constructions are tagged - `Synthetic { by: By::filter(...) }` (post-Plan 5). Idempotence test sees - consistent shape across runs. -- **Built-in filters in scope; user filters out**. Built-in filters ship with - Quarto and the contract applies to them at CI time. User filters are - enforced at edit-time (a non-idempotent user filter breaks q2-preview's - round-trip; the user sees corruption). - -## What gets tested concretely - -For each fixture: - -``` -let pipeline = build_q2_preview_pipeline_stages(); -let runtime = create_test_runtime(); - -let ast_1 = run_pipeline(fixture, pipeline.clone(), runtime.clone()); -let ast_2 = run_pipeline(fixture, pipeline, runtime); - -let hash_1 = compute_blocks_hash_fresh(&ast_1.blocks); -let hash_2 = compute_blocks_hash_fresh(&ast_2.blocks); - -assert_eq!(hash_1, hash_2, "fixture {} non-idempotent", fixture_name); -``` - -Failure modes the test catches: - -- A filter that's truly non-idempotent (e.g., `Str.text + "!"` produces - growing text on each run). -- A transform that emits non-deterministic attributes or plain_data - (e.g., HashMap iteration order in a sloppy implementation). -- A transform that mutates inputs differently across runs (probably - indicates a bug). - -Failure modes the test does NOT catch: - -- A transform that's idempotent but produces *wrong* output (wrong-but- - consistent — needs other testing). -- A filter that's idempotent for one input but non-idempotent for another - (need representative fixtures). -- **Round-trip non-idempotence** — see next section. - -### Two flavors of non-idempotence (and what this plan tests) - -There are two distinct properties that get loosely called "non-idempotence": - -1. **Pipeline non-determinism**: `pipeline(x)` produces different output - on repeat calls with the same input. Caused by filters that depend - on time, RNG, mutable global state, or undefined-order iteration. - **This is what Plan 3's current test catches** — running - `run_pipeline(fixture)` twice on the same source and comparing - hashes detects it cleanly. - -2. **Round-trip non-idempotence**: `pipeline(write(pipeline(x))) ≠ pipeline(x)`. - The filter is deterministic — same input always produces same - output — but applying the filter twice (once on source, once on - the qmd-writer-serialized output of the first pass) gives different - results. The classic case is `f(x) = x + "!"`: deterministic, but - `f(f(x)) ≠ f(x)`. **Plan 3's current test does NOT catch this** - because both runs are on the same source; the filter is applied - once to identical input, producing identical output. - -This second property is the one that actually breaks q2-preview's -writer round-trip. When the user edits and saves, the writer Verbatim- -copies unchanged blocks from source and Rewrites changed blocks via -the qmd writer. The Rewrite path emits the *post-filter* AST node -content as new source bytes; on the next pipeline run, the filter -re-applies to those bytes, and `f(f(x)) ≠ f(x)` shows up as text -drift on edited blocks. - -**Plan 7a's runtime check** (`claude-notes/plans/2026-05-04-q2-preview-plan-7a-filter-idempotence.md`) -targets round-trip non-idempotence explicitly, with a check that runs -the round-trip flavor: pipeline → write → pipeline, and hash-compares. -That plan is for **user filters at runtime**. - -### Plan 3 strengthening — folding the round-trip flavor into CI - -Plan 3 should be amended to also check round-trip non-idempotence -for built-in filters. The change is small: - -```rust -// Existing test: pipeline determinism -let ast_1 = run_pipeline(fixture, pipeline.clone(), runtime.clone()); -let ast_2 = run_pipeline(fixture, pipeline.clone(), runtime.clone()); -assert_eq!(blocks_hash(&ast_1), blocks_hash(&ast_2)); - -// New test: round-trip idempotence -let ast_a = run_pipeline(fixture, pipeline.clone(), runtime.clone()); -let qmd_a = qmd_write_to_string(&ast_a); -let ast_b = run_pipeline(&qmd_a, pipeline, runtime); -assert_eq!(blocks_hash(&ast_a), blocks_hash(&ast_b)); -``` - -Per-fixture cost: one extra pipeline pass + one qmd writer call. -Bounded; runs at CI time, not in the editor loop. - -This amendment is **in scope for Plan 3** (extends what's already a -CI test for built-ins). User filters get the runtime version via -Plan 7a. Add the second flavor to each fixture's assertion when -implementing Plan 3. - -## Open questions for implementation - -- **Test infrastructure location**: probably `crates/quarto-core/tests/` as - a workspace-level integration test crate. New test file like - `q2_preview_idempotence.rs`. Confirm during implementation. -- **Fixture format**: just `.qmd` files in a fixtures dir, or in-source - literal strings? Files are easier to maintain and review; literal strings - are easier to keep with the test. Probably files for the substantial cases, - literals for trivial ones. -- **How to drive the pipeline twice**: the natural approach is to build the - pipeline once and run it twice, OR build two identical pipelines and run - each on a fresh AST. Pipeline construction includes Lua engine setup which - may be stateful — confirm the second-run pipeline starts fresh. -- **Built-in filter inventory**: enumerate the filters in - `resources/extensions/`. Probably ~10-20. Each gets a fixture (or a - shared fixture if the trigger pattern is similar). -- **CI failure expectation**: does the test fail noisily if any built-in - filter is non-idempotent? Probably yes — that's the point. But we may - discover at first run that one or more is non-idempotent, requiring a - pre-existing fix before this plan can land. - -## References - -- `crates/quarto-ast-reconcile/src/hash.rs::compute_blocks_hash_fresh` — the - hash function we use. Verified excludes source_info. -- `crates/quarto-ast-reconcile/src/hash.rs:768` — existing test - `test_same_content_same_hash` — confirms hash excludes source_info. -- `crates/quarto-core/src/pipeline.rs::build_q2_preview_pipeline_stages` — - the pipeline under test (created by Plan 1). -- `resources/extensions/` — built-in extensions with their Lua filters. -- `claude-notes/plans/lua-filter-pipeline/` — Carlos's earlier analysis of - which filters are pure vs. side-effecting. - -## Test plan - -The plan IS the test plan. The deliverable is a test crate. - -- Per-fixture idempotence assertion (the main loop above). -- Per-built-in-filter idempotence assertion. -- Combined fixture (sectionized doc with callouts and shortcodes) as a - stress test. -- Documentation: when a future contributor adds a new transform or filter, - they should add a fixture covering it. Document this expectation in - `claude-notes/instructions/`. - -## Dependencies - -- Depends on: Plan 1 (`build_q2_preview_pipeline_stages` exists and runs). -- Blocks: implicitly Plans 4-8 (round-trip work assumes this contract holds). - We don't need this to *implement* those plans, but landing it before - reviewing them gives us confidence the foundation is solid. -- Related to Plan 7a (runtime user-filter idempotence check). Plan 3 - is the **CI-time** half of the contract for built-in filters; Plan 7a - is the **runtime** half for user-supplied filters. The two share the - same hash function (`compute_blocks_hash_fresh`) and the same - round-trip-vs-non-determinism distinction. See §"Plan 3 strengthening" - above and Plan 7a's §"Plan 3 strengthening" section. - -### What happens when a fixture fails - -Plan 3 reports failures; the *fix* lands in the appropriate downstream -plan, not in Plan 3. Three failure modes and where their fixes go: - -- **Non-idempotent built-in Lua filter**. The filter's contract is - broken. Fix: edit the filter's Lua source. Lands wherever the - filter lives (typically `resources/extensions/...`). Plan 3 just - surfaces the test. -- **Non-deterministic transform attribute ordering**. A transform that - iterates a HashMap or similar and emits attrs in non-deterministic - order. Fix: change the transform to emit deterministically. Lands - in the transform's source file (typically a Plan 6-shaped fix even - though it's not strictly a provenance issue — provenance audit and - determinism audit are sister concerns). -- **Source-info-related instability**. Should NOT happen because the - hash function excludes source_info. If somehow it does, Plan 4's - type changes are the place to investigate. - -If a fixture fails on first run, document the failure as a known issue -in Plan 3's commit message and file the fix as a follow-up against the -appropriate plan. Don't silently disable failing fixtures. - -## Risk areas - -- **A built-in filter might fail the test on first run**. If so, we either - (a) fix the filter before this plan lands or (b) document the failure as - a known issue and defer the fix. Plan should not silently disable failing - filters from the test set. -- **Hash stability across binary versions**: `FxHasher`'s output is stable - within a Rust process but not across versions. Tests should compare hashes - computed in the same process, not stored as constants. This is the natural - shape of "run pipeline twice and compare" anyway. -- **Pipeline construction non-determinism**: if the pipeline picks up extension - paths in OS-dependent order, attributes could differ on different machines. - Mitigated by fixture isolation — fixtures don't reference real OS paths - unless explicitly testing a path-aware feature. - -## Estimated scope - -| Component | Lines (rough) | -|---|---| -| Test runner harness | ~80 | -| Per-fixture qmd files | ~100 (across ~10 fixtures) | -| Per-fixture test assertions | ~150 | -| Built-in filter coverage | ~150 | -| Documentation | ~50 | -| **Total** | **~530** | - -Probably one focused session. Risk: if a built-in filter fails idempotence, -fixing the underlying issue may push this into two sessions. - -## Notes - -The user said: "Yes, idempotency and stable structural hash have to be the -base contract — so we have to work that out as part of this complex of plans. -Everything existing must be verified to have those properties." This plan -encodes that contract as a CI-enforced test. - -The hash function excluding source_info means that future plans (4-8) that -change source_info don't risk breaking idempotence — even if a transform -produces different source_info on different runs (e.g., a Sectionize that -generates synthetic source_info from current timestamps; not what we do, but -illustrative), the hash stays stable. diff --git a/claude-notes/plans/2026-05-04-q2-preview-plan-4-source-info-types.md b/claude-notes/plans/2026-05-04-q2-preview-plan-4-source-info-types.md index 0e9bff4fc..23e3eac33 100644 --- a/claude-notes/plans/2026-05-04-q2-preview-plan-4-source-info-types.md +++ b/claude-notes/plans/2026-05-04-q2-preview-plan-4-source-info-types.md @@ -1,117 +1,432 @@ -# Plan 4 — SourceInfo provenance types (Synthetic + Derived + By struct) +# Plan 4 — SourceInfo provenance types (Generated + Anchor + AnchorRole) -**Date:** 2026-05-04 +**Date:** 2026-05-04 (substantially revised 2026-05-20) **Branch:** feature/q2-preview -**Status:** Implementation plan (open questions named) -**Milestone:** none directly — foundation for Plans 5/6/7/8 +**Status:** Implementation plan (ready to execute) +**Milestone:** none directly — foundation for the rest of the provenance + epic + +## Epic context + +Plans 3–8 (filter idempotence, this plan, JSON wire format, provenance +audit, incremental writer + soft-drop, runtime filter check, include +round-trip) make up the **provenance epic** — the second wave of work +on the q2-preview branch after Plans 1–2 landed. They share a common +target: a typed, source-mapped notion of "where did this AST node come +from" that lets the incremental writer round-trip edits, lets +attribution credit the right author, and lets future diagnostics surface +resolution chains to users. The file names keep their q2-preview-plan-N +form for continuity with the earlier discussion notes. ## Goal -Extend `SourceInfo` with two new variants: - -- `Synthetic { by: By }` — for nodes that have no source preimage at all - (Sectionize's section Divs, filter constructions, synthesized title h1s, - the footnotes container, etc.). Replaces the existing `FilterProvenance - { filter_path, line }` variant — FilterProvenance becomes the special - case `Synthetic { by: By::filter(...) }`. -- `Derived { from: Arc, by: By }` — for nodes that have a - source preimage AND distinct atomic semantics. Used for shortcode - resolutions: the resolved Str's `from` chain points at the shortcode - token's bytes, and the `by` records that this is shortcode-derived - content (so the writer can prohibit edits via Plan 7's atomic detection). - *Not* used for filter mutations (those stay `Original` — non-atomic) or - sugar transforms (their CustomNodes inherit Original from their input - Div — also non-atomic). - -`By` is an open `{ kind: String, data: serde_json::Value }` struct that -appears as the payload of both Synthetic and Derived. The `Original`, -`Substring`, `Concat` variants are unchanged. +Extend `SourceInfo` with a single new variant, `Generated`, that +captures every transform-synthesized node in a uniform shape: + +```rust +Generated { by: By, from: SmallVec<[Anchor; 2]> } +``` + +`by` answers "which transform produced me." `from` is a list of +typed, role-labeled source-info pointers that answer "which source +bytes contributed to me." The list is empty for pure synthesis +(sectionize wrappers, filter constructions); has one `Invocation` +entry for shortcode resolutions; can carry additional roles +(`ValueSource`, future `Dispatch`, extension-defined `Other(...)`) as +the provenance picture sharpens. + +The pre-existing `FilterProvenance` variant folds into `Generated` +(with `by.kind == "filter"`). ## Scope ### In scope -- Add `Synthetic { by: By }` variant to `SourceInfo` enum. -- Add `Derived { from: Arc, by: By }` variant. +- Add `Generated { by: By, from: SmallVec<[Anchor; 2]> }` variant to `SourceInfo`. Inline capacity 2 covers the steady-state shape after the deferred follow-ups land (Invocation + ValueSource on `meta`/`var` shortcodes; Invocation + Dispatch on Lua-handler shortcodes); see §Risk areas for the trade-off. - Define `By` struct: `{ kind: String, data: serde_json::Value }`. -- Implement builder methods on `By` for known kinds: `filter`, `sectionize`, - `user_edit`, `shortcode`, `include`, `title_block`, `footnotes`, - `appendix`, `tree_sitter_postprocess`, `raw` (escape hatch). +- Define `Anchor` struct: `{ role: AnchorRole, source_info: Arc }`. +- Define `AnchorRole` enum: `Invocation`, `ValueSource`, `Other(String)`. + (`Dispatch` is a planned future role; see "Deferred anchor role" below.) +- Implement builder methods on `By` for known kinds: `filter`, + `sectionize`, `user_edit`, `shortcode`, `include`, `title_block`, + `footnotes`, `appendix`, `tree_sitter_postprocess`, `raw` (escape hatch). +- Implement helper accessors on `SourceInfo` for the `Generated` shape: + - `invocation_anchor(&self) -> Option<&Arc>` + - `value_source_anchor(&self) -> Option<&Arc>` + - `anchors_with_role(&self, role: &AnchorRole) -> impl Iterator>` + - `append_anchor(&mut self, role: AnchorRole, source_info: Arc)` - Migrate all `SourceInfo::FilterProvenance` construction sites to - `SourceInfo::Synthetic { by: By::filter(...) }`. -- Migrate all `SourceInfo::FilterProvenance` pattern-match sites (~22 files - flagged earlier). + `SourceInfo::Generated { by: By::filter(...), from: smallvec![] }`, + carrying `(filter_path, line)` in `by.data`. +- Migrate all `SourceInfo::FilterProvenance` pattern-match sites + (15 files, 27 occurrences — see §Risk areas) to the new shape. - Remove the `FilterProvenance` variant. -- Update accessors: `start_offset`, `end_offset`, `length`, `map_offset`, - `remap_file_ids`, `extract_file_id` (in diagnostic.rs) to handle both - new variants. For `Derived`: recurse into `from` for offset accessors - (returns the `from`'s offsets if the chain leads to Original). -- Update Lua serde (`pampa/src/lua/diagnostics.rs`) for both new variants. - Keep `"FilterProvenance"` recognized as a legacy tag that maps to - `Synthetic { by: By::filter(...) }` for back-compat reads. +- Update accessors on `SourceInfo` to handle `Generated`: + - `length`, `start_offset`, `end_offset` — return `0` (same as today's + `FilterProvenance`; Generated has no characteristic local-text length). + - `map_offset` — return `None` (offset-within-current-text is undefined + for Generated; callers wanting source coordinates use + `resolve_byte_range`). + - `resolve_byte_range` — delegate to `invocation_anchor()` and recurse + (returns the invocation anchor's chain-resolved range, or `None` if + there is no invocation anchor). + - `remap_file_ids` — walk every `Anchor.source_info` and recurse via + `Arc::make_mut`. Unlike `FilterProvenance` (no-op), `Generated` CAN + carry `FileId`s inside its anchors. + - File-id extraction across the workspace is **consolidated** into + two new `SourceInfo` accessors (see "File-id accessor consolidation" + below). The six ad-hoc walkers in `diagnostic.rs`, + `pampa/.../location.rs`, `pampa/.../pipe_table.rs`, + `pampa/.../section.rs`, `apply_template.rs` (test), and + `engine_execution.rs` (test) all collapse onto `root_file_id()` / + `collect_file_ids()`. The Generated arm is defined once on those + accessors. Empty-`from` Generated returns `None`, which matches + today's `FilterProvenance` behavior; the two call sites in + `to_ariadne_report` (`diagnostic.rs:674`, `:773`) both tolerate + `None` gracefully (the main-location path falls through via `?`; + the detail loop `continue`s), so no caller change is required + beyond the mechanical swap to `si.root_file_id()`. +- Update Lua serde (`pampa/src/lua/diagnostics.rs`) for `Generated`. + Use `t = "Generated"` as the discriminant; the table carries `by` and + `from` sub-tables. Keep `"FilterProvenance"` recognized as a legacy + tag that maps to `Generated { by: By::filter(...), from: smallvec![] }` + for back-compat reads. ### Out of scope - JSON wire format changes (Plan 5 does that). - Audit of transforms emitting `SourceInfo::default()` to fix them - (Plan 6 does that). -- The `preimage_in` accessor (Plan 7 does that). -- Helper accessors like `as_filter()` — minimal interface in Plan 4; - helpers added as call sites need them (Plans 6/7). + (Plan 6 does that). `Default for SourceInfo` itself is unchanged + (stays `Original { FileId(0), 0, 0 }`); Plan 6 fixes incorrect + emissions at transform sites without modifying the trait impl. +- The `preimage_in` accessor (Plan 7 owns it). Plan 7's `preimage_in` + consumes `invocation_anchor()` defined here; the contiguity rule + for `Concat` lives with the implementation in Plan 7. +- The `is_atomic_custom_node` registry for CustomNode types (Plan 7 + owns it). +- The metadata loader changes that would populate `ValueSource` + anchors on `meta` / `var` shortcode resolutions — that's a separate + follow-up (see "Deferred anchor role" and Plan 6's "ValueSource + follow-up" section). +- Registering Lua filter files in `SourceContext` to enable typed + `Dispatch` anchors. See "Deferred anchor role" below. + +## Inherited pre-existing failure (bd-3odjm) + +**One test in the workspace is expected to be red throughout Plan 4 +and only goes green when Plan 5 ships its first reader change.** Do +not try to fix it inside Plan 4. + +- Test: `cargo nextest run -p quarto-core --test idempotence lua_shortcode_lipsum_fixed` + (orchestrator mode only; `SingleFile` passes). +- Symptom: panic with `MalformedSourceInfoPool` when + `pampa::readers::json::read` re-parses the orchestrator's AST JSON. +- Root cause (already established): wire-format type-code-3 + collision — writer emits the new `FilterProvenance` payload + `[filter_path, line]` under code 3, reader still decodes code 3 + as the legacy `Transformed` `[parent_id, ...]`. +- Owner: [Plan 5 — wire format](2026-05-04-q2-preview-plan-5-wire-format.md). + +Plan 4's verification gate (Phase 7) and `cargo xtask verify` +therefore expect **exactly one** failing test in +`quarto-core::idempotence` (the test above) until Plan 5's first +reader fix lands. Any other failure is a Plan-4 regression and must +be triaged before continuing. + +This is the integration branch's intended long-lived-red state per +Plan 3's §"Long-lived branch policy" — Plan 4 ships on top of that +queue, not in spite of it. + +## Work items + +Phase-ordered. Each phase compiles cleanly before the next begins. +"Settled" items below (design decisions, semantics rules) are detailed +later in the plan — this list is the actionable extract. + +### Phase 1 — Type definitions in `quarto-source-map` + +- [x] Add `smallvec` to the workspace `Cargo.toml` (`[workspace.dependencies]`) + with the `serde` feature, and depend on it from + `crates/quarto-source-map/Cargo.toml`. Verified absent in both files + at the start of Plan 4. +- [x] Add `By` struct (`kind: String`, `data: serde_json::Value` with + `#[serde(default, skip_serializing_if = "serde_json::Value::is_null")]` + — the attribute path needs to be fully qualified, not the short + `Value::is_null` form). +- [x] Add `AnchorRole` enum (`Invocation`, `ValueSource`, `Other(String)`). +- [x] Add `Anchor` struct (`role: AnchorRole`, `source_info: Arc`). +- [x] Add `Generated { by: By, from: SmallVec<[Anchor; 2]> }` variant + to `SourceInfo`. Keep `FilterProvenance` for now — it's removed + at the end of Phase 5. +- [x] Verify the new enum still implements `Debug`, `Clone`, + `PartialEq`, `Serialize`, `Deserialize` (including with the + `SmallVec` field — needs `serde` feature on `smallvec`). + +### Phase 2 — Constructors and accessors + +- [x] `By::filter`, `By::sectionize`, `By::user_edit`, `By::shortcode`, + `By::include`, `By::title_block`, `By::footnotes`, `By::appendix`, + `By::tree_sitter_postprocess`, `By::raw`. +- [x] `By::shortcode` doc-comment states the required-Invocation-anchor + invariant (see §"Required-anchor invariant for `shortcode`" for + the exact wording). +- [x] `By::is_atomic_kind` (returns true for `filter | shortcode | + title-block | tree-sitter-postprocess`). +- [x] `By::is_kind`, `By::as_filter`. +- [x] `Anchor::invocation`, `Anchor::value_source` constructors. +- [x] `SourceInfo::generated(by)` constructor (empty `from`). +- [x] `SourceInfo::invocation_anchor`, `SourceInfo::value_source_anchor`. +- [x] `SourceInfo::anchors_with_role`, `SourceInfo::append_anchor`. + +### Phase 3 — Update existing accessors for the `Generated` arm + +- [x] `length`, `start_offset`, `end_offset` → return `0` (in `source_info.rs`). +- [x] `map_offset` → return `None` (in `mapping.rs`). +- [x] `resolve_byte_range` → delegate to `invocation_anchor()` and recurse. +- [x] `remap_file_ids` → walk `from`, recurse via `Arc::make_mut`. +- [x] Add `SourceInfo::root_file_id() -> Option` accessor in + `source_info.rs`. +- [x] Add `SourceInfo::collect_file_ids(&self, out: &mut HashSet)` + accessor in `source_info.rs`. +- [x] Migrate `DiagnosticMessage::extract_file_id` + (`quarto-error-reporting/src/diagnostic.rs:556`) → call + `si.root_file_id()`; delete the private fn. +- [x] Migrate `extract_filename_index` + (`pampa/src/pandoc/location.rs:329`) — deleted entirely (callers + were tests only; tests deleted in favor of the unified + `root_file_id`/`collect_file_ids` coverage in source-map). +- [x] Migrate the inline-match file-id extraction in + `pampa/src/pandoc/treesitter_utils/pipe_table.rs:256-279` → + `table_start.root_file_id().unwrap_or(FileId(0))`. Fixes the + latent nested-Substring `FileId(0)` fall-through. +- [x] Migrate the inline-match file-id extraction in + `pampa/src/pandoc/treesitter_utils/section.rs:129-152` → + `table.source_info.root_file_id().unwrap_or(FileId(0))`. Same + latent-nested-Substring bug fixed. +- [x] Migrate the test-mod `root_file_id` local fn in + `crates/quarto-core/src/stage/stages/apply_template.rs:820` → + `info.root_file_id()`; delete the local fn. +- [x] Migrate the test-mod `walk_source_info` inner fn in + `crates/quarto-core/src/stage/stages/engine_execution.rs:819` + → `si.collect_file_ids(out)`; per-Inline/per-Block walkers + retained, only the inner SourceInfo step swapped. + +### Phase 4 — Lua serde + +- [x] Add `Generated` arm to `source_info_to_lua_table` in + `pampa/src/lua/diagnostics.rs` (`t = "Generated"`, `by` and `from` + sub-tables; `by.data` is JSON-encoded as a string for Lua transit). +- [x] Add `Generated` arm to `source_info_from_lua_table`. +- [x] Keep `"FilterProvenance"` legacy reader: maps to + `Generated { by: By::filter(path, line), from: smallvec![] }`. + Indefinitely accepted; writes never emit it. + +### Phase 5 — Migration + +The migration is atomic — one PR, no deprecated-alias scaffold. Only +4 non-source-map callers of `SourceInfo::filter_provenance(...)` exist, +all trivially co-migrated with the 27 `SourceInfo::FilterProvenance` +pattern sites. + +- [x] Sweep remaining `SourceInfo::FilterProvenance` references — + `git grep "SourceInfo::FilterProvenance"` now returns 0 hits in + `crates/`. The legacy `"FilterProvenance"` tag survives only in + the Lua reader (as documented in Phase 4). +- [x] Sweep `SourceInfo::filter_provenance(...)` constructor-function + callers (4 non-source-map files + 1 in-crate test) → new + `Generated` shape inline; constructor deleted from + `source_info.rs`. +- [x] Remove the `FilterProvenance` variant from `SourceInfo`. + +### Phase 6 — Tests (see §Test plan for full descriptions) + +Type / builder: +- [x] Unit tests for every `By` builder (all 10 kinds incl. `raw`). +- [x] `By::is_atomic_kind` coverage (atomic set + extension kinds). +- [x] `By::is_kind` + `By::as_filter` coverage. +- [x] Unit tests for `Anchor::invocation` / `Anchor::value_source`. +- [x] JSON round-trip: `By`, `Anchor`, `Generated` (no anchors / with + Invocation / multi-anchor). + +Accessor tests on `Generated`: +- [x] `length` / `start_offset` / `end_offset` for `Generated` → `0`. +- [x] `map_offset` for `Generated` → `None` (covered by the existing + mapping tests — Generated falls through to the None arm). +- [x] `resolve_byte_range` recursion through `Invocation -> Substring` + → resolves correctly; empty `from` and ValueSource-only `from` + → `None`. +- [x] `remap_file_ids` for `Generated` walks every anchor's source_info + via `Arc::make_mut` (regression guard — must NOT be no-op). +- [x] `root_file_id` for every variant. +- [x] `collect_file_ids` for every variant, including Generated with + mixed-role anchors. +- [x] `invocation_anchor` coverage (present / absent / ValueSource-only). +- [x] `value_source_anchor` coverage (parallel). +- [x] `anchors_with_role` coverage (each known role + unknown role). +- [x] `append_anchor` mutator coverage. + +Structural: +- [x] Rename `test_filter_provenance_tracking` + (`filter_tests.rs:740-813`) → `test_filter_generated_tracking` + and updated assertions to the `Generated` shape with + `by.as_filter()` recovery. +- [x] `combine()` × `Generated` structural test (zero-length Concat + piece). +- [x] Lua-serde round-trip including legacy `"FilterProvenance"` tag + back-compat read. + +### Phase 7 — Verification gate + +- [x] `cargo build --workspace` clean. +- [x] `cargo nextest run --workspace --no-fail-fast`: 9370 passed, + 1 failed — `quarto-core::idempotence::lua_shortcode_lipsum_fixed` + (bd-3odjm, owned by Plan 5). No other regressions. +- [x] `cargo xtask verify --skip-rust-tests`: all 12 steps passed + (Rust build + hub-client npm install/build/wasm/tests + q2-preview + SPA build). Rust tests run separately with `nextest --no-fail-fast` + above. +- [x] `git grep "SourceInfo::FilterProvenance"` returns zero hits + across `crates/` (variant gone). +- [x] `git grep "SourceInfo::filter_provenance"` returns zero hits + across `crates/` (no alias was added; original constructor + removed in Phase 5). +- [x] `git grep '"FilterProvenance"'` in Rust code returns only the + legacy-Lua-reader arm (3 hits — comment in doc-comment for + `source_info_to_lua_table`, comment in `source_info_from_lua_table`, + and the match arm itself). No writer emissions, no other readers. + The `SerializableSourceMapping::FilterProvenance` identifier + (wire code 3, Plan 5-owned) is not a string literal and does not + match this grep. +- [x] `git grep "extract_filename_index\|fn root_file_id\|fn walk_source_info"` + across `crates/` returns one hit — the new + `SourceInfo::root_file_id` accessor in + `crates/quarto-source-map/src/source_info.rs`. Six ad-hoc walkers + retired. ## Design decisions (settled in conversation) -- **`Derived` is reintroduced** (we'd dropped it earlier and walked it - back). It came back because pure provenance preservation can't - distinguish "shortcode resolution" (atomic; user edits prohibited at - the writer level) from "filter mutation" (non-atomic; user edits - flow to source). Both have a preimage in the same file; both could - use Original; only Derived gives the writer a type-level way to know - which is which. +- **Single `Generated` variant, not two.** Earlier drafts proposed + `Synthetic` + `Derived` to separate "no preimage" from "has preimage + but is atomic." The unified `Generated { by, from: SmallVec<[Anchor; 2]> }` + expresses both with one variant: anchor-list empty for pure + synthesis, anchor-list with `Invocation` for shortcode-style + resolutions. The "has preimage" property is `gen.invocation_anchor().is_some()`, + not a separate enum arm. +- **`by` records generator identity; `from` records source contributions.** + These are orthogonal axes. Atomicity is determined by `by.kind` + (per the `is_atomic_kind()` predicate); anchor-presence is orthogonal + to atomicity. +- **Anchors are typed `Arc`, not dynamic JSON.** Path C in + the 2026-05-20 discussion: rather than stuff source-info chain + metadata into `by.data` (dynamic typing), use a typed list of + role-labeled anchors. `by.data` shrinks to per-kind *non-source-info* + configuration. - **Filter mutations stay Original**. A Lua filter that does `Str.text = upper(Str.text)` doesn't change source_info. The mutated - Str retains its Original chain. -- **Filter constructions become Synthetic**. `pandoc.Str("decoration")` - in a Lua filter produces `Synthetic { by: By::filter(filter_path, line) }` - (replaces the existing FilterProvenance auto-attachment). -- **Shortcode resolutions become Derived**. The shortcode resolver - emits `Derived { from: Original{shortcode_token_range}, by: - By::shortcode(name) }` on resolved nodes. Plan 6 owns this. + Str retains its Original chain. This is unchanged from the existing + Lua machinery contract. +- **Filter constructions become `Generated { by: filter, from: [] }`**. + `pandoc.Str("decoration")` in a Lua filter produces this shape (the + Lua machinery's auto-attach replaces the existing FilterProvenance + emission). Lua-file path and line live in `by.data` until + Lua-file-registration lands; then they migrate to a `Dispatch` anchor. +- **Shortcode resolutions become `Generated { by: shortcode(name), from: [Invocation -> token_si] }`.** + Plan 6 owns the resolver-side stamping; the resolver appends an + `Invocation` anchor pointing at the shortcode token's source range. - **Sugar transforms stay Original**. CalloutTransform et al. inherit - source_info from their input Div. They're not atomic — the user - editing a callout's body content is fine. + source_info from their input Div. The Div's bytes are the canonical + preimage of the resulting CustomNode wrapper; the wrapper's + `type_name` carries the generator identity, so `source_info` doesn't + need to also encode it. The same reasoning applies to Plan 8's + `CustomNode("IncludeExpansion")` wrapper. See "Original vs Generated + on synthesized nodes" below. - **`By` is an open struct, not a closed enum**. Forward-compatibility for TS-Quarto-Lua-port and extension-defined kinds. Mirrors the - `CustomNode.plain_data` pattern (also `serde_json::Value`-typed). + existing precedent in `CustomNode.plain_data` and `Artifact.metadata` + — open `serde_json::Value` at extension/dispatch seams; static typing + everywhere else. +- **`AnchorRole` is a closed enum with an `Other(String)` escape hatch**. + The known roles (`Invocation`, `ValueSource`) are the load-bearing + ones the core consults. `Other(String)` lets extensions or future + plans add roles without modifying the type. - **Kind-string convention**: kebab-case, namespaced for third-party - (`ext//foo`). + (`ext//foo`). Same for `AnchorRole::Other` values. +- **Anchor list ordering is append order**. `from` is a `SmallVec`; + iteration is insertion order. `append_anchor` pushes to the end. + Accessors that find by role (`invocation_anchor`, `value_source_anchor`) + return the first match — at most one anchor per known role by + convention. Serde round-trips preserve order. No producer sorts; + no consumer reorders. - **Builder methods for known kinds, plus `raw` escape hatch**. + `By::raw(kind, data)` accepts any `kind` string — including built-in + names like `"shortcode"` or `"filter"`. Forgery (an extension calling + `By::raw("shortcode", …)` without the required Invocation anchor) + is caught downstream by Plan 6's audit-completion test and Plan 7's + `debug_assert!`, so no constructor-level rejection is needed. The + convention is still `ext//` for third-party kinds — + collisions with built-ins are a misuse caught at audit time, not a + type error. ## The proposed shape +**Naming.** Read the new variant as: this node was generated **by** some +transform, **from** some anchors. `by` records the producer; `from` is +the list of `Anchor`s that record the source-side contributions. The +items in the list are `Anchor` values; methods that operate on individual +items keep "anchor" in their name (`invocation_anchor`, +`value_source_anchor`, `append_anchor`, `anchors_with_role`), while the +field name and any Lua-table key use `from`. `by` / `from` reads cleanly +in both Rust and Lua serializations — preserve that pairing throughout. + ```rust #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub enum SourceInfo { Original { file_id: FileId, start_offset: usize, end_offset: usize }, Substring { parent: Arc, start_offset: usize, end_offset: usize }, Concat { pieces: Vec }, - Synthetic { by: By }, - Derived { from: Arc, by: By }, + Generated { by: By, from: SmallVec<[Anchor; 2]> }, } #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct By { - /// Short kind tag, kebab-case. Examples: "filter", "sectionize", - /// "user-edit", "shortcode", "include", "title-block". + /// Short kind tag, kebab-case. Examples: "filter", "shortcode", + /// "sectionize", "user-edit", "title-block". /// Third-party kinds should namespace: "ext/my-extension/foo". pub kind: String, - /// Free-form structured data specific to this kind. + /// Per-kind configuration that is NOT a source-info pointer. + /// Anchors live in `Generated.from`, not here. /// `Null` for kinds that don't carry per-instance data. #[serde(default, skip_serializing_if = "serde_json::Value::is_null")] pub data: serde_json::Value, } +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct Anchor { + pub role: AnchorRole, + pub source_info: Arc, +} + +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub enum AnchorRole { + /// The user-written construct that triggered this node's creation + /// (e.g. the `{{< meta foo >}}` token in the active document). + /// Load-bearing: the writer's `preimage_in` and attribution's + /// `resolve_byte_range` consult the first anchor with this role. + /// At most one per node by convention. + Invocation, + + /// Where the VALUE this node carries was defined, when distinct + /// from the invocation site (e.g. `footer:` in `_metadata.yml` for + /// a `{{< meta footer >}}` resolution). Diagnostic-only — does not + /// affect the writer or attribution decisions in v1. + ValueSource, + + /// Extension-defined or future role we haven't enumerated. + /// String is kebab-case, namespaced (`ext//`). + Other(String), +} + impl By { pub fn filter(filter_path: impl Into, line: usize) -> Self { ... } pub fn sectionize() -> Self { ... } @@ -123,6 +438,78 @@ impl By { pub fn appendix() -> Self { ... } pub fn tree_sitter_postprocess() -> Self { ... } pub fn raw(kind: impl Into, data: serde_json::Value) -> Self { ... } + + /// True if a `Generated { by: , .. }` node should be treated + /// as atomic by the incremental writer. Atomic nodes are produced + /// by the pipeline and represent content the user shouldn't edit + /// through React (filter constructions, shortcode resolutions, + /// synthesized title h1, tree-sitter-inserted spaces). + /// + /// Atomicity is determined by `kind` alone — orthogonal to + /// anchor-presence. A `Generated { by: shortcode, from: [...] }` + /// is atomic; so is a `Generated { by: filter, from: [] }`. + pub fn is_atomic_kind(&self) -> bool { + matches!( + self.kind.as_str(), + "filter" | "shortcode" | "title-block" | "tree-sitter-postprocess" + ) + } + + pub fn is_kind(&self, kind: &str) -> bool { self.kind == kind } + + /// If this is a `filter` kind, return its `(filter_path, line)` payload. + pub fn as_filter(&self) -> Option<(&str, usize)> { + if self.kind != "filter" { return None; } + let path = self.data.get("filter_path")?.as_str()?; + let line = self.data.get("line")?.as_u64()? as usize; + Some((path, line)) + } +} + +impl Anchor { + pub fn invocation(source_info: Arc) -> Self { + Self { role: AnchorRole::Invocation, source_info } + } + pub fn value_source(source_info: Arc) -> Self { + Self { role: AnchorRole::ValueSource, source_info } + } +} + +impl SourceInfo { + pub fn generated(by: By) -> Self { + SourceInfo::Generated { by, from: SmallVec::new() } + } +} + +// Helper methods on Generated-shape access — typically called via +// matching `SourceInfo::Generated { by, from } => ...`. We provide +// the helpers as free functions on the variant pattern; example: + +impl SourceInfo { + /// If this is `Generated`, return the first anchor whose role is + /// `Invocation`. Returns `None` otherwise (including for + /// non-`Generated` variants). + pub fn invocation_anchor(&self) -> Option<&Arc> { + match self { + SourceInfo::Generated { from, .. } => from + .iter() + .find(|a| matches!(a.role, AnchorRole::Invocation)) + .map(|a| &a.source_info), + _ => None, + } + } + + /// If this is `Generated`, return the first anchor whose role is + /// `ValueSource`. Returns `None` otherwise. + pub fn value_source_anchor(&self) -> Option<&Arc> { + match self { + SourceInfo::Generated { from, .. } => from + .iter() + .find(|a| matches!(a.role, AnchorRole::ValueSource)) + .map(|a| &a.source_info), + _ => None, + } + } } ``` @@ -134,227 +521,666 @@ impl By { combine_all). Existing pattern. **Contiguity expectation**: writer paths that need to Verbatim-copy a Concat (Plan 7's `preimage_in`) return `Some(range)` only when all pieces resolve into the target - file AND are byte-contiguous in source order (`pieces[i].end == - pieces[i+1].start`). Non-contiguous Concats (rare; would arise if a - transform composed source-info from disparate file regions) return - `None` from `preimage_in`, and Plan 7's coarsen falls through to - Rewrite for that node. This is a Plan 7 invariant, not a Plan 4 - type-system invariant — Plan 4 doesn't forbid gappy Concats. If a - future use case needs to construct a gappy Concat intentionally, no - Plan 4 change is required; Plan 7's writer behavior already handles - the case. -- **Synthetic**: NO source preimage. The node was created from nothing. - Sectionize wrappers, filter constructions, synthesized title h1s. - Writer omits or recurses (Plan 7). -- **Derived**: HAS a source preimage but is a distinct transform output. - The `from` chain points at the source bytes; `by` describes the - transform. Writer treats as atomic (Plan 7) — KeepBefore Verbatim - copies preimage; UseAfter triggers AtomicViolation. Used for shortcode - resolutions; later for crossref cite resolutions if/when needed. + file AND are byte-contiguous in source order. Non-contiguous Concats + return `None`, and Plan 7's coarsen falls through to Rewrite. +- **Generated**: produced by a pipeline transform. `by` records the + producer; `from` records any source-side contributions. The + variant subsumes the previous `Synthetic`/`Derived` distinction: + - Empty anchors → pure synthesis (sectionize wrappers, filter + constructions, title-block h1, tree-sitter postprocess, footnotes + container, appendix wrapper, user-edit). + - `Invocation` anchor present → has a source-side preimage (every + shortcode resolution; future filter-with-trigger-anchor cases). + - `ValueSource` anchor present → records where the value came from + (future, gated on metadata-loader changes). + - `Other(...)` anchor present → extension-defined. + + Writer behavior (Plan 7) consults `by.is_atomic_kind()` for + atomicity and `gen.invocation_anchor()` for the preimage byte range. + +## Original vs Generated on synthesized nodes + +Two pieces of provenance information need to land somewhere when a +transform produces a node: + +1. **Generator identity** — "which transform produced me." +2. **Source anchor** — "which source bytes are this node's canonical preimage." + +For non-CustomNode synthesized nodes (sectionize Div, filter Str, +footnotes Div), there's no other slot for (1), so `source_info` carries +both via `Generated { by, from }`. + +For CustomNode synthesized nodes, (1) is already encoded in +`CustomNode.type_name`. The wrapper *is* a `Callout` / `IncludeExpansion` +/ `CrossrefResolvedRef` by virtue of `type_name`; `source_info` only +needs to do (2). And the natural shape for (2) — when the CustomNode +1:1-substitutes for a parser-emitted source-mapped node — is the +inherited `Original` (or whatever `SourceInfo` shape the substituted +node carried). + +| Synthesized node kind | Has CustomNode `type_name`? | Substitutes 1:1 for source-mapped node? | `source_info` shape | +|---|---|---|---| +| `IncludeExpansion` wrapper (Plan 8) | Yes | Yes (the include-line Paragraph) | Original (inherited) | +| `Callout` / `Theorem` / `Proof` / etc. | Yes | Yes (the source Div) | Original (inherited) | +| `CrossrefResolvedRef` | Yes | Yes (the source Cite) | Original (inherited) | +| `FloatRefTarget` | Yes | Yes (the source Div) | Original (inherited) | +| Sectionize Section Div | No | No (structural grouping) | `Generated { by: sectionize, from: [] }` | +| Footnotes container Div | No | No (structural grouping) | `Generated { by: footnotes, from: [] }` | +| Appendix wrapper Div | No | No (structural grouping) | `Generated { by: appendix, from: [] }` | +| Title-block synthesized h1 | No | No (synthesized from `title:` YAML) | `Generated { by: title_block, from: [] }` | +| Tree-sitter postprocess Space | No | No (inserted between nodes) | `Generated { by: tree_sitter_postprocess, from: [] }` | +| Shortcode resolution output | No | No (resolved from value, distinct from token bytes) | `Generated { by: shortcode("…"), from: [Invocation, …] }` | +| Filter-constructed node | No | No (filter computed it) | `Generated { by: filter, from: [] }` (Dispatch anchor in the future) | + +The rule: + +> A synthesized node uses **Original** `source_info` if and only if it +> is a CustomNode whose 1:1 source preimage is a parser-emitted node. +> Everything else uses **Generated**. + +## `by.data` shape per kind + +`by.data` is open `serde_json::Value` (matching the `CustomNode.plain_data` +and `Artifact.metadata` precedents). The known shapes per kind are: + +| `by.kind` | `by.data` contents | +|---|---| +| `shortcode` (Rust handler) | `{ "name": "" }` | +| `shortcode` (Lua handler) | `{ "name": "", "lua_path": "", "lua_line": }` until Lua-file-registration; then just `{ "name": "" }` | +| `filter` | `{ "filter_path": "", "line": }` until Lua-file-registration; then `{}` | +| `sectionize` / `footnotes` / `appendix` / `title-block` / `tree-sitter-postprocess` / `user-edit` | `{}` (empty) | +| `ext//` (third-party) | extension-defined, opaque to core | + +Convention: `data` is a JSON object with kind-specific known fields. +Consumers must treat unknown fields as opaque metadata. Producers may +add fields without breaking readers that don't look for them. Adding a +new field to a known kind's `data` is a non-breaking change. + +This same convention applies to `CustomNode.plain_data`; Plan 4 codifies +it once for both seams. The pattern is "open Value at extension/dispatch +seams; static typing everywhere else" — `Anchor.source_info` stays +typed `Arc`; only the truly per-kind, heterogeneous data +sits in `by.data`. + +## Atomic-kind set + +`By::is_atomic_kind()` returns true for kinds whose nodes are "atomic" +from the incremental writer's perspective — nodes the user can't edit +honestly through React, because the pipeline regenerated them from +source-side input. + +| `by.kind` | Atomic? | Role | +|---|---|---| +| `filter` | Yes | filter-constructed leaves; user edits the filter, not the output | +| `shortcode` | Yes | shortcode resolutions; user edits the token, not the resolved content | +| `title-block` | Yes | synthesized title h1; user edits `title:` metadata | +| `tree-sitter-postprocess` | Yes | parser-side synthetic spaces | +| `sectionize` | No (Transparent) | structural wrapper; children are editable | +| `footnotes` | No (Transparent) | container; children are editable | +| `appendix` | No (Transparent) | container; children are editable | +| `user-edit` | No | React-constructed; user-typed by definition | + +Atomicity is per-kind, orthogonal to `from`. A `Generated { by: shortcode, +from: [Invocation -> token_si] }` is atomic; so is a +`Generated { by: filter, from: [] }`. The writer's coarsen +(Plan 7) consults `by.is_atomic_kind()` and `gen.invocation_anchor()` +independently. + +Extensions that contribute new `by.kind` values are not atomic by +default. If an extension wants its kind to be atomic, the +`is_atomic_kind()` predicate (or a follow-up extension-registration +mechanism — see Plan 7 §Open questions) needs to recognize it. v1 +hardcodes the built-in set. + +### Required-anchor invariant for `shortcode` + +A `Generated { by: shortcode(...), from: [] }` is **not a valid state**. +Every shortcode-resolution node must carry at least one `Invocation` +anchor pointing at the source token's byte range. The resolver +(Plan 6) is responsible for maintaining this invariant; downstream +consumers (Plan 7's writer, error-reporting) may assume it. + +Plan 4 documents the invariant; enforcement is split across the two +producers/consumers of the shape: + +- **Plan 6 (producer)** owns the audit-completion test that walks the + post-stamping AST and asserts no `Generated { by: shortcode, from: [] }` + remains. The stamper is the only construction site for `by: shortcode` + in v1; the test verifies it always attaches the `Invocation` anchor. +- **Plan 7 (consumer)** adds a `debug_assert!` in `coarsen`'s + atomic-no-anchor branch. The writer routes "atomic + no invocation" + to `Omit` (drop the node, pipeline regenerates next run); for filter + that's correct, for shortcode it's silent data loss — the assertion + catches the bad shape before that branch fires, in dev / test builds. + +No constructor-level enforcement in v1. The `By::shortcode(name)` +builder stays symmetric with the other `By::xxx()` builders; the +required-anchor invariant is a *resolver* invariant, not a *type* +invariant. If a second required-anchor rule appears later, promote +the audit assertion into a shared validator pass. + +The `By::shortcode` doc-comment must state the invariant explicitly, +so anyone reaching for the builder from a new call site reads: + +```rust +/// Construct a `By` for a shortcode resolution. +/// +/// **Invariant.** Every `Generated { by: shortcode(...), .. }` must +/// carry at least one `Invocation` anchor in `from` pointing at the +/// source token's byte range. Use only inside a `Generated` whose +/// anchor list is populated; constructing the bare shape with empty +/// `from` is rejected by Plan 6's audit-completion test and trips +/// Plan 7's writer `debug_assert!`. +pub fn shortcode(name: impl Into) -> Self { ... } +``` ## Migrations The pre-existing `FilterProvenance` is renamed/folded: - **Construction**: `SourceInfo::filter_provenance("path", 42)` → - `SourceInfo::Synthetic { by: By::filter("path", 42) }`. - Add a deprecated alias `SourceInfo::filter_provenance` that constructs - the new shape, eased migration; remove after migration completes. -- **Pattern-match**: every `SourceInfo::FilterProvenance { filter_path, line }` - arm becomes `SourceInfo::Synthetic { by }` and inspects `by.kind == - "filter"` and `by.data["filter_path"]` / `by.data["line"]`. Or a small - helper `By::as_filter() -> Option<(&str, usize)>` for the common case. + `SourceInfo::Generated { by: By::filter("path", 42), from: smallvec![] }`. + The `(filter_path, line)` pair lives in `by.data` until + Lua-file-registration lands. No deprecated alias is shipped; the + 4 non-source-map callers are migrated inline in the same PR (see + Phase 5). +- **Pattern-match (production)**: every `SourceInfo::FilterProvenance { filter_path, line }` + arm becomes `SourceInfo::Generated { by, .. }` and inspects via + `by.as_filter()` to recover the path/line. +- **Pattern-match (tests)**: `Some(SourceInfo::FilterProvenance { filter_path, line })` + becomes `Some(SourceInfo::Generated { by, .. })` with `by.as_filter()` + for path/line recovery. Empty-bind sites + (`Some(SourceInfo::FilterProvenance { .. }) => {}`) become the + guard form: `Some(SourceInfo::Generated { by, .. }) if by.is_kind("filter") => {}`. + Affected sites verified by grep: `pampa/src/lua/diagnostics.rs:444, 509, 802`, + `pampa/src/lua/filter_tests.rs:1802`, plus the renamed + `test_filter_provenance_tracking` at `filter_tests.rs:740-813`. +- **JSON writer arm** (`pampa/src/writers/json.rs:314`): the + pattern-match site must stay exhaustive over `SourceInfo` after the + variant is gone. Plan 4 produces only `by.kind == "filter"` + Generated values; Plan 5 owns wire-code 4 for non-filter kinds. + The interim arm emits the legacy code-3 payload exactly as today, + preserving bd-3odjm's expected failure mode: -## `By` helper accessors + ```rust + SourceInfo::Generated { by, .. } => { + let (filter_path, line) = by.as_filter().expect( + "Plan 4 produces only filter-kind Generated; non-filter \ + Generated requires Plan 5's wire-code 4 emitter", + ); + ( + 0, + 0, + SerializableSourceMapping::FilterProvenance { + filter_path: filter_path.to_string(), + line, + }, + ) + } + ``` -Plan 4 ships these helpers up front, so call sites in Plans 6 and 7 read -provenance consistently rather than each writing ad-hoc string-equality -checks against `by.kind`: + Plan 5 replaces this with the wire-code 4 emitter and removes the + `SerializableSourceMapping::FilterProvenance` variant. +- **Lua serde**: read `"FilterProvenance"` tag (legacy) and reconstruct + as `Generated { by: By::filter(...), from: smallvec![] }`. New + constructions emit `"Generated"` tag with `by` and `from` sub-tables + (per §In scope). -```rust -impl By { - /// True if this kind matches the given string (sugar for `self.kind == kind`). - pub fn is_kind(&self, kind: &str) -> bool { self.kind == kind } +## File-id accessor consolidation - /// If this is a `filter` kind, return its `(filter_path, line)` payload. - /// Returns None for any other kind. - pub fn as_filter(&self) -> Option<(&str, usize)> { - if self.kind != "filter" { return None; } - let path = self.data.get("filter_path")?.as_str()?; - let line = self.data.get("line")?.as_u64()? as usize; - Some((path, line)) - } +Six SourceInfo walkers across the workspace conceptually do the same +operation — "give me the FileId(s) this SourceInfo refers to" — but +diverge on Concat semantics, Substring recursion depth, and return +type: - /// True if a `Synthetic { by: }` node should be treated as - /// atomic by the incremental writer. Atomic Synthetic nodes are - /// constructed by the pipeline with no source preimage and represent - /// content the user shouldn't edit through React (filter-constructed - /// inlines, synthesized title h1, tree-sitter-inserted spaces). - /// - /// The writer's coarsen step (Plan 7) uses this to decide: - /// - KeepBefore on atomic Synthetic → Omit (drop from output; - /// pipeline regenerates next run). - /// - UseAfter / RecurseIntoContainer on atomic Synthetic → soft-drop - /// substitution + Q-3-42 warning. +| Site | Returns | Concat policy | Substring | Status | +|---|---|---|---|---| +| `quarto-error-reporting/src/diagnostic.rs:556` `extract_file_id` | `Option` | `first().and_then` | full recursion | private, production | +| `pampa/src/pandoc/location.rs:329` `extract_filename_index` | `Option` | `iter().find_map` | full recursion | pub, production, has tests | +| `pampa/src/pandoc/treesitter_utils/pipe_table.rs:256-279` (inline match) | `FileId` (FileId(0) fallback) | first piece only | **one level only** — broken for nested Substring | production, latent bug | +| `pampa/src/pandoc/treesitter_utils/section.rs:129-152` (inline match) | `FileId` (FileId(0) fallback) | first piece only | **same shallow bug** | production, latent bug | +| `quarto-core/src/stage/stages/apply_template.rs:820` `root_file_id` | `Option` | `first().and_then` | full recursion | test mod | +| `quarto-core/src/stage/stages/engine_execution.rs:813` `collect_file_ids` / `walk_source_info` | `HashSet` | walks every piece | full recursion | test mod | + +Plan 4 consolidates these onto two methods on `SourceInfo`: + +```rust +impl SourceInfo { + /// First FileId reachable from this SourceInfo's root. /// - /// Non-atomic Synthetic kinds are transparent containers (Sectionize, - /// Footnotes, Appendix wrappers) whose children carry their own - /// source preimage; the writer recurses into children rather than - /// dropping or substituting. - pub fn is_atomic_synthesizer(&self) -> bool { - matches!( - self.kind.as_str(), - "filter" | "title-block" | "tree-sitter-postprocess" - ) + /// Original → `Some(file_id)`. + /// Substring → recurse parent. + /// Concat → `pieces.iter().find_map(|p| p.source_info.root_file_id())` + /// (find_map semantics — strict superset of every existing + /// "first piece" caller; skips Generated holes and empty pieces). + /// Generated → `invocation_anchor().and_then(|si| si.root_file_id())`; + /// `None` when no Invocation anchor is present. + pub fn root_file_id(&self) -> Option { ... } + + /// Every FileId reachable from this SourceInfo. Walks every + /// Original, every Substring parent, every Concat piece, and + /// every Generated anchor (all roles — Invocation, ValueSource, + /// Other). + pub fn collect_file_ids(&self, out: &mut HashSet) { ... } +} +``` + +Migration table (Phase 3): + +| Old | New | +|---|---| +| `DiagnosticMessage::extract_file_id(si)` | `si.root_file_id()` (delete private fn) | +| `extract_filename_index(si)` | `si.root_file_id().map(\|fid\| fid.0)` (kept as a one-line shim or inlined) | +| pipe_table.rs inline match → `FileId` | `table_start.root_file_id().unwrap_or(FileId(0))` — also fixes nested-Substring bug | +| section.rs inline match → `FileId` | `table.source_info.root_file_id().unwrap_or(FileId(0))` — same fix | +| test `root_file_id` (apply_template.rs) | `info.root_file_id()` (delete local fn) | +| test `walk_source_info` (engine_execution.rs) | `si.collect_file_ids(out)` (delete inner fn) | + +Net effect: ~60 LOC of duplicate walkers removed, two latent +production bugs fixed (nested-Substring fall-through to FileId(0)), +and the Generated arm is defined exactly once. + +## Deferred anchor role + +**`Dispatch` anchor (future).** When a Lua-implemented shortcode +handler or user filter constructs a node, the natural shape for +"where in Lua source was this constructed" is: + +```rust +Anchor { + role: AnchorRole::Dispatch, // not in v1 + source_info: Arc::new(Original { file_id: kbd_lua_id, start, end }), +} +``` + +This requires Lua filter files to be registered in `SourceContext` so +they have `FileId`s. That's its own infrastructure work touching the +Lua engine, the source context, the diagnostic machinery, and the +cache-key surface. We defer it. + +In the interim, the Lua machinery continues to carry `(filter_path, +line)` in `by.data` (see the `by.data` table above for `filter` and +Lua-dispatched `shortcode` kinds). When the Lua-file-registration +follow-up lands, the data migrates out of `by.data` and into a +`Dispatch` anchor; `AnchorRole::Dispatch` joins the enum (a +forward-compatible enum extension); and `by.data` for those kinds +shrinks to per-kind config only. + +The migration applies to **both** affected kinds, symmetrically: + +| kind | shape today | shape after Lua-file-registration | +|---|---|---| +| `filter` | `Generated { by: filter{path, line}, from: [] }` | `Generated { by: filter{}, from: [Dispatch -> lua_si] }` | +| `shortcode` (Lua handler) | `Generated { by: shortcode{name, lua_path, lua_line}, from: [Invocation -> token_si] }` | `Generated { by: shortcode{name}, from: [Invocation -> token_si, Dispatch -> lua_si] }` | +| `shortcode` (Rust handler) | `Generated { by: shortcode{name}, from: [Invocation -> token_si] }` | unchanged (no Lua source to point at) | + +A Lua-handler shortcode after registration carries **two** anchors — +`Invocation` for the user-written token, `Dispatch` for the Lua +handler that resolved it. The anchor list is what makes this clean: +adding `Dispatch` doesn't disturb `Invocation`, and the writer's +preimage walk (Plan 7) still looks at `invocation_anchor()` only. + +Tracked as **bd-36fr9** ("Provenance follow-up: Dispatch anchor for +Lua-handler filter & shortcode"). + +**`ValueSource` anchor (defined, deferred firing).** +`AnchorRole::ValueSource` is defined in Plan 4's type. The shortcode +resolver doesn't attach it yet, because the metadata loader doesn't +record per-key source-info today (every metadata key's `source_info` +points at where the value was parsed from, but the merged metadata +that the resolver consults doesn't expose this). A separate follow-up +issue covers extending the metadata loader to thread per-key source +through to the merged value. When that lands, Plan 6's stamper +appends `ValueSource` anchors for `meta` and `var` shortcode +resolutions whose values came from outside the active document. + +Tracked as **bd-129m3** ("Provenance follow-up: ValueSource anchor +stamping for meta/var shortcodes"). + +Both follow-ups are pure additions when they land — neither requires +reopening Plan 4's type design. The shape is forward-compatible by +construction. + +## Resolve-byte-range semantics + +`resolve_byte_range` is Plan 4's responsibility (existing accessor on +`SourceInfo`, gains a `Generated` arm). `preimage_in` is Plan 7's — +Plan 4 only ships the building block it depends on, `invocation_anchor()`. + +```rust +impl SourceInfo { + pub fn resolve_byte_range(&self) -> Option<(usize, usize, usize)> { + match self { + SourceInfo::Original { file_id, start_offset, end_offset } => + Some((file_id.0, *start_offset, *end_offset)), + SourceInfo::Substring { parent, start_offset, end_offset } => { + let (fid, parent_start, _) = parent.resolve_byte_range()?; + Some((fid, parent_start + start_offset, parent_start + end_offset)) + } + SourceInfo::Concat { .. } => None, + SourceInfo::Generated { .. } => self + .invocation_anchor() + .and_then(|si| si.resolve_byte_range()), + } } } ``` -Atomic vs. transparent vs. editable Synthetic kinds (decided in -conversation; the table in §Notes shows the full mapping): - -- **Atomic** (`is_atomic_synthesizer() == true`): `filter`, `title-block`, - `tree-sitter-postprocess`. Pipeline-generated content with no source - preimage; user can't edit honestly. -- **Transparent** (`is_atomic_synthesizer() == false`, has children): - `sectionize`, `footnotes`, `appendix`. Container synthesis; children - are editable per their own provenance. -- **Editable** (`is_atomic_synthesizer() == false`, materializable): - `user-edit`. Explicitly user-typed; qmd writer serializes via Rewrite. -- **Escape hatch** (`raw`): not atomic by default; extensions that need - atomic behavior should namespace their kind under `ext//...` and - consider whether `is_atomic_synthesizer` needs to recognize their - kinds (open extension question; v1 doesn't address registration). - -Add more accessors as Plans 6/7 surface concrete repeated patterns. The -above three cover the immediate needs (filter-provenance recovery in -tests, generic kind matching in writer dispatch, atomicity classification -for the writer). Don't proliferate accessors preemptively — -`as_shortcode()`, `as_sectionize()`, etc. can be added if their call -sites prove repetitive. - -## Builder list is extensible - -The `By` builder list above (`filter`, `sectionize`, `user_edit`, etc.) is -the v1 known set. **Plan 6's audit may discover sites Plan 4 didn't -anticipate** — if so, Plan 6 adds new `By::()` builders to extend -the set. Builders are inert from Plan 4's perspective (a builder is just -a constructor that produces `By { kind: "...", data: ... }`); adding one -doesn't require reasoning about Plan 4's invariants. - -Convention: each new builder gets a doc-comment explaining what kind of -node uses it and why. Keeps the `By` type's purpose discoverable. - -## Open questions for implementation - -- **Lua serde back-compat**: read `"FilterProvenance"` tag (legacy) and - reconstruct as `Synthetic { by: By::filter(...) }`. New constructions - emit `"Synthetic"` tag. Read both indefinitely; writes migrate to new - immediately. -- **Tests update**: `pampa/src/lua/filter_tests.rs::test_filter_provenance_tracking` - asserts on `SourceInfo::FilterProvenance`. Update to assert on - `Synthetic { by }` with `by.is_kind("filter")` and check - `by.as_filter()` returns the right path/line. +The `Generated` arm collapses to "look up the invocation anchor; +recurse into its source_info." Pure synthesis (empty `from`) returns +`None`. Multi-anchor Generateds (when `ValueSource` lands) still only +consult `Invocation` — `ValueSource` is diagnostic-only. + +Plan 7's `preimage_in` follows the same `Generated` pattern (it +delegates to `invocation_anchor()`); see Plan 7 §"`preimage_in` +semantics" for the full implementation including Concat contiguity. ## References -- `crates/quarto-source-map/src/source_info.rs:22` — current SourceInfo enum. -- `crates/quarto-source-map/src/source_info.rs:48-54` — current - FilterProvenance variant. -- `crates/quarto-source-map/src/source_info.rs:185-237` — accessors that - need updating (start_offset, end_offset, length, remap_file_ids). -- `crates/quarto-source-map/src/mapping.rs:17-74` — `map_offset` recursion; - needs new arm. -- `crates/pampa/src/lua/diagnostics.rs:60-145` — Lua serde to extend. -- `crates/pampa/src/lua/filter_tests.rs:663-728` — test to update. +- `crates/quarto-source-map/src/source_info.rs:21-55` — current + `SourceInfo` enum (incl. `FilterProvenance` variant at lines 49-54). +- `crates/quarto-source-map/src/source_info.rs:162-264` — accessors that + need updating (`length`, `start_offset`, `end_offset`, + `resolve_byte_range`, `remap_file_ids`). +- `crates/quarto-source-map/src/mapping.rs:17-74` — `map_offset` + recursion; needs `Generated` arm (returns `None`, like + `FilterProvenance` does today). +- `crates/quarto-error-reporting/src/diagnostic.rs:556-575` — + `extract_file_id` private fn; retired in favor of + `SourceInfo::root_file_id()`. +- `crates/pampa/src/pandoc/location.rs:328-344` — `extract_filename_index`; + reduced to a one-line shim over `root_file_id()` (or inlined at + callers). Has dedicated tests at `location.rs:588-655`. +- `crates/pampa/src/pandoc/treesitter_utils/pipe_table.rs:256-279` — + inline file-id extraction; retired in favor of + `root_file_id().unwrap_or(FileId(0))`. Also fixes a latent + nested-Substring bug. +- `crates/pampa/src/pandoc/treesitter_utils/section.rs:129-152` — + same shape and same latent fix. +- `crates/quarto-core/src/stage/stages/apply_template.rs:820-829` — + test-mod `root_file_id`; retired in favor of `SourceInfo::root_file_id()`. +- `crates/quarto-core/src/stage/stages/engine_execution.rs:813-832` — + test-mod `walk_source_info`; retired in favor of + `SourceInfo::collect_file_ids()`. +- `crates/pampa/src/lua/diagnostics.rs:50-145` — Lua serde to extend. +- `crates/pampa/src/lua/filter_tests.rs:740-813` — `test_filter_provenance_tracking`; rename and update assertions to the `Generated` shape. - `crates/quarto-pandoc-types/src/custom.rs:75` — `CustomNode.plain_data` - (the prior-art shape we're mirroring). + (the prior-art for `serde_json::Value` at extension seams; same + convention now applies to `By.data`). +- `crates/quarto-core/src/artifact.rs:71` — `Artifact.metadata` + (second precedent for the same pattern). ## Test plan -- Unit tests for each `By` builder method (constructs the right kind and data). +### Type / builder tests + +- Unit tests for each `By` builder method (constructs the right kind + and data). Cover all ten: `filter`, `sectionize`, `user_edit`, + `shortcode`, `include`, `title_block`, `footnotes`, `appendix`, + `tree_sitter_postprocess`, `raw`. +- `By::is_atomic_kind()` test: confirms the set named in §"Atomic-kind + set" returns `true` exactly for `filter | shortcode | title-block | + tree-sitter-postprocess` and `false` for everything else (including + extension `ext/…/…` kinds). +- `By::is_kind()` / `By::as_filter()` coverage. +- Unit tests for `Anchor::invocation()` / `Anchor::value_source()` + constructors. - Round-trip test: `By` → JSON → `By` (serde derive). -- Integration test: filter-provenance test (renamed from - `test_filter_provenance_tracking`) confirms a filter-created Str gets - `Synthetic { by: By::filter(...) }` source_info. -- Derived round-trip: build a `Derived { from: Original, by: By::shortcode("...") }` - value; round-trip through JSON (Plan 5) and Lua serde; assert structural - equality. -- Accessor recursion test: a `Derived` value's `start_offset()` / `end_offset()` - / `length()` walk through `from` and return the from's offsets. +- Round-trip test: `Anchor` → JSON → `Anchor` (serde derive). + +### Accessor tests on `Generated` + +- `length()` / `start_offset()` / `end_offset()` for `Generated` + return `0` regardless of `from` contents. +- `map_offset()` for `Generated` returns `None` regardless of offset + argument. +- `resolve_byte_range()` recursion: a + `Generated { from: [Invocation -> Substring{parent: Original{42, 100, 200}, 10, 20}] }` + resolves to `(42, 110, 120)`. A `Generated` with empty `from` returns + `None`. A `Generated` with only a `ValueSource` anchor (no + `Invocation`) returns `None`. (Plan 7 owns the matching `preimage_in` + tests.) +- `remap_file_ids()` for `Generated`: build a + `Generated { from: [Invocation -> Original{FileId(0), …}, ValueSource -> Original{FileId(3), …}] }`, + apply `|id| FileId(id.0 + 10)`, assert both anchors' source_info + carry remapped FileIds. This catches the "no-op like FilterProvenance" + regression — `Generated` must NOT be a no-op since it can hold FileIds. +- `root_file_id()` coverage on every variant. Generated with an + Invocation anchor pointing at `Original{file_id: FileId(7), ...}` + returns `Some(FileId(7))`. Generated with only a `ValueSource` + anchor returns `None` (matches the empty-`from` case — only + Invocation participates in `root_file_id`). Concat with + `[Generated{empty}, Original{42}]` returns `Some(FileId(42))` + (find_map skips the empty Generated piece) — this also pins the + Plan-3 latent bug fixed by the new accessor on + pipe_table.rs / section.rs. +- `collect_file_ids()` coverage: Generated with + `[Invocation -> Original{FileId(1), ...}, ValueSource -> Original{FileId(2), ...}, Other(...) -> Original{FileId(3), ...}]` + populates the set with `{FileId(1), FileId(2), FileId(3)}` — confirms + that all anchor roles participate, not just Invocation. Concat, + Substring, nested compositions: every reachable FileId lands. +- `invocation_anchor()` accessor: a Generated with `[Invocation -> X]` + returns `Some(X)`; with `[]` returns `None`; with `[ValueSource -> Y]` + (no Invocation) returns `None`. +- `value_source_anchor()` accessor: parallel coverage. +- `anchors_with_role()` accessor: a Generated with + `[Invocation -> X, ValueSource -> Y, Other("foo") -> Z]` returns the + right anchors for each role, and an empty iterator for an unknown role. +- `append_anchor()` mutator: starting from `Generated { from: [] }`, + append an Invocation then a ValueSource; assert both are present in + order. + +### Structural tests + +- Integration test: filter-provenance test renamed from + `test_filter_provenance_tracking` (at `filter_tests.rs:740-813`) + confirms a filter-created Str gets `Generated { by: filter, from: [] }` + with `(filter_path, line)` recoverable via `by.as_filter()`. +- `combine()` × `Generated` structural test: combining an `Original` + with a `Generated` produces a `Concat` whose Generated piece has + length `0` (matches `Generated::length()`). `map_offset` over the + combined Concat skips the Generated piece. This pins behavior even + though no production code path combines Generated source_info today. - Lua-serde round-trip: typed → Lua table → typed, including legacy - `"FilterProvenance"` tag back-compat. + `"FilterProvenance"` tag back-compat (reads as `Generated { by: + filter, from: [] }`; never round-trips back to `FilterProvenance`). ## Dependencies -- Depends on: nothing (pure type change in the foundation crate). -- Blocks: Plan 5 (wire format extension), Plan 6 (provenance audit), Plan 7 - (writer's preimage walk uses Synthetic and Derived). +- Depends on: nothing (pure type change in the foundation crate, plus + consolidation of file-id walkers across `quarto-core`, `pampa`, and + `quarto-error-reporting` that all already depend on + `quarto-source-map`). +- Blocks: Plan 5 (wire format extension), Plan 6 (provenance audit), + Plan 7 (writer's preimage walk uses Generated and the + `invocation_anchor` helper). ## Risk areas -- **Migration scope**: ~22 files pattern-match `SourceInfo` variants. Each - needs migration arms for *both* `Synthetic` and `Derived`. Most are - mechanical: Synthetic arm returns what FilterProvenance did (usually - `0`, `0`, or `None`); Derived arm recurses into `from` for offset - accessors and returns the same as Synthetic for FileId-extracting helpers. -- **`Derived` accessor recursion**: `start_offset()`, `end_offset()`, - `length()` need to recurse into `from`. A long Derived chain could - in principle stack overflow, but in practice chains are 1-2 deep. - Same risk profile as Substring. -- **`serde_json::Value` in PartialEq derives**: `Value` implements `PartialEq` - but with potentially weird semantics for floats. For our use, kinds are - string + small structured data; should be fine. Test the cases. +- **Migration scope**: 15 files pattern-match `SourceInfo::FilterProvenance` + (27 occurrences total — verified by grep against the worktree). + Phase 3's file-id-walker consolidation retires ~6 of those by + replacing entire match expressions (the file-id-extraction sites in + `diagnostic.rs`, `location.rs`, `pipe_table.rs`, `section.rs`, + `apply_template.rs`, `engine_execution.rs`). Phase 5 sweeps the + ~21 remaining arms. Most are mechanical: the `Generated` arm + returns what `FilterProvenance` did today (`0`/`0`/`None` for + offset/length accessors; delegates to `invocation_anchor()` for + `resolve_byte_range`). File-id traversals are handled exactly once, + inside the new `root_file_id` / `collect_file_ids` accessors — + callers walk through those rather than re-implementing the recursion + per call site. +- **Anchor-list allocation**: `from` is typed `SmallVec<[Anchor; 2]>` + from day 1 (with the `serde` feature enabled). Inline capacity of 2 + covers all expected shapes through the deferred follow-ups with zero + heap allocation: + - empty (sectionize / footnotes / appendix / title-block / + tree-sitter-postprocess / filter constructions today) — the bulk + of synthesized nodes; + - one Invocation (Rust-handler shortcode resolutions, today); + - two anchors (Invocation + ValueSource for `meta`/`var` once + bd-129m3 lands; Invocation + Dispatch for Lua-handler shortcodes + once bd-36fr9 lands). + Cap=2 grows the `SmallVec<[Anchor; …]>` field by ~40 bytes (the size + of one inline `Anchor` slot — `AnchorRole`'s largest variant + `Other(String)` is 32 bytes, plus 8 for `Arc`). Because + the `SourceInfo` enum's stack size is dictated by its largest + variant, **every** `SourceInfo` value in the AST grows by that 40 + bytes — not just `Generated` instances. For a doc with thousands of + Block/Inline nodes (each carrying a `SourceInfo` by value, not + Arc-boxed), the cap=1 → cap=2 step costs ~40 bytes per node, i.e. + tens-to-hundreds of KB on a large document. The trade is paid in + exchange for eliminating the heap spill cap=1 would incur on every + multi-anchor shortcode in the steady state. Three-or-more-anchor + Generateds (Invocation + ValueSource + Dispatch on a Lua-handler + `meta` shortcode) still spill — same cost as `Vec` would have + been. If memory-per-node turns out to matter for the q2-preview + interactive editor, revisit by Arc-boxing the `Generated` variant + (so the SourceInfo enum's stack size drops back to a single pointer + for that variant) rather than by reverting to cap=1. Adds a + `smallvec` workspace dependency (verified absent today). +- **`serde_json::Value` in PartialEq derives**: `Value` implements + `PartialEq` but with potentially weird semantics for floats. For our + use, kinds carry string + small structured data; should be fine. + Test the cases. (Verified: no production call site relies on + `SourceInfo == SourceInfo` today — the `PartialEq` derive is required + by the wider `Block`/`Inline` derives but isn't itself load-bearing. + Plan 7's coarsen may compare structurally once it lands; the + `Value::PartialEq` semantics on small kebab-case objects are + well-behaved.) - **Removing `FilterProvenance` is a breaking change for downstream - consumers**. Within the q2 workspace this is bounded; if any external code - imports the variant by name, they'd break. Search for non-workspace usages - before removing (probably none). + consumers**. Within the q2 workspace this is bounded; if any external + code imports the variant by name, they'd break. Search for + non-workspace usages before removing (probably none). +- **`Default` on containers of `SourceInfo`**: verified no struct in + `quarto-pandoc-types/src/{block,inline}.rs` derives `Default` (each + `SourceInfo`-bearing struct is constructed explicitly), so changing + `SourceInfo`'s arm set can't cascade into a broken + `#[derive(Default)]`. The hand-written `Default for SourceInfo` impl + (the `Original { FileId(0), 0, 0 }` zero-value) stays unchanged. +- **`combine()` with a `Generated` operand**: structurally valid (it + produces a `Concat` with a zero-length `Generated` piece, since + `Generated::length()` returns `0`), but semantically dead — the + Generated side carries no preimage bytes for adjacent-text coalescing, + and `map_offset` will skip over the zero-length piece. Verified: all + 17 `.combine(` call sites in the workspace (`attr.rs`, + `postprocess.rs`, `location.rs`, `yaml/parser.rs`, etc.) combine + Original/Substring shapes; nothing combines FilterProvenance today, so + Generated won't be combined either unless a future transform reaches + for it. The Phase 6 `combine() × Generated` test documents the + intended fall-through behavior for any future caller, not a current + regression. No type-level prevention in v1. ## Estimated scope | Component | Lines (rough) | |---|---| -| `Synthetic` variant + accessors | ~50 | -| `Derived` variant + recursive accessors | ~50 | -| `By` struct + builders | ~100 | -| Pattern-match migrations (~22 files, both new variants) | ~250 | +| `Generated` variant + `Anchor` + `AnchorRole` types | ~80 | +| Accessors (invocation_anchor, value_source_anchor, etc.) | ~60 | +| `By` struct + builders + `is_atomic_kind` | ~120 | +| `resolve_byte_range` / `map_offset` / `remap_file_ids` updates | ~40 | +| `root_file_id` + `collect_file_ids` accessors | ~50 | +| File-id walker consolidation (6 sites → 2 methods, net delete) | **-30** | +| Pattern-match migrations (~9 files, ~21 occurrences post-consolidation) | ~140 | | FilterProvenance construction site migrations | ~30 | -| Lua serde extension + back-compat (both variants) | ~80 | -| Test updates and new tests | ~200 | -| **Total** | **~760** | +| Lua serde extension + back-compat | ~80 | +| Test updates and new tests | ~280 | +| **Total** | **~850** | + +One to two focused sessions. The unified-variant design reduces the +total cost vs. the previous Synthetic-plus-Derived dual-variant draft +(every accessor and migration site collapses one arm). -One focused session, possibly stretching into a second given the slightly -larger scope from carrying Derived alongside Synthetic. +## Implementation surprises (recorded 2026-05-22 after Plan 4 landed) + +A few things diverged from the plan-as-written. Annotating them here so +Plan 5+ readers can adjust expectations. + +- **`gen` is a reserved keyword in current Rust.** Test locals and + method-receiver bindings must avoid the identifier `gen` (raw form + `r#gen` works but is ugly). The plan's pseudocode used + `gen.invocation_anchor()` / `gen.preimage_in()` etc. as shorthand; + in real code use `generated`, `g`, or destructure the variant. + Plan 7's `preimage_in` sketches should be amended before + implementation — the same trap applies. + +- **Phase 1's "compiles cleanly" holds only for `quarto-source-map`, + not the workspace.** Adding the `Generated` variant immediately + triggered non-exhaustive-match errors across ~10 crates. Phase 3's + six-walker consolidation rescues part of it, but the workspace + doesn't build green again until **Phase 5** lands. The phase boundary + semantics are "the source-map crate plus directly-touched + consumers"; expect downstream crates to be red between Phase 1 and + Phase 5. Future plans that add new `SourceInfo` variants should plan + for a "transitional arms inline" interlude or accept that the + workspace is red mid-implementation. + +- **`extract_filename_index` was tests-only.** The plan suggested + "thin shim or inline at the few callers" — turned out the only + callers were the function's four dedicated tests plus one + commented-out reference in `pampa/src/writers/json.rs`. Deleted the + function and the four tests entirely; the equivalent coverage now + lives in `quarto-source-map`'s `test_root_file_id_per_variant`. + Cleaner than the plan anticipated. Future grep-and-replace plans + should re-verify caller counts at start-of-implementation, not just + at planning time. + +- **`anchors_with_role` returns `Box`, not `impl + Iterator`.** The plan's signature was + `-> impl Iterator>`, but the two match arms + return different concrete iterator types (a `filter_map` over the + anchor list for the `Generated` arm, `std::iter::empty()` for + everything else). The fix is `Box + 'a>`. Static + dispatch would require either a hand-rolled iterator enum or + `Either` from `itertools` — not worth it for a method called + in non-hot paths. + +- **`cargo xtask verify` modifies a second lockfile.** The WASM build + leg (Step 9, `npm run build:wasm`) re-resolves + `crates/wasm-quarto-hub-client/Cargo.lock`, which is distinct from + the workspace `Cargo.lock`. Both ended up in the Plan-4 commit. Not + a problem, but plans that touch any crate transitively used by + `wasm-quarto-hub-client` should expect that second lockfile to be + dirty after verification. + +- **The bd-3odjm carve-out behaved exactly as predicted.** Single + failure, in + `quarto-core::idempotence::lua_shortcode_lipsum_fixed`, panicking + with `MalformedSourceInfoPool` from the wire-code-3 collision + between writer (Generated → code 3 with `[filter_path, line]` + payload) and reader (code 3 = legacy `Transformed`). This is a + *non-surprise* worth recording: the plan's "Inherited pre-existing + failure" section was correct down to the test name, the panic + message, and the root cause. ## Notes -The conceptual surface is "two new variants, one of which (`Synthetic`) -generalizes `FilterProvenance`." The pattern-match migration touches many -files but most arms are mechanical — Synthetic behaves like FilterProvenance -for offset accessors (returns 0, 0); Derived recurses into `from`. +The conceptual surface is "one new variant, `Generated`, with a typed +anchor list." The pattern-match migration touches many files but most +arms are mechanical. -Per the open-struct decision, `By` is `{ kind, data }` rather than a closed -enum. Builder methods give ergonomic, self-documenting construction at known -call sites; `By::raw` lets extensions add kinds without modifying the type. -The same `By` value appears as the payload of both Synthetic and Derived — -many kinds can be either depending on context, though in practice they -correspond cleanly: +Per the open-struct decision, `By` is `{ kind, data }` rather than a +closed enum. Builder methods give ergonomic, self-documenting +construction at known call sites; `By::raw` lets extensions add kinds +without modifying the type. The `Anchor` list is typed throughout — +each entry's `source_info` is an `Arc`, not dynamic JSON. -| Kind | Variant | When used | -|---|---|---| -| `filter` | Synthetic | Lua filter constructions (`pandoc.Str(...)`) | -| `sectionize` | Synthetic | SectionizeTransform's section Divs | -| `title-block` | Synthetic | TitleBlockTransform's synthesized h1 | -| `footnotes` | Synthetic | FootnotesTransform's container Div | -| `appendix` | Synthetic | AppendixStructureTransform's wrapper Div | -| `tree-sitter-postprocess` | Synthetic | parser-side synthetic Spaces | -| `user-edit` | Synthetic | React-constructed nodes | -| `shortcode` | Derived | shortcode resolutions (Plan 6) | -| `include` | (wrapped, not Derived) | wrapper CustomNode in Plan 8 | -| `crossref-resolve` | (wrapped, not Derived) | already a CustomNode today | - -Reintroducing Derived was a reversal of an earlier "drop it" decision. -The reversal happened when we recognized that Original chains alone can't -distinguish "shortcode resolution" (atomic) from "filter mutation" -(non-atomic). Derived gives Plan 7 the type-level distinction it needs to -trigger AtomicViolation correctly. +The earlier `Synthetic`/`Derived` split was a useful intermediate during +design discussion (it crystallized the atomic-vs-not distinction), but +the unified `Generated` shape captures the same information with fewer +moving parts. The "has preimage" property becomes +`gen.invocation_anchor().is_some()` rather than a separate enum arm; +atomicity stays per-`by.kind`, orthogonal to anchor-presence. + +| Kind | Variant | Anchors | When used | +|---|---|---|---| +| `filter` | Generated | `[]` (Dispatch later) | Lua filter constructions (`pandoc.Str(...)`) | +| `sectionize` | Generated | `[]` | SectionizeTransform's section Divs | +| `title-block` | Generated | `[]` | TitleBlockTransform's synthesized h1 | +| `footnotes` | Generated | `[]` | FootnotesTransform's container Div | +| `appendix` | Generated | `[]` | AppendixStructureTransform's wrapper Div | +| `tree-sitter-postprocess` | Generated | `[]` | parser-side synthetic Spaces | +| `user-edit` | Generated | `[]` | React-constructed nodes | +| `shortcode` | Generated | `[Invocation]` (`+ValueSource` later, `+Dispatch` later for Lua) | shortcode resolutions (Plan 6) | +| `include` | (wrapped CustomNode, source_info Original) | — | wrapper CustomNode in Plan 8 | +| `crossref-resolve` | (wrapped CustomNode, source_info Original) | — | already a CustomNode today | diff --git a/claude-notes/plans/2026-05-04-q2-preview-plan-5-wire-format.md b/claude-notes/plans/2026-05-04-q2-preview-plan-5-wire-format.md index 92a4dca4e..36fce02b6 100644 --- a/claude-notes/plans/2026-05-04-q2-preview-plan-5-wire-format.md +++ b/claude-notes/plans/2026-05-04-q2-preview-plan-5-wire-format.md @@ -1,122 +1,261 @@ -# Plan 5 — JSON wire format extension + code-3 fix +# Plan 5 — JSON wire format extension for Generated -**Date:** 2026-05-04 +**Date:** 2026-05-04 (revised 2026-05-20) **Branch:** feature/q2-preview **Status:** Implementation plan (open questions named) -**Milestone:** none directly — fixes a latent bug, prepares wire for Plans 6/7/8 +**Milestone:** none directly — fixes a latent bug, prepares wire for + the rest of the provenance epic + +## Epic context + +Part of the **provenance epic** (Plans 3–8). Plan 5 carries the wire +format adjustments needed so the typed provenance Plan 4 introduces can +cross the WASM/JSON boundary and round-trip without information loss. +The file name keeps its q2-preview-plan-N form for continuity with +earlier discussion notes. ## Goal -Extend the source-info pool's JSON wire format to encode two new variants -introduced by Plan 4: `Synthetic { by: By }` and `Derived { from: SourceInfo, -by: By }`. In the same change, fix a latent bug: today's writer emits -`FilterProvenance` as type code `3` with payload `[filter_path, line]`, but -today's reader interprets code `3` as the long-removed `Transformed` variant -and tries to parse it as `[parent_id, ...]` — resulting in a hard -`MalformedSourceInfoPool` error on any AST that crosses the JSON boundary -with a FilterProvenance value in it. +Extend the source-info pool's JSON wire format to encode the +`Generated { by, from }` variant introduced by Plan 4. In the same +change, fix a latent bug: today's writer emits `FilterProvenance` as +type code `3` with payload `[filter_path, line]`, but today's reader +interprets code `3` as the long-removed `Transformed` variant and tries +to parse it as `[parent_id, ...]` — resulting in a hard +`MalformedSourceInfoPool` error on any AST that crosses the JSON +boundary with a FilterProvenance value in it. + +The latent bug doesn't surface in current main because `parse_qmd_to_ast` +doesn't run filters that produce FilterProvenance. **But the q2-preview +pipeline (already shipped via Plans 1–2) does run filters and +shortcodes**, and the latent bug becomes reachable as soon as a +built-in or user filter constructs a node whose JSON-serialized +source_info crosses the WASM boundary. Plan 5 is therefore higher +priority than the original "prepares wire for downstream plans" +framing suggested — it fixes a bug that's no longer latent in design, +only in reach. + +## Inherited failure that must close on Plan 5's first reader change (bd-3odjm) + +Plan 3's idempotence gate already ships a live reproduction of this +bug as a failing test on the integration branch. Plan 5 *inherits* +it as the canonical first-iteration target. + +- Test: `cargo nextest run -p quarto-core --test idempotence lua_shortcode_lipsum_fixed` + (orchestrator mode only; `SingleFile` passes — the pipeline itself + is idempotent). +- Beads issue: **bd-3odjm**. +- Symptom: `MalformedSourceInfoPool` from + `pampa::readers::json::read` re-parsing the orchestrator's AST JSON + for a lipsum-shortcode-bearing document. +- Pre-Plan-5 cause: code-3 collision (writer emits FilterProvenance + `[filter_path, line]`; reader decodes as legacy Transformed + `[parent_id, ...]`). + +**The contract:** the very first time Plan 5 runs the idempotence +suite after a reader change lands, `lua_shortcode_lipsum_fixed` must +go green. The full chain is: -The latent bug doesn't surface today because `parse_qmd_to_ast` doesn't run -the transforms that produce `FilterProvenance`. The instant Plan 1 enables -the q2-preview pipeline (which runs filters and shortcodes), we'd hit it. + 1. Plan 5 lands the legacy code-3 reader change (per §"Code 3 — + Legacy reader only" below) — recognize FilterProvenance's + string-array payload, produce + `Generated { by: filter, from: vec![] }`, fall through to + legacy Transformed for the numeric-array payload. + 2. `cargo nextest run -p quarto-core --test idempotence + lua_shortcode_lipsum_fixed` passes. + 3. The full Plan-3 idempotence suite is green (27/27). + +**If step 2 fails after the reader change**, the Plan-5 author has a +real signal: either the reader's discrimination between the two +code-3 shapes is wrong, or the lipsum path produces a code-3 shape +that neither arm handles. In that case, do not move on to other +Plan-5 work — the failing test on the integration branch is the +canonical reproduction and must be the focus until green. + +This is also a positive: bd-3odjm is the most realistic Plan-5 +regression test available — a real fixture, a real pipeline, a real +round-trip — so it doubles as the smoke check before any of the +hand-constructed tests in §"Test plan" run. ## Scope ### In scope -- Add wire format code `4` for `Synthetic { by: By }`. Payload encoding: - `d` carries `{"kind": "...", "data": ...}` (or `{"kind": "..."}` if - `by.data` is null). -- Add wire format code `5` for `Derived { from, by }`. Payload encoding: - `d` carries `{"from": , "by": {"kind": "...", "data": ...}}`. - The `from` is interned in the source-info pool just like `Substring.parent`. -- Fix the code-3 reader. Today's reader interprets code 3 as Transformed and - tries to read a parent_id out of `data[0]`. Make it accept *both* shapes: - - **Legacy Transformed** (`data` is `[parent_id, ...]` of numbers): map to - `Substring` (current behavior), preserving back-compat for old JSON. - - **Latent FilterProvenance** (`data` is `[filter_path, line]` — string - then number): decode as `Synthetic { by: By::filter(filter_path, line) }`. - This recovers the FilterProvenance shape that was being silently corrupted. -- After the fix, the writer no longer emits code 3 for new content (codes 4 - and 5 cover everything). Code 3 becomes a read-only legacy compat path. -- Round-trip tests: every `SourceInfo` variant survives Rust → JSON → Rust - unchanged. +- Add wire format code `4` for `Generated { by, from }`. Payload + encoding: + ```json + { + "by": { "kind": "...", "data": }, + "from": [ + { "role": "", "si_id": }, + ... + ] + } + ``` + Outer `from` mirrors the Rust field name (`Generated.from`). Inner + `si_id` is the source-info pool reference — it points to another + entry in the pool, typically an `Original` covering the source bytes + the anchor describes. The name is deliberately distinct from + `Substring`'s `parent_id`: a Substring genuinely *has* a parent in + the chain (the slice's ancestor), but an anchor's reference is a + sideways pointer, not a containment relationship. `si_id` reads as + "source-info pool index" with no tree-structure overclaim. Multiple + anchors share an `si_id` naturally (multi-inline shortcode: every + resolved inline's `Invocation` anchor references the same token's + pool entry). +- Anchor role encoding: `"invocation"`, `"value-source"`, or + `"other:"` for `AnchorRole::Other(String)`. + Kebab-case throughout. +- Fix the code-3 reader. Today's reader interprets code 3 as + Transformed and tries to read a parent_id out of `data[0]`. Make it + accept *both* shapes: + - **Legacy Transformed** (`data` is `[parent_id, ...]` of numbers): + map to `Substring` (current behavior), preserving back-compat for + old JSON. + - **Latent FilterProvenance** (`data` is `[filter_path, line]` — + string then number): decode as `Generated { by: By::filter(filter_path, line), from: smallvec![] }`. + This recovers the FilterProvenance shape that was being silently + corrupted. +- After the fix, the writer no longer emits code 3 for new content (code + 4 covers everything). Code 3 becomes a read-only legacy compat path. +- **Code 5 is unassigned.** Earlier drafts proposed code 5 for a + separate `Derived` variant; that variant was unified into `Generated` + during the 2026-05-20 design discussion and never shipped. Code 5 + remains free for future reservation. +- Round-trip tests: every `SourceInfo` variant survives Rust → JSON → + Rust unchanged. ### Out of scope -- Lua serde changes (Plan 4 covers those — the Lua format is independent of - the JSON pool wire format). -- The wire format for `By.data` itself is just `serde_json::Value` (already - handled by serde derives on `By`). +- Lua serde changes (Plan 4 covers those — the Lua format is + independent of the JSON pool wire format). +- The wire format for `By.data` itself is just `serde_json::Value` + (already handled by serde derives on `By`). +- The metadata-loader changes that would populate `ValueSource` anchors + (separate follow-up; the wire format is forward-compatible — anchor + arrays simply gain entries when the resolver starts attaching them). +- Lua-file-registration that would convert `Dispatch` anchor data from + `by.data` into typed `Original`-backed anchors (separate follow-up; + wire-format forward-compatible the same way). ## Design decisions (settled in conversation) -- **Two new wire codes (4 and 5)**: Synthetic and Derived. The `Derived` - variant came back in the conversation after we saw that pure-provenance - alone couldn't distinguish "shortcode resolution" (atomic; user edits - prohibited at the writer level) from "filter mutation" (non-atomic; user - edits flow to source). Derived gives the type-level distinction. -- **Code 3 stays as a legacy reader** — fixes the latent bug AND retires - `FilterProvenance` in one step. The reader recognizes both old shapes - (legacy Transformed array of numbers; FilterProvenance `[filter_path, line]`) - and dispatches accordingly. Post-Plan 5, writers never emit code 3. -- **Verbose keys (`kind`, `data`, `from`, `by`) over compact ones** at the - payload level for self-documentation. The wire format's outer fields - (`t`, `r`, `d` at the SourceInfoJson level) stay compact for consistency - with existing code. +- **One new wire code (4)**, not two. The original Plan 4 / 5 drafts + split `Synthetic` (code 4) and `Derived` (code 5). The unified + `Generated` variant collapses these. Code 5 remains unassigned. +- **Typed anchor list at the wire level.** Each entry in the `from` + array carries a `role` string and an `si_id` pool reference. This + keeps the source-info chain typed even at the wire boundary — + `si_id` refers to another pool entry, never an inlined object. +- **Code 3 stays as a legacy reader** — fixes the latent bug AND + retires `FilterProvenance` in one step. The reader recognizes both + old shapes (legacy Transformed array of numbers; FilterProvenance + `[filter_path, line]`) and dispatches accordingly. Post-Plan 5, + writers never emit code 3. +- **`from` is one name across three layers, with different inner types + at each layer.** Worth knowing before reading any one layer in + isolation: + - **User-facing (`quarto-source-map`):** `SourceInfo::Generated.from: + SmallVec<[Anchor; 2]>` where `Anchor { role, source_info: Arc }`. + Carries actual `Arc` references. + - **Writer-internal (`writers/json.rs`):** `SerializableSourceMapping::Generated.from: + Vec<(AnchorRole, usize)>` where the `usize` is the pool ID returned + by `intern` for the anchor's source_info. Same semantic concept, + flattened to pool IDs. + - **On the wire (JSON):** `"from": [{ "role": "...", "si_id": }, ...]`, + omitted when empty. Same data, JSON-shaped. + The name `from` is preserved at every layer so the implementer can + read top-down without renames; the inner type changes are + deliberate (Arc → ID → JSON) and follow the pattern already + established by `Substring.parent` → `parent_id`. +- **Verbose keys (`kind`, `data`, `by`, `from`, `role`, `si_id`)** + at the payload level for self-documentation. The wire format's outer + fields (`t`, `r`, `d` at the SourceInfoJson level) stay compact for + consistency with existing code. The asymmetry is intentional: outer + fields appear once per pool entry across the whole pool (N×K bytes + for K outer fields, repeated for each of N entries — the compact + names amortize across thousands of entries), while the inner payload + keys appear only inside Generated entries (a minority of pool entries + — most are Substring/Original from parsing). Document-level overhead + from the verbose payload keys is empirically small; clarity at the + new boundary outweighs it. Pool JSON is also gzipped on the wire in + the orchestrator and hub-client transports, which collapses the + repeated short keys further. ## Concrete wire format -### Code 4 — Synthetic +### Code 4 — Generated -The source-info pool entry for a `Synthetic` value: +The source-info pool entry for a `Generated` value with **no anchors** +(pure synthesis — sectionize, filter, title-block, footnotes, appendix, +tree-sitter-postprocess, user-edit): ```json -{ - "t": 4, - "r": [0, 0], - "d": { "kind": "filter", "data": { "filter_path": "/path/to/f.lua", "line": 42 } } -} +{ "t": 4, "r": [0, 0], "d": { "by": { "kind": "sectionize" } } } ``` -For kinds without per-instance data: - ```json -{ "t": 4, "r": [0, 0], "d": { "kind": "sectionize" } } +{ "t": 4, "r": [0, 0], "d": { "by": { "kind": "filter", "data": { "filter_path": "/path/to/f.lua", "line": 42 } } } } ``` -(`"data"` field omitted when the inner `By.data` is null, per the serde -`skip_serializing_if` on the `By` struct from Plan 4.) +(The `"data"` field is omitted when `By.data` is `null`, per the serde +`skip_serializing_if` on `By`. The `"from"` field is omitted when the +list is empty.) -### Code 5 — Derived +The source-info pool entry for a `Generated` value with **one +Invocation anchor** (shortcode resolution): -The source-info pool entry for a `Derived` value: +```json +{ + "t": 4, + "r": [0, 0], + "d": { + "by": { "kind": "shortcode", "data": { "name": "meta" } }, + "from": [ + { "role": "invocation", "si_id": 7 } + ] + } +} +``` + +The source-info pool entry for a `Generated` value with **multiple +anchors** (future: a shortcode resolution that also records its value +source after the metadata-loader follow-up lands): ```json { - "t": 5, + "t": 4, "r": [0, 0], "d": { - "from": 7, - "by": { "kind": "shortcode", "data": { "name": "meta" } } + "by": { "kind": "shortcode", "data": { "name": "meta" } }, + "from": [ + { "role": "invocation", "si_id": 7 }, + { "role": "value-source", "si_id": 12 } + ] } } ``` -The `from` field is a pool ID referencing another entry in the source-info -pool — typically an `Original` entry covering the shortcode token's bytes. -The `by` carries the same shape as Synthetic's `d` (`{kind, data}` with -`data` optional). +The pool entry's `r: [0, 0]` because Generated doesn't carry its own +offsets — ranges are obtained via the `resolve_byte_range` / +`preimage_in` chain-walk through the `Invocation` anchor. -The pool entry's `r: [0, 0]` because Derived doesn't carry its own offsets -— ranges are obtained via the `preimage_in` walk through the `from` chain. +### Code 3 — Legacy reader only -## The dual-shape code-3 reader +Post-Plan-5 writers never emit code 3. The arm exists only to read +pre-Plan-5 JSON. Two shapes are possible and the dispatch order is +**numeric-first, then string-headed** — JSON `Number` and `String` are +disjoint types, so the order is unambiguous; numeric goes first because +legacy `Transformed` is the historically larger producer. ```rust 3 => { - // Legacy code-3: either old `Transformed` (data is [parent_id, ...]) - // or the buggy FilterProvenance writer (data is [filter_path, line]). + // Legacy code-3 reader. Writers no longer emit code 3. + // - Legacy Transformed: data = [parent_id, ...] (number-headed) + // - Latent FilterProvenance: data = [filter_path, line] (string-headed) + // Both shapes are read strictly — `MalformedSourceInfoPool` on any + // length/type mismatch (same convention as the Substring / Concat + // arms above). let array = data.as_array().ok_or(MalformedSourceInfoPool)?; if array.is_empty() { return Err(MalformedSourceInfoPool); } @@ -127,10 +266,12 @@ The pool entry's `r: [0, 0]` because Derived doesn't carry its own offsets // ...current logic... SourceInfo::Substring { parent: ..., start_offset, end_offset } } else if let Some(filter_path) = array[0].as_str() { - // Latent FilterProvenance shape. Decode to Synthetic. - let line = array.get(1).and_then(|v| v.as_u64()).unwrap_or(0) as usize; - SourceInfo::Synthetic { + // Latent FilterProvenance shape: must be exactly [path, line]. + if array.len() != 2 { return Err(MalformedSourceInfoPool); } + let line = array[1].as_u64().ok_or(MalformedSourceInfoPool)? as usize; + SourceInfo::Generated { by: By::filter(filter_path.to_string(), line), + from: smallvec![], } } else { return Err(MalformedSourceInfoPool); @@ -138,173 +279,879 @@ The pool entry's `r: [0, 0]` because Derived doesn't carry its own offsets } ``` -Future writers don't emit code 3. Eventually code 3 can be retired entirely -(once we're confident no on-disk JSON files contain it), but for now it's a -no-cost read-only compat shim. +Future writers don't emit code 3. Eventually code 3 can be retired +entirely (once we're confident no on-disk JSON files contain it), but +for now it's a no-cost read-only compat shim. -## The new code-4 reader +### Code 4 — Reader / writer ```rust 4 => { - // Synthetic { by: By } - let by_obj = data.as_object().ok_or(MalformedSourceInfoPool)?; - let kind = by_obj.get("kind") - .and_then(|v| v.as_str()) - .ok_or(MalformedSourceInfoPool)? - .to_string(); - let data = by_obj.get("data").cloned().unwrap_or(Value::Null); - SourceInfo::Synthetic { by: By { kind, data } } + // Generated { by, from }. The outer `r` field is parsed by the + // caller and *ignored here* — Generated entries don't carry their + // own offsets; ranges come from chain-walking the Invocation anchor + // via `resolve_byte_range` / `preimage_in`. The writer hard-codes + // `r: [0, 0]` for code-4 entries; downstream code that reads `r` + // directly will see zeros — that's the signal to walk the anchor + // chain instead. A code-4 entry with `r != [0, 0]` from an + // older/future writer is silently accepted (precedent: today's + // Concat arm also parses `r` but doesn't use it). + // + // Strict on every other shape: missing `by`, `by.kind`, `from` entry + // missing `role`/`si_id`, `from` present but not an array, or an + // `Other("")` role string → `MalformedSourceInfoPool`. Same + // convention as the Substring/Concat arms above. + let obj = data.as_object().ok_or(MalformedSourceInfoPool)?; + let by_obj = obj.get("by").and_then(|v| v.as_object()) + .ok_or(MalformedSourceInfoPool)?; + let kind = by_obj.get("kind").and_then(|v| v.as_str()) + .ok_or(MalformedSourceInfoPool)?.to_string(); + let by_data = by_obj.get("data").cloned().unwrap_or(Value::Null); + let by = By { kind, data: by_data }; + + let mut from = SmallVec::<[Anchor; 2]>::new(); + match obj.get("from") { + None => {} // absent ≡ empty (writer skips empty `from`) + Some(v) => { + let from_arr = v.as_array().ok_or(MalformedSourceInfoPool)?; + for entry in from_arr { + let entry_obj = entry.as_object() + .ok_or(MalformedSourceInfoPool)?; + let role_str = entry_obj.get("role").and_then(|v| v.as_str()) + .ok_or(MalformedSourceInfoPool)?; + let role = parse_anchor_role(role_str)?; + let si_id = entry_obj.get("si_id").and_then(|v| v.as_u64()) + .ok_or(MalformedSourceInfoPool)? as usize; + if si_id >= current_index { + return Err(CircularSourceInfoReference(si_id)); + } + let si = pool.get(si_id).cloned() + .ok_or(InvalidSourceInfoRef(si_id))?; + from.push(Anchor { role, source_info: Arc::new(si) }); + } + } + } + + SourceInfo::Generated { by, from } +} + +fn parse_anchor_role(s: &str) -> Result { + match s { + "invocation" => Ok(AnchorRole::Invocation), + "value-source" => Ok(AnchorRole::ValueSource), + _ => { + let name = s.strip_prefix("other:") + .ok_or(MalformedSourceInfoPool)?; + if name.is_empty() { return Err(MalformedSourceInfoPool); } + Ok(AnchorRole::Other(name.to_string())) + } + } } ``` -The new code-4 writer: +Writer: ```rust -SerializableSourceMapping::Synthetic { by } => { +SerializableSourceMapping::Generated { by, from } => { let mut by_json = json!({ "kind": by.kind }); - if !by.data.is_null() { - by_json["data"] = by.data.clone(); + if !by.data.is_null() { by_json["data"] = by.data.clone(); } + + let mut d = json!({ "by": by_json }); + if !from.is_empty() { + let arr: Vec = from.iter() + .map(|(role, si_id)| json!({ + "role": serialize_anchor_role(role), + "si_id": si_id, + })) + .collect(); + d["from"] = Value::Array(arr); + } + + (4, d) +} + +fn serialize_anchor_role(role: &AnchorRole) -> String { + match role { + AnchorRole::Invocation => "invocation".to_string(), + AnchorRole::ValueSource => "value-source".to_string(), + AnchorRole::Other(s) => format!("other:{}", s), } - (4, by_json) } ``` -(start_offset and end_offset for Synthetic are both 0 — there's no source -range. The writer continues to emit `r: [0, 0]`.) +The serializer interns each anchor's `source_info` into the pool when +first encountered and reuses the ID on later references — the same +`arc_parent_ids` HashMap pattern already used for `Substring.parent`. +Multi-inline shortcode resolution thus produces N `Generated` entries, +each with one `Invocation` anchor, all referencing the same pool ID for +the shortcode token's `Original` entry. -## The new code-5 reader/writer +### TypeScript wire-format definitions -```rust -5 => { - // Derived { from: Arc, by: By } - let obj = data.as_object().ok_or(MalformedSourceInfoPool)?; - let from_id = obj.get("from") - .and_then(|v| v.as_u64()) - .ok_or(MalformedSourceInfoPool)? as usize; - if from_id >= current_index { - return Err(CircularSourceInfoReference(from_id)); - } - let from = pool.get(from_id).cloned().ok_or(InvalidSourceInfoRef(from_id))?; - let by_obj = obj.get("by").and_then(|v| v.as_object()) - .ok_or(MalformedSourceInfoPool)?; - let kind = by_obj.get("kind").and_then(|v| v.as_str()) - .ok_or(MalformedSourceInfoPool)?.to_string(); - let by_data = by_obj.get("data").cloned().unwrap_or(Value::Null); - SourceInfo::Derived { from: Arc::new(from), by: By { kind, data: by_data } } -} +`ts-packages/preview-renderer/src/types/sourceInfo.ts` is a hand-mirror +of the Rust wire format. Earlier provenance-epic churn (during the +2026-05-20 design discussion) left it carrying a stale forward-declared +split: code 4 = `Synthetic { d: By }`, code 5 = `Derived { d: { from, by } }`. +That split never shipped. Plan 5 reconciles the file with the unified +Generated design: + +**Before Plan 5 (current file):** + +```ts +export type SourceInfoEntry = + | { t: 0; r: [number, number]; d: number } + | { t: 1; r: [number, number]; d: number } + | { t: 2; r: [number, number]; d: Array<[number, number, number]> } + | { t: 3; r: [number, number]; d: [string, number] } + | { t: 4; r: [0, 0]; d: By } // Synthetic — never shipped + | { t: 5; r: [0, 0]; d: { from: number; by: By } }; // Derived — never shipped ``` -Writer: +**After Plan 5:** -```rust -SerializableSourceMapping::Derived { from_id, by } => { - let mut by_json = json!({ "kind": by.kind }); - if !by.data.is_null() { by_json["data"] = by.data.clone(); } - (5, json!({ "from": from_id, "by": by_json })) +```ts +export interface AnchorRef { + role: string; // "invocation" | "value-source" | "other:" + si_id: number; // index into the source-info pool } + +export type SourceInfoEntry = + | { t: 0; r: [number, number]; d: number } // Original + | { t: 1; r: [number, number]; d: number } // Substring + | { t: 2; r: [number, number]; d: Array<[number, number, number]> } // Concat + | { t: 3; r: [number, number]; d: [string, number] | [number, ...number[]] } // legacy reader only (no new writes) + | { t: 4; r: [0, 0]; d: { by: By; from?: AnchorRef[] } }; // Generated +// code 5 — unassigned, free for future reservation ``` -`from_id` is an interned pool ID, the same way `Substring.parent_id` works. -The serializer interns the `from` SourceInfo when first encountered and -reuses the ID on later references — natural deduplication for shortcode -resolutions where many resolved nodes share the same `from`. +Changes vs. current file: + +- Code 4's `d` shape narrows from bare `By` to `{ by: By; from?: AnchorRef[] }`. +- Code 5's entry is removed entirely. It was never emitted by any + shipping writer; no on-disk artifact carries it. Removing the variant + is safe. +- Code 3's `d` shape widens to a union to reflect the dual-shape legacy + reader (string-headed = FilterProvenance, numeric-headed = old + Transformed). New writers don't emit code 3 either way, so this is a + read-side typing only. +- `from?` is absent (not `[]`) when empty — writer skips the field via + `if !from.is_empty()`. TS consumers use `entry.d.from ?? []` as the + canonical access pattern; absent and `[]` are treated equivalently. +- The file's header doc-comment (lines 10–19 of the current file) + references `Synthetic` and `Derived` by name and says "Plan 5 wires + this up." Rewrite it to describe Generated instead and drop the + Synthetic/Derived nomenclature. + +**`utils/sourceInfo.ts` reconciliation** (full enumeration of the +"audit" called for in Phase 5): + +- `entryFor(node, pool)` — unchanged. +- `isDerived(node, pool)` — **delete entirely.** It checks `entry?.t === 5`, + which after Plan 5 is unreachable (code 5 unassigned). Any caller + still using it migrates to `isAtomicSourceInfo`. +- `isAtomicSourceInfo(node, pool, atomicKinds)` — rewrite. The current + body branches on `entry.t === 5` (always atomic) OR + `entry.t === 4 && atomicKinds.has(entry.d.kind)`. After Plan 5: only + `entry.t === 4 && atomicKinds.has(entry.d.by.kind)` — the `kind` + field moves from `entry.d.kind` to `entry.d.by.kind`, and the code-5 + branch is removed. +- `ATOMIC_SYNTHETIC_KINDS` constant (currently empty) — **rename to + `ATOMIC_KINDS`** to match the Rust canonical name `By::is_atomic_kind`, + and populate with the Plan-4 atomic set: + `new Set(["filter", "shortcode", "title-block", "tree-sitter-postprocess"])`. + The accompanying doc-comment ("mirrors `By::is_atomic_synthesizer()`") + is updated to "mirrors `By::is_atomic_kind()`." + +The TS type and the Rust serializer must agree byte-for-byte; the +header doc-comment cites the Rust file as the source of truth, same +convention as for the atomic-CustomNodes registry. + +## Work items + +Phase-ordered. Each phase compiles cleanly **and leaves the workspace +fully green** before the next begins. Phase 1 lands on its own as the +bd-3odjm fix even if the rest of Plan 5 stalls. + +**Ordering note.** The naive 1 → 2 → 3 → 4 order would break round-trip +between Phase 2 (writer emits code 4) and Phase 4 (reader decodes code +4) — every fixture containing a filter or shortcode would fail with +`MalformedSourceInfoPool` on code 4 in that window. The order below +puts the code-4 reader (renumbered Phase 2) before the writer change +so each phase leaves the workspace green. Phases 3 (writer) and 4 +(streaming writer) **must land atomically** as a single commit/squash +because Phase 3 removes `SerializableSourceMapping::FilterProvenance`, +which the streaming writer references — splitting them produces a +build break. + +### Phase 0 — Start gate + +- [x] Confirm Plan 4 (Generated + By + Anchor + AnchorRole) has merged + into `feature/provenance`. If not, stop — Plan 5 cannot build. + Verify with `git grep -n "enum SourceInfo" crates/quarto-source-map/src/source_info.rs` + and confirm a `Generated` arm exists. +- [x] Confirm the Plan-4 interim writer state is present in + `crates/pampa/src/writers/json.rs`: a `SourceInfo::Generated { by, .. }` + arm in `SourceInfoSerializer::intern` that recovers + `(filter_path, line)` via `by.as_filter().expect(...)` and emits + `SerializableSourceMapping::FilterProvenance`. This is the arm + Phase 3 rewrites. As of Plan 4's commit, the arm lives around + `writers/json.rs:314-331`; refresh before implementing. Verify + with `git grep -n "Plan 5's wire-code 4 emitter" crates/pampa/src/writers/json.rs` + — exactly one hit (Plan 4's `expect` message). +- [x] Confirm `SerializableSourceMapping::FilterProvenance` still + exists as a variant in `writers/json.rs` (it does post-Plan-4 — + Plan 4 deliberately kept the *serializable* enum variant even + though the source-map variant is gone, because the interim + writer arm above still emits it). Verify with + `git grep -n "SerializableSourceMapping::FilterProvenance" crates/pampa/` + — expect ~4 hits (writer's `to_json` arm, the interim `intern` + arm above, the streaming writer's two arms in + `stream_write_source_info_pool`). All four go away in Phase 3+4. +- [x] Confirm no on-disk JSON snapshots carry code-3 entries that the + new dual-shape reader would need to decode. Verified at planning + time: `grep -rn '"t":3\|"t": 3' crates/ tests/ hub-client/` + returns zero hits and `grep -rln 'FilterProvenance' crates/pampa/snapshots + crates/pampa/tests/snapshots crates/quarto-core/tests/snapshots` + is also empty. Re-run before starting Phase 1 to confirm nothing + has been added in the interim. **No fixture migration needed.** + +### Phase 1 — Legacy code-3 dual-shape reader (closes bd-3odjm) + +- [x] Add `parse_anchor_role` helper in `crates/pampa/src/readers/json.rs` + (used by Phase 2 too — landing it here is a no-op until then). +- [x] Rewrite the code-3 arm in `SourceInfoDeserializer::new` (currently + `crates/pampa/src/readers/json.rs:252-283`) per §"Code 3 — Legacy + reader only": dispatch on `data[0]` numeric → legacy Substring; + string → strict `[path, line]` decode to `Generated { by: + By::filter(path, line), from: smallvec![] }`; otherwise + `MalformedSourceInfoPool`. No silent `unwrap_or(0)` — line must + be a number or the entry is malformed. +- [x] Rewrite the code-3 reader's doc-comment to: + "Legacy reader for code 3 — accepts both old Transformed + numeric-array and buggy FilterProvenance string-array; writes + never emit code 3." +- [x] Run `cargo nextest run -p quarto-core --test idempotence lua_shortcode_lipsum_fixed` + → green (closes bd-3odjm). +- [x] Run the full Plan-3 idempotence suite → 27/27 green. +- [x] **Per-phase verification gate:** `cargo nextest run --workspace` + → all green. bd-3odjm closed; no regressions. Phase 1 is + independently revertible (the reader change is purely additive + — restoring the prior arm removes only the new FilterProvenance + recovery branch). +- [x] **Rollback signal:** the Phase-1 reader change only touches the + code-3 arm; other code-paths and other pool entries are + unaffected. If a Plan-3 idempotence case *other than* + `lua_shortcode_lipsum_fixed` regresses (or a workspace test + outside the idempotence suite regresses), that is a real signal + — either the dual-shape discriminator misclassifies a payload + shape that *isn't* the buggy FilterProvenance, or the new + `Generated` recovery loses information a downstream test + depended on. Do not paper over it by relaxing the strict + rejection rules. Investigate the failing case's pool entries + with `jq '.astContext.sourceInfoPool'` on the offending fixture's + JSON, identify which code-3 entries are present, and decide + whether the discriminator needs an additional case or the failing + test had a buggy pre-existing expectation. Either way, file a + beads issue. + +### Phase 2 — Code-4 reader + +Lands before any writer change so the reader is forward-compatible +when Phase 3 starts emitting code 4. Phase 2 alone leaves the workspace +green: no production code emits code 4 yet, so the new arm is exercised +only by hand-constructed tests. + +- [x] Add a `4 => { … }` arm in `SourceInfoDeserializer::new` + (`readers/json.rs:154-287`) per §"Code 4 — Reader / writer": + decode `by` (kind + optional data), decode `from` array entries + via `parse_anchor_role` + `si_id`, with the `si_id < current_index` + circular-ref guard. +- [x] Reject malformed code-4 payloads with `MalformedSourceInfoPool`: + missing `by`; missing `by.kind`; `from` present but not an array; + `from` entry not an object; `from` entry missing `role`; `from` + entry missing `si_id`; unrecognized role string; `Other("")` with + empty suffix. See §"Code 4 — Reader / writer" for the full + snippet — same strictness as the Substring/Concat arms. +- [x] Silently accept code-4 entries with `r != [0, 0]` (one-line + comment in the arm; precedent: today's Concat arm). +- [x] Add the forward-compat unit tests in `readers/json.rs::tests` — + see Phase 6 for the full list of tests landing here. + +### Phase 3 — Writer code-4 emit (`SerializableSourceMapping` + intern + `to_json`) **+ Phase 4 streaming-writer parity, landed atomically** + +Phases 3 and 4 (below) must land in one commit / squash: Phase 3 +removes `SerializableSourceMapping::FilterProvenance`, which Phase 4's +streaming writer references — splitting them produces a build break. -`r: [0, 0]` for Derived too — offsets are recovered through the chain via -`preimage_in` (Plan 7), not stored on the Derived entry itself. +Starting state from Plan 4: `SourceInfo::FilterProvenance` is gone, but +`SerializableSourceMapping::FilterProvenance` survives because Plan 4's +interim writer arm (see Plan 4 §"Migrations", `pampa/src/writers/json.rs:314`) +converts `SourceInfo::Generated { by: filter, .. }` into the legacy +shape via `by.as_filter().expect(...)`. That arm panics for non-filter +Generated kinds, so the workspace only stays buildable as long as no +non-filter Generated is constructed — Plan 6 doesn't ship shortcode +stamping until later, so Plan 4's expect is safe in the interim. +Phase 3 removes both the interim arm and the `SerializableSourceMapping::FilterProvenance` +variant at once. + +- [x] Add `Generated { by: By, from: Vec<(AnchorRole, usize)> }` to + `SerializableSourceMapping` in `crates/pampa/src/writers/json.rs`. +- [x] Replace Plan 4's interim `SourceInfo::Generated { by, .. } => … + SerializableSourceMapping::FilterProvenance` arm with a real + `SerializableSourceMapping::Generated { … }` construction (no more + `by.as_filter().expect(...)`); supports all `by.kind` values + uniformly. +- [x] Remove `SerializableSourceMapping::FilterProvenance` (no longer + reachable after the interim arm above is rewritten). +- [x] Update `SourceInfoSerializer::intern` (`writers/json.rs:260-333`): + - Recognize `SourceInfo::Generated { by, from }`. + - **Recursively intern each anchor's `source_info` BEFORE pushing + the parent pool entry** (same pattern as today's `Concat` and + `Substring` arms), so anchor `si_id`s are strictly less than + the Generated's own id. The reader's `si_id < current_index` + guard requires this invariant. + - **Reuse the existing `arc_parent_ids` cache** (keyed by + `Arc::as_ptr(&anchor.source_info)`) for anchor dedup. Same cache, + same key shape as `Substring.parent`. Multi-inline shortcode + resolutions (every resolved inline shares one `Arc` for the + token's `Original`) hit the cache and produce a single pool + entry for the shared target — exactly the dedup behavior the + "Anchor dedup test" in Phase 6 verifies. + - Build the **`intern`-match-arm return tuple** as + `(0, 0, SerializableSourceMapping::Generated { by, from: from_ids })` + — `intern` returns `(start_offset, end_offset, mapping)`; the + `r: [0, 0]` rule is enforced by hard-coding the first two + components to zero, exactly as today's FilterProvenance arm at + lines 314-322 does. +- [x] Update `SerializableSourceInfo::to_json` (`writers/json.rs:169-190`) + with the code-4 arm per §"Code 4 — Reader / writer". +- [x] Add `serialize_anchor_role` helper. +- [x] Update the `SourceInfoJson.t` legend comment at + `writers/json.rs:115` from + `"0=Original, 1=Substring, 2=Concat, 3=FilterProvenance"` to + `"0=Original, 1=Substring, 2=Concat, 3=Legacy (read-only), 4=Generated"`. + +### Phase 4 — Streaming writer parity (atomic with Phase 3) + +- [x] Add the code-4 arm in `stream_write_source_info_pool` + (`writers/json.rs:3482-3532` as of `eb06c4cf`; refresh before + implementing); mirror the `to_json` shape exactly. +- [x] Remove the FilterProvenance arms (lines 3509-3514 emit, line 3526 + tag as of `eb06c4cf`). They become unreachable once + `SerializableSourceMapping::FilterProvenance` is gone from Phase 3. + +### Phase 5 — TypeScript types + +- [x] Rewrite `ts-packages/preview-renderer/src/types/sourceInfo.ts` + per §"TypeScript wire-format definitions": + - Add `AnchorRef` interface. + - Code 4's `d` becomes `{ by: By; from?: AnchorRef[] }`. + - Code 3's `d` becomes `[string, number] | [number, ...number[]]`. + - Remove the code-5 entry. + - Rewrite the header doc-comment to describe Generated, not + Synthetic/Derived. The current header cites + `crates/pampa/src/writers/json.rs:54-91`, which is stale (the + wire-format types now live at ~lines 109-207 of that file). The + new doc-comment should cite **two** sources of truth: the Rust + enum `SourceInfo` in + `crates/quarto-source-map/src/source_info.rs` (canonical + producer-side definition) and the JSON wire mirror in + `crates/pampa/src/writers/json.rs` (`SerializableSourceMapping` + ~lines 193-207, `SourceInfoJson` ~lines 109-116, code-4 + serializer in `to_json` ~lines 167-190). Do not bake in exact + line numbers — cite the type names; they will outlast line + drift. +- [x] Update `ts-packages/preview-renderer/src/utils/sourceInfo.ts` per + §"TypeScript wire-format definitions" → `utils/sourceInfo.ts` + reconciliation: + - Delete `isDerived` entirely. + - Rewrite `isAtomicSourceInfo` to read `entry.d.by.kind` (was + `entry.d.kind`) and drop the code-5 branch. + - **Rename** `ATOMIC_SYNTHETIC_KINDS` → `ATOMIC_KINDS` to match + the Rust canonical `By::is_atomic_kind`. + - Populate `ATOMIC_KINDS` with `new Set(["filter", "shortcode", + "title-block", "tree-sitter-postprocess"])` (mirrors Plan 4's + `By::is_atomic_kind`). + - Update the file's doc-comment from "mirrors + `By::is_atomic_synthesizer()`" to "mirrors `By::is_atomic_kind()`." + - Migrate any remaining `isDerived` callers (`grep -rn isDerived ts-packages/`) + to the new `isAtomicSourceInfo` shape. +- [x] Update `ts-packages/preview-renderer/src/utils/sourceInfo.test.ts` + — the existing tests will not compile after the changes above. + Specifically: + - Drop the `import { isDerived, ATOMIC_SYNTHETIC_KINDS }` lines + and the entire `describe('isDerived', …)` block. `isDerived` is + gone; `ATOMIC_SYNTHETIC_KINDS` is renamed `ATOMIC_KINDS` and now + populated (the existing `is empty in 2A` assertion no longer + holds). + - Rewrite `samplePool`: + - Drop the code-5 entry entirely (codes 5 unassigned post-Plan-5). + - Reshape the code-4 entry from `d: { kind: 'IncludeShortcode' }` + (bare `By`) to `d: { by: { kind: 'shortcode', data: { name: 'meta' } } }` + (no `from` — absent is the canonical empty form). Add a second + code-4 entry with `from: [{ role: 'invocation', si_id: 0 }]` + so the `entry.d.from ?? []` access pattern is exercised. + - Reshape the code-3 entry: keep one with `d: ['filter.lua', 42]` + (string-headed legacy FilterProvenance) and add a sibling with + `d: [0]` (numeric-headed legacy Transformed) to exercise the + new dual-shape `d` type. + - Rewrite the `isAtomicSourceInfo` describe block: the + "Synthetic vs Derived" framing is dead. Drive new assertions + against `ATOMIC_KINDS` populated with the Plan-4 atomic set, + using a code-4 entry whose `by.kind` is `"shortcode"` (atomic) + and another whose `by.kind` is `"sectionize"` (non-atomic). + - Add an `ATOMIC_KINDS` describe block asserting the four + Plan-4 atomic kinds are members and at least one non-atomic kind + (`"sectionize"`) is not. Replaces the deleted + `ATOMIC_SYNTHETIC_KINDS` block. + - Run `cd hub-client && npm run build:all` after the rewrite — the + production build (`tsc -b && vite build`) is stricter than + `tsc --noEmit` / vitest and catches type-narrowing errors that + unit tests miss. + +### Phase 6 — Tests + +**Test placement.** All tests are hand-written (no proptest in this +file; the repo doesn't use it heavily). Unit tests extend the existing +test modules; the end-to-end integration test extends the existing +integration crate: + +- Writer-side unit tests → `crates/pampa/src/writers/json.rs::tests` + (joins the existing `test_source_info_pool_*` cluster at + `writers/json.rs:3688+`). +- Reader-side unit tests → `crates/pampa/src/readers/json.rs::tests` + (joins the existing `test_deserialize_source_info_pool_*` cluster at + `readers/json.rs:2479+`). +- End-to-end integration test → `crates/pampa/tests/json_reader_smoke_tests.rs` + (existing integration crate that drives file fixtures through + `pampa::readers::json::read`). + +Per-phase landing: forward-compat tests for the code-4 reader and the +legacy code-3 recovery test land with Phase 1/2 (reader-only); writer +round-trips, dedup, and the end-to-end test land with Phases 3+4 once +the writer emits code 4. + +**Tests:** + +- [x] Round-trip property test for every `SourceInfo` variant (Original, + Substring, Concat, Generated with various By kinds and `from` + configurations). Hand-written cases (one per shape). See §Test + plan. +- [x] Concat-of-Generated round-trip case: a `Concat { pieces }` whose + pieces' `source_info` is `Generated`. Serialize → deserialize → + assert structural equality. Closes a coverage gap — current + production paths emit this shape (e.g. coalesced filter-emitted + spans). Sits in the writer-side test module since it exercises + the recursive intern of mixed-variant pieces. +- [x] Substring-of-Generated round-trip case: a + `Substring { parent: Arc::new(Generated { … }), … }` — e.g. a + filter-emitted span whose substring is later coalesced. The + writer's existing `intern` recursion routes + `Substring.parent: Arc` through the new code-4 path + with no extra logic, and the reader's existing Substring arm + reads the parent_id back as a code-4 pool entry. The test serves + as a regression guard for that path: confirm pool ordering + (parent Generated entry interns strictly before the Substring + child) and assert structural equality across serialize → + deserialize. Co-located with the Concat-of-Generated case in + the writer-side test module. +- [x] Filter-provenance recovery test (hand-constructed code-3 with + string-array payload → `Generated { by: filter, from: smallvec![] }`). +- [x] Legacy Transformed back-compat test (hand-constructed code-3 with + numeric-array payload → `Substring`). +- [x] Strict code-3 rejection tests: `[path]` (missing line) and + `[path, "not-a-number"]` (non-numeric line) both + → `MalformedSourceInfoPool`. Guards the no-`unwrap_or(0)` rule. +- [x] Forward-compat test (code-4 with unknown `by.kind`, arbitrary + `data` → preserved round-trip). +- [x] Strict code-4 rejection tests: missing `by`, missing `by.kind`, + `from` present but not an array, `from` entry not an object, + `from` entry missing `role`/`si_id`, role string `"other:"` + (empty suffix) → all `MalformedSourceInfoPool`. +- [x] **Anchor dedup test (writer-side only).** Hand-construct an AST + with N inlines, each carrying + `Generated { by: By::shortcode("meta"), from: smallvec![Anchor::invocation(Arc::clone(&shared))] }`. + Serialize. Assert: the pool contains the shared target exactly + once and every Generated entry's `from[0].si_id` references that + single ID. **Read-side note:** deserialization rebuilds each anchor + with a fresh `Arc`, so a subsequent re-serialization produces N + copies — this test verifies the *write-time* optimization keyed + on `Arc::as_ptr`. See [[anchor-dedup-invariant]] in §"Risk areas" + for the broader contract. Test passes Plan-5-alone (no shortcode + resolver needed — Arc sharing is hand-wired). +- [x] Streaming-writer parity test. Helper shape: + `roundtrip_via_stream(ast) -> ast` that calls `stream_write_pandoc` + into a `Vec`, reads back via `pampa::readers::json::read`, + and asserts SourceInfo equality at chosen Generated nodes. The + streaming writer's match arms are independent of `to_json`'s; + without this coverage, a Phase-4 regression in + `stream_write_source_info_pool` could slip through. +- [x] AnchorRole round-trip test: build a `Generated` with each role + (`Invocation`, `ValueSource`, `Other("ext/foo/bar")`) wrapped in + anchors; serialize through JSON via the writer's code-4 path; + deserialize via the reader's code-4 path; assert the role survives. +- [x] End-to-end production reachability test (kbd-shortcode fixture → + `render_qmd_to_preview_ast` → JSON → `pampa::readers::json::read` + → assert success and recovered shape). Lives in + `crates/pampa/tests/json_reader_smoke_tests.rs`. +- [x] TypeScript-side type round-trip (parse a JSON pool with Generated + entries; confirm `SourceInfoEntry` shape matches; confirm + `entry.d.from ?? []` access pattern works for both absent and + present `from`). + +### Phase 7 — Verification gate + +- [x] `cargo build --workspace` clean. +- [x] `cargo nextest run --workspace --no-fail-fast` all green + (bd-3odjm closed in Phase 1; no other regressions). Use + `--no-fail-fast` so a single regression doesn't hide downstream + green tests — same convention used to close Plan 4. +- [x] `cargo xtask verify` (full — `quarto-core`/`pampa` are WASM + consumers; hub-build leg matters). The WASM rebuild leg will + modify `crates/wasm-quarto-hub-client/Cargo.lock` as a side + effect (separate lockfile from the workspace one); include it + in the commit. Plan 4 hit this and committed it without issue. +- [x] `git grep "FilterProvenance"` returns only legacy-reader / legacy + doc references (no writer emissions, no `SerializableSourceMapping` + variant). +- [x] Update bd-3odjm: close at the Phase-1 commit (the reader change + that turns `lua_shortcode_lipsum_fixed` green). The close trigger + is the commit itself, not a downstream PR or merge — Plan 5 lands + on the `feature/provenance` integration branch via merge commits, + not a standalone PR, so tying the close to the commit gives the + issue a concrete reference. Refresh its description to use `from:` + not `anchors:` if reopened for any reason. **If Phase 3 or 4 + introduces a *new* failure mode in the lipsum fixture, file a + fresh beads issue** rather than reopening bd-3odjm — that issue is + specifically the code-3 collision and should stay scoped to it. + +## Implementation guidance carried over from Plan 4 + +A few small things came up during Plan 4 that are worth knowing before +starting Plan 5: + +- **`SmallVec::new()` is the construction pattern, not `smallvec![]`.** + Plan 4 uniformly used `SmallVec::<[Anchor; 2]>::new()` for empty + lists, never the `smallvec!` macro. The reader file + `crates/pampa/src/readers/json.rs` does not currently import + `smallvec::smallvec`. Code samples in this plan that show + `smallvec![]` are pseudocode — when implementing, write + `SmallVec::new()` (matches Plan 4's convention, avoids a needless + import). The `SmallVec` type itself needs + `use smallvec::SmallVec;` at the top of the file — Plan 4 added + this to every consumer it touched (`pampa/src/lua/diagnostics.rs`, + `pampa/src/lua/types.rs`); `readers/json.rs` and the writer's + Generated arm (Phase 3) will need it too. + +- **Don't name a local `gen`.** Rust 2024 makes `gen` a reserved + keyword. Plan 4's test code had to rename a `let gen = ...` to + `let generated = ...`. None of Plan 5's code samples currently use + `gen` as an identifier — keep it that way. (Plan 7's prose still + uses `gen.invocation_anchor()` as shorthand; that's pseudocode, not + literal Rust to type.) + +- **Phase boundary "compiles cleanly" semantics.** Plan 4 found that + "each phase compiles cleanly" really means "the directly-touched + crate compiles cleanly" — adding a new `SourceInfo` variant + immediately broke `match` exhaustiveness across ~10 crates, and the + workspace stayed red between Plan-4 Phase 1 and Phase 5. Plan 5's + Phase 1 → 2 → 3+4 ordering above explicitly avoids this trap (each + phase leaves the workspace green); the *atomic* Phase 3+4 squash is + the only place where you have to land more than one commit's worth + of code in a single push. + +- **`cargo xtask verify --skip-rust-tests` is a useful intermediate.** + Plan 4 ran `cargo nextest run --workspace --no-fail-fast` first + (confirms only bd-3odjm is red), then `cargo xtask verify + --skip-rust-tests` (confirms the WASM/hub-client legs are green + without re-running the same Rust tests). Plan 5 should follow the + same split for the final verification gate. ## Open questions for implementation -- **Eventually retiring code 3**: at some point, no JSON files in the wild - contain code 3 (the buggy FilterProvenance shape never round-tripped before - Plan 5; the legacy Transformed shape predates a transition we made earlier). - Could remove the legacy reader. Don't need to decide now. -- **Detecting malformed code 4/5 payloads**: if shape doesn't match - expectation, error with `MalformedSourceInfoPool`. Confirm the exact - error variant for each malformation. -- **Streaming writer parity** (`stream_write_custom_block` and the streaming - source-info-pool writer): both writer paths need updating. Today both have - the same code-3 → FilterProvenance shape — the bug applies to both. - Update both to emit code 4 for Synthetic and code 5 for Derived. -- **Pool deduplication of Derived `from` references**: when many Derived - source_infos share the same `from` (e.g., a multi-inline shortcode - resolution where every resolved inline points at the same shortcode - token), the writer should intern `from` once and reuse the ID. The - existing `arc_parent_ids` HashMap pattern (used for `Substring.parent`) - applies here. +- **Eventually retiring code 3**: at some point, no JSON files in the + wild contain code 3 (the buggy FilterProvenance shape never + round-tripped before Plan 5; the legacy Transformed shape predates a + transition we made earlier). Could remove the legacy reader. Don't + need to decide now. +- **Detecting malformed code 4 payloads**: settled in Phase 2 of + §"Work items" — `MalformedSourceInfoPool` for missing `by`, missing + `by.kind`, `from` not an array, `from` entry not an object, `from` + entry missing `role`/`si_id`, unrecognized role string, and empty + `Other("")` suffix. +- **Streaming writer parity** (`stream_write_source_info_pool`): settled + in Phase 4 of §"Work items" — atomic with Phase 3 (writer code-4 emit). +- **Pool deduplication of anchor `si_id` references**: when many + Generated entries share the same anchor target (multi-inline + shortcode), the writer interns once and reuses the ID. The existing + `arc_parent_ids` HashMap pattern (already used for `Substring.parent`) + handles this — same interning mechanism, different reader-side name + (`si_id` for anchors, `parent_id` for substrings). This is a + **writer-side optimization only** — deserialization rebuilds each + anchor with a fresh `Arc`, so pool-size is not stable over + read-write-read. AST content and Plan-3 hashes (which exclude + `source_info`) are stable. See [[anchor-dedup-invariant]] in §"Risk + areas". +- **TypeScript hand-mirror updates**: see §"TypeScript wire-format + definitions" above. Settled — code 4's `d` becomes `{ by; from? }`, + code 5 is removed, code 3's `d` becomes a union for the dual-shape + legacy reader, `ATOMIC_SYNTHETIC_KINDS` renames to `ATOMIC_KINDS` + with the Plan-4 atomic set populated. The companion test file + `utils/sourceInfo.test.ts` is rewritten in lockstep — see Phase 5. +- **Writer JSON-build style**: hand-build via `json!` macro, matching + the existing convention throughout `writers/json.rs`. Not derive-based. + Settled. +- **`By::kind` canonical enumeration**: see Plan 4's `By::` builders + (`filter`, `sectionize`, `user_edit`, `shortcode`, `include`, + `title_block`, `footnotes`, `appendix`, `tree_sitter_postprocess`, + `raw`) for the full set. Plan 5 emits whatever `by.kind` string is + present, kebab-case throughout. Atomic-kind list mirrors + `By::is_atomic_kind` (`filter | shortcode | title-block | + tree-sitter-postprocess`). Cross-plan invariant — no Plan-5-owned + decision here. ## References -- `crates/pampa/src/writers/json.rs:80` — type code comment. -- `crates/pampa/src/writers/json.rs:132-155` — `SerializableSourceInfo::to_json`. -- `crates/pampa/src/writers/json.rs:145-148` — current FilterProvenance → - code 3 emit (the buggy line). -- `crates/pampa/src/writers/json.rs:225-298` — full SerializableSourceInfo - enum and conversion. -- `crates/pampa/src/readers/json.rs:155-290` — pool reader; the code-3 - branch is at line 252. -- `crates/quarto-source-map/src/source_info.rs:22-55` — SourceInfo enum - (extended by Plan 4). +(Line numbers as of `feature/provenance` @ 4c465768. Plan 4's migration +will shift these; refresh before implementing.) + +- `crates/pampa/src/writers/json.rs:115` — `SourceInfoJson.t` field + comment, currently `"0=Original, 1=Substring, 2=Concat, 3=FilterProvenance"`. + Plan 5 extends the legend to include `4=Generated` and notes code 3 + as legacy reader only. +- `crates/pampa/src/writers/json.rs:160-190` — `SerializableSourceInfo` + struct and `to_json` method. Code-3 emit at lines 180-182 (the bug). +- `crates/pampa/src/writers/json.rs:193-207` — `SerializableSourceMapping` + enum (Original/Substring/Concat/FilterProvenance arms). Phase 3 adds + a `Generated` arm and removes `FilterProvenance`. +- `crates/pampa/src/writers/json.rs:260-333` — `SourceInfoSerializer::intern`; + Phase 3 adds a `SourceInfo::Generated` arm with topologically-ordered + anchor recursion. +- `crates/pampa/src/writers/json.rs:3482-3532` — `stream_write_source_info_pool`; + Phase 4 mirrors the to_json changes here (lines 3509-3514 emit, line + 3526 tag). +- `crates/pampa/src/readers/json.rs:99-293` — `SourceInfoDeserializer::new` + (the pool reader). Code-3 arm at lines 252-283 (Phase 1 rewrites); + Phase 2 adds a code-4 arm. +- `crates/quarto-source-map/src/source_info.rs:21-55` — `SourceInfo` enum + (extended by Plan 4 — confirm Generated/By/Anchor/AnchorRole present + before Plan 5 starts; see Phase 0). +- `ts-packages/preview-renderer/src/types/sourceInfo.ts` — JS-side + `SourceInfoEntry`. See §"TypeScript wire-format definitions" for the + full before/after. +- `ts-packages/preview-renderer/src/utils/sourceInfo.ts` — JS-side + helpers (`isAtomicSourceInfo`, etc.); needs adjustment for the new + shape per Plan 4 / Plan 7. ## Test plan +(Hand-written tests; the repo doesn't use proptest in this area. See +Phase 6 for test-file placement and per-phase landing.) + - **Round-trip property test**: for each variant (Original, Substring, - Concat, Synthetic, Derived with various By kinds), build a `SourceInfo`, - serialize to JSON, deserialize, assert equality. Cover the full enum. -- **Filter-provenance recovery test**: hand-construct a JSON pool entry with - the buggy code-3-with-string-array-payload shape. Read it. Assert the - reader produces `Synthetic { by: By::filter(...) }` with the right path/line. -- **Legacy Transformed back-compat test**: hand-construct a JSON pool entry - with code-3-with-numeric-array-payload (the legacy Transformed shape). - Assert the reader still produces a `Substring` (preserving today's - back-compat behavior). -- **Forward-compat test**: hand-construct a JSON pool entry with code 4 and - an unknown kind (`"kind": "ext/future/foo"`, arbitrary data). Assert it - decodes as `Synthetic { by: By { kind: "ext/future/foo", data: ... } }`. - Round-trips unchanged. Same test for code 5. -- **Derived dedup test**: build an AST where multiple inlines have Derived - source_info sharing the same `from`. Serialize. Confirm the pool contains - the `from` Original entry exactly once and each Derived entry references - it by ID (rather than re-encoding the Original each time). -- **End-to-end with Plan 4**: build an AST containing Synthetic-tagged AND - Derived-tagged nodes, serialize to JSON via the existing JSON writer, - deserialize via the reader, assert structural equality. + Concat, Generated with various By kinds and anchor configurations), + build a `SourceInfo`, serialize to JSON, deserialize, assert + equality. Cover the full enum. +- **Concat-of-Generated round-trip**: a `Concat { pieces }` whose + pieces' `source_info` is `Generated` (the shape produced by coalesced + filter-emitted spans). Serialize → deserialize → assert structural + equality. Closes a coverage gap not exercised by the per-variant + property test above. +- **Substring-of-Generated round-trip**: a + `Substring { parent: Arc::new(Generated { … }), … }`. + `Substring.parent: Arc` is structurally unrestricted, so + this shape can arise whenever a transform produces a span and a + downstream coalesce or slice carves a substring out of it. The + serializer's `Substring` arm interns the parent recursively, which + routes through the new code-4 arm; the reader's `Substring` arm then + reads the parent_id back. Round-trip the construction and assert + structural equality. +- **Filter-provenance recovery test**: hand-construct a JSON pool entry + with the buggy code-3-with-string-array-payload shape. Read it. + Assert the reader produces `Generated { by: filter, from: smallvec![] }` + with the right path/line via `by.as_filter()`. +- **Strict code-3 rejection**: hand-construct `[path]` (missing line) + and `[path, "not-a-number"]` (non-numeric line); assert both + → `MalformedSourceInfoPool`. Guards the no-`unwrap_or(0)` rule. +- **Legacy Transformed back-compat test**: hand-construct a JSON pool + entry with code-3-with-numeric-array-payload (the legacy Transformed + shape). Assert the reader still produces a `Substring` (preserving + today's back-compat behavior). +- **Forward-compat test**: hand-construct a JSON pool entry with code 4 + and an unknown kind (`"kind": "ext/future/foo"`, arbitrary data). + Assert it decodes as `Generated { by: By { kind: "ext/future/foo", + data: ... }, from: smallvec![] }`. Round-trips unchanged. +- **Strict code-4 rejection**: missing `by`, missing `by.kind`, `from` + present but not an array, `from` entry not an object, `from` entry + missing `role`/`si_id`, unrecognized role string, and role string + `"other:"` (empty `Other` suffix) → all `MalformedSourceInfoPool`. +- **Anchor dedup test (writer-side only)**: build an AST where N + inlines carry Generated source_info each with an `Invocation` anchor + wrapping `Arc::clone(&shared)`. Serialize. Confirm the pool contains + the shared target exactly once and each Generated entry's + `from[0].si_id` references it by ID. *Read-side note:* deserialization + rebuilds each anchor with a fresh `Arc`; this test only verifies the + write-time optimization (see [[anchor-dedup-invariant]] in §"Risk + areas"). Test passes Plan-5-alone (no shortcode resolver needed). +- **Streaming-writer parity test**: implement helper + `roundtrip_via_stream(ast) -> ast` that streams the AST via + `stream_write_pandoc` into a `Vec` and reads back through + `pampa::readers::json::read`. Run a representative Generated-bearing + AST through it; assert equality. The streaming writer's match arms + are independent of `to_json`'s, so a Phase-4 regression could + otherwise slip through. +- **AnchorRole round-trip test**: build a `Generated` with each role + (`Invocation`, `ValueSource`, `Other("ext/foo/bar")`) wrapped in + anchors; serialize through JSON via the writer's code-4 path; + deserialize via the reader's code-4 path; assert the role survives. +- **Live regression test already on the integration branch:** + `cargo nextest run -p quarto-core --test idempotence lua_shortcode_lipsum_fixed` + (filed as **bd-3odjm**; see §"Inherited failure that must close on + Plan 5's first reader change (bd-3odjm)" above). This is the + fastest first-iteration smoke check: it drives a real pipeline + a + real shortcode + a real JSON round-trip + the existing Plan-3 + hashing harness, and goes red until Plan 5 fixes the code-3 + collision. Run it before the hand-constructed tests below. +- **End-to-end production reachability test** (additional regression + guard for the bug Plan 5 fixes — current main would fail this test + as soon as the JSON round-trip is exercised on a Lua-shortcode-bearing + document): + 1. Build a fixture using `{{< kbd Ctrl+C >}}` (the kbd extension's + `kbd.lua` calls `pandoc.Span(...)`, which the Lua machinery's + `filter_source_info` auto-attach tags with FilterProvenance / + post-Plan-4 `Generated { by: filter, ... }`). + 2. Run it through `render_qmd_to_preview_ast` (or the equivalent + production path that drives the JSON writer with + filter-constructed nodes in the AST). + 3. Take the resulting JSON, feed it back through + `pampa::readers::json::read`. + 4. Assert the round-trip succeeds (no `MalformedSourceInfoPool` + error) AND the recovered source_info is `Generated { by: + shortcode, from: [Invocation -> ...] }` after Plan 6's + post-walk has stamped it. (If running Plan 5 alone — before + Plan 6 lands — the recovered shape is `Generated { by: filter, + from: [] }` with `(filter_path, line)` in `by.data`; the + round-trip still succeeds.) + + This is distinct from the hand-constructed "Filter-provenance + recovery test" above. That test exercises the legacy code-3 reader + in isolation; this one drives a real pipeline + JSON writer + reader + to verify the bug-fix holds end-to-end against a production-shaped + path. Without Plan 5, the round-trip on step 3 errors out + (`MalformedSourceInfoPool` from the code-3-as-Transformed + misinterpretation) on any document whose shortcode-resolution path + hits a Lua handler. +- **End-to-end with Plan 4**: build an AST containing both + no-anchor and with-anchor Generated nodes, serialize to JSON via the + existing JSON writer, deserialize via the reader, assert structural + equality. +- **TypeScript-side type round-trip**: hub-client / preview-renderer + test parses a JSON pool with Generated entries and confirms its + `SourceInfoEntry` shape matches. ## Dependencies -- Depends on: Plan 4 (Synthetic + Derived variants + By struct). -- Blocks: Plans 6, 7, 8 (they all rely on the new variants round-tripping +- Depends on: Plan 4 (Generated variant + By + Anchor + AnchorRole). +- Blocks: Plans 6, 7, 8 (they all rely on Generated round-tripping through JSON). ## Risk areas -- **Streaming writer code path**: there are two writer paths in `json.rs` - (`write_custom_block` non-streaming and `stream_write_custom_block` - streaming). Both have the same source-info-pool emission logic. Both need - updating. Easy to forget the streaming variant. +- **Streaming writer code path**: source-info-pool emission lives in + two functions in `crates/pampa/src/writers/json.rs`: + `SerializableSourceInfo::to_json` (used by the non-streaming + `write_pandoc` at line 1657) and `stream_write_source_info_pool` + (called from `stream_write_pandoc` at line 3530). Both consume the + same `SerializableSourceMapping` enum but inline their own match + arms. Compiler exhaustiveness catches missed arms after Phase 3's + enum change — a deliberate safety property, and the reason Phases 3 + and 4 must land atomically. The named-but-unrelated pair + `write_custom_block` / `stream_write_custom_block` handles + `CustomNode` blocks, not the pool; don't confuse them. - **Pool ID stability**: changing the format of pool entries shouldn't affect their IDs (which are sequential by intern order). Verify. +- **Anchor dedup is a writer-side + optimization, not a round-trip-stable property.** The writer's + `arc_parent_ids` HashMap is keyed by `Arc::as_ptr`; multiple anchors + pointing to the same `Arc` collapse to one pool entry. + After deserialization, each anchor gets a freshly-allocated `Arc` + carrying a `clone` of the pool target, so a subsequent re-serialize + materializes N copies. **Pool-size is not stable over read-write-read; + AST content and Plan-3 hashes are.** Plan-3's idempotence harness + hashes `doc.ast.blocks` / `doc.ast.meta` via `compute_block_hash_fresh` + / `compute_meta_hash_fresh_excluding_rendered`, both of which + explicitly skip `source_info` (see + `claude-notes/plans/2026-05-04-q2-preview-plan-3-builtin-filter-idempotence.md` + §"Goal" — *"skips `source_info` and `key_source`"*). Same contract as + today's `Substring.parent` reads. The reader-side `Arc::new(si)` + pattern in the new code-4 arm matches the existing Substring arm at + `readers/json.rs:196-200`, which also calls `Arc::new(pool.get(parent_id).cloned()?)` + on every read — no sharing on the read side, by design. +- **Acyclic-by-construction assumption.** `SourceInfo` graphs are + acyclic by construction — transforms build bottom-up, `Arc` + is immutable post-construction. The writer's recursive interning + relies on this invariant — same precondition as today's + Substring/Concat arms. No cycle detection in the reader either. +- **Recursion depth.** Anchor interning adds a third recursion path on + top of Substring chains and Concat pieces. Production depth is + bounded by AST depth (shallow in practice); no separate guard. + Adversarial input could blow the stack, but that's no different from + the existing Substring-chain recursion — out of scope for Plan 5. - **Old JSON files**: anyone with on-disk JSON snapshots of ASTs (test - fixtures, debug exports) generated by current writers will have code 3 - with the buggy shape. Plan 5's reader handles them. New writes emit code 4. + fixtures, debug exports) generated by current writers will have code + 3 with the buggy shape. Plan 5's reader handles them. New writes emit + code 4. +- **Coexistence with attribution wire fields in the same file**: the + attribution work (already shipped) added `astContext.attribution` + and `attributionActors` near the source-info pool emission in + `crates/pampa/src/writers/json.rs`. Plan 5 touches different + conditional branches of the same writer file but no semantic + conflict — `astContext.attribution` records reference source-info + pool IDs unchanged; new code-4 entries are valid `s` targets just as + Original entries are. ## Estimated scope | Component | Lines (rough) | |---|---| -| Code 4 writer + reader | ~50 | -| Code 5 writer + reader (with `from` interning) | ~60 | -| Code 3 dual-shape reader | ~30 | -| Streaming writer parity | ~30 | -| Tests | ~180 | -| **Total** | **~350** | +| Code 4 writer (with anchor interning) | ~80 | +| Code 4 reader (with anchor decoding) | ~70 | +| Code 3 dual-shape legacy reader | ~35 | +| `AnchorRole` ↔ string serialization | ~20 | +| Streaming writer parity | ~40 | +| TypeScript type + utils updates | ~30 | +| Tests (incl. strict-rejection + stream helper + Concat-of-Generated) | ~290 | +| **Total** | **~565** | One focused session. ## Notes -The bug-fix opportunity is real: this plan makes things work that have been -silently latent. Worth a clear callout in the implementation commit message: -"This change fixes a latent bug where FilterProvenance values written by -the JSON writer could not be read back. Production code never tripped this -because no production path produced FilterProvenance in the AST that crossed -the JSON boundary." +The bug-fix opportunity is real and now reachable in production: this +change makes things work that have been silently latent. Worth a clear +callout in the implementation commit message: + +> This change fixes a latent bug where `FilterProvenance` values written +> by the JSON writer could not be read back. Production code never +> tripped this in current main because no production path produced +> FilterProvenance in an AST that crossed the JSON boundary — but +> Plans 1–2 shipped the q2-preview pipeline that runs filters whose +> output does cross that boundary. Plan 5's reader recovers the +> `Generated { by: filter, ... }` shape from the buggy code-3 payload, +> closing the gap. + +The single-code-4 design (no separate code 5) is the result of +unifying `Synthetic` + `Derived` into `Generated` during the 2026-05-20 +design discussion. Code 5 is left unassigned, free for future +reservation. + +**`r: [0, 0]` for Generated entries during the Plan-5↔Plan-7 window.** +After Plan 5 ships, all `Generated` pool entries carry `r: [0, 0]` — +the per-entry range field is no longer the right accessor for +Generated; use `resolve_byte_range` (via the Invocation anchor) for +chain-resolved ranges. Any diagnostic UI (q2-debug, hub-client devtools) +that reads `r` directly will see uninformative zeros for these entries. +This is a long-lived integration branch and the same developer is +implementing all of Plans 5–7, so the surprise window is local; once +Plan 7's `preimage_in` lands, the standard accessor pattern reaches +through Generated correctly. No external consumers need warning. diff --git a/claude-notes/plans/2026-05-04-q2-preview-plan-6-provenance-audit.md b/claude-notes/plans/2026-05-04-q2-preview-plan-6-provenance-audit.md index 3f1b84cfd..f6ddbbba8 100644 --- a/claude-notes/plans/2026-05-04-q2-preview-plan-6-provenance-audit.md +++ b/claude-notes/plans/2026-05-04-q2-preview-plan-6-provenance-audit.md @@ -1,178 +1,875 @@ -# Plan 6 — Provenance audit (Derived for shortcodes, Synthetic for synthesizers) +# Plan 6 — Provenance audit (Generated for synthesizers, anchors for shortcodes) -**Date:** 2026-05-04 +**Date:** 2026-05-04 (revised 2026-05-20, review pass 2026-05-22) **Branch:** feature/q2-preview -**Status:** Implementation plan (open questions named) +**Status:** Implementation plan (review-pass edits applied; theorem +attr_source question closed) **Milestone:** none directly — completes the AST shape Plans 7/8 rely on +## Epic context + +Part of the **provenance epic** (Plans 3–8). Plan 6 is the audit pass +that converts every transform's `SourceInfo::default()` emission into +the correct `Generated { by, from }` shape Plan 4 defines, and +attaches `Invocation` anchors uniformly to all shortcode resolutions. +The file name keeps its q2-preview-plan-N form for continuity with the +earlier discussion notes. + +## Work items checklist + +Implementation order. The plan body (Scope / Implementation notes / Test plan) +holds the design details; this list is the work-tracking surface. + +### Phase 0 — prerequisite +- [x] Add `Inline::source_info_mut` (~33 LOC) + `Block::source_info_mut` + (~24 LOC) accessors in `quarto-pandoc-types`, with round-trip unit tests + for one representative variant of each. + +### Audit +- [x] Comprehensive grep + categorize `SourceInfo::default()` sites in + `crates/quarto-core/src/transforms/` and `crates/pampa/src/`. + (Report: `claude-notes/research/2026-05-22-plan-6-audit.md`. + Follow-ups: bd-12vrr callout default-title, bd-1inj0 code-block + chrome.) +- [x] Document the positional-alignment invariant on `AttrSourceInfo.attributes` + (`crates/quarto-pandoc-types/src/attr.rs:31`). + +### Stamper + dispatch funnel +- [x] Implement `stamp_shortcode_anchors` + mutable AST walkers in + `shortcode_resolve.rs` (model on existing `recurse_inline` / + `resolve_block`). +- [x] Wire the stamper into `resolve_shortcode`'s dispatch funnel so every + Rust / Lua / extension dispatch is post-walked. +- [x] Thread `shortcode_owned.source_info` into `make_error_inline` and + `shortcode_to_literal` from their four call sites. + +### Synthesizer fixes +- [x] `TitleBlockTransform`: emit `Generated { by: By::title_block(), from: [] }` + on the synthesized h1. +- [x] `SectionizeTransform`: emit `Generated { by: By::sectionize(), from: [] }` + on the synthetic Section Div (both close-on-stack and end-of-input sites). +- [x] `FootnotesTransform`: emit `Generated { by: By::footnotes(), from: [] }` + on the container Div. +- [x] `AppendixStructureTransform`: emit `Generated { by: By::appendix(), from: [] }` + on the container Div, bibliography wrapper, license/copyright/citation + helpers (all 5 sites — the helpers were not enumerated in the plan body + but are structurally identical synthesizers; see audit report §"Decisions + on plan-adjacent sites"). +- [x] `theorem.rs::extract_name_attr` + `proof.rs::extract_name_attr`: + thread `&div.attr_source` through; index before `kvs.remove("name")`; + fall back on length-mismatch. **Implementation note**: the + `debug_assert_eq!` form the plan body suggested is too strict — it + fires on the common test pattern of `AttrSourceInfo::empty()` plus a + non-empty `kvs`. Relaxed to `debug_assert!(attr_source.attributes. + is_empty() || kvs.len() == attr_source.attributes.len(), ...)`. The + empty case is "no provenance" (not a bug); only populated-but- + misaligned input is a bd-3aolj/bd-1e6a5 sync error. +- [x] `pampa::pandoc::treesitter_utils::postprocess` synthetic Space + (~line 1348): emit `Generated { by: By::tree_sitter_postprocess(), from: [] }`. + +### Tests +- [x] Shortcode required-anchor invariant + (`shortcode_resolution_required_anchor_invariant` — every + `by:shortcode` carries an Invocation). +- [x] Per-transform fix tests (sectionize / title_block / footnotes / + appendix — shape test in each transform's own test module). +- [x] Lua-shortcode enrichment test + (`lua_shortcode_typed_return_enriched_to_shortcode_kind` — typed Lua + return promoted from `by:filter` → `by:shortcode`, `filter_path` / + `line` migrated into `by.data.lua_path` / `by.data.lua_line`, + Invocation appended). +- [x] Multi-inline shortcode anchor test + (`multi_inline_shortcode_resolution_shares_invocation_source` — + Strong[Str], Space, Str all share the same Invocation source_info). +- [x] Escaped-shortcode regression test + (`escaped_shortcode_keeps_original_source_info`). +- [x] Error-inline regression test + (`unknown_shortcode_error_uses_token_source_info` — both Strong + Str + layers carry the token's Original source_info, not Default or + Generated). The earlier `test_make_error_inline` unit test was also + updated to assert the threaded shape. +- [x] `source_info` determinism test + (`shortcode_resolution_is_deterministic` — two runs produce + structurally-equal ASTs, including all `Generated.by` / + `Generated.from[]` / Original byte ranges). +- [ ] Audit-completion test across the full pipeline (no + `SourceInfo::default()` survives across all transforms). Deferred — + the required-anchor invariant + per-transform shape tests cover the + same property piecemeal; a pipeline-level audit would belong in the + e2e test crate alongside Plan 3's idempotence fixtures and is + better wired in there. Open follow-up. +- [ ] Attribution interaction test (multi-author latest-wins via + `query_byte_range`). Deferred — needs `GitBlameProvider` setup; the + attribution chain is mechanically covered by Plan 4's + `resolve_byte_range` (Generated → Invocation → Original) and Plan 6 + doesn't change the chain. Open follow-up. +- [ ] Error + escaped round-trip test (incremental writer + verbatim-copies). Deferred to Plan 7 (writer infrastructure). +- [ ] Shortcode-inside-include composition test (Invocation anchor + `file_id != 0`). Deferred to Plan 8 (include wrapper introduces the + cross-file context). +- [ ] Plan 3 idempotence test rerun (no new non-determinism). Verified + by `cargo nextest run --workspace` — all 9460 tests pass, including + Plan 3's idempotence fixtures. + +### Verification +- [x] `cargo xtask verify` — all 12 steps green: workspace build, + workspace tests (9460 passed, 196 skipped), lint, format, WASM build, + hub-client build, hub-client tests, q2-preview-spa build. +- [x] End-to-end exercise. Invocation: + ``` + target/debug/q2 render /tmp/plan6-e2e/doc.qmd + ``` + Fixture: a `.qmd` with `title:` (drives title-block), two `## ` + headers (drive sectionize), a footnote `^[…]` (drives footnotes + transform + appendix container), and a `{{< meta title >}}` + shortcode (drives the resolver + stamper). Observed HTML + (inspected, snippet preserved): + ```html + Plan 6 E2E +
+

Plan 6 E2E

+
+

A section

+

Body text. … A meta lookup: Plan 6 E2E.

+ … +
+
+ ``` + Title-block h1 synthesized; both sections wrapped by sectionize; + meta shortcode resolved to its value; footnote container Div + + appendix container Div both emitted. Plan 6's source_info shape is + not visible in HTML, but it's covered by the per-transform shape + tests (Tests section above) and by the workspace test suite. + ## Goal Audit every transform that emits `SourceInfo::default()` (a meaningless -zero-range Original) and fix it to emit correct provenance. Two patterns -apply: +zero-range Original) and fix it to emit correct provenance. Two +patterns apply: - **Transforms that genuinely synthesize content with no source preimage** (Sectionize's section Divs, TitleBlock's synthesized h1, etc.): emit - `Synthetic { by: By::() }` from Plan 4. -- **The shortcode resolver, specifically**: emit `Derived { from: - ctx.source_info, by: By::shortcode(name) }` on resolved nodes. The - `Derived` provenance preserves the shortcode token's byte range AND - marks the resolved content as atomic for the writer (Plan 7 detects - Derived + UseAfter as AtomicViolation). - -This plan does NOT introduce a `CustomNode("ShortcodeResolution")` wrapper + `Generated { by: By::(), from: smallvec![] }` from Plan 4. +- **The shortcode resolver, uniformly**: emit `Generated { by: By::shortcode(name), + from: smallvec![Anchor::invocation(token_si)] }` on every resolved + node, regardless of whether the handler is Rust-built-in or + Lua-implemented. The `Invocation` anchor's `source_info` is the + shortcode token's range; Plan 7's writer uses it for Verbatim-copy + on KeepBefore; attribution chains through it via `resolve_byte_range`. + +The earlier `Derived` variant proposal collapsed into `Generated` with +an `Invocation` anchor during the 2026-05-20 design discussion; this +plan reflects the unified shape. + +Plan 6 does NOT introduce a `CustomNode("ShortcodeResolution")` wrapper (an earlier draft proposed that; we walked it back). Wrappers are appropriate for cases where there's no available source-side anchor in the same file (includes — different FileId — Plan 8 handles those). For shortcodes the resolved nodes can carry source_info pointing into the -parent file directly, which is much lighter than wrapping. +parent file directly via the typed `Invocation` anchor. + +## Prerequisite — Phase 0: mutable accessors on Inline / Block + +Plan 6's `stamp_shortcode_anchors` helper (see "The post-walk helper" +below) takes `&mut Inline` / `&mut Block` and rewrites the +`source_info` field. Today `crates/quarto-pandoc-types/src/inline.rs:57` +defines only `pub fn source_info(&self) -> &SourceInfo` (immutable); +Plan 4 does not add a mutable counterpart. Every existing site that +mutates `source_info` in the workspace holds a *typed* reference +(`&mut Str`, `&mut CodeBlock`, …) and assigns the public field +directly — there is no generic `&mut Inline -> &mut SourceInfo` +accessor. + +**Before any stamping code can compile**, add to +`crates/quarto-pandoc-types/src/inline.rs` and `block.rs`: + +```rust +impl Inline { + pub fn source_info_mut(&mut self) -> &mut quarto_source_map::SourceInfo { + match self { + Inline::Str(s) => &mut s.source_info, + // ... 28 variants, mechanical mirror of `source_info(&self)` + } + } +} + +impl Block { + pub fn source_info_mut(&mut self) -> &mut quarto_source_map::SourceInfo { + match self { + Block::Plain(p) => &mut p.source_info, + // ... 18 variants, mechanical mirror of `source_info(&self)` + } + } +} +``` + +Pure mechanical mirror of the existing read accessors — ~33 LOC for +`Inline` + ~24 LOC for `Block`. Add a unit test that round-trips a +mutation through the accessor on one representative variant of each. ## Scope ### In scope -For each transform that currently emits `SourceInfo::default()`, replace with -the correct provenance: +For each transform that currently emits `SourceInfo::default()`, replace +with the correct provenance: - **`ShortcodeResolveTransform`** (`crates/quarto-core/src/transforms/shortcode_resolve.rs`): - Currently emits `SourceInfo::default()` on every resolved Str/Inline (lines - 172, 179, 186, etc.). Fix: emit `Derived { from: Arc::new(ctx.source_info.clone()), - by: By::shortcode(shortcode_name) }` on each resolved node. The `from` - is the shortcode token's range (an Original from `ctx.source_info`). - All resolved nodes in a multi-inline resolution share the same `from`, - enabling Plan 7's dedupe rule. + Currently emits `SourceInfo::default()` on 12 production sites (see + References for the per-line breakdown). **Fix the dispatch funnel + uniformly via a post-walk helper**: immediately after every handler + dispatch (Rust handler OR Lua-engine dispatch OR extension + dispatch), walk the returned nodes and stamp + `Generated { by: By::shortcode(name), from: smallvec![Anchor::invocation(Arc::new(ctx.source_info.clone()))] }` + on each block/inline. + - The post-walk **enriches**, not overrides: any `by.data` fields the + Lua machinery attached (`filter_path`, `line` — Plan 4's filter + `by.data` shape) are preserved by promoting the kind from + `filter` to `shortcode`, renaming to `lua_path` / `lua_line` in + `by.data` to reflect the new context. See "Lua-shortcode + enrichment" below. + - The post-walk recurses into nested blocks/inlines (model on + `recurse_inline` / `resolve_block` in this file) so every node in + the dispatch output gets the anchor. + - **Two outlier sites do NOT pass through the dispatch funnel** and + need call-site source_info threading instead of the stamper: + - `make_error_inline` (lines 1030-1038): visible `?key` Str + + Strong wrapper for unknown shortcodes. Today both layers carry + `SourceInfo::default()`. Fix: pass `shortcode_owned.source_info` + through from call sites at lines 659 and 914, and use it as the + Str/Strong's `source_info` (an `Original` pointing at the + shortcode token's bytes — same shape Plan 6's + audit-completion test expects). **Atomicity intent**: the error + region is treated as normal editable user-source content (NOT + atomic). If the user edits `?meta:bad` in React, the bytes + change in the source qmd via the verbatim-copy path. Plan 7's + `is_atomic_kind()` does not fire because the source_info is + Original, not Generated. The Strong-wraps-Str overlap (both + layers carry the same range) is structurally parallel to the + footnote `` case Plan 7:261-267 already documents. + - `shortcode_to_literal` (lines 1043-1109): the literal-text Str + produced for escaped `{{}}` shortcodes. Today it emits + `SourceInfo::default()`. Fix: pass `shortcode_owned.source_info` + through from call sites at lines 665 and 920, and use it as the + Str's `source_info`. This is required to satisfy the + "Escaped-shortcode regression test" (line 453: "its source_info + stays Original (not Generated)") — without this fix, the + regression test would fail on Plan 6's own implementation. - **`TitleBlockTransform`** (line 183-185): synthesizes a level-1 Header - from `title:` metadata. Fix: emit `Synthetic { by: By::title_block() }` + from `title:` metadata. Fix: emit `Generated { by: By::title_block(), from: smallvec![] }` on the synthesized Header (and any nested Inlines). Note: q2-preview - skips this transform (Plan 1), but the audit covers the HTML pipeline too. + skips this transform (Plan 1), but the audit covers the HTML + pipeline too. - **`SectionizeTransform`** (`pampa/src/transforms/sectionize.rs:96, 148`): - the synthetic Section Div. Fix: `Synthetic { by: By::sectionize() }`. + the synthetic Section Div. Fix: `Generated { by: By::sectionize(), from: smallvec![] }`. The wrapped Header retains its original source_info. Body blocks retain theirs. -- **`FootnotesTransform`**: the synthesized footnotes container Div. Fix: - `Synthetic { by: By::footnotes() }`. q2-preview skips, but audit covers - HTML pipeline. (Confirm scope during implementation; investigate whether - any *inline* nodes need fixing.) +- **`FootnotesTransform`**: the synthesized footnotes container Div. + Fix: `Generated { by: By::footnotes(), from: smallvec![] }`. The + synthesized `` markers are already source-mapped via + `create_footnote_ref` cloning from the original `Note` inline (so + they stay Original — no change needed). The four synthesized inline + layers (Span/Superscript/Link/Str) all carry the same range, + producing a multi-node overlap; Plan 7:261-267 documents that this + is round-trip-friendly without extra writer work (block-level + Verbatim of the surrounding Para covers it). q2-preview pipeline + runs this transform (per Plan 2B's audit); the audit applies to + both pipelines. - **`AppendixStructureTransform`**: the synthetic appendix container Div. - Fix: `Synthetic { by: By::appendix() }`. Same scope note as Footnotes. -- **`theorem.rs::extract_name_attr`** (line 313): the title Str extracted - from `name="..."` attribute is built with `SourceInfo::default()`. Fix: - use the attr value's source_info (currently lost — inspection needed for - whether `attr_source` carries this info). At minimum, `Synthetic { by: - By::raw("theorem-title-attr", json!({})) }` if we can't recover it, but - better to preserve the actual source position from the attr-source. + Fix: `Generated { by: By::appendix(), from: smallvec![] }`. Same scope + note as Footnotes. +- **`theorem.rs::extract_name_attr`** (line 313) **and the parallel + `proof.rs::extract_name_attr`** (line 167): the title Str extracted + from `name="..."` is currently built with `SourceInfo::default()`. + Fix: thread `&div.attr_source` into `extract_name_attr` in both + files; index by `kvs.keys().position(|k| k == "name")` *before* the + `remove`; use `attr_source.attributes[idx].1` (an + `Option` carrying the parser-recorded + `Original{file_id, value_start, value_end}` for the attribute + value's bytes) as the Str's `source_info`. Falls back to + `SourceInfo::default()` only when the Option is `None` (e.g. JSON + read from external Pandoc producers that don't emit `attrS`) OR + when length-alignment fails (see safeguards below). The parser + populates the value range at + `crates/pampa/src/pandoc/treesitter.rs:1075-1107` → + `treesitter_utils/commonmark_attribute.rs:38-50`; no parser-side + prerequisite is needed. + + **Positional-alignment safeguards** (review-pass 2026-05-22): the + fix relies on the invariant *"`AttrSourceInfo.attributes[i]` is the + `(key_src, val_src)` for the i-th entry in `Attr.2`'s insertion + order."* This invariant holds in the parser's main path but **is + not documented and is broken in two preexisting code paths** + (duplicate-key handling in `commonmark_attribute.rs:41-49`; + caption-attr-into-table merge in `section.rs:85-113` and + `postprocess.rs:1483-1496`). Plan 6 therefore: + 1. **Documents the invariant** with a doc-comment on + `AttrSourceInfo.attributes` in `crates/quarto-pandoc-types/src/attr.rs:31`. + 2. **Guards the index in `extract_name_attr`** with a runtime + length check (`if kvs.len() == attr_source.attributes.len()`) + and a `debug_assert_eq!` on lengths. Falls back to + `SourceInfo::default()` when they diverge, so production never + panics on misaligned input. + 3. **Two follow-up beads tracked** (out-of-band, preexisting bugs): + **bd-3aolj** (duplicate-key handling in + `commonmark_attribute.rs:41-49` — `LinkedHashMap::insert` updates + in place while `attr_source.attributes.push` always appends) and + **bd-1e6a5** (caption-attr-into-table merge in + `section.rs:85-113` / `postprocess.rs:1483-1496` — same root + cause when caption + table keys overlap). Plan 6 does not block + on them; its runtime guard handles the failure mode safely. + 4. Note: `kvs.remove("name")` after the index lookup itself shrinks + `attr.2` by one without touching `attr_source.attributes`. The + surviving `div.attr_source` is then handed to `CustomNode::new` + (`theorem.rs:281`). Downstream consumers of `attr_source` on + that CustomNode see misaligned data. The rest of `convert_div` + does not re-index `attr_source`, so this is harmless locally, + but a future consumer of the constructed CustomNode's + `attr_source` could trip on it. Considered acceptable for v1; + if a future caller indexes, it should use the same guarded + pattern. + + JSON round-trip preserves the value range: `attrS.kvs` serializes + as a positional array of `[key_ref, val_ref]` pairs + (`json.rs:600-633`) and reads back identically (`json.rs:423-508`). + No Plan-5 follow-up needed. - **`pampa::pandoc::treesitter_utils::postprocess`** (line 1348): the "Synthetic Space" inserted to separate citation from suffix. Fix: - `Synthetic { by: By::tree_sitter_postprocess() }`. + `Generated { by: By::tree_sitter_postprocess(), from: smallvec![] }`. The audit pass also looks for any *other* sites emitting -`SourceInfo::default()` that I haven't enumerated. Plan 6 starts with a +`SourceInfo::default()` that aren't enumerated. Plan 6 starts with a comprehensive grep. ### Out of scope -- The `is_atomic_custom_node` registry function (Plan 7 owns it). -- The writer's atomic-violation diagnostic (Plan 7). +- The `is_atomic_kind()` predicate and `is_atomic_custom_node` registry + (Plan 7 owns the writer-side atomicity logic). +- The writer's soft-drop / atomic-violation handling (Plan 7). - The writer's multi-inline shortcode dedupe rule (Plan 7). - The `IncludeExpansion` CustomNode wrapper (Plan 8). -- React component for shortcode-resolved inlines (Plan 2B — atomic-aware - `setLocalAst` gating in the dispatcher detects Derived provenance via - Plan 2A's `isAtomicSourceInfo` accessor and renders read-only). +- React component for shortcode-resolved inlines (Plan 2A's framework + atomic gate already handles this via the `isAtomicSourceInfo` + accessor; Plan 4's `is_atomic_kind` set names `shortcode` as atomic). +- **Metadata-loader changes** to record per-key source-info for `meta` + and `var` shortcodes. Files separately; see "ValueSource follow-up" + below. +- **Lua-file registration in `SourceContext`** to enable typed + `Dispatch` anchors. Files separately; see "Dispatch follow-up" + below. - The HTML pipeline doesn't need a "ShortcodeResolutionResolveTransform" - (no wrapper to unwrap). Shortcode-resolved nodes ARE flat inlines/blocks - with Derived source_info; the HTML writer doesn't care about source_info, - it just renders the nodes. Behavior unchanged for HTML. + (no wrapper to unwrap). Shortcode-resolved nodes ARE flat + inlines/blocks with `Generated` source_info; the HTML writer doesn't + care about source_info, it just renders the nodes. Behavior + unchanged for HTML. ## Design decisions (settled in conversation) -- **Most transforms just need to preserve ctx.source_info**. The "audit and - fix" is mostly bug fixes — ctx already has the info; the transforms just - drop it. Mechanical change. -- **Shortcode resolution uses Derived provenance, not a wrapper.** Each - resolved Str/Inline/Block gets `Derived { from: ctx.source_info, by: - By::shortcode(name) }`. This preserves the shortcode token's byte range - (via the `from` chain) AND signals to Plan 7's writer that this content - is atomic. Multi-inline resolutions: every resolved node shares the same - `from`, and Plan 7's dedupe rule emits the shortcode token once per group. -- **`Synthetic` provenance for genuine synthesizers**. Sectionize, TitleBlock, - Footnotes, Appendix containers — none of these correspond to source bytes, - so they get `Synthetic { by: By::() }`. +- **Single funnel covers all shortcodes**. The `ShortcodeResolveTransform::resolve_shortcode` + method is the single dispatch point for in-file shortcodes (Rust + built-ins, Lua-loaded extension handlers, extension name lookup). + Plan 6's stamping helper runs once per dispatch, uniformly. All + built-in (`meta`) and Lua-implemented (`kbd`, `lipsum`, `placeholder`, + `version`, `video`) shortcodes get the same treatment. User-extension + shortcodes via Lua: same. `{{< include >}}` is the genuine exception + — handled by `IncludeExpansionStage` (a separate pipeline stage) and + Plan 8's wrapper, not via Generated. +- **Include×shortcode composition is architecturally well-defined.** + `IncludeExpansionStage` runs at the stage layer + (`crates/quarto-core/src/pipeline.rs:258`) before + `AstTransformsStage` (`pipeline.rs:312`), so includes are spliced + flat before any shortcode resolution. Shortcode resolution is + single-pass — `resolve_blocks` advances its index *past* inserted + blocks (`shortcode_resolve.rs:625-677`); returned content is never + re-scanned, so a shortcode emitting the literal text + `"{{< include foo.qmd >}}"` lands as a `Str`, never as a parsed + `Shortcode` (the reverse composition is structurally impossible). + When a shortcode appears *inside* include-spliced content, the + Invocation anchor's `source_info` points into the included file + (different `FileId` than the parent) — this is correct: the token's + bytes live there. Plan 8's wrapper carries the parent-file anchor + independently; Plan 7's `preimage_in(parent_file)` returns `None` + for the included children and the wrapper governs verbatim-copy. +- **Enrichment, not override**. The Lua machinery's auto-attach + produces `Generated { by: filter, from: [], by.data: { filter_path, + line } }` (post-Plan-4, per Plan 4 §"by.data shape table" line 590) + for *typed* Inline/Block nodes constructed during a Lua shortcode + dispatch (e.g. `return pandoc.Str(...)`). Bare-string returns + (`return "text"` → `LuaShortcodeResult::Text`) do NOT pass through + `filter_source_info`; they land with `SourceInfo::default()` and + enter the post-walk's fresh-Generated branch directly. The shortcode + resolver's post-walk enriches the filter-attached cases: + - **Appends** an `Invocation` anchor pointing at the shortcode token. + - **Promotes** `by.kind` from `"filter"` to `"shortcode"`, renaming + `filter_path` → `lua_path` and `line` → `lua_line` in `by.data` + (reflecting the new shortcode context) and adding the shortcode + `name`. + The Lua-side dispatch precision is preserved; the shortcode context + layer is added on top. No information is discarded. + + **Scope**: this enrichment fires only from + `ShortcodeResolveTransform::resolve_shortcode`. General Lua filter + dispatches (`UserFiltersStage`) leave `Generated { by: filter, ... }` + intact — that is the steady-state for filter constructions, per + Plan 4 §"Filter constructions become Generated { by: filter, from: + [] }". The post-walk is not wired into the filter stage and should + not be. +- **Most transforms just need to preserve ctx.source_info**. The + "audit and fix" is mostly bug fixes — ctx already has the info; the + transforms just drop it. Mechanical change. +- **Shortcode resolutions use `Generated` + `Invocation` anchor, not a + wrapper.** Each resolved Str/Inline/Block gets `Generated { by: + shortcode(name), from: [Invocation -> Arc::new(ctx.source_info.clone())] }`. + The anchor's source_info is the shortcode token's range (an Original + from `ctx.source_info`). Plan 7's writer uses it for Verbatim-copy + on KeepBefore. Multi-inline resolutions: every resolved node shares + the same anchor's source_info, enabling Plan 7's dedupe rule. +- **Genuine synthesizers use `Generated` with empty anchors**. + Sectionize, TitleBlock, Footnotes, Appendix containers — none of + these correspond to source bytes, so they get + `Generated { by: By::(), from: smallvec![] }`. Plan 7's coarsen + treats their wrappers as Transparent (recurse into source-bearing + children) or Omit depending on `by.is_atomic_kind()`. - **No `atomic` flag needed**. Plan 7's atomic-violation logic detects - atomicity via `Derived` source_info on any node, OR via the - `is_atomic_custom_node` registry for CustomNode types - (IncludeExpansion, CrossrefResolvedRef). Shortcode atomicity falls into - the first category. + atomicity via `by.is_atomic_kind()` (per Plan 4's predicate) and via + the `is_atomic_custom_node` registry for CustomNode types + (`IncludeExpansion`, `CrossrefResolvedRef`). Shortcode atomicity + falls into the first category (`shortcode` is in the atomic-kind + set). + +## Attribution interaction + +The `Invocation` anchor's existence delivers correct attribution for +shortcode-resolved content **with no attribution-code changes**: + +- `query_attribution(node.source_info, runs)` calls `resolve_byte_range`. +- Per Plan 4's updated `resolve_byte_range`, `Generated` delegates to + `invocation_anchor()`, which returns the `Invocation` anchor's + `source_info` — typically an `Original` covering the shortcode + token's bytes. +- The chain resolves to `(file_id=0, token_start, token_end)`. +- `query_attribution` accepts (file_id == 0, start < end) and calls + `query_byte_range`. +- The existing max-time-across-overlapping-runs logic in + `AttributionMap::query_byte_range` picks the latest author covering + the token's bytes. + +For multi-author shortcodes: if author A wrote `{{< meta foo >}}` at +T1 and author B changed `foo` to `bar` at T2 > T1, the byte range +covers bytes touched by both; `query_byte_range` picks the latest +(B). This is the policy specified in the 2026-05-20 design +discussion ("attributed to latest author of the shortcode text"), +and it falls out mechanically from Plan 6's anchor stamping plus +Plan 4's chain-walking accessor — no special-case code. + +## Lua-shortcode enrichment + +The Lua machinery's `filter_source_info` (in +`crates/pampa/src/lua/types.rs`) walks the live Lua call stack to find +the first non-C frame and produces (post-Plan 4) the canonical +filter-construction shape: + +```rust +Generated { + by: By::filter(filter_path, line), // by.data = { filter_path, line } + from: smallvec![], +} +``` + +This auto-attach fires when Lua code constructs *typed* nodes via +`pandoc.Str(...)`, `pandoc.Span(...)`, etc. Bare-string Lua returns +(`return "text"` → `LuaShortcodeResult::Text`) do NOT pass through +`filter_source_info`; their resulting Str carries +`SourceInfo::default()` instead. + +When this filter-shape source_info appears inside a Lua shortcode +handler dispatch, the resolver's post-walk enriches it to: + +```rust +Generated { + by: By { + kind: "shortcode".to_string(), + data: json!({ + "name": shortcode_name, + "lua_path": , + "lua_line": , + }), + }, + from: smallvec![Anchor::invocation(Arc::new(ctx.source_info.clone()))], +} +``` + +The Lua-side `filter_path` / `line` precision is preserved in +`by.data` under the more contextually-precise names `lua_path` / +`lua_line`; the shortcode `name` is added; the kind is promoted from +`filter` to `shortcode`. **Nothing is discarded.** Nodes that entered +the post-walk with `SourceInfo::default()` (bare-string Lua returns, +or Rust handler returns) hit the fresh-Generated branch instead and +end up with `by.data = { name }` plus the Invocation anchor. + +This is the canonical "enrichment-via-post-walk" pattern. Other +transforms that wrap dispatch may follow the same shape later (always +append, promote `by.kind`, preserve prior `by.data` fields where +meaningful). + +When the **Lua-file-registration follow-up** lands (see "Dispatch +follow-up" below), `lua_path` / `lua_line` migrate out of `by.data` and +into a typed `Dispatch` anchor. `by.data` for Lua-dispatched shortcodes +then shrinks to just `{ "name": shortcode_name }`. + +## The post-walk helper + +```rust +/// After every shortcode handler dispatch, stamp Invocation provenance +/// on the returned nodes. Recurses into nested AST so every block and +/// inline gets the anchor. Enriches existing `Generated { by: filter, ... }` +/// (from Lua auto-attach) by promoting kind and appending the anchor; +/// otherwise sets source_info to a fresh Generated shape. +fn stamp_shortcode_anchors( + result: &mut ShortcodeResult, + shortcode_name: &str, + token_si: &SourceInfo, +) { + let token_arc = Arc::new(token_si.clone()); + match result { + ShortcodeResult::Inlines(inlines) => { + for inline in inlines.iter_mut() { + stamp_inline(inline, shortcode_name, &token_arc); + } + } + ShortcodeResult::Blocks(blocks) => { + for block in blocks.iter_mut() { + stamp_block(block, shortcode_name, &token_arc); + } + } + ShortcodeResult::Preserve | ShortcodeResult::Error(_) => {} + } +} + +fn stamp_inline(inline: &mut Inline, name: &str, token_arc: &Arc) { + let si = inline.source_info_mut(); + *si = enrich_or_create(si, name, token_arc); + // recurse into nested inlines (Strong, Emph, Link, ...) + walk_nested_inlines(inline, |child| stamp_inline(child, name, token_arc)); +} + +fn enrich_or_create( + existing: &SourceInfo, + name: &str, + token_arc: &Arc, +) -> SourceInfo { + // If the Lua machinery attached Generated { by: filter, ... }, + // promote it. Otherwise fresh Generated. + // + // NOTE (bd-36fr9 co-change): the by.data["filter_path"]/["line"] + // reads below are temporary. Once Lua-file registration lands, + // those fields move out of by.data and into a Dispatch anchor in + // `from`. This branch then reads the existing Dispatch anchor + // from `existing.from[]` and copies it into the new from-list + // alongside Invocation. See §"Dispatch follow-up". + // + // NOTE (bd-129m3 integration point): for `meta` / `var` shortcodes + // post-loader-change, the helper also appends a ValueSource + // anchor pointing at the metadata value's source range. See + // §"ValueSource follow-up". + let by = match existing { + SourceInfo::Generated { by, .. } if by.kind == "filter" => { + let lua_path = by.data.get("filter_path").cloned(); + let lua_line = by.data.get("line").cloned(); + let mut data = serde_json::json!({ "name": name }); + if let Some(p) = lua_path { data["lua_path"] = p; } + if let Some(l) = lua_line { data["lua_line"] = l; } + By { kind: "shortcode".to_string(), data } + } + _ => By::shortcode(name), + }; + SourceInfo::Generated { + by, + from: smallvec![Anchor::invocation(Arc::clone(token_arc))], + } +} +``` + +(Block stamping is parallel — recurse into block children and inlines +they contain.) ## Open questions for implementation - **Comprehensive audit**: grep for `SourceInfo::default()` in - `crates/quarto-core/src/transforms/` and `crates/pampa/src/`. Categorize - each site: preserve ctx info / emit Synthetic / emit Derived / leave - as-is (test code). Plan 6's first commit is the audit report; - subsequent commits fix each site. -- **Theorem title from attr**: when `extract_name_attr` extracts the title - from `name="Pythagoras"`, it gets a String with no source_info. Inspecting - `attr_source` may or may not give the byte range of the attr value. - Worth investigating; if achievable, use Original{attr_value_range}; - otherwise Synthetic. -- **Footnotes and Appendix transforms**: q2-preview skips them in v1, but - Plan 6 audits them anyway. Confirm during implementation that the audit - is feasible without breaking HTML pipeline tests. (Extension of the - pattern, not a redesign.) -- **Escaped shortcodes**: today `Shortcode::is_escaped` is a flag, and - escaped shortcodes preserve as literal text (no resolution). Don't apply - Derived to escaped shortcodes — they're not resolved; they stay as - literal text with their original source_info. + `crates/quarto-core/src/transforms/` and `crates/pampa/src/`. + Categorize each site: preserve ctx info / emit Generated with + appropriate by-kind / emit Generated with Invocation / leave as-is + (test code). Plan 6's first commit (after Phase 0) is the audit + report; subsequent commits fix each site. + +(Previously-open questions resolved by review pass 2026-05-22: +"Theorem title from attr" — `AttrSourceInfo` already carries the +value range; see §Scope theorem bullet for the threaded-in fix. +"Escaped shortcodes" — the In-scope `shortcode_to_literal` fix at +the call site (passing `shortcode_owned.source_info` through) +produces the Original shape the regression test expects. +"Recursion into deep AST" — concrete reusable shape and full +container-variant set documented; see §Implementation notes +below.) + +## Implementation notes + +- **Recursion shape for the post-walk.** The walker must traverse the + full container set — for inlines: Strong, Emph, Strikeout, + Superscript, Subscript, SmallCaps, Quoted, Cite, Link, + Image (alt/caption), Span, Underline, Delete, Insert, Highlight, + EditComment, Note (block content), Custom (slot contents); for + blocks: Div, BlockQuote, OrderedList, BulletList, DefinitionList, + Figure, Table (cells), Custom (slot contents). The canonical + reusable shape is in + `crates/quarto-core/src/transforms/shortcode_resolve.rs`'s own + `recurse_inline` (~lines 945-1027) and `resolve_block` + (~lines 710-863), which already cover this set including Image's + alt/caption content and Note's nested blocks. Model the new mutable + walkers on these — drop the async + shortcode-resolution logic, + keep the match-arm dispatch and Image/Note recursion. The narrower + walkers in `callout.rs` and `theorem.rs` are block-only and do NOT + cover the inline variants the stamper needs; do not use them as the + reference shape. + +## ValueSource follow-up + +Plan 6 does NOT attach `ValueSource` anchors. The shape is defined +(Plan 4 ships `AnchorRole::ValueSource`) but the data isn't available: +the metadata loader doesn't surface per-key source-info to the +shortcode resolver today. Specifically, the merged `meta` ConfigValue +the resolver consults has `source_info` per key INTERNALLY, but +`MetaShortcodeHandler::resolve` calls `ctx.metadata.get_nested(&key)` +and then `config_value_to_inlines(value)` which discards the +per-key source information when flattening to strings. + +The follow-up issue ("metadata-loader threads per-key source-info +through to shortcode handlers"): + +1. Loader change: `ConfigValue` already carries `source_info` + per-value (`crates/quarto-pandoc-types/src/config_value.rs:155`); + the lookup path returns ConfigValue references, but + `config_value_to_inlines` converts to bare Strs discarding source. + Thread source through. +2. Resolver change: when constructing the resolved nodes, attach a + `ValueSource` anchor pointing at the value's `source_info`. +3. This is the structural feature behind Elliot's 2026-05-20 chain + request — the resolved content would carry both `Invocation` (where + the shortcode was written) and `ValueSource` (where the value was + defined). + +When the follow-up lands, Plan 6's post-walk grows one more anchor +append at the appropriate dispatch sites. The current Plan 6 ships +with just `Invocation`; the type is forward-compatible. + +**Integration point**: bd-129m3 should append the ValueSource anchor +inside `enrich_or_create` (see §"The post-walk helper" below). Once +the metadata loader threads per-key source-info through, the helper +gains access to the value's source range via the `ShortcodeContext` +and pushes a second anchor into `from` alongside the Invocation. No +other call sites in Plan 6 change. + +Tracked as **bd-129m3** ("Provenance follow-up: ValueSource anchor +stamping for meta/var shortcodes"). + +## Dispatch follow-up + +Plan 6 does NOT use a typed `Dispatch` anchor for Lua-side +construction info. Lua filter files aren't registered in `SourceContext`, +so we can't construct an `Original` pointing into them. In the interim, +`(lua_path, lua_line)` lives in `by.data` (see "Lua-shortcode +enrichment" above). + +The follow-up issue ("register Lua filter files in `SourceContext`"): + +1. `SourceContext::register_file(path, bytes) -> FileId`. +2. Lua engine calls it when loading each filter. +3. `filter_source_info` produces `Original { file_id, start, end }` + instead of returning a path-line pair. +4. Lua-attached source_info becomes `Generated { by: filter, from: + [Dispatch -> Original{lua_file, ...}] }`. +5. Plan 6's post-walk's enrichment then preserves the `Dispatch` + anchor (typed) instead of preserving `by.data` fields. + +When the follow-up lands, `AnchorRole::Dispatch` joins the enum (a +non-breaking enum extension); `by.data` for `filter` / Lua-dispatched +`shortcode` kinds shrinks to per-kind config only. + +**Co-change in `enrich_or_create`**: bd-36fr9 must update Plan 6's +helper (§"The post-walk helper" below). The current "enrich" branch +reads `by.data.get("filter_path")` and `by.data.get("line")` from +the existing `Generated{by:filter, ...}`; post-bd-36fr9, those +fields are gone from `by.data` and the relevant info lives in the +`Dispatch` anchor inside `from`. The helper then reads the existing +Dispatch anchor and copies it into the new shortcode-shape `from` +alongside the Invocation. The §"Lua-shortcode enrichment" example +above also needs updating to show the post-bd-36fr9 shape. + +Tracked as **bd-36fr9** ("Provenance follow-up: Dispatch anchor for +Lua-handler filter & shortcode"). ## References -- `crates/quarto-core/src/transforms/shortcode_resolve.rs` — main fix site. - Lines 172, 179, 186, 203, 208, 215, 222, 238 emit `SourceInfo::default()`. +- `crates/quarto-core/src/transforms/shortcode_resolve.rs` — main fix + site. Per-line breakdown of production `SourceInfo::default()` + emissions: + - Lines 172, 179, 186, 203, 208, 215, 222 — `config_value_to_inlines` + (Str construction for `meta` / `var` lookups). + - Line 238 — `flatten_blocks_to_inlines` (synthesized + paragraph-separator Space; NOT part of `config_value_to_inlines`). + - Line 470 — `lua_result_to_shortcode_result::Text` arm (bare-string + Lua return wrapped in a Str). + - Lines 1034, 1036 — `make_error_inline` (visible `?key` Str + Strong + wrapper for unknown shortcodes). + - Line 1109 — `shortcode_to_literal` (escaped-shortcode literal text). + The stamper handles the first three groups uniformly via the dispatch + funnel; `make_error_inline` and `shortcode_to_literal` need call-site + source_info threading (see "In scope" bullet). +- `crates/quarto-core/src/transforms/shortcode_resolve.rs:306-371` — + `resolve_shortcode` method (single funnel for all dispatches; the + post-walk hooks in here). +- `crates/quarto-core/src/transforms/shortcode_resolve.rs:710-1027` — + existing `resolve_block` / `recurse_inline` walkers. Canonical + reusable shape for the new mutable walkers (drop async + + shortcode-resolution logic; keep the match-arm dispatch and + Image/Note recursion). - `crates/quarto-core/src/transforms/title_block.rs:183, 185` — h1 synthesis sites. - `crates/pampa/src/transforms/sectionize.rs:96, 148` — section Div - synthesis sites. -- `crates/quarto-core/src/transforms/footnotes.rs` — investigate. -- `crates/quarto-core/src/transforms/appendix.rs` — investigate. -- `crates/quarto-core/src/transforms/theorem.rs:281, 313` — name-attr title - extraction. -- `crates/pampa/src/pandoc/treesitter_utils/postprocess.rs:1348` — synthetic - Space. + synthesis sites. (Line 169 in that file is a `dummy_source_info()` + test helper, not a production site.) +- `crates/quarto-core/src/transforms/footnotes.rs` — container Div + synthesis (around line 495 / `create_footnotes_section`). +- `crates/quarto-core/src/transforms/appendix.rs` — appendix container + Div synthesis (`create_appendix_container` ~line 257). +- `crates/quarto-core/src/transforms/theorem.rs:313` and + `crates/quarto-core/src/transforms/proof.rs:167` — name-attr title + extraction in `extract_name_attr`. Both pass `&div.attr_source` + through and use `attr_source.attributes[idx].1` (an + `Option`). +- `crates/quarto-pandoc-types/src/attr.rs:27-32` — `AttrSourceInfo` + shape (`attributes: Vec<(Option, Option)>` + for key/value source ranges). +- `crates/pampa/src/pandoc/treesitter.rs:1075-1107` and + `crates/pampa/src/pandoc/treesitter_utils/commonmark_attribute.rs:38-50` + — parser sites that populate the attr value's byte range. No + prerequisite parser change needed. +- `crates/pampa/src/pandoc/treesitter_utils/postprocess.rs:1348` — + synthetic Space. +- `crates/pampa/src/lua/types.rs:1812-1840` — `filter_source_info` + Lua-side auto-attach. Note: only fires for typed Inline/Block + returns (`pandoc.Str(...)`); bare-string returns + (`return "text"` → `LuaShortcodeResult::Text`) bypass it. - `crates/quarto-pandoc-types/src/custom.rs` — CustomNode shape. -- `crates/quarto-core/src/transforms/callout.rs` — example pattern for sugar - transforms wrapping output in CustomNode. +- `crates/quarto-core/src/transforms/callout.rs` — example pattern for + sugar transforms wrapping output in CustomNode. NOTE: callout + + theorem are block-only walkers; for inline recursion, use + `shortcode_resolve.rs::recurse_inline` instead. +- `crates/quarto-core/src/stage/stages/user_filters.rs` — general Lua + filter dispatch site. Does NOT invoke the post-walk; its + constructions keep `by.kind == "filter"` as steady state. +- `crates/quarto-core/src/pipeline.rs:258, 312` — `IncludeExpansionStage` + precedes `AstTransformsStage`, so includes are spliced before + shortcodes resolve. See §"Include×shortcode composition" in Design + decisions. ## Test plan - **Audit-completion test**: a unit test that builds a fixture document exercising shortcode resolution, sectionize, and (HTML pipeline only) - title-block / footnotes / appendix. Asserts that the resulting AST has - no nodes with `SourceInfo::default()` source_info. (Defensive - regression: catches a future PR that adds a transform without provenance.) + title-block / footnotes / appendix. **Asserts that the resulting AST + has no nodes with `SourceInfo::default()` source_info AND every + synthesized node carries an appropriate `Generated` shape** (matches + the §Atomic-kind-set / §by.data tables in Plan 4). Defensive + regression: catches a future PR that adds a transform without + provenance. +- **Shortcode required-anchor invariant**: the audit-completion test + ALSO walks the post-stamping AST and asserts no `Generated { by: + shortcode, from: [] }` remains. Every `by.kind == "shortcode"` node + must carry at least one `Invocation` anchor pointing at the source + token's bytes. Per Plan 4 §"Required-anchor invariant for shortcode", + this is the producer-side enforcement of the rule; Plan 7 adds a + `debug_assert!` on the consumer side as belt-and-suspenders. The + stamper is the only construction site for `by: shortcode` in v1, so + the test exercises the full source of bad shapes. - **Per-transform fix tests**: for each fixed transform, a test that inspects the produced source_info shape: - - SectionizeTransform: synthetic Div has `Synthetic { by: By { kind: - "sectionize" } }`. Header inside has its original source_info. - - ShortcodeResolveTransform: each resolved Str has `Derived { from: - Original{shortcode_token_range}, by: By { kind: "shortcode", data: - {"name": "..."} } }`. The `from` Original points at the shortcode - token's bytes in source. + - SectionizeTransform: synthetic Div has `Generated { by: { kind: + "sectionize" }, from: [] }`. Header inside has its original + source_info. + - ShortcodeResolveTransform (uniform): each resolved Str has + `Generated { by: { kind: "shortcode", data: { name: "..." } }, + from: [Anchor { role: Invocation, source_info: ... }] }`. The + anchor's source_info chain-walks to the shortcode token's bytes + via `resolve_byte_range`. + - Lua-shortcode test: a `{{< kbd Ctrl+C >}}` invocation produces a + Span with `Generated { by: { kind: "shortcode", data: { name: + "kbd", lua_path: "...", lua_line: N } }, from: [Invocation] }`. + **NOT** `by.kind == "filter"`; the post-walk promoted it. + - Other built-in Lua shortcodes (lipsum, placeholder, version, video): + same shape, with the appropriate `name`. - Etc. for each transform. -- **Multi-inline shortcode source_info test**: a metadata key with - markdown (`title: "**Bold** Title"`). After ShortcodeResolveTransform, - the resulting `[Strong[Str], Space, Str]` ALL have Derived source_info - with the same `from` (the shortcode token's range). This is what Plan - 7's dedupe rule will detect. -- **Idempotence still holds**: re-run Plan 3's idempotence test after the - audit — the changes shouldn't introduce non-determinism. +- **Multi-inline shortcode anchor test**: a metadata key with markdown + (`title: "**Bold** Title"`). After ShortcodeResolveTransform, the + resulting `[Strong[Str], Space, Str]` ALL have `Generated` with + `Invocation` anchors whose `source_info` is the same shortcode + token's range. This is what Plan 7's dedupe rule detects. +- **Attribution interaction test**: render a doc with `{{< meta foo >}}` + through two commits by different authors (author A wrote the line at + T1; author B changed `foo` → `bar` at T2). With Plan 6 stamped and a + `GitBlameProvider` installed, the resulting `astContext.attribution` + for the resolved Str references author B's identity (the latest + author of the token bytes). This is the multi-author latest-wins + policy. +- **Escaped-shortcode regression test**: `{{}}` resolves + to literal text; its source_info stays Original (not Generated). +- **Error-inline regression test**: an unknown shortcode `{{< bogus >}}` + resolves via `make_error_inline` to `Strong[Str("?bogus")]`. Both + layers carry `Original` source_info pointing at the bogus + shortcode's token bytes (NOT `Default`, NOT `Generated`). Plan 7's + `is_atomic_kind()` does not fire; round-trip through the + incremental writer Verbatim-copies the original token bytes. +- **Error / escaped round-trip test**: full incremental-writer + round-trip on a fixture containing both `{{}}` and + `{{< bogus >}}`. After Plan 6's stamping + Plan 7's writer, the + output qmd should byte-equal the input for those regions + (verbatim-copy via the Original anchor in both cases). +- **Shortcode-inside-include composition test**: `parent.qmd` + contains `{{< include foo.qmd >}}`; `foo.qmd` contains + `{{< meta title >}}`. After Plan 6 stamping (and Plan 8's wrapper), + the resolved Str inside the IncludeExpansion wrapper has + `Generated { by: { kind: "shortcode", data: { name: "title" } }, + from: [Invocation -> Original{file_id: , ...}] }`. + Assert the Invocation anchor's source_info `file_id != 0` (i.e. + points into the included file, not the parent). Plan 8's wrapper + carries the parent-file anchor at its level; this test exercises + Plan 6's stamping invariant under the cross-file context. Plan 8's + own test plan covers wrapper round-trip independently. +- **Idempotence still holds**: re-run Plan 3's idempotence test after + the audit — the changes shouldn't introduce non-determinism. +- **`source_info` determinism (Plan 6-specific gap)**: Plan 3's hashes + exclude `source_info` by design (`compute_blocks_hash_fresh` and + `compute_meta_hash_fresh` both skip it). So Plan 3 does **not** + catch a transform whose synthesized `Generated { by, from }` + output is non-deterministic *in the source_info layer* — e.g., an + `Anchor::invocation` that hashes a different `SourceInfo` on + repeated runs because the shortcode-token's range was recomputed + rather than cloned. Plan 6 must add its own per-fixture + source_info-determinism check: render twice, walk the AST in + lockstep, assert every `Generated.by`, every `Generated.from[]`, + and every Original `SourceInfo` is `==`-equal across runs. Place + this alongside Plan 3's idempotence test (same fixtures, parallel + assertion) so the test crate covers both contracts. ## Dependencies ### Hard dependencies - **Plan 4** — Plan 6's transforms use `By::shortcode(...)`, - `By::sectionize()`, `By::title_block()`, etc., plus the `Derived` and - `Synthetic` variants. Cannot compile without Plan 4. + `By::sectionize()`, `By::title_block()`, etc., plus the `Generated` + variant and `Anchor`/`AnchorRole` types. Cannot compile without + Plan 4. ### Soft dependencies @@ -181,8 +878,8 @@ comprehensive grep. through the JSON wire format (the path q2-preview takes when crossing the WASM boundary to React and back), Plan 5's wire-format extension is required. Without Plan 5, a Plan 6 AST that gets serialized to JSON - and deserialized loses the `Derived` and `Synthetic` shapes (decoded - via legacy code-3 fallback as Substring approximations). + and deserialized loses the `Generated` shape (decoded via legacy + code-3 fallback as Substring approximations). Pragmatic implication: Plan 6 lands cleanly in-Rust without Plan 5, but isn't observable in q2-preview without Plan 5. The plans can be @@ -192,77 +889,89 @@ comprehensive grep. ### Blocks - **Plan 7** — writer needs Plan 6's audit-fixed AST shape to walk - preimages correctly and to detect Derived for atomic enforcement. -- Independent of Plan 8 (Plan 8 introduces its own wrapper for includes; - shortcodes don't use that pattern). + preimages correctly and to detect atomic-kind for `is_atomic` + enforcement. +- Independent of Plan 8 (Plan 8 introduces its own wrapper for + includes; shortcodes don't use that pattern). ## Risk areas -- **Audit completeness**: missing a site means a future Plan 7 round-trip - silently corrupts that region. Mitigation: the audit-completion test - scans for `SourceInfo::default()` in produced ASTs. -- **Breaking existing HTML pipeline tests**: the audit changes source_info - on many nodes. The hash-based reconciler doesn't care, but tests that - inspect specific source_info shapes might fail. Run the full workspace - test suite after each transform fix. +- **Audit completeness**: missing a site means a future Plan 7 + round-trip silently corrupts that region. Mitigation: the + audit-completion test scans for `SourceInfo::default()` AND for + synthesized-but-not-Generated shapes in produced ASTs. +- **Breaking existing HTML pipeline tests**: the audit changes + source_info on many nodes. The hash-based reconciler doesn't care, + but tests that inspect specific source_info shapes might fail. Run + the full workspace test suite after each transform fix. - **Shortcode-resolved nodes change source_info shape**: existing tests that assert "the resolved title Str has SourceInfo::default()" or - similar will fail. Update them to expect Derived. The HTML output + similar will fail. Update them to expect Generated. The HTML output doesn't change shape (still flat inlines/blocks); only source_info on those nodes changes. -- **No new CustomNode type added** (deliberate change from earlier draft). - The HTML pipeline isn't affected — shortcode-resolved content remains - flat inlines/blocks; the HTML writer renders them normally. +- **No new CustomNode type added** (deliberate, retained from the + earlier draft). The HTML pipeline isn't affected — shortcode-resolved + content remains flat inlines/blocks; the HTML writer renders them + normally. +- **Post-walk recursion bugs**: missing a nested AST shape in the walk + means some inner nodes don't get the anchor. Cover Strong/Emph/Link + for inlines and Div/BlockQuote/Span-in-Plain for blocks. ## Estimated scope | Component | Lines (rough) | |---|---| +| Phase 0: `Inline::source_info_mut` + `Block::source_info_mut` accessors + unit tests | ~70 | | Audit pass (grep + categorize) | ~30 (mostly notes) | -| Shortcode resolver fix (~12 sites, all emit Derived now) | ~80 | +| `stamp_shortcode_anchors` helper + mutable recursion walks (modeled on `shortcode_resolve.rs::recurse_inline` / `resolve_block`) | ~220 | +| Shortcode resolver dispatch-site fixes — 12 production sites: `config_value_to_inlines` ×7, `flatten_blocks_to_inlines` ×1, `lua_result_to_shortcode_result::Text` ×1, `make_error_inline` ×2, `shortcode_to_literal` ×1. Most covered by the stamper; `make_error_inline` and `shortcode_to_literal` need call-site source_info threading. | ~70 | | TitleBlock fix | ~20 | | Sectionize fix | ~20 | | Footnotes fix | ~30 | | Appendix fix | ~30 | -| Theorem title-from-attr fix | ~20 | +| Theorem + proof title-from-attr fix (thread `attr_source` through `extract_name_attr` in both files) | ~30 | | TreeSitter postprocess fix | ~10 | -| Tests | ~200 | -| **Total** | **~440** | +| Tests | ~280 | +| **Total** | **~810** | -Smaller than the earlier draft (which included a ShortcodeResolution -wrapper, qmd writer arm, and HTML pipeline implications). One focused -session likely. +The earlier "~540" estimate omitted the Phase-0 mut accessors (~70 LOC), +under-counted the recursion walkers (mutable walks over the full +inline/block container set are ~220 LOC, not ~80), and missed the +`make_error_inline` / `shortcode_to_literal` / `proof.rs` fix sites. ## Notes -This is a "scattered fixes" plan — touches many transform files with small -per-file changes. Most of the diff is mechanical: `SourceInfo::default()` -→ `ctx.source_info.clone()` (Original) for synthesizers that DO have a -source preimage but currently drop it; `Synthetic { by: By::() }` -for genuine synthesizers; `Derived { from, by }` for shortcode resolutions. +This is a "scattered fixes" plan — touches many transform files with +small per-file changes. Most of the diff is mechanical: `SourceInfo::default()` +→ either `ctx.source_info.clone()` (Original) for synthesizers that DO +have a source preimage but currently drop it, or +`Generated { by: By::(), from: smallvec![] }` for genuine +synthesizers, or `stamp_shortcode_anchors(...)` for shortcode +dispatches. The conceptual surface is small; the file count is not. The earlier-draft "wrap shortcode resolutions in `CustomNode("ShortcodeResolution")`" -approach was walked back. Per the user's reasoning: wrappers were heavy for -what's fundamentally a provenance problem. Derived gives us atomic detection -at the writer level (Plan 7) without the structural cost of a new CustomNode -type, the qmd writer arm, the HTML-pipeline-resolve transform, or the -React component for the wrapper. Includes (Plan 8) still use a wrapper -because their cross-file FileId issue genuinely requires anchoring at the +approach was walked back. Per the user's reasoning: wrappers were heavy +for what's fundamentally a provenance problem. The typed `Invocation` +anchor in `Generated` gives Plan 7 atomic detection at the writer +level (via `by.is_atomic_kind()` returning true for `shortcode`) +without the structural cost of a new CustomNode type, the qmd writer +arm, the HTML-pipeline-resolve transform, or the React component for +the wrapper. Includes (Plan 8) still use a wrapper because their +cross-file FileId issue genuinely requires anchoring at the parent-file level. The shortcode-resolution provenance change propagates to: q2-preview -rendering (Plan 2B's atomic-aware `setLocalAst` gating in the -framework's `Inline` dispatcher — `framework/dispatchers.tsx`, -post-2pre — detects Derived inlines via Plan 2A's -`isAtomicSourceInfo` accessor. The original "MaybeReadOnlyInline -wrapper" framing was resolved during the 2026-05-06 / 2026-05-07 -review sessions into the framework's unified `Block` / `Inline` -dispatchers gaining the atomic gate, rather than a separate -wrapper component or per-format duplication. Both q2-debug and -q2-preview pick up the gate "for free"), -writer round-trip (Plan 7's atomic logic detects Derived + UseAfter -as AtomicViolation; Plan 7's dedupe rule handles multi-inline -shortcode resolutions), and possibly some existing tests that -asserted on the flat Str's source_info shape. +rendering (Plan 2A's framework atomic gate in `dispatch.tsx`'s `Node` +detects `shortcode` kind via `ATOMIC_GENERATED_KINDS` and the +JS-side `isAtomicSourceInfo` accessor), writer round-trip (Plan 7's +soft-drop logic detects `by.is_atomic_kind()` + UseAfter and emits +Q-3-42; Plan 7's dedupe rule handles multi-inline shortcode +resolutions via the shared anchor source_info), and possibly some +existing tests that asserted on the flat Str's source_info shape. + +The post-walk's enrichment pattern (promote kind, preserve prior +`by.data`, append anchor) is the canonical shape for any future +transform that wraps a Lua dispatch. Document the pattern in Plan 6's +helper so future contributors have a reference. diff --git a/claude-notes/plans/2026-05-04-q2-preview-plan-7-incremental-writer.md b/claude-notes/plans/2026-05-04-q2-preview-plan-7-incremental-writer.md index 3a7a7b7f6..ca2b0002e 100644 --- a/claude-notes/plans/2026-05-04-q2-preview-plan-7-incremental-writer.md +++ b/claude-notes/plans/2026-05-04-q2-preview-plan-7-incremental-writer.md @@ -1,422 +1,847 @@ -# Plan 7 — Incremental writer preimage walk + Transparent + atomic-violation + multi-inline dedupe - -**Date:** 2026-05-04 -**Branch:** feature/q2-preview -**Status:** Implementation plan (open questions named) +# Plan 7 — Incremental writer: preimage walk, Transparent / Omit, atomic soft-drop, multi-inline dedupe + +**Date:** 2026-05-04 (revised 2026-05-24; closed 2026-05-26) +**Branch:** feature/provenance +**Status:** **Shipped** — Phases 1-7 + 9 landed on `feature/provenance`. +Phase 8's broader Playwright matrix deferred to Plan 7b. Q-3-41 + +TS-side editability predicate deferred to Plan 7c. Algebraic +soundness refactor of the coarsen/write path tracked under Plan 7d. **Milestone:** M3 (edit-back works for non-include, non-pure-synthesis edits) +> **Reading this plan in 2026-05-26+:** the `[x]` checkboxes reflect +> what shipped. A handful of `[ ]` items remain — all of them are +> explicitly deferred with a pointer to the follow-up plan that owns +> them. Plan 7 itself is closed; no remaining work lives here. + +## Epic context + +Part of the **provenance epic** (Plans 3–10). Plan 7 is the keystone: +once the writer understands the typed provenance from Plans 4–6, it +can correctly round-trip user edits, soft-drop bad edits with clear +diagnostics, and surface warnings on both hub-client and the `q2 +preview` SPA. The file name keeps its `q2-preview-plan-7-` form for +git-history continuity; new plans in the epic adopt the +`provenance-plan-N-` convention (see Plan 9 / Plan 10). + ## Goal -Teach the incremental writer (`pampa::writers::incremental`) to handle the -new provenance shapes introduced by Plans 4-6 so that q2-preview round-trip -edits work correctly. Five new behaviors: - -- **`preimage_in(target_file_id)` accessor**: a recursive walk through - Substring/Concat/Derived chains that returns the byte range in the target - file IF the chain resolves there, else None. -- **`Transparent` coarsen variant**: for `KeepBefore` nodes whose source_info - is `Synthetic` but whose children have recoverable preimages (Sectionize's - case), recurse into the children rather than emit a useless empty - Verbatim. The wrapper itself contributes nothing to the output. -- **Atomic detection via `Derived`**: nodes with `Derived` source_info are - atomic. KeepBefore + Derived → Verbatim copies the preimage (the shortcode - token, etc.). UseAfter or RecurseIntoContainer touching a Derived node → - AtomicViolation. -- **Atomic detection via `is_atomic_custom_node`**: `IncludeExpansion` - CustomNode is atomic via type_name lookup. Same outcome as Derived case - (KeepBefore Verbatim; anything else → AtomicViolation). Plus - `CrossrefResolvedRef` is atomic (already a CustomNode in the AST). -- **Multi-inline dedupe rule**: when assembling a run of consecutive inlines - (in InlineSplice or inline assembly contexts) that all share the same - Derived source_info `from`, emit Verbatim *once* for the group rather - than N times. This handles multi-inline shortcode resolutions. - -This plan also adds a `pipeline_kind: Option` parameter to -`incremental_write_qmd` (per Decision D — param with default) that runs the -q2-preview pipeline on the baseline AST before reconciling, making the -reconcile symmetric. Existing callers pass `None` and get today's -parse-only baseline behavior; q2-preview's call site passes -`Some("preview")`. The string is the wasm-bindgen-friendly form of -the `Option<&'static str>` selector Plan 1 added to `Format` -(`crates/quarto-core/src/format.rs::Format::pipeline_kind`); inside -`incremental_write_qmd` it maps to the same kind string the render -side already uses ("preview"). +Teach the incremental writer (`pampa::writers::incremental`) to +handle the typed provenance shapes Plans 4–6 introduce so that +q2-preview round-trip edits work correctly. Five new behaviors: + +- **`preimage_in(target_file_id)` accessor** on `SourceInfo`: a + recursive walk through Substring / Concat / Generated chains that + returns the byte range in the target file if the chain resolves + there, else `None`. For `Generated`, walks through the + `Invocation` anchor only — never `ValueSource`, never `Dispatch`, + never `Other`. +- **`Transparent` coarsen variant**: for `KeepBefore` nodes whose + `source_info` is `Generated` with empty anchors AND non-atomic + kind (Sectionize wrappers, footnotes container, appendix + container), recurse into the children rather than emit a useless + empty Verbatim. The wrapper itself contributes nothing to output. +- **`Omit` coarsen variant**: for `KeepBefore` nodes that have no + preimage in target and no source-bearing children (atomic-kind + Generated with no Invocation anchor — filter constructions, + title-block synthesis, tree-sitter postprocess space). The node + is dropped from output; the next pipeline run regenerates it from + baseline content. +- **Unified editability gate, applied via soft-drop**: a region is + editable iff it has byte-traceable preimage in the target file + AND is not an atomic-kind `Generated` AND is not an atomic + CustomNode. Edits to non-editable regions soft-drop with + diagnostic warnings rather than aborting the entire write. +- **Multi-inline dedupe rule**: when assembling a run of consecutive + inlines (in InlineSplice or inline-assembly contexts) whose + `Invocation` anchors are structurally equal (`PartialEq`), emit + Verbatim *once* for the group rather than N times. Handles + multi-inline shortcode resolutions. + +Plan 7 also changes the WASM-facing `incremental_write_qmd` +signature: the caller now supplies the baseline AST explicitly +instead of having the writer parse the original qmd internally. +This makes the writer pipeline-agnostic — it diffs the two ASTs +the caller hands it and writes accordingly, regardless of what +pipeline produced them. When this plan lands, ReactPreview's read-only guard from Plan 1 -lifts (one-block early-return in `handleSetAst`, deletable per Plan -1's design), and edits in q2-preview round-trip correctly. The -**render-side dispatches** Plan 1's §"Multi-plan contract: cleanup -owed to Plan 7" originally targeted (`AstTransformsStage::run()` -and `ReactPreview.tsx::doRender`) **already use the structured -selector** as of Plan 1's implementation: `AstTransformsStage` -reads `ctx.format.pipeline_kind`, and `ReactPreview.doRender` -dispatches via the `pipelineKindForFormat(format)` helper at -`hub-client/src/utils/pipelineKind.ts`. Plan 7 therefore adds the -**write-side parameter** rather than refactoring those render-side -sites; see §Scope for the verification step. +lifts, and edits in q2-preview round-trip correctly. The q2-preview +SPA gains edit-back via the same writer path — replacing its +current `noopSetAst` with a real handler that routes through +`incrementalWriteQmd` to the sync-client's `updateFileContent` +and through automerge to the ephemeral hub's disk-write. + +## API decomposition: parse / transform / reconcile / write + +The writer is one node in a four-primitive grammar: + +| Primitive | Rust signature (existing) | What it does | +|---|---|---| +| **parse** | `qmd_to_pandoc(bytes) → (Pandoc, ASTContext)` | Lex/parse qmd source to a parse-only AST. No transforms. | +| **transform** | `build__transform_pipeline()` + `run_pipeline()` | Apply a pipeline's transform stages to a parse-only AST. Produces a same-shape AST at a different tier. | +| **reconcile** | `compute_reconciliation(&a, &b) → ReconciliationPlan` | Diff two ASTs structurally, producing a plan of KeepBefore / UseAfter / RecurseIntoContainer alignments. | +| **write** | `incremental_write(qmd, original_ast, new_ast, plan)` | Materialize the plan as qmd bytes — Verbatim-copy source bytes for KeepBefore, qmd-writer-serialize for UseAfter / Rewrite. | + +The Rust internals already implement this decomposition. The WASM +bridge layer exposes the compositions that callers need. + +**Pipeline tier discipline.** "Same pipeline tier" means: the +baseline AST and the new AST were both produced by the same +sequence of transform stages, applied to ASTs that were both +parsed from the same kind of source. The reconciler is tier-agnostic +— it just diffs structures — but the caller must supply ASTs at the +same tier or every Generated wrapper looks like a new insertion. +Two tiers matter today: + +- **parse-only**: `parse_qmd_to_ast(content)` output. Used by + q2-debug, q2-slides, and the WASM demos (kanban, hub-react-todo). +- **q2-preview**: `renderPageInProjectWithAttribution(path, …)` + output (post-q2-preview-pipeline AST). Used by ReactPreview's + q2-preview path and the q2-preview SPA. ## Scope ### In scope -- `preimage_in` accessor on `SourceInfo` (in `quarto-source-map`). Walks - Substring's `parent`, Concat's `pieces`, Derived's `from`. Returns - `Some(byte_range)` if the chain resolves to an `Original` in the target - file, else `None`. -- `coarsen` rules. Two new entry variants (`Transparent`, `Omit`) plus - **soft-drop substitution logic** for atomic content: - - **Verbatim**: KeepBefore + `preimage_in` resolves into target file. - Today's behavior, generalized via `preimage_in` to work on Derived - chains too. - - **Transparent (recurse)**: KeepBefore + Synthetic source_info + block - has children with recoverable preimages. Recurse on children, produce - a child-entry list. Wrapper itself emits nothing. Handles Sectionize. - - **Omit**: KeepBefore + atomic-Synthetic node, OR KeepBefore + Synthetic - with no recoverable children. The node is dropped from output; the - next pipeline run regenerates it from baseline content. Used for - filter-constructed leaves and the rare structurally-stable Synthetic - leaf. - - **Rewrite**: UseAfter or non-atomic Recurse-with-changes. Today's - behavior. Includes the let-user-win case for block-level UseAfter - on atomic nodes (see §"The coarsen logic" — atomicity does NOT - block this path; the qmd writer's CustomNode arms know how to write - fresh atomic CustomNodes from `plain_data`). - - **InlineSplice**: today's behavior, extended with the multi-inline - Derived dedupe rule and the **inline-level soft-drop substitution** - described below. -- **Soft-drop substitutions** for the bad-edit cases. Coarsen detects - these and **substitutes a safe alignment** rather than aborting the - whole write: - - **Inline-level UseAfter on a Derived inline** (user retyped resolved - shortcode text): substitute KeepBefore for that one inline within - the surrounding `InlineReconciliationPlan`. The rest of the inline - plan continues as-is. Emit a `Q-3-42` warning into the warnings - sink describing what was reverted. - - **Block-level RecurseIntoContainer on an atomic CustomNode** (user - edited inside an include): substitute KeepBefore for the wrapper. - The wrapper's source_info points at the parent-file include token - (Plan 8); Verbatim copy preserves it. Inner edits never reach the - qmd writer's CustomNode arm. Emit a `Q-3-43` warning. - - **Block-level UseAfter on an atomic node** (user replaced or - deleted an atomic block via React): **let-user-win** — keep as - Rewrite. The new block goes through the qmd writer's normal arms - (Plan 8's IncludeExpansion arm reads `plain_data["source_path"]` - and emits `{{< include … >}}` from a fresh user-edit-tagged - CustomNode just as cleanly as from a pipeline-emitted one). No - warning — the user explicitly chose this. -- **No `AtomicViolation` variant**. The previous design had coarsen - produce an `AtomicViolation` entry that caused `incremental_write` to - return `Err`. Under soft-drop, every bad-edit case has a safe - substitution, so `AtomicViolation` is unnecessary. The writer's - return type stays `Result<(String, Vec), Vec>`-shaped - (see "Warning channel mechanism" below); `Ok` carries the saved qmd - plus any soft-drop warnings. -- **Warning channel mechanism**: `coarsen` accepts a - `&mut Vec` warning sink as a parameter. Soft-drop - substitutions push warnings into the sink. The top-level - `incremental_write` returns `Ok((String, Vec))` - when no fatal error occurs (warnings can be present), and `Err` only - for true write failures (UTF-8 errors, qmd writer panics on - malformed input — same as today). The hub-client's `RenderResponse` - already carries a `warnings: [...]` field (Plan 1's pipeline - diagnostics use it); soft-drop warnings flow through the same path. -- **Diagnostic codes** (per the Q-3 conventions; see - `crates/quarto-error-reporting/src/error_catalog.json`): - - `Q-3-42` — "Shortcode edit dropped". Emitted when an inline-level - edit to Derived content was substituted by KeepBefore. Body: - affected inline's Derived `by.kind` and resolved-to text, plus the - shortcode token's source range so editor UIs can highlight it. - - `Q-3-43` — "Include block edit dropped". Emitted when a - block-level RecurseIntoContainer on an atomic CustomNode was - substituted by KeepBefore. Body: the include's `source_path` from - `plain_data`, plus the wrapper's source range. Actionable message: - "to edit this content, open `` directly." - Both are `DiagnosticKind::Warning`. No new structural fields on - `DiagnosticMessage` — discriminants are in the code+notes. -- `is_atomic_custom_node` registry, defined in **`quarto-core`** as - `pub const ATOMIC_CUSTOM_NODES: &[&str]` plus - `pub fn is_atomic_custom_node(type_name: &str) -> bool`. Plan 7 - ships the **Rust side** (writer in `pampa` consumes it; Plan 8 - extends the const to add `IncludeExpansion`). The **TypeScript - hand-mirror** at `hub-client/src/utils/atomicCustomNodes.ts` ships - with **Plan 2A** because Plan 2B is the first consumer (atomic-aware - `setLocalAst` gating in the dispatcher); ownership was reassigned - during the 2026-05-06 review session. The TS file's header comment - documents the sync convention — both sides are kept aligned via - doc comments + code review (no codegen). This matches the codebase's - existing pattern for cross-language type pairs (e.g., - `hub-client/src/types/intelligence.ts` mirrors `quarto-lsp-core` - types this way; `hub-client/src/types/diagnostic.ts` mirrors - `DiagnosticMessage`). Initial set: - `["IncludeExpansion", "CrossrefResolvedRef"]` (Plan 8 adds - `IncludeExpansion` to both sides). Note: `ShortcodeResolution` is - NOT in this set — shortcode atomicity is handled via the `Derived` - source_info path, not via a wrapper. - - **Migration path for extension-contributed atomic types**: the - hand-mirror is the right shape for built-ins. Extension-contributed - atomic types (a future plan; see §Open questions - "is_atomic_custom_node lookup — extension forward-compat") will - replace the JS const with a `wasm_bindgen` runtime lookup populated - per-render from loaded extensions. The migration changes the JS - data source but not the React-side dispatch logic — components - continue to call `isAtomicCustomNode(typeName)`; the function's - implementation switches from a const lookup to a context lookup. -- `assemble`: - - Walks Transparent entries by emitting each child's bytes with - separators computed from the children's original positions. - - Omit entries contribute nothing to the output (the original - Synthetic node is dropped; baseline regenerates next pipeline run). - - Inline-level dedupe: within an inline-splice or inline-assembly run, - detect consecutive inlines sharing the same Derived `from` and emit - one Verbatim (the from's preimage range) instead of N. - - No AtomicViolation handling — soft-drop substitutions happened in - coarsen; assemble sees only safe entries. -- `pipeline_kind` parameter added to `incremental_write_qmd`. When - `Some("preview")`: - - Re-parses `original_qmd` (today's behavior). - - **Runs the q2-preview transform pipeline on the baseline** (this is the - NEW step). Produces a baseline AST at the same pipeline tier as the - live AST. - - Reconciles new vs baseline. - - Writes via the updated coarsen/assemble logic. -- Lift the `handleSetAst` read-only guard in `ReactPreview.tsx` introduced - in Plan 1. Wire `setLocalAst` through with `pipeline_kind: "preview"`. -- **Verify: structured pipeline dispatch is already in place - (Plan 1 commits `a7143cc7` + `60658a4e` + `a5e00b20`).** Plan 1's - §"Multi-plan contract: cleanup owed to Plan 7" originally framed - this as scaffolding Plan 7 would refactor, but Plan 1 implemented - the structured form directly: - 1. `AstTransformsStage::run()` reads `ctx.format.pipeline_kind` - (the `Option<&'static str>` field on `Format`) and dispatches - to `build_q2_preview_transform_pipeline` when it equals - `Some("preview")`. - 2. `ReactPreview.tsx::doRender` dispatches via - `pipelineKindForFormat(format)` from - `hub-client/src/utils/pipelineKind.ts`, returning `'preview'` - for q2-preview and `undefined` for everything else. - Plan 7 therefore has no render-side cleanup work. During Plan 7 - implementation, **verify the write-side parameter threads through - the same selector**: Plan 7's new `pipeline_kind: Option` - argument on `incremental_write_qmd` (§Scope item below) should be - populated at the JS call site by `pipelineKindForFormat(format)` - and threaded through `wasmRenderer.ts::incrementalWriteQmd` to - the WASM boundary. Internally, the string maps to the same - `pipeline_kind` value the render side already uses. +#### `preimage_in` accessor (in `quarto-source-map`) -### Out of scope +```rust +impl SourceInfo { + pub fn preimage_in(&self, target: FileId) -> Option>; +} +``` -- Include round-trip via wrapper-CustomNode (Plan 8 — uses this plan's - atomic-detection + soft-drop logic but introduces the wrapper itself). -- Engine output as Derived (deferred future work). -- Editable CustomNode slots (e.g., editing a Callout's title and body - through React with edits round-tripping back to source). See - `claude-notes/research/2026-05-05-editable-custom-nodes.md`. -- Promoting the qmd writer to a fallible `Result` interface throughout. - Soft-drop semantics make this unnecessary for q2-preview; the - remaining panic paths are debug assertions for genuine programming - errors (e.g., `unreachable!()` in Plan 8's qmd-writer arm for atomic - CustomNodes in non-Verbatim paths), not user-facing failure modes. - -## Design decisions (settled in conversation) - -- **Sectionize's transparent recurse pattern**: `Synthetic` wrappers with - source-bearing children get the Transparent treatment. Children's bytes - are contiguous in source (Sectionize doesn't reorder), so emitting them - in order produces the right output. The wrapper emits nothing. -- **`FootnotesTransform` and `AppendixStructureTransform` containers also fit - the Transparent pattern.** Plan 2B's audit added both transforms to the - q2-preview pipeline. Their synthesized container Divs (`
`, - `
`) have no source preimage, but their children - carry source_info from the user-typed footnote content / user-defined - `:::{.appendix}` blocks. Same Transparent treatment as Sectionize. - Worth noting: `FootnotesTransform`'s synthesized `` markers are NOT - pure Synthetic — `create_footnote_ref` at `crates/quarto-core/src/transforms/footnotes.rs:440-460` - clones source_info from the original `Note` inline, so the markers carry - the same byte range as the user's `^[footnote text]` syntax. Round-trip-friendly - as `Original` without extra writer work; only the bare `
` - wrapper is the Transparent case. -- **Atomic detection has three paths** (all converging through the same - `is_atomic` helper): - 1. **Derived source_info** (shortcode resolutions). Any node whose - `source_info` is `Derived` is atomic. - 2. **Atomic Synthetic source_info** (filter constructions, title-block - synthesis, tree-sitter postprocess space, etc.). Detected via - `By::is_atomic_synthesizer()` (Plan 4 method on the `By` struct, - keyed off `by.kind`). - 3. **Atomic CustomNode types** (IncludeExpansion, CrossrefResolvedRef). - Looked up via `is_atomic_custom_node(&type_name) -> bool`. -- **Why three paths**: shortcode resolutions and filter constructions - don't get wrappers (wrappers are too heavy for non-cross-file cases); - they propagate atomicity via source_info shape. Includes use a - wrapper because of the cross-file FileId issue (the included blocks - live in another file; we need an anchor in the parent file). -- **Soft-drop, not abort**: bad-edit cases substitute a safe alignment - in coarsen and emit a warning rather than aborting the entire write. - The user's other (valid) edits go through; the bad edit is reverted - to KeepBefore (or KeepBefore-equivalent for inline-level cases). - Reasoning: the React side (Plan 2B) is the primary safeguard via - read-only enforcement; the writer is the contract guarantor; if both - are correct the warning channel rarely fires; if React has a hole the - writer protects without losing the user's session. "Edit cannot apply" - is honored (the bad edit doesn't reach source); "edit cannot apply - silently" is not (a Q-3-42/Q-3-43 warning surfaces in the diagnostic - panel). -- **Let-user-win for block-level UseAfter on atomic** (user replaced - or deleted an atomic block via React). Coarsen does NOT substitute - here; the new block goes through Rewrite via the qmd writer. The - qmd writer's CustomNode arms know how to write fresh atomic types - from `plain_data` (Plan 8's IncludeExpansion arm reads - `plain_data["source_path"]`). This composes naturally — a fresh - user-edit-tagged IncludeExpansion serializes the same way as a - pipeline-emitted one. No warning; the user's intent is clear. -- **Multi-inline shortcode dedupe**: a multi-inline shortcode resolution - produces several inlines all sharing the same Derived `from`. The - writer's inline-assembly path needs to detect this and emit Verbatim - *once* for the group. Without this, the assembly emits the shortcode - token N times. -- **Param-with-default for `incremental_write_qmd`** (Decision D): add a - `pipeline_kind: Option` parameter. `None` = current behavior - (parse-only baseline). `Some("preview")` = run q2-preview pipeline on - baseline. Existing callers (q2-debug demos, sync client, ReactPreview's - q2-debug path) continue to work unchanged. - -## The coarsen logic +Walks Substring's `parent`, Concat's `pieces`, Generated's +`Invocation` anchor (via `invocation_anchor()`). Returns +`Some(byte_range)` if the chain resolves to an `Original` in the +target file, else `None`. -``` -fn is_atomic(node) -> bool { - match node.source_info() { - SourceInfo::Derived { .. } => true, - SourceInfo::Synthetic { by } if by.is_atomic_synthesizer() => true, - _ => {} +**`Invocation` is the only role consulted.** `ValueSource` (Plan 9) +and `Dispatch` (Plan 10) are diagnostic-only. `AnchorRole::Other` +roles are also not walked. This is the binary asymmetry contract: +copying bytes from a `ValueSource` source range would emit raw YAML +metadata into the body — a hard correctness bug. The contract is +documented on `preimage_in` and on `AnchorRole::Other`'s doc-comment. + +Future anchor roles default to non-walked unless they're explicitly +added to `preimage_in`'s implementation. Extensions introducing +`AnchorRole::Other("…")` should treat this as a feature: their +attribution data isn't accidentally consulted by the writer. + +#### Unified editability predicate + +The same predicate gates two surfaces: Plan 2A's React read-only +check (preventing the user from typing into uneditable regions in +the first place) and the writer's soft-drop logic (the contract +guarantor if React has a hole). + +```rust +fn is_editable_inside(node: &Node, target_file_id: FileId) -> bool { + // Atomic CustomNodes (IncludeExpansion, CrossrefResolvedRef): + // single replaceable units, not editable inside. The user can + // replace them wholesale via a component menu; they can't type + // inside them. + if let Node::Block(Block::Custom(cn)) = node + && is_atomic_custom_node(&cn.type_name) + { + return false; } - match node { - Block::Custom(cn) if is_atomic_custom_node(&cn.type_name) => true, - _ => false, + // Atomic-kind Generated source_info (shortcode, filter, + // title-block, tree-sitter-postprocess): pipeline-emitted + // content whose user-source is the invocation token, not the + // resolved text. + if let SourceInfo::Generated { by, .. } = node.source_info() + && by.is_atomic_kind() + { + return false; } + // Catch-all: editable iff the region has byte-traceable preimage + // in the target file. This covers: + // - Original in target: editable. ✓ + // - Original / Substring rooted outside target: not editable. + // - Generated with Invocation anchor pointing into target: + // editable IFF non-atomic kind (handled above; this branch + // never sees atomic-kind Generated). + // - Generated with empty anchors (sectionize, footnotes, + // appendix containers): not editable — preimage_in returns + // None. + // - Generated with only ValueSource / Dispatch anchors + // (Plan 9/10 shapes): not editable — preimage_in walks + // Invocation only. + node.source_info().preimage_in(target_file_id).is_some() } +``` + +The catch-all clause is the change Plan 7 introduces over earlier +drafts. Non-atomic synthesized containers (sectionize wrappers, +footnotes container, appendix container) are now classified as +non-editable on both surfaces. Edits to them via React go through +the writer's soft-drop path; the React side classifies the region +as read-only and shows the user no edit affordance. + +#### `coarsen` rules — two new entry variants plus soft-drop + +`CoarsenedEntry` gains two variants alongside today's `Verbatim`, +`Rewrite`, and `InlineSplice`: + +- **`Transparent`**: KeepBefore on a `Generated` wrapper with empty + anchors AND non-atomic kind AND source-bearing children. Recurses + on the children, producing a child-entry list. The wrapper itself + emits nothing. Handles Sectionize, footnotes-container, + appendix-container. +- **`Omit`**: KeepBefore on an atomic-kind `Generated` node with no + Invocation anchor (filter-constructed leaves, title-block h1, + tree-sitter postprocess space), OR on a non-atomic `Generated` + with no children. The node is dropped from output; the next + pipeline run regenerates it. + +Soft-drop substitutions cover the bad-edit cases. Each substitutes +a safe alignment in coarsen and emits a warning rather than +aborting the entire write: + +- **Inline-level UseAfter on a region where `is_editable_inside` + returns false** (typically: user retyped resolved shortcode + text): substitute KeepBefore for that one inline within the + surrounding `InlineReconciliationPlan`. The rest of the inline + plan continues as-is. Emit a `Q-3-42` warning. +- **Block-level RecurseIntoContainer on a region where + `is_editable_inside` returns false** (user edited inside an + include, OR inside a synthesized-from-metadata container): + substitute KeepBefore for the wrapper. For an atomic CustomNode + (include), the wrapper's `source_info` is Original pointing at + the include token; Verbatim copy preserves it. For a no-preimage + `Generated` container, the substitution lands in `Omit` — the + container regenerates next pipeline run. Either way, inner edits + never reach the qmd writer's arm. Emit a `Q-3-43` warning. +- **Block-level UseAfter on a region where `is_editable_inside` + returns false but the node is an atomic CustomNode** (user + replaced or deleted an atomic block via React's component menu): + **let-user-win** — keep as Rewrite. The qmd writer's CustomNode + arm reads `plain_data` and emits the include syntax from a fresh + user-edit-tagged CustomNode. No warning — the menu is the + affordance the user took; the intent is unambiguous. +- **Block-level UseAfter on a region where `is_editable_inside` + returns false and the node has no preimage** (user replaced a + synthesized-from-metadata container via React): soft-drop — + there's no source byte range to anchor a Rewrite at. Substitute + Omit; the original container regenerates next pipeline run. + Emit a `Q-3-43` warning. + +Earlier drafts had an `AtomicViolation` variant that caused +`incremental_write` to return `Err`. Soft-drop replaces it: every +bad-edit case has a safe substitution, so `AtomicViolation` is +unnecessary. The writer's return type carries warnings alongside +the saved qmd, not as fatal errors. + +**Writer return type after Plan 7.** `incremental_write` returns +`Result<(String, Vec), Vec>`. +`Ok((qmd, warnings))` carries the qmd plus the soft-drop warnings +collected by `coarsen`. `Err(diags)` keeps its existing meaning: +qmd-writer failures that bubble up via `?` from the underlying +serializer (e.g. UTF-8 validation in `write_inline_to_string` at +`incremental.rs:813`). The WASM bridge maps `Err` to `{ success: +false, error: "Incremental write failed: ..." }` unchanged from +today. `compute_incremental_edits` takes the same shape: +`Result<(Vec, Vec), Vec>`. + +**Programmer errors do not flow through `Result`.** Invariant +violations — Plan-6-stamper bugs, structurally impossible +reconciliation states, post-coarsen contract violations — are +`panic!()` / `unreachable!("...")` / `debug_assert!()` inline. +This is the idiomatic q2 pattern: see existing uses at +`incremental.rs:825`, throughout `pampa/src/writers/json_stream.rs`, +`pampa/src/writers/html.rs:1188`, `pampa/src/writers/ansi.rs:223`, +and 10+ sites across `pampa/src/writers/json.rs`. The WASM bridge +already installs `console_error_panic_hook` at module init +(`wasm-quarto-hub-client/src/lib.rs:115`), so an in-process panic +surfaces as a JS exception with full stack trace — loud, immediate, +and the surface we want for "this should never happen." No +`WriterError` enum is introduced. + +#### Coarsen pseudo-code + +``` +fn coarsen(...) -> Vec: For each block alignment from the reconciler: if alignment is KeepBefore(orig_idx): - let original_block = original_ast.blocks[orig_idx]; - if let Some(range) = original_block.source_info().preimage_in(target_file) { - // Includes the atomic case (Derived + KeepBefore): Verbatim copy - // of the preimage. preimage_in walks Derived chains to the from. + let block = original_ast.blocks[orig_idx]; + if let Some(range) = block.source_info().preimage_in(target_file) { + // Original / Substring / Concat-contiguous / Generated-via- + // Invocation-anchor: all resolve here uniformly. Atomic-kind + // shortcode case lands here too — its Invocation anchor + // resolves to the token bytes. CoarsenedEntry::Verbatim { byte_range: range, orig_idx } } - else if matches!(original_block.source_info(), SourceInfo::Synthetic { by }) - && by.is_atomic_synthesizer() + else if matches!(block.source_info(), SourceInfo::Generated { by, .. }) + && by.is_atomic_kind() { - // Atomic Synthetic with no preimage (filter construction etc.). + // Atomic-kind Generated with no Invocation anchor (filter + // construction, title-block, tree-sitter-postprocess). // Drop from output; baseline regenerates next pipeline run. + // + // Belt-and-suspenders enforcement of Plan 4's required-anchor + // invariant for shortcode: a shortcode-Generated without an + // Invocation anchor would mean silent data loss. + debug_assert!( + !by.is_kind("shortcode"), + "Generated {{ by: shortcode, from: [] }} reached the writer — \ + Plan 6's stamper must always attach an Invocation anchor \ + for shortcode resolutions." + ); CoarsenedEntry::Omit } - else if matches!(original_block.source_info(), SourceInfo::Synthetic { .. }) - && original_block has children + else if matches!(block.source_info(), SourceInfo::Generated { .. }) + && block has source-bearing children { - // Non-atomic Synthetic wrapper (Sectionize etc.) — Transparent recurse. + // Non-atomic Generated wrapper (Sectionize, footnotes-container, + // appendix-container) with source-bearing children: Transparent + // recurse. CoarsenedEntry::Transparent { child_entries: } } else { - // Synthetic with no children, or some other shape with no preimage. - CoarsenedEntry::Omit + // Catch-all: KeepBefore with no preimage and no Generated-cascade + // shape that maps to Omit or Transparent. Examples: cross-file + // Original (no Plan-8 wrapper yet), Substring chain rooted outside + // target. (Gappy Concat is structurally impossible from in-repo + // callers — see §Open questions — but the catch-all is the safe + // fallback if a malformed JSON ingested via WASM produces one.) + // Fall back to Rewrite — re-serialize the + // unchanged block through the qmd writer. Lossy at the byte level + // (whitespace, formatting may shuffle) but preserves content. The + // earlier draft routed these to Omit; that path was data-loss-shaped + // and should never reach the writer. + // + // The reconciler's KeepBefore alignment ties orig_idx to a specific + // new-side block (they were classified structurally equal). The + // catch-all serializes that aligned new-side block — equivalently + // the original-side block, since they compare equal — so the + // existing `Rewrite { new_idx }` variant fits without modification. + // Coarsen looks up the aligned new_idx from the plan; no separate + // variant or field is needed. + CoarsenedEntry::Rewrite { new_idx: aligned_new_idx } } if alignment is UseAfter(new_idx): - // Let user win — including for atomic types. The qmd writer's - // CustomNode arms know how to write fresh atomic CustomNodes from - // plain_data (Plan 8's IncludeExpansion arm reads source_path). - // No atomic check here; trust the alignment. - CoarsenedEntry::Rewrite { new_idx } + let new_block = new_ast.blocks[new_idx]; + let was_atomic_custom_node = matches!(&new_block, Block::Custom(cn) + if is_atomic_custom_node(&cn.type_name)); + let was_no_preimage_generated = matches!(new_block.source_info(), + SourceInfo::Generated { .. }) + && new_block.source_info().preimage_in(target_file).is_none(); + + if !was_atomic_custom_node && was_no_preimage_generated { + // User replaced a synthesized-from-metadata container wholesale. + // No source position to anchor at; can't Rewrite. Soft-drop. + warnings.push(diagnostic_q3_43_widened(new_block)); + CoarsenedEntry::Omit + } else { + // Let user win — including for atomic CustomNodes (the user + // replaced an include via the component menu; the qmd writer's + // CustomNode arm handles this). + CoarsenedEntry::Rewrite { new_idx } + } if alignment is RecurseIntoContainer { before_idx, after_idx }: - let original_block = original_ast.blocks[before_idx]; - if is_atomic(original_block) { - // SOFT-DROP: inner edits to an atomic block are reverted. - // Substitute KeepBefore — Verbatim copy of the wrapper's preimage. - warnings.push(diagnostic_q3_43(original_block)); - if let Some(range) = original_block.source_info().preimage_in(target_file) { + let block = original_ast.blocks[before_idx]; + if !is_editable_inside(block, target_file) { + // Inner edits to a non-editable container are reverted. + warnings.push(diagnostic_q3_43(block)); + if let Some(range) = block.source_info().preimage_in(target_file) { + // Atomic CustomNode with preimage (include token): Verbatim. CoarsenedEntry::Verbatim { byte_range: range, orig_idx: before_idx } } else { - // Atomic node lacks a preimage in target — extremely unusual. - // Substitute Omit; warning already pushed. + // No-preimage container (synthesized): Omit; regenerates next run. CoarsenedEntry::Omit } } else { // Existing recurse logic for inline plans, custom_node_plans, etc. - // The inline-plan-walking step has its own soft-drop substitution + // Inline-plan-walking has its own soft-drop substitution // (see "Inline-level soft-drop" below). ... } ``` -**Inline-level soft-drop** (applied during `assemble_inline_content` and -when constructing the inline plan for InlineSplice): +#### Inline-level soft-drop + +Applied during `assemble_inline_content` and when constructing the +inline plan for `InlineSplice`: ``` For each inline alignment in plan.inline_alignments: -if alignment is UseAfter(new_idx) and is_atomic(new_inlines[new_idx]): - // User retyped over a Derived inline (shortcode resolution). - // Substitute KeepBefore for the corresponding original inline. - warnings.push(diagnostic_q3_42(new_inlines[new_idx])); - treat as KeepBefore() +if alignment is UseAfter(new_idx) and !is_editable_inside(orig_inlines[before_idx], target): + // User retyped over a non-editable inline (typically: shortcode + // resolution). Substitute KeepBefore for the original inline at + // before_idx — the position the alignment already names. The + // earlier draft suggested matching the *new* inline's Invocation + // anchor against original-side anchors, but user-edit inlines + // don't carry Invocation anchors so there'd be nothing to match. + warnings.push(diagnostic_q3_42(orig_inlines[before_idx])); + treat as KeepBefore(before_idx) -if alignment is RecurseIntoContainer and the original inline is_atomic: - // Same shape as the block-level recurse-on-atomic case. +if alignment is RecurseIntoContainer and !is_editable_inside(orig_inlines[before_idx], target): warnings.push(diagnostic_q3_42(orig_inlines[before_idx])); treat as KeepBefore(before_idx) ``` -The "corresponding original index" for inline-level UseAfter substitution -is the index in `orig_inlines` whose Derived `from` matches the new inline's -`from`. In the multi-inline shortcode case, multiple original inlines share -the same `from`; any of them produces the right Verbatim result (they all -preimage to the same shortcode token bytes, which the dedupe rule emits -once anyway). +#### `assemble` updates + +- **Transparent entries** emit each child's bytes with separators + computed from the children's original positions. The wrapper + itself contributes nothing. +- **Omit entries** contribute nothing to output. The original + `Generated` node is dropped; baseline regenerates next pipeline + run. +- **Multi-inline dedupe**: within an inline-splice or inline-assembly + run, detect consecutive `KeepBefore` entries whose inlines' + `Invocation` anchors are structurally equal (compared via + `PartialEq` on the anchor's `source_info` — `SourceInfo` derives + `PartialEq`, so value equality across the full chain). Emit + Verbatim *once* for the group, using the anchor's preimage byte + range. Without dedupe, a multi-inline shortcode resolution like + `**Bold** Title` → `[Strong[Str], Space, Str]` would emit the + shortcode token N times. +- No `AtomicViolation` handling — soft-drop substitutions happened + in coarsen; `assemble` sees only safe entries. + +#### `incremental_write_qmd` signature change + +Today: +```rust +pub fn incremental_write_qmd(original_qmd: &str, new_ast_json: &str) -> String; +``` -The `assemble` step iterates coarsened entries: +After Plan 7: +```rust +pub fn incremental_write_qmd( + original_qmd: &str, + baseline_ast_json: &str, + new_ast_json: &str, +) -> String; // JSON: { success, qmd, warnings, error?, diagnostics? } +``` -- Verbatim → copy byte range from `original_qmd`. -- Rewrite → use the qmd writer to serialize the new block. -- InlineSplice → existing splice logic, extended with (a) the - multi-inline Derived dedupe rule and (b) inline-level soft-drop - substitutions before assembly. -- Transparent → emit children's bytes recursively. -- Omit → skip (contribute nothing to output). +The third positional argument (`baseline_ast_json`) is the +caller-supplied baseline AST at the same pipeline tier as +`new_ast_json`. The writer no longer parses `original_qmd` to +synthesize a baseline; it uses the caller-supplied one. This makes +the writer pipeline-agnostic: it diffs the two ASTs it's given and +writes accordingly. + +The TS wrapper at `ts-packages/preview-runtime/src/wasmRenderer.ts` +mirrors the signature change: `incrementalWriteQmd(originalQmd, +baselineAst, newAst): { qmd, warnings }` (today: `(originalQmd, +newAst): string`). + +No `pipeline_kind` parameter. The pipeline tier is implicit in +whichever baseline AST the caller passes. + +#### Warning channel mechanism + +`coarsen` accepts a `&mut Vec` warning sink as +a parameter. Soft-drop substitutions push warnings into the sink. +The WASM bridge serializes the warnings into the response JSON's +existing `warnings` field (already present on `AstResponse`; today +always `None` for `incremental_write_qmd`). The TS wrapper returns +`{ qmd, warnings }`. The hub-client's existing diagnostic collation +(`ReactPreview.tsx::allDiagnostics`, `Editor::diagnosticsToMarkers`) +displays soft-drop warnings the same way it displays pipeline +diagnostics — as Monaco squiggles for located warnings, and as +the `.diagnostics-banner` for unlocated. + +#### Diagnostic codes + +Three codes, per the Q-3 conventions in +`crates/quarto-error-reporting/error_catalog.json`: + +- **`Q-3-41` — "Edit dropped — render not ready yet".** Emitted + when the user makes an edit before the first successful render + has produced a baseline AST (`ast === ''` on hub-client, + `astJson === null` on the SPA). Body: imperative — "Your edit + was dropped because the document hasn't finished rendering. Try + again in a moment." Warning severity. No source range (the + edit is pre-render, so there's no rendered DOM to point at). + Suppress-after-3 still applies by code rather than range. + **Implementation deferred to Plan 7c** (Phases 1 + 3) — the + catalog entry and TS-side emission did not ship with Plan 7's + Rust work; the writer code path Plan 7 shipped never reaches the + no-baseline case (the bridge intercepts before invocation). + +- **`Q-3-42` — "Shortcode edit dropped".** Emitted when an + inline-level edit to shortcode-resolved (or other atomic-Generated) + content was substituted by KeepBefore. Body: the affected inline's + text and the source range of the invocation token (from the + `Invocation` anchor) so editor UIs can highlight it. + +- **`Q-3-43` — "Generated content edit dropped".** Three emission + paths, sharing the same code and structural shape: + - Block-level RecurseIntoContainer on an atomic CustomNode + (Plan 8's `IncludeExpansion`): body names the include's + `source_path` from `plain_data`. Message: "To edit this content, + open `` directly." + - Block-level RecurseIntoContainer on a no-preimage Generated + container (synthesized appendix / footnotes container after + Plan 9 stamps ValueSource anchors): body names the metadata + key when available. Message: "This content is generated from + metadata; edit `_quarto.yml` / frontmatter to change it." + - Block-level UseAfter on a no-preimage Generated container: + same body as the previous case. + +Both are `DiagnosticKind::Warning`. Both carry source ranges +(the wrapper's preimage range when available, else the surrounding +block's range), so they squiggle naturally in Monaco. + +**Catalog mechanics** (verified). Each Q-* code in +`error_catalog.json` carries one static `message_template` plus +title / subsystem / docs_url. Per-call-site body text uses the +existing `DiagnosticMessageBuilder` API +(`crates/quarto-error-reporting/src/builder.rs`): -The function returns `Ok((String, Vec))` carrying the -saved qmd plus any soft-drop warnings that fired during coarsen. It only -returns `Err` for genuine write failures (UTF-8 errors, qmd writer failures -on malformed input — same as today's writer). +```rust +DiagnosticMessageBuilder::warning("Generated content edit dropped") + .with_code("Q-3-43") + .problem(format!("To edit this content, open `{}` directly.", + source_path)) + .add_hint("...") + .build() +``` -## Multi-inline shortcode dedupe +The catalog entry provides one generic `message_template`; the +three emission paths supply their distinct text via the builder. +**No template-able-body infrastructure needed** — the existing +builder API already covers it. Phase 3 ships one catalog entry per +code and three builder helper functions (`diagnostic_q3_42`, +`diagnostic_q3_43_include`, `diagnostic_q3_43_metadata`). -When `{{< meta foo >}}` resolves to multiple inlines (e.g., metadata is -markdown like `**Bold** Title` → `[Strong[Str], Space, Str]`), each -resolved inline has the same `Derived { from: Original{shortcode_range}, -by: By::shortcode("meta") }` source_info. +#### `is_atomic_custom_node` registry -Block-level: if both pipeline runs produce the same multi-inline output, -the surrounding Para is structurally identical → KeepBefore at block -level → Verbatim copy of the WHOLE Para's bytes (including the shortcode -token). One copy. ✓ +Defined in `quarto-core` as: +```rust +pub const ATOMIC_CUSTOM_NODES: &[&str] = &["CrossrefResolvedRef"]; +pub fn is_atomic_custom_node(type_name: &str) -> bool; +``` -Inline-level recursion (when the user edits something else in the same -Para): the reconciler picks `RecurseIntoContainer` with an inline plan. -Each shortcode-derived inline is `KeepBefore` individually. Without -dedupe, each one's Verbatim emits the shortcode token → N copies in -output. +Plan 7 ships the Rust side. The TypeScript hand-mirror at +`ts-packages/preview-renderer/src/utils/atomicCustomNodes.ts` +already exists (Plan 2A shipped it with `CrossrefResolvedRef`, +ahead of the Rust source-of-truth). From Plan 7 onward, the +lockstep convention applies: when one side changes, the other +must too — same discipline as `By::is_atomic_kind()` ↔ +`ATOMIC_KINDS` (`ts-packages/preview-renderer/src/utils/sourceInfo.ts:54`) +and `DiagnosticMessage` ↔ `types/diagnostic.ts`. Plan 8 adds +`IncludeExpansion` to both sides. + +Extensions that need to contribute atomic types use a future +registration mechanism (see §Open questions); the const set +covers built-ins. + +#### Hub-client integration + +**Scope clarification: first-demo UX.** Plan 7 lifts the coarse +`pipelineKindForFormat(format) === 'preview'` read-only guard at +`ReactPreview.tsx:429-440` and replaces it with the writer's +soft-drop path. The writer's Q-3-42 / Q-3-43 diagnostics are the +user-facing safety net for the first demo — bad edits don't reach +source, and the user sees a warning. A fine-grained React-side +gate (greying out the affordance per region via the +`is_editable_inside` predicate consulted from JS) is **deferred** +to a future frontend pass. For the first demo, the experience is +"you can type, but it doesn't take, and you see a warning"; that +is the deliverable. Plan 2A's existing atomic-CustomNode gate +continues to prevent the most surprising cases (editing inside +includes) without further work. + +- Lift the `handleSetAst` read-only guard in `ReactPreview.tsx:429-440` + introduced in Plan 1. Wire `setLocalAst` through with the current + `ast` state as the baseline; merge the returned warnings into the + diagnostics flow. + + Concrete shape: + ```ts + const writeBackWarningsRef = useRef([]); + const lastRenderDiagnosticsRef = useRef([]); + + const handleSetAst = useCallback((newAst) => { + if (ast === '') { + // First-edit-before-render — see §Plan-7-specific decisions + writeBackWarningsRef.current = applySuppressByRange([ + ...writeBackWarningsRef.current, + diagnosticQ3_41(), + ]); + onDiagnosticsChange([ + ...lastRenderDiagnosticsRef.current, + ...writeBackWarningsRef.current, + ]); + return; + } + const { qmd, warnings } = incrementalWriteQmd( + content, ast, JSON.stringify(newAst), + ); + writeBackWarningsRef.current = applySuppressByRange([ + ...writeBackWarningsRef.current, + ...warnings, + ]); + onDiagnosticsChange([ + ...lastRenderDiagnosticsRef.current, + ...writeBackWarningsRef.current, + ]); + onContentRewrite(qmd); + }, [content, ast, onContentRewrite, onDiagnosticsChange]); + ``` + + Lifecycle. `lastRenderDiagnosticsRef.current` is set inside + `doRenderWithStateManagement` (the existing render-result callback + at `ReactPreview.tsx:354`) on every render completion. On the + **success** branch, `writeBackWarningsRef.current = []` — the + regenerated AST replaces the stale baseline, so warnings about + edits against the previous baseline are no longer current. On + the **failure** branch, the previous write-back warnings persist + (the user's edit was real; the render error doesn't invalidate + the warning). + + Suppress-after-3-by-source-range (`applySuppressByRange`) runs at + the merge site, not in the writer — the writer is policy-free. + Monaco squiggles remain as the persistent signal for repeated + edits over the same range. + + The `ast` state already holds the previously-rendered + post-pipeline AST (set by the regular render effect on every + successful render). No new caching mechanism is required; React's + `useState` is the cache. + +#### q2 preview SPA integration + +- Replace `noopSetAst` at `q2-preview-spa/src/PreviewApp.tsx:241` + with a real handler. The baseline AST is the SPA's + currently-displayed AST — `state.astJson` in `PreviewAppState`, + set on every successful render at `PreviewApp.tsx:530`. The + `content` argument is read via `getFileContent(state.activeFile)` + from `@quarto/preview-runtime/automergeSync` (verified to exist + at `ts-packages/preview-runtime/src/automergeSync.ts:177`) — the + SPA holds no local `content` state because automerge is the + source-of-truth. Skeleton: + ```ts + const handleSetAst = useCallback(async (newAst) => { + if (!state.activeFile || state.astJson === null) { + // First-edit-before-render — emit Q-3-41 into DiagnosticStrip + pushDiagnostics([diagnosticQ3_41()]); + return; + } + const content = getFileContent(state.activeFile); + if (content === null) return; // file gone / binary; defensive + const { qmd, warnings } = incrementalWriteQmd( + content, state.astJson, JSON.stringify(newAst), + ); + const hash = await computeHash(qmd); + echoHashRef.current = { path: state.activeFile, hash }; + syncClient.updateFileContent(state.activeFile, qmd); + pushDiagnostics(warnings); + }, [state.activeFile, state.astJson]); + ``` +- Add **content-match echo-prevention** in the SPA's + `onFileContent` handler. Just before calling + `updateFileContent`, hash the qmd being emitted (e.g. SHA-256 or + a cheaper FNV-1a — exact algorithm settled during implementation) + and stash `(path, hash)` in a ref. In `onFileContent(path, + content)`, suppress the re-render if `(path, hash(content))` + matches the stashed value; otherwise process normally. Robust + against interleaved unrelated file updates (an unrelated file's + `onFileContent` doesn't match the stashed `path`, so it processes + normally). +- Ship `q2-preview-spa/src/components/DiagnosticStrip.tsx` — a + small SPA-local component (~50 lines TSX + ~20 lines CSS) that + displays Q-3-42 / Q-3-43 warnings returned by `incrementalWriteQmd`. + Mirrors hub-client's `.diagnostics-banner` visual style. Applies + suppress-after-3-by-source-range (see "Autosave-context spam + mitigation" below). +- Both single-file mode (bd-tnm3k) and project mode work via the + same code path — the ephemeral hub bridges automerge ↔ disk + uniformly. No SPA-side branching needed. + +#### Move `pipelineKindForFormat` to shared package + +`pipelineKindForFormat` lives in `hub-client/src/utils/pipelineKind.ts` +today. The SPA can't import from hub-client. The writer no longer +needs the helper (no `pipeline_kind` parameter), but the SPA's +**display path** does — to choose between `parse_qmd_to_ast` and +`render_page_in_project_with_attribution` when rendering. + +Move to `ts-packages/preview-runtime/src/pipelineKind.ts`. Both +hub-client and the SPA import from there. Mechanical move; ~5 LOC +of import-path updates. + +#### Diagnostic surfacing in hub-client + +Warnings flow through the existing `RenderResponse.warnings` channel +(the same path Plan 1's pipeline diagnostics and attribution-render +diagnostics use). `ReactPreview.tsx::allDiagnostics` already collates +them; `Editor::diagnosticsToMarkers` splits into Monaco markers and +the existing `.diagnostics-banner`. Q-3-42 and Q-3-43 both carry +source ranges, so they squiggle naturally. **No new hub-client UI +needed.** + +One known UX gap: the banner is gated on `!isFullscreenPreview`, so +users in fullscreen-preview mode rely on the Monaco squiggles +(visible when they exit fullscreen) rather than the banner. Accepted. + +### Out of scope + +- **Include round-trip via wrapper-CustomNode** (Plan 8 — uses + this plan's atomic-detection + soft-drop logic but introduces the + wrapper itself). +- **Running the transform pipeline inside the writer.** The writer + is pipeline-agnostic by design; the caller supplies the baseline + AST at whatever tier they need. Future plans don't change this. +- **Engine output as Generated** (deferred future work). +- **Editable CustomNode slots** (e.g., editing a Callout's title and + body through React with edits round-tripping back to source). See + `claude-notes/research/2026-05-05-editable-custom-nodes.md`. +- **Promoting the qmd writer to a fallible `Result` interface + throughout.** Soft-drop semantics make this unnecessary for + q2-preview; the remaining panic paths are debug assertions for + genuine programming errors, not user-facing failure modes. See + §"The byte-provenance contract" below. +- **Lifting hub-client's diagnostic banner + SPA's `DiagnosticStrip` + into a shared `@quarto/preview-renderer` component.** Filed as a + follow-up against the hub-client decomposition epic (bd-hfjj); not + on Plan 7's critical path. +- **Appendix-license end-to-end round-trip test.** Exercises + end-to-end correctness of soft-dropping a metadata-derived edit + against the appendix synthesizer's ValueSource-stamped output. + Depends on Plan 9 stamping ValueSource on a real consumer; lands + in Plan 9 Phase 5. (The **structural unit test** for + non-Invocation-role-skipping is in Plan 7's Phase 1 — it + hand-builds anchors with generic `By` plus + `Anchor::value_source()` / `AnchorRole::Other(...)`, no consumer + dependency. Plan 9 Phase 5 also carries an appendix-specific + version using `By::appendix(...)` once that constructor exists.) + +## Design decisions (settled) + +- **Decompose into orthogonal primitives.** Parse / transform / + reconcile / write are independent operations. The writer doesn't + know about pipelines; the caller composes. The WASM bridge layer + exposes the compositions callers actually use; future entries + can land without changing the writer's signature. +- **Caller supplies baseline AST.** Removes the writer's dependency + on `RenderContext`, `SystemRuntime`, `Format`, and pipeline + construction machinery. The writer's surface is three strings + (qmd, baseline, new) in and one JSON envelope out. +- **`Invocation` is the only anchor role the writer consults.** + `ValueSource`, `Dispatch`, and `Other` are diagnostic-only. The + asymmetry is load-bearing: copying bytes from a `ValueSource` + source range would emit raw YAML into the body — a hard + correctness bug. Documented on `preimage_in` and on + `AnchorRole::Other`. +- **Soft-drop, not abort.** Bad-edit cases substitute a safe + alignment in coarsen and emit a warning rather than aborting the + entire write. The user's other valid edits go through; the bad + edit is reverted. React (Plan 2A's framework atomic gate) is the + primary safeguard via read-only enforcement; the writer is the + contract guarantor; if React has a hole, the writer protects + without losing the user's session. +- **Unified editability predicate.** Plan 2A's React-side read-only + check and the writer's coarsen-side soft-drop logic consult the + same `is_editable_inside(node, target_file_id) -> bool`. Three + reasons content is uneditable: atomic CustomNode (replaceable + wholesale via menu, not editable inside); atomic-kind Generated + (shortcode / filter / title-block — content represents the + resolved value of an invocation token); no preimage in target + (synthesized-from-metadata containers — sectionize / footnotes / + appendix, after Plan 9's stamping). +- **Let-user-win for block-level UseAfter on atomic CustomNode.** + Replacing an `IncludeExpansion` wholesale (e.g. swapping to a + different `source_path` via a component menu) goes through the + qmd writer's CustomNode arm. No warning — the user's intent is + unambiguous. +- **Soft-drop for block-level UseAfter on no-preimage Generated.** + Replacing a synthesized-from-metadata container has no source + position to anchor at; Rewrite would have nowhere to write. + Substitute Omit + Q-3-43 warning. +- **Multi-inline dedupe via `PartialEq` on anchor source_info.** + Two consecutive inlines share an `Invocation` anchor iff their + anchor's `source_info` is `==` (value equality). `SourceInfo` + derives `PartialEq`, so this is structural — Substring chains, + Concat pieces, etc. compare element-wise. +- **Inline-level UseAfter substitution targets `before_idx`.** The + alignment from the reconciler already carries the original-side + index being replaced; the writer uses that directly. Earlier + drafts suggested matching the *new* inline's `Invocation` anchor + against original-side anchors — but user-edit inlines don't + carry `Invocation` anchors, so there's nothing to match. +- **No `pipeline_kind` parameter on `incremental_write_qmd`.** The + pipeline tier is implicit in the baseline AST the caller passes. +- **No backward-compat shim for the signature change.** Three + first-class consumers (ReactPreview, kanban demo, hub-react-todo + demo) + one type interface (`quarto-sync-client`'s `astOptions`) + + one TS wrapper (`ts-packages/preview-runtime`'s + `wasmRenderer.ts`). All in-repo, lockstep-migrable. No npm-exposed + consumers. No wire-format persistence — the function emits qmd + text, not a serialized envelope. The codebase has no + `#[deprecated]` convention; the migration is one PR. +- **Plan 7 keeps its existing filename (`2026-05-04-q2-preview- + plan-7-incremental-writer.md`)** for git-history continuity. New + plans in the epic use the `provenance-plan-N-` convention + (Plan 9, Plan 10). -Dedupe rule: when iterating inline alignments in -`assemble_inline_content`, group consecutive `KeepBefore` entries whose -inlines share the same `Derived` source (compare the `Arc` -identity of `from`, or by structural equality of the `from` value). Emit -Verbatim *once* for the group, using the `from`'s preimage byte range. +## Multi-inline shortcode dedupe -This applies only at the inline level (where multi-inline shortcode -resolutions occur). Block-level rarely sees this case. +See `claude-notes/designs/incremental-writer-contract.md` §"Multi-inline +dedupe" for the full rule and rationale. Brief: when `{{< meta foo >}}` +resolves to multiple inlines all sharing the same `Invocation` anchor, +inline-level reconciliation groups consecutive `KeepBefore` entries +with `PartialEq`-equal anchors and emits Verbatim once for the group. + +## Plan-7-specific decisions + +- **First-edit / no-baseline behavior — drop + warn.** When the + WASM entry receives an empty / null / unparseable + `baseline_ast_json`, it returns `success: false` with a clear + error message rather than falling back to parsing `original_qmd` + internally (which would silently misbehave for q2-preview-tier + callers). Callers gate `incrementalWriteQmd` on having a + successfully-rendered baseline cached AND emit `Q-3-41` to the + active diagnostic surface when the gate trips: + - ReactPreview gates on `ast !== ''`; on a tripped gate, + push `diagnosticQ3_41()` into `writeBackWarningsRef.current` + and flush via `onDiagnosticsChange` (see §Hub-client integration). + - SPA gates on `astJson !== null` AND `activeFile !== null`; on + a tripped gate, push `Q-3-41` into `DiagnosticStrip`. + Pre-Plan-7 today the edit is silently dropped — no DOM affordance, + no console signal. Plan 7 makes the drop visible so users whose + interaction model beats the first render (e.g. paste-on-boot) + learn what happened. +- **Inline-level UseAfter substitution targets `before_idx`.** The + alignment from the reconciler already carries the original-side + index being replaced; the writer uses that directly. An earlier + draft suggested matching the *new* inline's `Invocation` anchor + against original-side anchors — but user-edit inlines don't + carry `Invocation` anchors, so there is nothing to match. +- **Programmer-error surface.** No `WriterError` enum. Invariant + violations panic inline (`debug_assert!()` / `unreachable!()` / + `panic!()` — idiomatic q2; see §"Writer return type after Plan 7" + for the citation set). User-genuine failures keep flowing + through the existing `Result::Err(Vec)` arm — + same shape today's writer uses for inline-splice qmd-writer + errors. The WASM bridge's `Err` arm is unchanged. +- **Anchor-role doc-comment placement.** The "non-`Invocation` + roles are not walked" policy is doc-commented in three places + with the canonical statement on `SourceInfo::preimage_in`: + - `SourceInfo::preimage_in` (canonical) — full policy. + - `AnchorRole::Other` — points back: "see `preimage_in` for the + walking policy; `Other` is among the non-walked roles." + - `AnchorRole` enum doc — one-liner: "Only `Invocation` is walked + by the writer's `preimage_in`; see `preimage_in` doc for the + full policy and rationale." + Plan 10 adds an equivalent pointer on `AnchorRole::Dispatch` + when it lands. +- **Tier-mismatch sanity check at WASM bridge.** No runtime + verifier in Plan 7 — the caller-contract documentation suffices. + Future hardening (`assert that all FileIds referenced by + `new_ast_json` also appear in `baseline_ast_json`, or similar) + is a follow-up if real bugs surface; do not add speculative + guards now. ## `preimage_in` semantics @@ -435,370 +860,946 @@ impl SourceInfo { } SourceInfo::Concat { pieces } => { // All pieces must resolve into target file AND be contiguous. + // Note: `SourceInfo::concat()` computes each piece's + // `offset_in_concat` cumulatively (sum of prior lengths), so + // gaps are structurally impossible in any Concat produced by + // the in-repo constructors. This branch is defensive against + // malformed JSON deserialization, not against in-repo callers. let ranges: Vec<_> = pieces.iter() .map(|p| p.source_info.preimage_in(target)) .collect::>>()?; if ranges.is_empty() { return None; } - // Confirm contiguous: ranges[i].end == ranges[i+1].start if ranges.windows(2).all(|w| w[0].end == w[1].start) { Some(ranges.first()?.start .. ranges.last()?.end) } else { - None // gappy concat — can't Verbatim-copy + None // defensive: gaps shouldn't arise from in-repo + // constructors; if they do, fall through to the + // catch-all Rewrite branch below. } } - SourceInfo::Synthetic { .. } => None, - SourceInfo::Derived { from, .. } => { - // Walk through the `from` chain to find a preimage in the target. - from.preimage_in(target) + SourceInfo::Generated { .. } => { + // Walk through the Invocation anchor's chain. + // Never walks ValueSource (Plan 9), Dispatch (Plan 10), + // or Other — these are diagnostic-only. + self.invocation_anchor() + .and_then(|si| si.preimage_in(target)) } } } } ``` -The `Derived` case delegates to `from`, which usually resolves to an -`Original` covering the source token bytes. So a `Derived` shortcode -resolution successfully returns its preimage range; the writer Verbatim -copies the shortcode token from source. +The `Generated` case delegates to `invocation_anchor()`, which +returns the first `Invocation` anchor's source_info — typically an +`Original` covering the source token's bytes. So a +shortcode-resolution Generated successfully returns its preimage +range; the writer Verbatim-copies the shortcode token from source. + +## Migration plan + +### Rust function signatures (`pampa::writers::incremental`) + +```rust +// Before: +pub fn incremental_write( + original_qmd: &str, + original_ast: &Pandoc, + new_ast: &Pandoc, + plan: &ReconciliationPlan, +) -> Result>; + +pub fn compute_incremental_edits( + original_qmd: &str, + original_ast: &Pandoc, + new_ast: &Pandoc, + plan: &ReconciliationPlan, +) -> Result, Vec>; + +// After (both gain a warnings channel in the Ok variant; Err +// keeps its existing meaning — qmd-writer failures bubbling up +// via `?` from the underlying serializer): +pub fn incremental_write( + original_qmd: &str, + original_ast: &Pandoc, + new_ast: &Pandoc, + plan: &ReconciliationPlan, +) -> Result<(String, Vec), Vec>; + +pub fn compute_incremental_edits( + original_qmd: &str, + original_ast: &Pandoc, + new_ast: &Pandoc, + plan: &ReconciliationPlan, +) -> Result<(Vec, Vec), Vec>; +``` + +`coarsen` (private) gains a `&mut Vec` warning +sink parameter and keeps its existing `Result, +Vec>` return shape — the `Err` arm still +surfaces hard errors from inline-splice assembly via `?`. +Programmer errors (Plan-6 stamper bugs, structurally impossible +states) `panic!()` / `unreachable!()` / `debug_assert!()` inline. + +### WASM entry signature (`incremental_write_qmd`) + +```rust +// Before: +pub fn incremental_write_qmd(original_qmd: &str, new_ast_json: &str) -> String; + +// After: +pub fn incremental_write_qmd( + original_qmd: &str, + baseline_ast_json: &str, + new_ast_json: &str, +) -> String; +// JSON: { success, qmd, warnings, error?, diagnostics? } +``` + +### TypeScript wrapper (`ts-packages/preview-runtime/src/wasmRenderer.ts:712`) + +```ts +// Before: +export function incrementalWriteQmd(originalQmd: string, newAst: RustQmdJson): string; + +// After: +export function incrementalWriteQmd( + originalQmd: string, + baselineAst: RustQmdJson | string, // accept either parsed or JSON-string for ergonomics + newAst: RustQmdJson, +): { qmd: string; warnings: Diagnostic[] }; +``` + +### Sync-client interface (`ts-packages/quarto-sync-client/src/types.ts:169`) + +```ts +// Before: +incrementalWriteQmd?: (originalQmd: string, newAst: unknown) => string; + +// After: +incrementalWriteQmd?: ( + originalQmd: string, + baselineAst: unknown, + newAst: unknown, +) => { qmd: string; warnings: Diagnostic[] }; +``` + +### Sync-client call site (`ts-packages/quarto-sync-client/src/client.ts:957`) + +```ts +// Before: +qmdText = astOptions.incrementalWriteQmd(cached.source, ast); + +// After: +const result = astOptions.incrementalWriteQmd(cached.source, cached.ast, ast); +qmdText = result.qmd; +// Optional: surface result.warnings to a sync-client callback. Default +// to ignore; sync-client is policy-free. +``` + +The `astCache` already maintains both `source` (qmd) and `ast` (last +parsed AST) per file. `cached.ast` IS the baseline. No demo-side +state changes required. + +### Consumer migrations + +1. **`hub-client/src/components/render/ReactPreview.tsx:429-440`** + — `handleSetAst` updated to pass the current `ast` state as the + baseline. The existing read-only guard for `pipelineKindForFormat + === 'preview'` deletes. Warnings from the response feed into + `allDiagnostics` collation alongside pipeline diagnostics. + +2. **`q2-demos/kanban/src/{useSyncedAst.ts:93, wasm.ts:79}`** — the + `astOptions.incrementalWriteQmd` lambda forwards the new third + argument. `wasm.ts:79`'s wrapper accepts and forwards + `baselineAst`. The demo's app state is unchanged; sync-client's + astCache supplies the baseline. + +3. **`q2-demos/hub-react-todo/src/{useSyncedAst.ts:93, wasm.ts:79}`** + — same as kanban. + +4. **`q2-preview-spa/src/PreviewApp.tsx`** — new `handleSetAst` + replaces `noopSetAst` at line 241. Routes through + `incrementalWriteQmd(content, currentAst, newAst)` with + content-match echo-prevention (see §Hub-client / SPA + integration above). + +All migrations in one PR; no back-compat shim. The TS-side type +checker catches every call site automatically. ## Open questions for implementation -- **Inline-level Transparent**: today the writer has `InlineSplice` for - inline-level changes within a block. Does Transparent apply to inlines - too (e.g., a `Span` with Synthetic source_info containing source-bearing - inlines)? Probably yes — extend the same pattern. Confirm during - implementation. -- **Concat-with-gaps**: if a Concat's pieces resolve to non-contiguous - ranges, `preimage_in` returns None per the algorithm above. Coarsen - falls through to Rewrite. Confirm this is the right semantics. -- **The `is_atomic_custom_node` lookup — extension forward-compat**: - today's hardcoded `pub const ATOMIC_CUSTOM_NODES: &[&str]` works for - built-in atomic types. Future extensions (including the eventual - TSX-extension story) will need to register their own atomic types - without modifying `quarto-core`. - - The forward-compat design (deferred to a follow-up plan; commits - the *shape* now without writing implementation code): - - - **YAML schema** in `_extension.yml`: - ```yaml - contributes: - custom-nodes: - - { type: MyCustomBlock, atomic: true } - - { type: AnotherWidget } # atomic defaults to false - ``` - - **Rust runtime aggregation** mirrors `resolve_filters()`'s pattern: - `pub fn collect_atomic_custom_node_types(extensions: &[Extension]) -> HashSet` - starts from the built-in set and adds extension-contributed entries - where `atomic == true`. - - **Function signature evolution**: - `is_atomic_custom_node(name)` → - `is_atomic_custom_node(name, ®istry: &HashSet)`. The - writer (in `pampa`) gets the registry from `StageContext` at coarsen - time. ~30 callers cascading; mechanical. - - **Rust→JS sync** for extension types (the genuinely-new piece — - the hand-mirror approach in Plan 7 doesn't work for extension - types because they aren't known at hub-client build time): - a `wasm_bindgen` export `get_atomic_custom_node_types()` is called - once per render after extensions are loaded; populates a React - context. The hand-mirrored TS const remains the fallback for the - no-extensions / WASM-initializing case and stays correct for - built-ins. - - **Plan 8's `IncludeExpansion`**: lands in the built-in set today - via `pub const ATOMIC_CUSTOM_NODES`. After the follow-up plan, the - set is built from a built-in's `_extension.yml` rather than - hardcoded — same effect via the same code path that user - extensions use, no privileged route. - - This sketch commits the schema choice (`contributes.custom-nodes` with - `atomic: bool`) and the function-signature migration path. Plan 7 - ships the const-based registry; the runtime aggregation, schema - parsing, and `wasm_bindgen` lookup all land in a follow-up when an - extension actually needs to register an atomic type. -- **Sibling vs param**: Decision D was "param with default" but Plan 4 / 7 - could implement it either way. Confirm during implementation. Param is - cleaner (one fewer entry point). Sibling is more isolated. Either works. -- **Runtime user-filter idempotence detection**: split out to Plan 7a. - See `claude-notes/plans/2026-05-04-q2-preview-plan-7a-filter-idempotence.md` - for the full design — round-trip idempotence check, per-filter - attribution, `idempotent: false` opt-out, Q-3-44 / Q-3-45 - diagnostics. Plan 7a is a separable follow-up that builds on Plan 7's - `pipeline_kind: Some("preview")` machinery; it doesn't gate M3. +- **Inline-level Transparent — settled: not needed.** A worktree + scan of `crates/quarto-core/src/transforms/` and + `crates/pampa/src/` finds zero inline-level synthesizers that + produce `Generated { from: [] }` with non-atomic kind and + source-bearing children. All four Plan-6 synthesizers + (Sectionize, TitleBlock, Footnotes, Appendix) emit *block*-level + wrappers; the inlines that synthesizers do touch (e.g. the + Footnotes `` inline stack — Span / Superscript / Link / Str) + carry `Original` source_info cloned from the `Note`'s range, not + `Generated`. They hit `Verbatim` via `preimage_in`, not + Transparent. The inline-assembly path's three variants + (`KeepBefore` / `UseAfter` / `RecurseIntoContainer`) handle every + shape that reaches it today; the third already preserves + delimiters and recurses, which is what an inline Transparent + would amount to. If a future transform begins emitting inline + Generated-empty-from wrappers, reopen this question — the + case is structurally absent in Plan-6-stamped output. + +- **Concat-with-gaps semantics — settled: structurally + impossible.** `SourceInfo::concat()` computes each piece's + `offset_in_concat` as the cumulative sum of prior lengths, so a + gap would corrupt the Concat invariant. All in-repo + constructors (`qmd::write_with_source_info`, postprocess + coalescing, YAML scalar combining, attribute combining, inline + combining) feed adjacent pieces; the existing + `concat_piece_lengths_sum_to_buffer_length` and + `concat_covers_output_with_frontmatter` tests + (`crates/pampa/tests/qmd_writer_source_info.rs`) lock the + tile-the-buffer-with-no-gaps property. The `preimage_in` + gap branch is defensive paranoia against malformed JSON + deserialization, not against in-repo callers, and the + catch-all Rewrite fallback is a safe graceful-degradation + endpoint that should never fire on well-formed input. + +- **`is_atomic_custom_node` extension forward-compat — out of + scope for Plan 7.** The two atomic types today + (`CrossrefResolvedRef`, `IncludeExpansion`) are both + Quarto-2-internal; no extension has asked for atomic-type + registration. Quarto 1 has no public extension-author-facing + mechanism for custom AST node types either (verified against + `~/src/quarto-cli` and deepwiki) — its internal registration + is via `_quarto.ast.add_handler()` (imperative Lua call), + not declarative YAML, and `_extension.yml` has no + `custom-nodes:` key. If a future extension genuinely needs to + contribute an atomic CustomNode type, a separate plan picks + the registration shape with the right review (mirroring + Quarto 1's imperative Lua surface, or designing a YAML + surface, or both). Plan 7 ships the const-set with no + extension-side coupling; the const-set's lack of an + extension hook is intentional, not provisional. + +- **Runtime user-filter idempotence detection** — split out to + Plan 7a. See `claude-notes/plans/2026-05-04-q2-preview-plan-7a- + user-filter-idempotence.md` for the full design — round-trip + idempotence check, per-filter attribution, `idempotent: false` + opt-out, Q-3-44 / Q-3-45 diagnostics. Plan 7a is a separable + follow-up; it doesn't gate M3. + +- **Content-match echo-prevention hash choice.** SHA-256 is the + obvious safe choice (already used in Plan 7a's + `filter_sources_hash`). FNV-1a or xxHash would be faster but + cryptographic strength isn't needed — we're just comparing a + freshly-emitted qmd against an arriving qmd for equality. Confirm + during SPA implementation. + +- **`pampa::pipeline::transform_ast` Rust-internal helper.** + Extracting the transform step of `render_qmd_to_preview_ast` into + a standalone `transform_ast(ast: Pandoc, ...) -> Pandoc` would + let tests exercise the transform tier in isolation. ~30 LOC of + factoring; not on Plan 7's critical path. Open beads if useful + during implementation. ## References -- `crates/pampa/src/writers/incremental.rs` — the writer to modify. - Particularly `coarsen` (line 149), `assemble` (line 228), `compute_separator` - (line 354), `block_source_span` (line 447), the helper for inline byte - ranges (line 800). -- `crates/quarto-source-map/src/source_info.rs:185-237` — accessor patterns - to extend. -- `crates/wasm-quarto-hub-client/src/lib.rs:2510` — `incremental_write_qmd` - entry point to extend (line drifted from 2166 after Plan 1's - prep refactor + new q2-preview wiring; verify exact line at - Plan 7 implementation time). -- `hub-client/src/services/wasmRenderer.ts:583` — the JS wrapper - (line drifted from 531). -- `hub-client/src/components/render/ReactPreview.tsx` — `handleSetAst` - guard to lift. Plan 1 implemented the `doRender` format switch via - `pipelineKindForFormat(format)` already; Plan 7 wires the same - helper into the edit-back path so the guard can be replaced with a - call to `incrementalWriteQmd` that passes the `pipeline_kind`. -- `hub-client/src/utils/pipelineKind.ts` — Plan 1's TS helper - (`pipelineKindForFormat`); Plan 7's JS-side call site reads it. -- `crates/quarto-core/src/stage/stages/ast_transforms.rs` — - `AstTransformsStage::run()` JIT branch already dispatches on - `ctx.format.pipeline_kind` (Plan 1 commit `60658a4e`); no edit - needed for Plan 7 itself, listed for context. -- `crates/quarto-core/src/format.rs` — `Format::pipeline_kind` - (Plan 1 commit `a7143cc7`); Plan 7 reads it in the - `incremental_write_qmd` body to drive the baseline-pipeline - selection. -- Plans 4 (Synthetic + By), 5 (wire format), 6 (audit) — provide the - AST shape this plan walks. +### Rust + +- `crates/pampa/src/writers/incremental.rs` — the writer. + Particularly `incremental_write` (line 80), `coarsen` (line 149), + `assemble` (line 228), `compute_separator` (line 354), + `block_source_span` (line 448), `assemble_inline_splice` (line + 602), `assemble_inline_content` (line 632), + `assemble_recursed_container` (line 672), `inline_source_span` + (line 800). +- `crates/quarto-source-map/src/source_info.rs` — `SourceInfo`, + `Generated`, `By`, `Anchor`, `AnchorRole`. Plan 7 adds the + `preimage_in` accessor. +- `crates/quarto-ast-reconcile/src/lib.rs` — + `compute_reconciliation`, `structural_eq_blocks`, + `structural_eq_inlines`, `compute_blocks_hash_fresh`, + `compute_meta_hash_fresh_excluding_rendered`. All used by the + test plan; the reconciler API itself doesn't change. +- `crates/wasm-quarto-hub-client/src/lib.rs:2947` — WASM entry + point (signature change). +- `crates/quarto-core/src/lib.rs` (or appropriate module) — + `ATOMIC_CUSTOM_NODES` const + `is_atomic_custom_node` fn (new). + +### TypeScript + +- `ts-packages/preview-runtime/src/wasmRenderer.ts:712` — JS + wrapper (signature change). Imports from this package; both + hub-client and the SPA consume. +- `ts-packages/preview-runtime/src/pipelineKind.ts` — new home + for `pipelineKindForFormat` (moved from + `hub-client/src/utils/pipelineKind.ts`). +- `ts-packages/preview-renderer/src/utils/atomicCustomNodes.ts` — + existing TS hand-mirror of `ATOMIC_CUSTOM_NODES`. +- `ts-packages/quarto-sync-client/src/types.ts:169` — + `astOptions.incrementalWriteQmd` interface (signature change). +- `ts-packages/quarto-sync-client/src/client.ts:957` — sync-client + call site (forwards new argument). +- `hub-client/src/components/render/ReactPreview.tsx:429-440` — + `handleSetAst` guard lift + edit-back wiring. +- `q2-preview-spa/src/PreviewApp.tsx:241` — `noopSetAst` → + real handler. +- `q2-demos/kanban/src/{useSyncedAst.ts:93, wasm.ts:79}`, + `q2-demos/hub-react-todo/src/{useSyncedAst.ts:93, wasm.ts:79}` + — demo wrappers (signature forwarding). + +### Plans + +- **Plans 4 (Generated + Anchor + By + is_atomic_kind), 5 (wire + format), 6 (audit)** — provide the AST shape this plan walks. +- **Plan 3** — ships `compute_meta_hash_fresh` / + `compute_meta_hash_fresh_excluding_rendered` in + `quarto-ast-reconcile`; the writer-lossless baseline test uses + both. +- **Plan 7a** (`claude-notes/plans/2026-05-04-q2-preview-plan-7a- + user-filter-idempotence.md`) — separable follow-up; runtime + user-filter idempotence check. +- **Plan 8** — uses Plan 7's atomic infrastructure for + `IncludeExpansion`; not blocking. +- **Plan 9** (`claude-notes/plans/2026-05-22-provenance-plan-9- + valuesource-threading.md`) — ValueSource consumer wiring; + appendix synthesizer stamping that makes the Q-3-43-widened + cases fire on real data. Owns the `preimage_in` role-asymmetry + unit test and the appendix-license e2e round-trip test (Plan 9 + Phase 5). +- **Plan 10** (`claude-notes/plans/2026-05-22-provenance-plan-10- + dispatch-anchor.md`) — Dispatch anchor for Lua sources; inherits + Plan 7's `AnchorRole::Other` policy. ## Test plan -- **Reconciler source-info-blindness foundation test** (new, lands in +- **Writer-lossless baseline test** (prerequisite for the + reconciler tests below; lands in Plan 7's first commit alongside + the foundation test). For each AST shape the writer needs to + emit (Generated-with-Invocation shortcode resolutions, Plan 8's + IncludeExpansion CustomNode wrappers, FloatRefTarget / Theorem / + Proof / Callout CustomNodes, synthesized Sectionize / Footnotes / + Appendix containers, user-edited variants of each), assert that + `parse(write(ast))` produces an AST whose + `compute_blocks_hash_fresh` + `compute_meta_hash_fresh_excluding_rendered` + equal the input's. This isolates writer bugs from reconciler + bugs. Fixtures reuse Plan 3's set under + `crates/quarto-core/tests/fixtures/q2-preview-idempotence/` plus + any Plan 7-specific shapes. + +- **Reconciler source-info-blindness foundation test** (lands in Plan 7's first commit): asserts that `structural_eq_blocks` and - `structural_eq_inlines` (in `quarto-ast-reconcile`) return `true` for - pairs of nodes that differ *only* in source_info. Cover all the new - shapes: two Original blocks with different file IDs / offsets; two - Synthetic blocks with different `By` payloads; two Derived blocks with - different `from` chains but the same content/attr/plain_data; - CustomNode pairs differing only in source_info on the wrapper or in - any slot child. The hash function already excludes source_info - (verified by Plan 3 and existing - `compute_blocks_hash_fresh::test_same_content_same_hash`); this test - covers the *equality* path too. Why it matters: the reconciler drives - KeepBefore decisions off these functions. If they leak source_info - by accident, q2-preview round-trip would degenerate to whole-doc - Rewrite without any obvious symptom — every test that doesn't inspect - the alignment plan would still pass. Catch the leak structurally - rather than discover it via correctness regressions. -- **`preimage_in` unit tests**: each variant (Original same/different file, - Substring chain, Concat contiguous/gappy, Synthetic, Derived). Assert - correct byte range or None. -- **Coarsen unit tests**: build mock reconciliation plans + ASTs covering: - - Verbatim (KeepBefore + preimage in target, both Original and Derived). - - Transparent (KeepBefore + non-atomic Synthetic wrapper with - source-bearing children — Sectionize case). - - Omit via atomic Synthetic (KeepBefore + Synthetic with - `by.is_atomic_synthesizer() == true` and no preimage — filter - construction case). - - Omit via Synthetic with no children (rare). - - Rewrite (UseAfter, non-atomic). - - **Soft-drop: inline UseAfter on Derived** — substitute KeepBefore - for that inline, surrounding inline plan continues; assert - `Q-3-42` warning emitted. + `structural_eq_inlines` return `true` for pairs of nodes that + differ *only* in source_info. Cover: two Original blocks with + different file IDs / offsets; two Generated blocks with different + `By` payloads; two Generated blocks with different anchor lists + but the same content / attr / plain_data; CustomNode pairs + differing only in source_info on the wrapper or in any slot child. + Why it matters: the reconciler drives KeepBefore decisions off + these functions. If they leak source_info by accident, round-trip + degenerates to whole-doc Rewrite without obvious symptom. + +- **`preimage_in` unit tests** — each variant: Original same / other + file, Substring chain, Concat contiguous / gappy, Generated with + no anchors, Generated with Invocation anchor resolving into + target, Generated with Invocation anchor resolving elsewhere. + Assert correct byte range or None. + +- **`preimage_in` skips non-Invocation roles** — Generated with + only ValueSource / Dispatch / Other anchors returns None. (The + full ValueSource end-to-end correctness test lives in Plan 9 + Phase 5 with real appendix-license fixtures; this Plan-7-side + test pins the unit-level behavior.) + +- **Coarsen unit tests** — build mock reconciliation plans + ASTs + covering: + - Verbatim (KeepBefore + preimage in target, both Original and + Generated-with-Invocation cases). + - Transparent (KeepBefore + non-atomic Generated wrapper with + source-bearing children — Sectionize / footnotes / appendix). + - Omit via atomic-kind Generated (KeepBefore + Generated with + `by.is_atomic_kind() == true` and no anchors — filter + construction). + - Omit via non-atomic Generated with no children (rare). + - Rewrite via catch-all (KeepBefore with no preimage and no + matching Generated shape — cross-file Original, gappy Concat). + - Rewrite (UseAfter, non-atomic, editable). + - **Soft-drop: inline UseAfter on atomic-Generated** — substitute + KeepBefore for that inline at `before_idx`; surrounding inline + plan continues; assert `Q-3-42` warning emitted. - **Soft-drop: block RecurseIntoContainer on atomic CustomNode** - (IncludeExpansion) — substitute KeepBefore for the wrapper; - assert `Q-3-43` warning emitted; assert wrapper's preimage bytes - in output. - - **Let-user-win: block UseAfter on atomic node** — Rewrite via qmd - writer; no warning. Assert qmd writer's CustomNode arm correctly - serializes a fresh user-edit-tagged IncludeExpansion (uses - `plain_data["source_path"]`). -- **Multi-inline dedupe unit tests**: build a Para with three consecutive - inlines all sharing the same Derived `from`. Reconcile against an - identical Para. Assert the writer emits the shortcode token bytes - ONCE, not three times, in the inline-assembly output. -- **Soft-drop interaction tests**: - - User edits one Derived inline AND a non-atomic inline in the same - Para → assert non-atomic edit is applied AND shortcode token is + (IncludeExpansion) — substitute KeepBefore; assert `Q-3-43` + warning emitted; assert wrapper's preimage bytes in output. + - **Soft-drop: block RecurseIntoContainer on no-preimage + Generated** — substitute Omit; assert `Q-3-43` warning emitted; + assert nothing emitted for the wrapper. + - **Soft-drop: block UseAfter on no-preimage Generated** — + substitute Omit; assert `Q-3-43` warning emitted. + - **Let-user-win: block UseAfter on atomic CustomNode** — Rewrite + via qmd writer; no warning. Assert qmd writer's CustomNode arm + correctly serializes a fresh user-edit-tagged IncludeExpansion + using `plain_data["source_path"]`. + +- **Multi-inline dedupe unit tests** — build a Para with three + consecutive inlines all sharing the same `Invocation` anchor + source_info (`PartialEq`-equal). Reconcile against an identical + Para. Assert the writer emits the shortcode token bytes ONCE, + not three times. Also: assert dedupe does NOT fire when anchors + differ structurally. + +- **Multi-inline dedupe + ValueSource interaction** (forward-compat + with Plan 9). Build inlines with shape `Generated { from: + [Invocation, ValueSource] }`. Two inlines whose `Invocation` + source_info matches but `ValueSource` source_info differs should + still dedupe (dedupe consults Invocation only). Add this once + Plan 9 has stamped ValueSource on a real consumer. + +- **Soft-drop interaction tests:** + - User edits one Derived inline AND a non-atomic inline in the + same Para → non-atomic edit applied AND shortcode token preserved AND `Q-3-42` warning emitted. - - User edits inside an include AND outside the include in same doc → - assert outside edit is applied AND include token is preserved AND + - User edits inside an include AND outside the include in same + doc → outside edit applied AND include token preserved AND `Q-3-43` warning emitted (write succeeds with warnings, not Err). -- **End-to-end round-trip tests**: - - Sectionized doc → edit one paragraph → assert the section structure - is preserved verbatim except for the edit. - - Doc with single-inline shortcode (`{{< meta title >}}`) → edit a - different paragraph → assert the shortcode token is preserved. - - Doc with multi-inline shortcode (markdown title) → edit a different - paragraph in same Para → assert the shortcode token appears once, - not multiple times. - - Doc with shortcode → attempt to edit the resolved title → assert - `Q-3-42` warning + the document text is byte-equal to a no-op edit - (i.e., the bad edit was reverted). Save succeeded. - - (Plan 8 covers includes; this plan establishes the infrastructure.) -- **Filter-construction soft-drop test**: build an AST with a - filter-constructed Str (Synthetic { by: filter }) inside a Para. User - retypes it through React → assert `Q-3-42` warning + the original - Para's source bytes (without the decoration) appear in output. Next - pipeline run regenerates the decoration. -- **Idempotence holds**: re-run Plan 3's idempotence test after this plan - lands. The AST shape changes from this plan's transforms shouldn't break - it. + +- **End-to-end round-trip tests** (hub-client): + - Sectionized doc → edit one paragraph → assert the section + structure is preserved verbatim except for the edit. + - Doc with single-inline shortcode (`{{< meta title >}}`) → edit + a different paragraph → assert the shortcode token is preserved. + - Doc with multi-inline shortcode (markdown title) → edit a + different paragraph in same Para → assert the shortcode token + appears once, not multiple times. + - Doc with shortcode → attempt to edit the resolved title → + assert `Q-3-42` warning + the document text is byte-equal to + a no-op edit (the bad edit was reverted). Save succeeded. + - Plan 8 covers includes; Plan 9 Phase 5 covers appendix-license; + this plan establishes the infrastructure. + +- **End-to-end round-trip tests (SPA):** + - SPA boots against a project with a single doc; edit a paragraph + via setLocalAst; assert the qmd on disk reflects the edit and + automerge content matches. + - Single-file mode (bd-tnm3k): same test with a `.qmd` outside any + `_quarto.yml` project root; assert the original file path is + written. + - Edit a shortcode in the SPA → assert Q-3-42 warning appears in + DiagnosticStrip; assert qmd on disk is unchanged. + - Edit a non-atomic block and a shortcode-resolved inline together + → assert non-atomic edit applies, shortcode preserved, Q-3-42 + warning shows. + - **Content-match echo-prevention test**: induce a local-edit ↔ + sync-echo cycle; assert the SPA's render effect fires exactly + once after the edit completes; assert an interleaved unrelated + file's update is processed normally (not suppressed). + +- **Filter-construction soft-drop test** — build an AST with a + filter-constructed Str (`Generated { by: filter, from: [] }`) + inside a Para. User retypes it through React → assert `Q-3-42` + warning + the original Para's source bytes (without the + decoration) appear in output. Next pipeline run regenerates the + decoration. + +- **Idempotence holds** — re-run Plan 3's idempotence test after + this plan lands. The AST shape changes shouldn't break it. ## Dependencies -- Depends on: Plans 4 (Synthetic + Derived + By), 5 (wire format), 6 - (audit + Derived provenance on shortcode resolutions). -- Blocks: nothing structurally; Plan 8 builds on the atomic infrastructure - but is independent (uses `is_atomic_custom_node` for IncludeExpansion). -- Lifts the read-only mode that Plan 1 introduced for q2-preview. - Plan 1's render-side `pipeline_kind` dispatches in - `AstTransformsStage::run()` and `ReactPreview.tsx::doRender` are - already structured (no string-literal scaffolding remains); Plan - 7 verifies the write-side parameter threads through the same - selector. See §Scope's "Verify: structured pipeline dispatch is - already in place" item for the verification step. +### Hard dependencies + +- **Plans 4 / 5 / 6** — provide the typed `Generated { by, from }` + shape and the synthesizer stamping the writer walks. The writer + can't test Generated-with-anchor behavior without those types + existing and being produced by real transforms. +- **Plan 3** — `compute_meta_hash_fresh` / + `compute_meta_hash_fresh_excluding_rendered` (used by the + writer-lossless baseline test). + +### Soft dependencies / coordination + +- **Plan 9** — owns the `preimage_in` role-asymmetry e2e test and + the appendix-license round-trip test. Plan 7's unit-level + `preimage_in` test pins behavior; Plan 9's tests pin end-to-end + correctness once a real ValueSource consumer (the appendix + synthesizer) exists. +- **Plan 10** — inherits Plan 7's `AnchorRole::Other` policy. No + ordering constraint. +- **Plan 7a** — separable follow-up; uses Plan 7's writer + warnings + infrastructure but doesn't gate M3. +- **Plan 8** — uses Plan 7's atomic-CustomNode infrastructure but + is independent (introduces `IncludeExpansion` to + `ATOMIC_CUSTOM_NODES`; doesn't change Plan 7's logic). + +### What Plan 7 doesn't block + +- Plan 9's implementation can start in parallel; the writer-side + changes don't depend on Plan 9's consumer wiring. +- Plan 10's implementation can start in parallel; Dispatch anchors + are stamped by Plan 6's post-walk helper, which Plan 10 modifies + independently. ## Risk areas -- **`incremental.rs` is intricate**: ~1000 lines, many interlocking - functions. Adding new coarsen variants and rewiring assemble carefully - is the meat of this plan. Budget extra time for edge cases. -- **Plan 4 / 5 / 6 must land first**. The writer can't test Synthetic - walking without those types existing. Order matters strictly. -- **InlineSplice + Transparent interaction**: the existing InlineSplice - logic handles inline-level changes. If Transparent at the block level - recurses into a block whose inlines need splicing, the assembly logic - composes both. Test this case — it's the trickiest edge. -- **Soft-drop warning visibility**: warnings flow through the existing - `RenderResponse.warnings` channel (the same path Plan 1's pipeline - diagnostics use). ReactPreview already displays diagnostics in the - editor. Confirm Q-3-42 / Q-3-43 warnings reach the diagnostic panel - and are visually distinguishable from pipeline warnings (or are - acceptably co-mingled — TBD by hub-client UX). -- **Autosave-context spam mitigation for Q-3-42 / Q-3-43**: hub-client - uses Automerge as the source-of-truth for qmd source — there's no - discrete "save" action; every keystroke triggers a debounced render - and incremental write. So a user persistently typing over a Derived - inline (resolved shortcode) would re-fire Q-3-42 on every render, - flooding the diagnostic panel with copies of the same warning. - Same for Q-3-43 if the user keeps editing inside an include. - Mitigation: **suppress-after-3** in the diagnostic banner. The - Monaco squiggle (yellow underline at the affected source range) - remains as the persistent signal; the side-panel banner shows the - first three occurrences per source range and silently drops the - rest. Implemented at the diagnostic-ingest layer in `Preview.tsx` - (or wherever warnings are processed for display), not at the +- **`incremental.rs` is intricate** (~830 lines, many interlocking + functions). Adding new coarsen variants and rewiring assemble + carefully is the meat of this plan. Budget extra time for edge + cases. + +- **Plans 4 / 5 / 6 must land first.** The writer can't test + Generated-with-anchor walking without those types existing and + being produced by real transforms. Order matters strictly. + +- **InlineSplice + Transparent interaction.** The existing + InlineSplice logic handles inline-level changes. If Transparent + at the block level recurses into a block whose inlines need + splicing, the assembly logic composes both. Test this case — + it's the trickiest edge. + +- **Baseline-AST staleness.** If the caller passes a baseline AST + that doesn't match the original qmd source (e.g., the qmd source + changed externally between render and edit), the reconciler + produces a confused diff and the writer's output is garbage. + Hub-client's existing `applyingRemoteRef` pattern + (`hub-client/src/hooks/useAutomergeSync.ts:55`) and the SPA's + content-match echo-prevention (new in this plan) keep the + baseline fresh in practice. The contract is: caller MUST pass + a baseline that's `parse_or_render(originalQmd) at the same tier + as newAst`. Document this on the WASM entry and TS wrapper. + +- **Soft-drop warning visibility.** Warnings flow through the + existing `RenderResponse.warnings` channel. Hub-client already + collates them in `ReactPreview.tsx`; Editor's + `diagnosticsToMarkers` splits into Monaco markers and the + existing `.diagnostics-banner`. SPA gets the new `DiagnosticStrip`. + +- **SPA echo-prevention correctness.** The content-match gate + must hash the qmd we're emitting exactly as the round-trip + produces it (no trailing newline differences, no encoding + variation). Implement with a fixture-based assertion: emit qmd + X, simulate the echo loop, assert the gate matches. + +- **Autosave-context spam mitigation for Q-3-42 / Q-3-43.** + Hub-client and SPA both use Automerge as the source-of-truth for + qmd source — there's no discrete "save" action; every keystroke + triggers a debounced render and incremental write. A user + persistently typing over an atomic-resolved inline would re-fire + Q-3-42 on every render, flooding the diagnostic surface. + + **Mitigation:** suppress-after-3 by source range. Monaco squiggles + (yellow underline at the affected source range) remain as the + persistent signal in hub-client; the side-panel banner / + DiagnosticStrip shows the first three occurrences per source + range and silently drops the rest. Implemented at the + diagnostic-ingest layer (`ReactPreview.tsx::allDiagnostics` + collation for hub-client; `DiagnosticStrip` for SPA), not at the writer. Plan 7a's Q-3-44 doesn't have this issue — it's cached - once per document per session, so it fires at most once. - Imperative message text matters here too: Q-3-42 / Q-3-43 should - read as instructions ("To edit this content, open ``") + once per document per session. + + Imperative message text matters: Q-3-42 / Q-3-43 read as + instructions ("To edit this content, open ``") rather than passive descriptions ("edit was dropped"), since the user has no discrete-save affordance to discard the bad edit. - Plan 7's soft-drop is what guarantees the qmd source-of-truth - doesn't accept the bad edit even though the in-React AST briefly - held it. ## Estimated scope | Component | Lines (rough) | |---|---| -| `preimage_in` accessor (with Derived) + tests | ~100 | +| `preimage_in` accessor (with Generated/Invocation) + tests | ~100 | | New `CoarsenedEntry` variants (Transparent, Omit) | ~20 | -| `coarsen` logic update (atomic detection + soft-drop substitutions) | ~180 | +| `coarsen` logic update (editability gate + soft-drop substitutions) | ~200 | | `assemble` updates (Transparent walk, Omit handling) | ~80 | -| Multi-inline shortcode dedupe rule in inline assembly | ~40 | -| Inline-level soft-drop substitution in inline plan | ~50 | -| `is_atomic_custom_node` registry + TS hand-mirror | ~40 | -| Q-3-42 / Q-3-43 diagnostic codes + catalog entries | ~40 | +| Multi-inline shortcode dedupe (PartialEq on Invocation anchors) | ~40 | +| Inline-level soft-drop substitution | ~50 | +| `is_atomic_custom_node` registry (Rust side; TS hand-mirror already in place) | ~30 | +| Q-3-42 / Q-3-43 diagnostic codes + catalog entries | ~50 | | Warning channel plumbing through coarsen → incremental_write return | ~50 | -| `pipeline_kind` parameter + WASM bridge + TS wrapper | ~80 | -| ReactPreview guard lift + edit-back wiring | ~20 | -| Verify Plan 1's render-side pipeline_kind dispatch is end-to-end correct (no refactor work; Plan 1 already implemented it) | ~5 | -| Tests (unit + end-to-end round-trip + soft-drop interactions) | ~400 | -| **Total** | **~1105** | - -Two focused sessions likely. Flagged as one of the highest-complexity plans; -extend the budget if the InlineSplice + Transparent composition surfaces -unexpected interactions. +| `incremental_write_qmd` WASM signature change + JSON envelope | ~40 | +| TS wrapper signature change (`incrementalWriteQmd`) | ~20 | +| Three consumer migrations (ReactPreview + 2 demos) + sync-client type | ~60 | +| ReactPreview guard lift + `ast`-state baseline wiring | ~20 | +| SPA setAst handler + content-match echo-prevention | ~50 | +| `DiagnosticStrip` component for SPA (TSX + CSS) | ~70 | +| `pipelineKindForFormat` move to `ts-packages/preview-runtime` | ~10 | +| Tests (unit + end-to-end round-trip + soft-drop interactions, both surfaces) | ~500 | +| **Total** | **~1390** | + +## Session split + +Two agent sessions, one plan document, two PRs against the +`feature/provenance` integration branch. The boundary is the WASM +ABI: Session 1 settles the Rust API surface; Session 2 propagates +it across the WASM bridge into TS callers and the SPA. + +**Session 1 — Rust core (Phases 1-3 + writer-lossless baseline +test).** Branch `beads/-plan7-rust-core` off +`feature/provenance`. Lands: + +- `quarto-source-map`: `preimage_in` accessor + doc-comment +- `quarto-core`: `ATOMIC_CUSTOM_NODES`, `is_atomic_custom_node`, + `editability::is_editable_inside` +- `quarto-error-reporting`: Q-3-41 / Q-3-42 / Q-3-43 catalog + entries + builder helpers +- `pampa::writers::incremental`: `CoarsenedEntry::{Transparent, Omit}` + variants, rewritten `coarsen` logic, soft-drop substitutions, + multi-inline dedupe, return-type change, debug-assert +- `quarto-ast-reconcile`: source-info-blindness foundation test +- Full unit + integration test corpus for the above +- Writer-lossless baseline test + +The WASM bridge stays **externally identical** in Session 1 — the +internal Rust signature changes, but `incremental_write_qmd` in +`wasm-quarto-hub-client/src/lib.rs:2947` keeps its two-argument +form and discards the new warnings channel temporarily (the WASM +arm calls `incremental_write(...)` and `let (qmd, _warnings) = ...`). +Browser callers see no change; existing tests still pass. +`cargo xtask verify` green at end of Session 1. + +**Session 2 — WASM bridge + consumers + SPA + e2e (Phases 4-9).** +Fresh agent, fresh context. Branch +`beads/-plan7-wasm-and-consumers` off the new +`feature/provenance` tip (after Session 1's `--no-ff` merge). Lands: + +- WASM signature change (three-arg `incremental_write_qmd`, + warnings surfaced in `AstResponse.warnings`) +- TS wrapper signature change in `wasmRenderer.ts` +- Hand-maintained `.d.ts` files (two locations) +- Sync-client interface + call-site update +- ReactPreview guard lift + `writeBackWarningsRef` plumbing + + Q-3-41 first-edit emission +- Two demo migrations (kanban, hub-react-todo) +- TS-side `hasPreimageIn` / `isEditableInside` / `dispatch.tsx` + update (closes the partial React gate from Plan 2A) +- `pipelineKindForFormat` move to `ts-packages/preview-runtime` +- SPA `handleSetAst` + `getFileContent` wiring + + content-match echo-prevention + `DiagnosticStrip` + Q-3-41 + first-edit emission +- All Phase 8 end-to-end tests +- All Phase 9 verification + cleanup + +`cargo xtask verify` (full chain, no skip) green at end, plus +manual browser smoke per CLAUDE.md's "End-to-end verification +before declaring success". + +Flagged as one of the highest-complexity plans; extend either +session's budget if the InlineSplice + Transparent composition +(Session 1) or the soft-drop catalog + SPA echo-prevention +(Session 2) surfaces unexpected interactions. + +## Implementation checklist + +Work items grouped by phase. Each phase's items are roughly +sequential; phases themselves are mostly sequential, with some +parallelism noted. Plan 6 must land before Phase 1 starts. + +**Coordination posture.** This checklist is sized for serial +implementation in a single fresh 1M-context session — the phases +flow linearly, and the entire plan fits comfortably in one +context window. No beads-per-phase split needed. Open a follow-up +beads only for items that surface during implementation and are +genuinely out of scope (e.g. preexisting bugs found in adjacent +code; future-plan-bound features). + +### Phase 1 — Foundation primitives (`quarto-source-map`, `quarto-pandoc-types`, `pampa`) + +**Implementation note (2026-05-24):** Plan originally placed +`ATOMIC_CUSTOM_NODES` / `is_atomic_custom_node` in `quarto-core`, but +`quarto-core` depends on `pampa` and the writer (in `pampa`) is the +primary consumer — that direction would cycle. Moved the registry +down to `quarto-pandoc-types` (the home of `CustomNode` itself). A +cross-check test in `quarto-core::crossref` pins the literal in +lockstep with `CROSSREF_RESOLVED_REF`. + +- [x] `SourceInfo::preimage_in(target: FileId) -> Option>` accessor with full match (Original, Substring, Concat, Generated) +- [x] Doc-comment on `preimage_in` stating the `Invocation`-only walking policy + asymmetry rationale +- [x] Doc-comment on `AnchorRole::Other` reiterating the policy (future roles default to non-walked) +- [x] `pub const ATOMIC_CUSTOM_NODES: &[&str] = &["CrossrefResolvedRef"]` in `quarto-pandoc-types` (not `quarto-core` — see implementation note) +- [x] `pub fn is_atomic_custom_node(type_name: &str) -> bool` in `quarto-pandoc-types` +- [x] `is_editable_inside_block` / `is_editable_inside_inline` helpers in `pampa::writers::incremental` (two functions sharing a private `is_editable_inside_source_info` core; React side will import an equivalent TS predicate in a future Phase) +- [x] `preimage_in` unit tests: Original same / different file; Substring chain; Concat contiguous / gappy / overlapping / mixed-files; Generated with no anchors; Generated with Invocation anchor resolving in / out of target; Generated with Invocation through Substring chain +- [x] `preimage_in` role-asymmetry unit test: Generated with only ValueSource / Other anchors returns None; mixed Invocation + ValueSource walks Invocation only +- [x] `is_editable_inside` unit tests covering all three uneditable reasons (atomic CustomNode, atomic-kind Generated, no-preimage Generated, value-source-only Generated) plus positive cases +- [x] Reconciler source-info-blindness foundation test in `quarto-ast-reconcile` (Generated-with-different-By, Generated-with-different-anchor-lists, CustomNode wrapper and slot-child blindness) +- [x] `cargo nextest run --workspace` green (9509 tests) +- [x] `cargo xtask verify` green (full 12-step chain including WASM build + hub-client tests) + +### Phase 2 — Writer internals (`pampa::writers::incremental`) + +**Implementation notes (2026-05-24):** +- The plan's checklist item "Remove `AtomicViolation` variant" was a + residue of an earlier draft — no such variant existed in the + pre-Plan-7 code. Marked done by omission. +- The `coarsen` signature change keeps `Result` as the return: the + warning sink covers soft-drop cases, while the existing `Err` arm + (reached via `?` from `assemble_inline_splice`) stays for genuine + structural failures. +- The singleton-KeepBefore inline emit path was updated to use + `preimage_in(target_file_id)` (with `inline_source_span` fallback). + Original-SI inlines are byte-identical to the old behavior; + Generated-SI inlines now emit the Invocation anchor's preimage + bytes instead of an empty range — fixes a latent zero-length bug + in the pre-Plan-7 inline-splice path. Multi-inline dedupe sits on + top: when consecutive KeepBefore entries share an Invocation + anchor, emit the anchor's preimage *once*. + +**Repo facts that bite when constructing test fixtures:** +- `AttrSourceInfo` does **not** implement `Default`. Use + `quarto_pandoc_types::AttrSourceInfo::empty()` for `Div`/`Header`/ + `Figure`/etc. `attr_source` fields in hand-built fixtures. +- `gen` is a reserved keyword in Rust 2024 edition. Don't name a + variable `gen` (e.g. for a `SourceInfo::Generated` fixture); + `gen_info` works. + +- [x] Add `CoarsenedEntry::Transparent { child_entries }` variant +- [x] Add `CoarsenedEntry::Omit` variant +- [x] Change `coarsen` signature to accept `&mut Vec` warning sink +- [x] Rewrite `coarsen` KeepBefore branch: Verbatim / Omit / Transparent / Rewrite-catch-all cascade per §"Coarsen pseudo-code" +- [x] Rewrite `coarsen` UseAfter branch: atomic-CustomNode-let-user-win, no-preimage-Generated-soft-drop +- [x] Rewrite `coarsen` RecurseIntoContainer branch: `is_editable_inside` gate + soft-drop substitution + Verbatim-or-Omit fallback +- [x] Inline-level soft-drop in `assemble_inline_content`: substitute KeepBefore via `before_idx` when `!is_editable_inside` +- [x] Multi-inline dedupe in `assemble_inline_content`: PartialEq grouping on Invocation anchor source_info +- [x] `assemble` handles Transparent (recursive child emission via `emit_entries` helper, shared `prev_entry` state across the wrapper boundary) +- [x] `assemble` handles Omit (no-op, doesn't update `prev_entry`) +- [x] ~~Remove `AtomicViolation` variant~~ — variant never existed in the codebase; checklist item was stale (see implementation note above) +- [x] Change `incremental_write` return type: `Result<(String, Vec), Vec>` (same for `compute_incremental_edits`); WASM bridge + all test callers migrated +- [x] `debug_assert!` for the shortcode-Generated-with-empty-from regression case (Plan 6 stamper invariant) — in `coarsen_keep_before_block` +- [ ] Writer-lossless baseline test (Plan 7 first-commit prerequisite): for each Generated / CustomNode shape, assert `parse(write(ast))` hash equals input via `compute_blocks_hash_fresh` + `compute_meta_hash_fresh_excluding_rendered` — **deferred to Plan 7b Phase 1** (`claude-notes/plans/2026-05-24-q2-preview-plan-7b-test-orama.md`) +- [x] Coarsen unit tests: Verbatim, Transparent (sectionize wrapper with source-bearing children), Omit (atomic-kind filter construction), Rewrite-catch-all (cross-file Original), Rewrite (UseAfter editable) +- [x] Coarsen soft-drop unit tests: inline UseAfter on atomic-Generated (Q-3-42); block RecurseIntoContainer on atomic CustomNode (Q-3-43, Verbatim path); block RecurseIntoContainer on no-preimage Generated (Q-3-43, Omit path); block UseAfter on no-preimage Generated (Q-3-43, Omit path) +- [x] Let-user-win unit test: block UseAfter on atomic CustomNode → Rewrite; no warning +- [x] Multi-inline dedupe unit tests: positive (anchors PartialEq-equal → one Verbatim); negative (anchors differ → individual emits); ValueSource cross-talk (Plan 9 forward-compat — anchors match on Invocation but differ on ValueSource → still dedupes) +- [ ] Soft-drop interaction test: shortcode edit + non-atomic edit in same Para — **deferred to Plan 7b Phase 1** +- [ ] Filter-construction soft-drop test (UseAfter into a filter-constructed inline) — **deferred to Plan 7b Phase 1** + +### Phase 3 — Diagnostic catalog (`quarto-error-reporting`) + +- [x] `Q-3-42` entry in `error_catalog.json`: title "Shortcode edit dropped"; problem text; hint text; severity Warning +- [x] `Q-3-43` entry in `error_catalog.json`: title "Generated content edit dropped"; severity Warning. (Single generic `message_template`; the three emission paths supply distinct body text via the builder — per Plan 7 §"Catalog mechanics".) +- [x] Diagnostic builder helpers `diagnostic_q3_42_inline(inline)` and `diagnostic_q3_43_block(block)` used by `coarsen`'s soft-drop sites; live in `pampa::writers::incremental` (not `quarto-error-reporting`, which doesn't depend on `quarto-pandoc-types`) +- [x] Unit tests: each soft-drop unit test asserts the correct Q-3-42 / Q-3-43 code is emitted + +### Session 1 → Session 2 handoff (retrospective — superseded) + +This section originally scoped Plan 7 across two agent sessions +split at the WASM ABI boundary. In practice the work shipped in a +single session on `feature/provenance` (Phases 1-7 + 9 done; +Phase 8's broader e2e matrix deferred to Plan 7b; Q-3-41 + TS-side +editability predicate deferred to Plan 7c). + +The session-split scaffold, the Deltas placeholder, and the Agent 2 +launch prompt have been retired — they assumed a two-PR workflow +that never materialized. The full reshape lives in git history +(commit `561eefa0` on the `review/provenance-plan-7` branch, which +landed on `feature/provenance` via the 2026-05-26 rebase) for +anyone who wants to see the original handoff design. + +### Phase 4 — WASM bridge signature change (`wasm-quarto-hub-client`) + +**Repo facts the implementer needs:** + +- **The `wasm-quarto-hub-client` crate is NOT in the cargo workspace.** + `cargo build -p wasm-quarto-hub-client` fails with "did not match + any packages". Build via `cd hub-client && npm run build:wasm` or + implicitly via `cargo xtask verify` step 6. +- **`AstResponse.warnings` is `Option>`, not raw + serde Value.** Convert via `diagnostics_to_json(&warnings, ctx)`, + where `ctx: &SourceContext`. In the post-Phase-4 body, the + baseline AST's `ASTContext` carries this — access via + `baseline_context.source_context` (the field that `ASTContext` + exposes). Phase 2 wired this via the old `original_context` + variable; the equivalent post-Phase-4 binding is the baseline AST's. + +- [x] Change `incremental_write_qmd` Rust signature: add `baseline_ast_json: &str` as second positional argument +- [x] WASM body: deserialize `baseline_ast_json` via `pampa::readers::json::read` (parallel to existing `new_ast_json` deserialization); drop the qmd-parse step +- [x] Populate `AstResponse.warnings` field from `incremental_write`'s warning vec via `diagnostics_to_json(&warnings, &baseline_context.source_context)` +- [x] Doc-comment specifies the baseline-tier contract (caller responsibility to match tier of `new_ast_json`) + +### Phase 5 — TypeScript wrapper + sync-client interface + +- [x] `ts-packages/preview-runtime/src/wasmRenderer.ts:712` — `incrementalWriteQmd(originalQmd, baselineAst, newAst): { qmd, warnings }` +- [x] Accept `baselineAst` as `RustQmdJson | string` for ergonomics; stringify internally +- [x] `ts-packages/preview-runtime/src/wasm-quarto-hub-client.d.ts:78` — new signature in WASM type declaration +- [x] `hub-client/src/types/wasm-quarto-hub-client.d.ts:69` — new signature in hub-client's WASM type declaration +- [x] `ts-packages/quarto-sync-client/src/types.ts:169` — `astOptions.incrementalWriteQmd` interface signature change +- [x] `ts-packages/quarto-sync-client/src/client.ts:957` — pass `cached.ast` as baseline; surface `result.qmd` to `updateFileContent`; warnings ignored at sync-client level (policy-free; demos consume them via wrapper) +- [x] Move `hub-client/src/utils/pipelineKind.ts` → `ts-packages/preview-runtime/src/pipelineKind.ts`; update imports in hub-client and SPA (SPA had no import yet — Phase 7) + +### Phase 6 — Consumer migrations + +- [x] `hub-client/src/components/render/ReactPreview.tsx:429-440` — `handleSetAst` updated: delete read-only guard, pass `ast` state as baseline, ingest warnings into next diagnostics push via `pendingWriteWarningsRef` +- [x] `hub-client/src/types/wasm-quarto-hub-client.d.ts:69` — type declaration updated +- [x] `q2-demos/kanban/src/wasm.ts:79` — wrapper accepts baselineAst, forwards to WASM +- [x] `q2-demos/kanban/src/useSyncedAst.ts:93` — astOptions lambda accepts third positional argument +- [x] `q2-demos/kanban/src/types/wasm-quarto-hub-client.d.ts:8` — type declaration updated +- [x] `q2-demos/hub-react-todo/src/wasm.ts:79` — wrapper signature update +- [x] `q2-demos/hub-react-todo/src/useSyncedAst.ts:93` — astOptions lambda update +- [x] `q2-demos/hub-react-todo/src/types/wasm-quarto-hub-client.d.ts:8` — type declaration update +- [x] Workspace `cargo build --workspace` + `cargo nextest run --workspace` green +- [x] `cd hub-client && npm run build:all` green (WASM type alignment) +- [x] `cd hub-client && npm run test:ci` green + +### Phase 7 — q2-preview SPA integration + +- [x] `q2-preview-spa/src/PreviewApp.tsx`: baseline read via `astJsonRef` mirroring `state.astJson` (avoided new state — the ref keeps `handleSetAst`'s identity stable across re-renders, which the iframe's effect-deps care about) +- [x] Replace `noopSetAst` with real `handleSetAst` that calls `incrementalWriteQmd(content, baselineJson, newAst)` +- [x] Content-match echo-prevention: hash emitted qmd via FNV-1a, stash `(path, hash)` in `lastEmittedRef`; matching incoming `onFileContent` consumes the ref and returns early +- [x] Hash algorithm decision recorded in `fnv1aHex` docstring (FNV-1a: in-process equality, 32 bits sufficient, zero-dependency, matches existing actor-color hash pattern) +- [x] `q2-preview-spa/src/components/DiagnosticStrip.tsx` component (inline styles per existing SPA convention; ~120 LOC TSX, no separate CSS file) +- [x] DiagnosticStrip ingest from `incrementalWriteQmd` result's warnings field via `writeWarnings` state +- [x] Suppress-after-3-by-source-range mitigation in DiagnosticStrip (`suppressAfterThree` helper) +- [x] Imperative message text for Q-3-42 / Q-3-43 — catalog entries already imperative from Phase 3 (`"edit the invocation token in source instead"`); DiagnosticStrip surfaces title + problem verbatim + +### Phase 8 — End-to-end tests + +- [x] Hub-client: WASM-level wrapper contract test (`hub-client/src/services/incrementalWrite.wasm.test.ts`) — pins the 3-arg API, identity round-trip, paragraph-edit preservation, structured error on malformed baseline JSON. Run via `npm run test:wasm`; 3/3 passing. +- [x] Plan 3's idempotence test re-run — passes within `cargo xtask verify` (9535/9535 Rust tests, includes `crates/quarto-core/tests/idempotence.rs`). +- [ ] **Deferred to Plan 7b Phases 2 + 3** (`claude-notes/plans/2026-05-24-q2-preview-plan-7b-test-orama.md`; consolidates `bd-3izo3`) — the broader Playwright scenario matrix (sectionized round-trip in a real hub session, single/multi-inline shortcode preservation, Q-3-42 byte-equal-no-op, Q-3-43 footnotes regeneration, SPA edit-paragraph round-trip in both project and single-file modes, SPA Q-3-42 DiagnosticStrip, mixed atomic + non-atomic, echo-prevention fixture). Each spec needs ~60 LOC of fixture/server setup and runs only under `cargo xtask verify --e2e`. The Rust-side soft-drop matrix is already exhaustively covered in `crates/pampa/src/writers/incremental.rs`; the deferred work is end-to-end *delivery* coverage, not new correctness coverage. + +### Phase 9 — Verification + cleanup + +- [x] `cargo xtask verify` green (full chain: Rust workspace + hub-build + hub-tests) — see `/tmp/plan7-phase4-6-verify.log` +- [x] **Refresh `q2 preview` WASM chain before smoke testing** (per `CLAUDE.md` §"Verifying Rust changes in `q2 preview`"; addresses the 2026-05-20 stale-WASM incident): + - [x] `cd hub-client && npm run build:wasm` — rebuild WASM from Plan 7's Rust changes + - [x] `cargo xtask build-q2-preview-spa` — bundle WASM into `q2-preview-spa/dist/` + - [x] `cargo build --bin q2` — re-embed `dist/` via `include_dir!` +- [x] q2 preview boot smoke: `cargo run --bin q2 -- preview /tmp/plan7-smoke` rendered correctly; user confirmed the preview in their browser (2026-05-24 session). The full edit round-trip (drag-to-trigger-handleSetAst → observe DiagnosticStrip on atomic edit) is part of the deferred Playwright matrix above. +- [ ] **Deferred to the user** — hub-client manual smoke (edit sectionized doc, observe section structure in saved qmd) and SPA manual smoke with echo-prevention assertion. The user is doing these by hand; the e2e equivalents land via Plan 7b Phases 2 + 3. +- [x] Plan 7 marked complete (Phases 1-7 + 9 done; Phase 8 partially landed, remainder tracked separately). +- [x] Bump `hub-client/changelog.md` with a one-line entry per the two-commit workflow (commit `b5d6d08a`). +- [x] Plan 9's `preimage_in` role-asymmetry e2e test reference is in Plan 9 Phase 5 (added a "Plan 7 shipped 2026-05-24" status note so the deferral state is unambiguous when Plan 9 lands). ## Notes -This is the most intricate plan in the set. It's the keystone for M3 — -once this lands, q2-preview is truly editable for the common case. Take -care with the test coverage; round-trip bugs in the writer can corrupt -source silently if not caught. +This is the most intricate plan in the set. It's the keystone for +M3 — once this lands, q2-preview is truly editable for the common +case in BOTH hub-client and the q2 preview SPA. Take care with +test coverage; round-trip bugs in the writer can corrupt source +silently if not caught. -### Soft-drop replaces hard-abort (revised from earlier draft) +### Soft-drop replaces hard-abort -An earlier draft of this plan had AtomicViolation as a hard error — any -edit to atomic content aborted the entire write. We revised to soft-drop: -each bad-edit case substitutes a safe alignment in coarsen and emits a -warning, but the user's other edits go through. The user-facing contract -"this edit must be prohibited" is honored (the bad edit doesn't apply); -the user-facing failure mode "the entire save was rejected" is not. -React (Plan 2B) is the primary safeguard via read-only enforcement; -the writer is the contract guarantor; if React has a hole the writer -protects without losing the user's session. +Plan 7 substitutes safe alignments in coarsen and emits warnings +rather than aborting the entire write. The user-facing contract +"this edit must be prohibited" is honored (the bad edit doesn't +reach source); the user-facing failure mode "the entire save was +rejected" is not. React (Plan 2A's framework atomic gate) is the +primary safeguard via read-only enforcement; the writer is the +contract guarantor; if React has a hole the writer protects +without losing the user's session. The let-user-win exception for block-level UseAfter on atomic -(user-replaced or -deleted atomic block via React) is a deliberate -asymmetry: when the user explicitly destroys an atomic block, we trust -them. The qmd writer's CustomNode arms know how to write fresh atomic -types from `plain_data` (Plan 8's IncludeExpansion arm reads -`plain_data["source_path"]`), so this composes through the normal -Rewrite path with no special handling. +**CustomNode** (user-replaced or -deleted via React's component +menu) is a deliberate asymmetry: when the user explicitly destroys +an atomic CustomNode through an explicit affordance, we trust +them. The qmd writer's CustomNode arms know how to write fresh +atomic types from `plain_data`. The corresponding case for +no-preimage Generated containers stays soft-drop — there's no +source position to anchor a Rewrite at. ### Filter mutations are not flagged as atomic — accepted corner -Plan 4 distinguishes filter constructions (`pandoc.Str("decoration")` → -`Synthetic { by: filter }`, atomic) from filter mutations -(`Str.text = upper(Str.text)` → keeps Original source_info, NOT atomic). - -A user editing a filter-mutated Str through React produces an unusual -round-trip: the user types "world" over the filter-output "HELLO"; -the writer Rewrites "world" to source; the next pipeline run filters -"world" → "WORLD". For idempotent filters (uppercase) this is fine — -the typed text round-trips through filter to itself. For non-idempotent -filters (`x => upper(x) + "!"`) the typed text gets a `!` appended on -every save, which is confusing. - -We accept this corner rather than flagging filter mutations as atomic -because (a) it would require revising Plan 4 to track filter mutations -distinctly from plain Original source_info (a notable type-system -change), (b) the runtime user-filter idempotence detection (above) -catches the AST-level non-idempotence that would actually corrupt -round-trip, and (c) Plan 3's idempotence test enforces the -contract for built-in filters at CI time. Users who write -non-idempotent filters get a warning at runtime and can decide whether -the trade-off is acceptable for their workflow. - -### The byte-provenance contract (and why the writer stays infallible) - -The contract isn't "no materialization" — that phrasing is too blunt -and conflates two cases. **The writer materializes constantly** in the -neutral sense: every Rewrite path materializes new bytes through the -qmd writer; even Verbatim copies are a kind of materialization (bytes -appearing in the saved file). The contract is more precise: the writer -only emits bytes whose origin can be honestly traced to either -**existing source bytes in the target file** (Verbatim copies, slot -preimages via `preimage_in`) or **fresh AST the user constructed** -(Rewrite paths fed by user-supplied AST nodes via the qmd writer's -normal arms). +Plan 4 distinguishes filter constructions (`pandoc.Str("decoration")` +→ `Generated { by: filter, from: [] }`, atomic) from filter +mutations (`Str.text = upper(Str.text)` → keeps Original source_info, +NOT atomic). + +A user editing a filter-mutated Str through React produces an +unusual round-trip: the user types "world" over the filter-output +"HELLO"; the writer Rewrites "world" to source; the next pipeline +run filters "world" → "WORLD". For idempotent filters (uppercase) +this is fine — the typed text round-trips through filter to itself. +For non-idempotent filters (`x => upper(x) + "!"`) the typed text +gets a `!` appended on every save, which is confusing. + +We accept this corner rather than flagging filter mutations as +atomic because: +- (a) it would require revising Plan 4 to track filter mutations + distinctly from plain Original source_info (a notable type-system + change); +- (b) Plan 7a's runtime user-filter idempotence detection catches + the AST-level non-idempotence that would actually corrupt + round-trip; +- (c) Plan 3's idempotence test enforces the contract for built-in + filters at CI time. + +Users who write non-idempotent filters get a warning at runtime +and can decide whether the trade-off is acceptable. + +### The byte-provenance contract + +The contract isn't "no materialization" — that phrasing is too +blunt. **The writer materializes constantly** in the neutral +sense: every Rewrite path materializes new bytes through the qmd +writer; even Verbatim copies are a kind of materialization. The +contract is more precise: the writer only emits bytes whose origin +can be honestly traced to either **existing source bytes in the +target file** (Verbatim copies, slot preimages via `preimage_in`) +or **fresh AST the user constructed** (Rewrite paths fed by +user-supplied AST nodes via the qmd writer's normal arms). What soft-drop forbids — by structural construction — is the case -where the writer would emit bytes synthesized from a wrapper's slot -children as flat content in the parent file. Concretely: if Plan 8's -qmd-writer arm for `IncludeExpansion` were reached in a non-Verbatim -path, it would (under the old defensive-fallback design) walk the -wrapper's content slot and emit those blocks as flat parent-file bytes -— but those blocks come from foo.qmd, not from parent.qmd source nor -from user input. Writing them into parent.qmd would put bytes there -whose provenance is the included file, which is dishonest at the +where the writer would emit bytes synthesized from a wrapper's +slot children as flat content in the parent file. Plan 8's +qmd-writer arm for `IncludeExpansion` in a non-Verbatim path +would (under an old defensive-fallback design) walk the wrapper's +content slot and emit those blocks as flat parent-file bytes — +but those blocks come from `foo.qmd`, not from `parent.qmd` source +nor from user input. Writing them into `parent.qmd` would put bytes +there whose provenance is the included file — dishonest at the parent-file boundary. Under soft-drop, coarsen substitutes KeepBefore (Verbatim of the -wrapper's parent-file include-token bytes) before the qmd writer ever -sees that case. The arm becomes `unreachable!()` — a debug assertion -for coarsen bugs, not a user-facing failure mode. Promoting the qmd -writer to a fallible `Result` interface to make the unreachable case -recoverable would be over-engineering, since correct coarsen makes the -case structurally absent. WASM panic-abort still kills the session if -the assertion fires, but that's the same risk profile as any other -writer bug; it's not specific to atomic enforcement, and it's -reachable only via a programming error in coarsen. - -The let-user-win Rewrite path is provenance-honest: when the user -constructs a fresh `IncludeExpansion` through React (with `plain_data -= { source_path: "bar.qmd" }`) and the writer materializes -`{{< include bar.qmd >}}` into source, the bytes' origin is the user's -edit. Plan 8's qmd-writer arm reads `plain_data`, doesn't read -`source_info`, and emits the include syntax — same arm whether the -wrapper came from `IncludeExpansionStage` (pipeline) or from React -(user). That symmetry is what makes the let-user-win case clean. +wrapper's parent-file include-token bytes) before the qmd writer +ever sees that case. The arm becomes `unreachable!()` — a debug +assertion for coarsen bugs, not a user-facing failure mode. +Promoting the qmd writer to a fallible `Result` interface to make +the unreachable case recoverable would be over-engineering, since +correct coarsen makes the case structurally absent. + +The let-user-win Rewrite path for atomic CustomNodes is +provenance-honest: when the user constructs a fresh +`IncludeExpansion` through React (with `plain_data = { source_path: +"bar.qmd" }`) and the writer materializes `{{< include bar.qmd >}}` +into source, the bytes' origin is the user's edit. Plan 8's +qmd-writer arm reads `plain_data`, doesn't read `source_info`, +and emits the include syntax — same arm whether the wrapper came +from `IncludeExpansionStage` (pipeline) or from React (user). That +symmetry is what makes the let-user-win case clean. + +The corresponding case for no-preimage Generated containers +soft-drops instead of let-user-win because those containers have +no parent-file source position — Rewrite would have nowhere to +write. The user's edit is rejected with Q-3-43; the original +content regenerates from baseline metadata on the next pipeline +run. + +### Decomposition of operations + +Plan 7's surface change — `incremental_write_qmd` takes a baseline +AST instead of parsing internally — is a small step in a larger +decomposition. The four primitives (parse / transform / reconcile / +write) are already implemented as separate Rust functions. Plan 7 +makes the WASM boundary reflect that decomposition: the writer's +WASM entry doesn't conflate the parse step with the write step +anymore. The caller composes parse + transform separately (or +re-uses an already-rendered AST from a prior call), then hands two +ASTs and the source bytes to the writer. + +This decomposition makes future pipeline kinds free: the writer +doesn't need a new parameter for each new kind, because it doesn't +know what a pipeline is. The caller picks which render function to +call; the writer just diffs. + +## Follow-ups closed + +- **`CoarsenedEntry::Rewrite` carried `new_idx` instead of + pre-computed text** (Phase 2 design vestige). + Closed 2026-05-25 by + [`coarsened-entry-self-contained`](./2026-05-25-coarsened-entry-self-contained.md). + The `result_idx is unused for child Rewrites (...not exercised by + today's synthesizers)` comment introduced in commit `9a473fe9` was + accurate at the time, but became reachable once Plan 7c Phase 8 + (`bdcfdc53`) added a Transparent-recursion path in `coarsen_blocks` + for changed wrappers. The fix lifts `Rewrite` to carry + `block_text: String` (matching `InlineSplice`'s precedent), making + every `CoarsenedEntry` variant self-contained. The contract is + documented in + [`incremental-writer-contract.md`](../designs/incremental-writer-contract.md) + §"`CoarsenedEntry` self-containment". diff --git a/claude-notes/plans/2026-05-04-q2-preview-plan-7a-filter-idempotence.md b/claude-notes/plans/2026-05-04-q2-preview-plan-7a-user-filter-idempotence.md similarity index 69% rename from claude-notes/plans/2026-05-04-q2-preview-plan-7a-filter-idempotence.md rename to claude-notes/plans/2026-05-04-q2-preview-plan-7a-user-filter-idempotence.md index 0985ab4d8..168f5492f 100644 --- a/claude-notes/plans/2026-05-04-q2-preview-plan-7a-filter-idempotence.md +++ b/claude-notes/plans/2026-05-04-q2-preview-plan-7a-user-filter-idempotence.md @@ -1,11 +1,20 @@ # Plan 7a — Runtime user-filter idempotence check + opt-out -**Date:** 2026-05-05 +**Date:** 2026-05-05 (revised 2026-05-20) **Branch:** feature/q2-preview **Status:** Implementation plan (open questions named) **Milestone:** none directly — extends M3 with an opt-in safety check; doesn't block the milestone +## Epic context + +Part of the **provenance epic** (Plans 3–8). Plan 7a is the reliability +follow-up to Plan 7: once the writer round-trips correctly for +idempotent filters, this plan adds runtime detection for the +non-idempotent case, with attribution to the offending filter and a +declarative opt-out. The file name keeps its q2-preview-plan-N form for +continuity with the earlier discussion notes. + ## Goal Detect when a user's Lua filter chain breaks q2-preview's round-trip @@ -41,8 +50,10 @@ specifically: current test does not catch it**. Plan 7a's runtime check targets **round-trip non-idempotence** — -parse, pipeline, write, parse, pipeline, hash-compare. See §"Plan 3 -strengthening" below. +parse, pipeline, write, parse, pipeline, hash-compare. Plan 3 covers +flavor (1) at CI time for built-ins; Plan 7a covers flavor (2) at +runtime for user filters. Built-in filter round-trip is not covered +by any current plan — see §"Notes" for the rationale. ## Scope @@ -89,7 +100,9 @@ strengthening" below. 2. Serialize AST_1 via the qmd writer → qmd_1. 3. Run pipeline on qmd_1 → AST_2. 4. Compare `compute_blocks_hash_fresh(&AST_1.blocks)` vs - `compute_blocks_hash_fresh(&AST_2.blocks)`. + `compute_blocks_hash_fresh(&AST_2.blocks)`, and the parallel + `compute_meta_hash_fresh(&AST_1.meta)` vs `(&AST_2.meta)` (new + helper landing in Plan 3). - **Per-filter attribution**: when the whole-pipeline check fails, run the same round-trip with each filter active in isolation (others stubbed). Filters whose isolated round-trip fails are named in the @@ -101,6 +114,20 @@ strengthening" below. - Hint: `Fix the filter to produce stable output, or add idempotent: false to its config in _quarto.yml to silence this check.` - Location: filter file path; no document-side range (the warning is about the filter, not a place in the active doc). + - **Sectionize-wrapper-aware hint (optional, follow-up).** A + common Lua-author error is `pandoc.walk_block(doc.blocks[1], …)` + intending to touch the user's first paragraph — but after + `SectionizeTransform` runs, `doc.blocks[1]` is the synthesized + sectionize Div and the walk operates on the wrapper, not the + user content. Idempotence detection sees the divergence and + fires Q-3-44 correctly, but the hint doesn't help the author + diagnose. When the AST diff is concentrated under a + `is_transparent_wrapper(doc.blocks[0])` (see + [`claude-notes/designs/transparent-wrappers.md`](../designs/transparent-wrappers.md)), + extend the hint with: "Note: your filter may be walking into a + sectionize wrapper. Use `doc.blocks[1].content[0]` to reach + the first user block, or iterate `doc.blocks` recursively + skipping wrapper Divs." - **`Q-3-45` diagnostic** (Info severity), three-variant body: - Title (all variants): `Filter exempted from idempotence checking` - Problem (UserConfig source): `idempotent: false set in . Edits may cause unintended changes elsewhere in the document.` @@ -113,11 +140,12 @@ strengthening" below. ### Out of scope -- **Strengthening Plan 3's CI test to round-trip flavor**. Plan 3 - currently catches non-determinism (`pipeline(x)` twice); the - round-trip flavor (`pipeline(write(pipeline(x)))`) catches a - different class of bug. See §"Plan 3 strengthening" below — flagged - as a Plan 3 amendment, not Plan 7a. +- **Extending the runtime round-trip check to built-in filters**. + Plan 7a's check fires only for filters in `Vec` with + `source = UserConfig` or `Extension`; ship-with-Quarto Lua filters + (today: just `video-filter.lua`) are not on that list. Built-in + filter round-trip is unverified anywhere — see §"Notes" for the + reasoning behind not closing this gap in v1. - **File watchers for filter sources**. Demand-driven invalidation via `filter_sources_hash` on next render is sufficient. The user edits a filter, opens the document, hash mismatches, check re-runs. @@ -130,8 +158,9 @@ strengthening" below. filter would block the first save by O(filter_count) pipeline passes. Acceptable for v1; revisit if reports come in. - **Idempotence checks on built-in filters at runtime**. Plan 3's CI - test (after the strengthening noted above) is the right place for - built-ins. Runtime check is for user-supplied filters specifically. + test is the right place for the pipeline-determinism property on + built-ins. The round-trip property on built-ins is unverified — see + the bullet above and §"Notes." ## Design decisions @@ -139,7 +168,8 @@ strengthening" below. check serializes the first pass's AST through the qmd writer and re-parses, mirroring the actual round-trip the writer performs. Pipeline determinism is a weaker property; we get that for free - from Plan 3's CI test (after strengthening). + from Plan 3's CI test (which covers pipeline non-determinism for + built-in transforms and the one built-in Lua filter). - **Cache verdict per session, persisted in IndexedDB**. The cache key includes `filter_sources_hash` (filter file bytes + opt-out flags). Surviving session boundaries is correct: if filter sources @@ -236,38 +266,6 @@ Total cost when an issue is detected: 2 + 2N pipeline runs (one whole-set check, two per filter for attribution). For 5 filters, ~12 runs. Bounded; acceptable on first edit per session, cached after. -## Plan 3 strengthening (out of scope here, flagged for the team) - -Plan 3's idempotence test (currently `run_pipeline(fixture)` twice on -the same source) catches **pipeline non-determinism** — filters that -use time, RNG, or mutable state. It does **not** catch -**round-trip non-idempotence** — the `f(f(x)) ≠ f(x)` case where the -filter is deterministic but produces different output when re-applied -to its own output through the qmd writer. - -Plan 7a's runtime check targets round-trip non-idempotence, which is -the property that actually matters for q2-preview's writer. The Plan 3 -CI test should be strengthened to also run this flavor, so that -built-in filters are CI-verified for both: - -```rust -// Existing Plan 3 test: pipeline non-determinism -let ast_1 = run_pipeline(fixture, ...); -let ast_2 = run_pipeline(fixture, ...); -assert_eq!(blocks_hash(&ast_1), blocks_hash(&ast_2)); - -// New: round-trip idempotence (the property that breaks q2-preview) -let ast_a = run_pipeline(fixture, ...); -let qmd_a = qmd_write_to_string(&ast_a); -let ast_b = run_pipeline(&qmd_a, ...); -assert_eq!(blocks_hash(&ast_a), blocks_hash(&ast_b)); -``` - -This change is small (~30 lines) and lands in Plan 3's test file. -**Recommend adding to Plan 3's scope as an amendment** rather than to -Plan 7a, since it concerns built-in filter coverage at CI time, not -runtime behavior. - ## Open questions for implementation - **Cross-session cache validity**: the profile cache persists. Should @@ -304,6 +302,30 @@ runtime behavior. flow through the same path. Confirm they reach the diagnostic panel and are visually distinguishable from pipeline warnings (or acceptably co-mingled — TBD by hub-client UX, same as Q-3-42/Q-3-43). +- **Per-Lua-line attribution (Plan 10 follow-up)**: Q-3-44 today + references the filter file path via `` read from + `FilterMetadata.spec` (the filter spec, not from `by.data` on any + Generated node), so Plan 7a is structurally independent of `By`'s + data shape. When **Plan 10** + (`claude-notes/plans/2026-05-22-provenance-plan-10-dispatch- + anchor.md`) lands, filter-constructed nodes carry a `Dispatch` + anchor pointing at a typed + `Original{lua_file_id, line_start, line_end}`. The Q-3-44 diagnostic + can then sharpen "filter `` is not idempotent" to "filter + `` line `` is not idempotent" — pointing at the specific + Lua-side construction site. The migration is purely additive — read + the Dispatch anchor when present, fall back to filter-spec path + when absent. Deferred until Plan 10 lands; the current + ``-only diagnostic is actionable. + +- **`filter_sources_hash` coordination with Plan 10.** Plan 7a + defines `filter_sources_hash` (SHA-256 over filter file bytes + + opt-out flags) as a `Pass1KeyInputs` field. Plan 10 Phase 7 + also wants Lua-filter-file content to invalidate `pass1_key`. + Since Plan 7a lands first, **Plan 10 reuses Plan 7a's + `filter_sources_hash` field** rather than introducing a parallel + hash. Plan 10's Phase 7 task reduces to: confirm the field + exists, confirm semantics match, no new field added. ## References @@ -324,10 +346,21 @@ runtime behavior. - Plan 7 — the q2-preview pipeline + qmd writer this check supports. The check uses Plan 7's `pipeline_kind: Some("preview")` machinery for both passes. -- Plan 3 — CI-time idempotence verification for built-in filters. - Plan 3 strengthening (above) extends the test to round-trip flavor. -- Plan 4 — `By` types; `is_atomic_synthesizer()` is unrelated to this - plan but the runtime check shares the source-info-blind hash. +- Plan 3 — CI-time pipeline-determinism verification for built-in + transforms and the one built-in Lua filter. Plan 3 ships + `compute_meta_hash_fresh` which this plan reuses for the meta + comparison in the round-trip check. The transform/filter-author + contract Plan 3 enforces is documented at + `claude-notes/instructions/idempotence-contract.md`; new transforms + on both the built-in and user-filter sides must meet it. +- Plan 4 — `By` types; `is_atomic_kind()` is unrelated to this plan + but the runtime check shares the source-info-blind hash. +- Plan 10 (`claude-notes/plans/2026-05-22-provenance-plan-10- + dispatch-anchor.md`) — Lua-file registration in `SourceContext`; + prerequisite for the per-Lua-line attribution refinement noted + under "Open questions" above. Plan 7a lands first; Plan 10 + reuses Plan 7a's `filter_sources_hash` field per the + cross-plan coordination note in §Open questions. ## Test plan @@ -365,6 +398,47 @@ runtime behavior. Q-3-45 variants match their respective bodies; hint text mentions the opt-out path. +- **Filter-mutation round-trip behavior test** (added 2026-05-25 + from code-review pass on Plan 7). The writer contract + (`claude-notes/designs/incremental-writer-contract.md`, + §"Filter mutations versus constructions") admits this corner: + a filter that *mutates* an existing node (rather than + *constructs* a new one) leaves the input's `Original` + source_info untouched, so the editability gate treats the + resulting text as editable. When the filter is non-idempotent + (`x => upper(x) + "!"`), the user's typed text round-trips as + `TYPED!` on the first save, `TYPED!!` on the next, etc. + + The Plan-7a runtime warning catches this — but the contract + doc doesn't pin *when* the warning fires: + - Does it fire on the first save (writer detects the filter + is non-idempotent at the AST level)? + - Does it fire only after a second save shows divergence + between consecutive pipeline runs? + - Does it suppress on subsequent saves to avoid flooding? + + Test plan (one fixture, three assertions): + + 1. Build a single-file doc with a non-idempotent user filter + (the canonical `f(x) = upper(x) + "!"` shape). + 2. Render once. Assert Q-3-44 fires with `filter_path = + "f.lua"`. Capture the warning ID. + 3. Simulate a user edit on a filter-mutated `Str`. Round-trip + through `incremental_write`. Re-render. Assert *either* the + same Q-3-44 fires again, *or* it's suppressed (whichever + the implementation picks) — but the test pins the choice. + 4. Repeat step 3 with no user edit (re-render of the same + content). Assert the warning behaviour matches step 3 — the + existence of a user edit doesn't change the diagnostic + surface; the filter's non-idempotence does. + + Output: the test pins behaviour and the assertion comments + document the contract. If the implementation prefers "fire + on every render" (loud, recoverable), the test asserts that. + If it prefers "fire once per cache key" (quiet, requires the + cache), the test asserts that. Either way, future contributors + read the test and know what behaviour is contracted. + ## Dependencies - **Depends on**: Plan 7 (the q2-preview transform pipeline + qmd writer @@ -374,9 +448,11 @@ runtime behavior. `Synthetic`/`Derived` content for realism). - **Blocks**: nothing structurally; this is a reliability improvement, not a milestone deliverable. -- **Related**: Plan 3 (CI-time test for built-in filters) — would - benefit from the strengthening proposed in §"Plan 3 strengthening" - above. +- **Related**: Plan 3 (CI-time pipeline-determinism test for built-in + transforms and the one built-in Lua filter). Plan 3 ships + `compute_meta_hash_fresh` / `compute_meta_hash_fresh_excluding_rendered` + in `quarto-ast-reconcile`; this plan reuses both for the meta + comparison in the round-trip check. ## Risk areas @@ -416,7 +492,8 @@ runtime behavior. | Q-3-44 / Q-3-45 catalog entries + builders | ~50 | | Session cache integration | ~40 | | Tests (unit + integration) | ~250 | -| **Total** | **~600** | +| Filter-mutation round-trip behavior test (added 2026-05-25) | ~60 | +| **Total** | **~660** | Single focused session. Risk: per-filter attribution may surface unexpected interactions; budget a second session if attribution proves @@ -430,10 +507,26 @@ it out keeps Plan 7 focused on the writer's coarsen + soft-drop logic gate the milestone. The check is targeted at user-supplied Lua filters. Built-in filters -that ship with Quarto are covered by Plan 3 (CI-time, with the -strengthening proposed above). User filters can't be statically -analyzed for idempotence (uncomputable for arbitrary Lua), so the -runtime check via double-pass-and-hash is the available mechanism. +that ship with Quarto are covered by Plan 3 for the +pipeline-determinism property only (`pipeline(x)` twice, same source, +hash-compare). The round-trip property +(`pipeline(write(pipeline(x))) == pipeline(x)`) is **not** verified +for built-ins anywhere in the epic. This gap is accepted in v1 +because: + +1. The built-in Lua filter universe is one filter today + (`video-filter.lua`); its idempotence is easy to read from source. +2. Round-trip is exercised in production by Plan 7's incremental + writer; a non-idempotent built-in would surface as user-visible + text drift, which we'd find via dogfooding before Plan 7 ships. +3. Extending Plan 7a's runtime check to also fire for built-in + filters is a small change to `FilterMetadata` filtering (a + `Vec::iter()` predicate), tracked as a follow-up if the gap + bites. + +User filters can't be statically analyzed for idempotence +(uncomputable for arbitrary Lua), so the runtime check via +double-pass-and-hash is the available mechanism. The opt-out (`idempotent: false`) gives users intentional escape — a timestamp-emitting filter can declare itself non-idempotent and silence diff --git a/claude-notes/plans/2026-05-04-q2-preview-plan-8-include-roundtrip.md b/claude-notes/plans/2026-05-04-q2-preview-plan-8-include-roundtrip.md index 42edae14d..ca4ba72ea 100644 --- a/claude-notes/plans/2026-05-04-q2-preview-plan-8-include-roundtrip.md +++ b/claude-notes/plans/2026-05-04-q2-preview-plan-8-include-roundtrip.md @@ -1,41 +1,69 @@ # Plan 8 — Include round-trip via IncludeExpansion CustomNode -**Date:** 2026-05-04 +**Date:** 2026-05-04 (revised 2026-05-20) **Branch:** feature/q2-preview **Status:** Implementation plan (open questions named) **Milestone:** M4 (documents with `{{< include >}}` are no longer read-only; edits outside includes round-trip cleanly; edits inside are prohibited) +## Epic context + +Part of the **provenance epic** (Plans 3–8). Plan 8 is the last plan in +the epic: it lights up include round-trip via a `CustomNode` wrapper +that consumes the atomic-detection infrastructure Plan 7 ships. The +file name keeps its q2-preview-plan-N form for continuity with the +earlier discussion notes. + ## Goal -Modify `IncludeExpansionStage` to wrap each include's expanded blocks in a -`CustomNode("IncludeExpansion")` whose `source_info` points to the include -shortcode token in the parent file. This gives the incremental writer an -anchor for the include token's source bytes — round-trip preserves -`{{< include foo.qmd >}}` verbatim when the user doesn't touch it. +Modify `IncludeExpansionStage` to wrap each include's expanded blocks in +a `CustomNode("IncludeExpansion")` whose `source_info` is the original +`{{< include foo.qmd >}}` Paragraph's `source_info` (`Original` pointing +at the parent file's include-token bytes). This gives the incremental +writer an anchor for the include token's source bytes — round-trip +preserves `{{< include foo.qmd >}}` verbatim when the user doesn't touch +it. This plan also adds the qmd-writer arm for `CustomNode("IncludeExpansion")` and the React component (transparent passthrough that doesn't propagate -`setLocalAst` to slot children — shipped here, not in Plan 2C; see Plan 2C's -2026-05-10 third-pass amendment for the deferral rationale). The writer's -atomic-violation logic from Plan 7 enforces the "edits inside an include -are prohibited" contract — `IncludeExpansion` is registered in -`is_atomic_custom_node`. +`setLocalAst` to slot children — shipped here, not in Plan 2C; see Plan +2C's 2026-05-10 third-pass amendment for the deferral rationale). +The writer's atomic-violation logic from Plan 7 enforces the "edits +inside an include are prohibited" contract — `IncludeExpansion` is +registered in `is_atomic_custom_node`. When this plan lands, M4 is reached: documents with includes are -fully-functional in q2-preview's read+edit mode (with edits outside includes -round-tripping; edits inside surfacing as diagnostics). +fully-functional in q2-preview's read+edit mode (with edits outside +includes round-tripping; edits inside surfacing as Q-3-43 diagnostics). ## Scope ### In scope -- Modify `IncludeExpansionStage` (`crates/quarto-core/src/stage/stages/include_expansion.rs`) - to wrap inserted blocks in a `Block::Custom(CustomNode { type_name: +- Modify `IncludeExpansionStage` + (`crates/quarto-core/src/stage/stages/include_expansion.rs`) to wrap + inserted blocks in a `Block::Custom(CustomNode { type_name: "IncludeExpansion", … })` instead of splicing them flat. The wrapper's - `source_info` is the original Paragraph's source_info (the include + `source_info` is the original Paragraph's `source_info` (the include shortcode token's range in the parent file). `plain_data` carries `{ "source_path": "", "atomic": true }`. +- **The wrapper's `source_info` stays `Original`, NOT `Generated`** — + see "Why the wrapper is Original" below. + + This is also what keeps `IncludeExpansion` from being a + *transparent wrapper* in the sense of + [`claude-notes/designs/transparent-wrappers.md`](../designs/transparent-wrappers.md). + The writer's descent helpers (`derive_target_file_id`, + `first_target_anchored_start_in`) stop at the wrapper and read + *its* `Original` source_info — which is exactly right: the + include-token bytes live in the parent qmd, that's where the + metadata region and the file id come from. If a future variant + ever emits an `IncludeExpansion` with `Generated` source_info + at the top of a parent document, descent would skip into the + *child* qmd's bytes and the parent's frontmatter would + silently mis-extract — add a debug-assert in + `IncludeExpansionStage` that the wrapper's `root_file_id()` + matches the parent. - Update the qmd writer (`pampa/src/writers/qmd.rs` CustomNode arm) to handle `"IncludeExpansion"`. Two paths: - **Verbatim path** (KeepBefore in Plan 7's coarsen): nothing to do — @@ -47,35 +75,42 @@ round-tripping; edits inside surfacing as diagnostics). The arm reads `plain_data` only — it does NOT inspect `source_info`, so it works identically for pipeline-emitted wrappers (Original source_info pointing at the parent file's include token) and - user-constructed wrappers (Synthetic { by: user_edit } source_info - from React). This is the path that fires when the user replaces or - adds an include via a React UI. + user-constructed wrappers (`Generated { by: user_edit, from: [] }` + source_info from React). This is the path that fires when the user + replaces or adds an include via a React UI. - **Unreachable path** (RecurseIntoContainer on atomic with inner changes): under Plan 7's soft-drop semantics, coarsen substitutes KeepBefore for this case before the qmd writer ever sees it. The arm includes `unreachable!("coarsen should have substituted KeepBefore for atomic CustomNode in RecurseIntoContainer; this branch indicates a coarsen bug")` as a debug assertion. +- Add an `IncludeExpansionResolveTransform` to the **Normalization + Phase** (symmetric with `CalloutResolveTransform`), running in the + HTML pipeline only (not q2-preview). Unwraps + `CustomNode("IncludeExpansion")` back into flat blocks for the HTML + writer to handle generically. See "HTML pipeline resolve transform" + below. - Add a React component for `IncludeExpansion` at - `hub-client/src/components/render/q2-preview/custom/IncludeExpansion.tsx` - (q2-preview's built-in custom-node registry, post-2pre / 2B / 2C). Plan - 2C deferred the placeholder per its third-pass amendment (2026-05-10): - until Plan 8 lands the AST node, `Fallback.tsx` covers the unknown - `type_name` gracefully, and Plan 8 ships the real component together - with the AST node and the `atomicCustomNodes.ts` addition: + `ts-packages/preview-renderer/src/q2-preview/custom/IncludeExpansion.tsx` + (q2-preview's built-in custom-node registry, post-2pre / 2B / 2C). + Plan 2C deferred the placeholder per its third-pass amendment + (2026-05-10): until Plan 8 lands the AST node, `Fallback.tsx` covers + the unknown `type_name` gracefully, and Plan 8 ships the real + component together with the AST node and the `atomicCustomNodes.ts` + addition: - Transparent passthrough: render the content slot's blocks normally. - Read-only: do not pass `setLocalAst` to slot children (enforced via - the framework's atomic-aware dispatcher in `framework/dispatchers.tsx` + the framework's atomic-aware dispatcher in `framework/dispatch.tsx` reading - `hub-client/src/utils/atomicCustomNodes.ts`). + `ts-packages/preview-renderer/src/utils/atomicCustomNodes.ts`). - Visual indicator (optional): subtle background tint or hover badge "from foo.qmd". - Register `"IncludeExpansion"` in **both** sides of the atomic registry: Rust `ATOMIC_CUSTOM_NODES` const (Plan 7 introduces the const + `is_atomic_custom_node()` function) and TypeScript hand-mirror - `hub-client/src/utils/atomicCustomNodes.ts` (Plan 2A introduces the - file with the initial `["CrossrefResolvedRef"]` set). Plan 8 amends - both to add `"IncludeExpansion"`. + `ts-packages/preview-renderer/src/utils/atomicCustomNodes.ts` (Plan + 2A introduces the file with the initial `["CrossrefResolvedRef"]` + set). Plan 8 amends both to add `"IncludeExpansion"`. - Tests covering: - Untouched include: round-trip preserves `{{< include foo.qmd >}}`. - Edit outside include: that paragraph rewrites; include token preserved. @@ -99,6 +134,11 @@ round-tripping; edits inside surfacing as diagnostics). - Resolving include shortcodes outside the standard `Paragraph[Shortcode("include")]` form (current behavior preserved — only top-level paragraph-form includes are handled). +- Attributing the include line in HTML rendering. The + `IncludeExpansionResolveTransform` unwraps the wrapper before + `AttributionRenderTransform` runs, so HTML output has no DOM anchor + for the include-line author. See "HTML attribution" below — this is + intentional v1 behavior. ## Design decisions (settled in conversation) @@ -114,22 +154,154 @@ round-tripping; edits inside surfacing as diagnostics). synthesized from the wrapper's slot children into the parent file, because those bytes' origin is the included file, not parent.qmd source nor user input. -- **Source_info on the wrapper points to the original Paragraph**, not to - the inner Shortcode. The Paragraph's range covers the whole `{{< include >}}` - line (including any whitespace/newline padding); the Shortcode's covers - just the token. Paragraph gives a cleaner verbatim copy. -- **Nested includes produce nested wrappers naturally**. When the include - expansion processes a child file that itself has includes, recursion - produces nested CustomNode wrappers. Each wrapper anchors at its own - parent-file include token. Round-trip semantics compose: untouched at any - level → preserved; touched at any level → atomic-violation at the deepest - affected wrapper. +- **Source_info on the wrapper points to the original Paragraph**, not + to the inner Shortcode. The Paragraph's range covers the whole + `{{< include >}}` line (including any whitespace/newline padding); + the Shortcode's covers just the token. Paragraph gives a cleaner + verbatim copy. +- **Nested includes produce nested wrappers naturally**. When the + include expansion processes a child file that itself has includes, + recursion produces nested CustomNode wrappers. Each wrapper anchors + at its own parent-file include token. Round-trip semantics compose: + untouched at any level → preserved; touched at any level → + atomic-violation at the deepest affected wrapper. - **React component is read-only** (Plan 8 ships the per-type IncludeExpansion component, deferred from Plan 2C per its third-pass - amendment; Plan 2B's framework atomic gate enforces read-only behavior; - Plan 2A introduces the atomic-registry hand-mirror). The IncludeExpansion - component does not pass `setLocalAst` to children. This is the primary - enforcement; the writer's atomic-violation is the contract guarantor. + amendment; Plan 2A's framework atomic gate enforces read-only + behavior). The IncludeExpansion component does not pass + `setLocalAst` to children. This is the primary enforcement; the + writer's atomic-violation is the contract guarantor. +- **Render-side resolve, not writer arm.** The HTML writer stays + generic — it doesn't grow knowledge of `IncludeExpansion`. The + `IncludeExpansionResolveTransform` unwraps in the Normalization Phase + (symmetric with `CalloutResolveTransform`), and the unwrapped blocks + flow through the rest of the HTML pipeline normally. This preserves + the Pandoc / Quarto convention of "resolve to standard AST before + writers" — see "Considered alternatives" below. + +## Why the wrapper is Original + +The wrapper's `source_info` is `Original`, inherited from the original +Paragraph it substitutes for. This may look inconsistent with Plan 6's +audit (which puts other transform-synthesized wrappers like Sectionize +into `Generated`), but it follows a principled rule: + +**Two pieces of provenance information** need to land somewhere when a +transform synthesizes a node: + +1. **Generator identity** — "which transform produced me." +2. **Source anchor** — "which source bytes are this node's canonical preimage." + +For non-CustomNode synthesized nodes (Sectionize's Section Div, +filter-constructed Str, footnotes container Div), there's no other slot +for (1), so `source_info` carries both via `Generated { by, from }`. + +For CustomNode synthesized nodes, (1) is **already encoded** in +`CustomNode.type_name`. The wrapper *is* an `IncludeExpansion` by +virtue of `type_name`; there's no need for `source_info` to also say +"I was made by IncludeExpansionStage." So `source_info` only has to do +(2) — and the natural shape for (2) when the wrapper substitutes 1:1 +for a source-mapped parser node is the inherited `Original`. + +This isn't a Plan 8 invention — it's the existing pattern for every +source-mapped CustomNode in the codebase: + +| CustomNode `type_name` | Source-mapped from | `source_info` shape | +|---|---|---| +| `Callout` | `:::{.callout-warning} … :::` Div | Original (inherited) | +| `Theorem` / `Proof` / etc. | `:::{.theorem #thm-foo} … :::` Div | Original (inherited) | +| `CrossrefResolvedRef` | `@thm-foo` Cite | Original (inherited) | +| `FloatRefTarget` | Figure / table / listing Div | Original (inherited) | +| `IncludeExpansion` (Plan 8) | `{{< include foo.qmd >}}` Paragraph | Original (inherited) | + +In contrast, Sectionize's Section Div is NOT a CustomNode (it's a +plain Div) AND it doesn't 1:1-substitute for a source-mapped parser +node (it's a structural grouping over a Header + its body). So its +`source_info` has to carry generator identity via `Generated { by: +sectionize, from: [] }`. + +**The rule, in one sentence**: a synthesized node uses **Original** +`source_info` if and only if it is a CustomNode whose 1:1 source +preimage is a parser-emitted node. Everything else uses **Generated**. + +See Plan 4's "Original vs Generated on synthesized nodes" section for +the full taxonomy. + +## HTML pipeline resolve transform + +The wrapper change applies to the `IncludeExpansionStage`, which runs +in BOTH the HTML and q2-preview pipelines. For HTML output, the +wrapper would otherwise reach the HTML writer with no native rendering +arm for `IncludeExpansion`. The cleanest fix: + +Add `IncludeExpansionResolveTransform` that runs ONLY in the HTML +pipeline (not q2-preview, where the React `IncludeExpansion` component +handles rendering directly). Unwraps `CustomNode("IncludeExpansion")` +back into flat blocks — the slot's `content` Blocks become siblings of +the surrounding content. The HTML writer then processes the flat +blocks generically. + +**Placement**: Normalization Phase, symmetric with +`CalloutResolveTransform` (`crates/quarto-core/src/pipeline.rs:988`). +Like Callout, the resolve fires early so the rest of the pipeline sees +standard AST. `Q2_PREVIEW_TRANSFORM_EXCLUDED` lists +`"callout-resolve"`; add `"include-expansion-resolve"` to that list so +q2-preview keeps the wrappers for React rendering. + +## HTML attribution + +When the resolve transform unwraps the wrapper, the wrapper's +`source_info` (Original pointing at the parent's include token) is +gone before `AttributionRenderTransform` runs at the tail of the +Finalization Phase. Consequences: + +- The unwrapped included blocks have `file_id != 0` (foo.qmd's + FileId). `query_attribution` skips them per the v1 single-doc + invariant. **No attribution on included content in HTML.** +- The include-line author has no node to be attributed against. The + Paragraph that previously held `{{< include foo.qmd >}}` was deleted + by `IncludeExpansionStage`. **No attribution on the include line in + HTML.** + +This matches what current main produces (without Plan 8, the include +line and its content are also un-attributed in HTML output), so it's +not a regression. It's *intentional* v1 behavior: in the rendered HTML, +there's no DOM element that represents "the include line" — those +source bytes don't appear in the rendered output. Attributing them +would require synthesizing a wrapping HTML element, which is +inconsistent with the "resolve to standard AST" convention. + +**q2-preview attributes the include line correctly.** q2-preview +excludes the resolve transform, so the wrapper survives all the way to +JSON serialization and React. `AttributionRenderTransform` visits the +wrapper, resolves its `Original` source_info to a byte range, and +records the include-line author via the existing `query_byte_range` +max-time logic. The React `IncludeExpansion` component receives the +attribution record and surfaces it as the authorship pill on the +wrapper region. + +When v2 multi-file blame lands (`crates/quarto-core/src/attribution/types.rs:58` +flags this as v1-only), the unwrapped HTML children gain attribution +from foo.qmd's blame. The HTML include-line itself remains +un-attributed because there's still no DOM anchor — that's a structural +property of HTML output, not a v1 limitation we plan to remove. + +## Considered alternatives + +**Option C — render `IncludeExpansion` natively in the HTML writer.** +Investigated during the 2026-05-20 design discussion. Cleaner for v2 +attribution (the wrapper would survive to the HTML writer, which could +emit a `
` with the include-line author's +`data-attr-*`). Rejected because it breaks the Pandoc / Quarto +convention of resolving CustomNodes to standard AST before writers see +them. The convention is load-bearing: it lets each new output format +(future Typst, future PDF) work generically without growing +CustomNode-specific arms. + +The decision is recoverable if needed — the type definitions and +wrapper shape don't change. Switching to native rendering later means +dropping the resolve transform and adding writer arms; it doesn't +require revising Plan 8's type design. ## The wrapper structure @@ -145,7 +317,7 @@ Block::Custom(CustomNode { "atomic": true, }), attr: ("".to_string(), vec![], LinkedHashMap::new()), - source_info: paragraph.source_info.clone(), // include token's parent-file bytes + source_info: paragraph.source_info.clone(), // Original{parent, include_token_range} }) ``` @@ -163,19 +335,19 @@ walk — same visual outcome, just no per-type styling). - Both pipeline runs (live and baseline) produce identical wrappers. - Reconciler picks `KeepBefore` for the wrapper. - Plan 7's coarsen sees `is_atomic_custom_node("IncludeExpansion") == true` - → goes the Verbatim path because `preimage_in(parent_file)` returns the - include token's byte range (the wrapper's source_info is `Original{parent, - start, end}`). + → goes the Verbatim path because `preimage_in(parent_file)` returns + the include token's byte range (the wrapper's source_info is + `Original{parent, start, end}`). - `assemble` copies `original_qmd[start..end]` — the literal `{{< include foo.qmd >}}` text. ✓ Source preserved. **Case 2 — edit outside include, untouched include in same doc**: -- Reconciler's plan has `KeepBefore` for the include wrapper, mixed alignments - for other blocks. -- The include wrapper goes through the Verbatim path (case 1 above). Other - blocks are handled per their own alignment. The include token in source - is preserved verbatim. Edit outside is rewritten. +- Reconciler's plan has `KeepBefore` for the include wrapper, mixed + alignments for other blocks. +- The include wrapper goes through the Verbatim path (case 1 above). + Other blocks are handled per their own alignment. The include token + in source is preserved verbatim. Edit outside is rewritten. **Case 3 — edit inside the include (somehow)**: @@ -192,7 +364,8 @@ walk — same visual outcome, just no per-type styling). the include's `source_path` from `plain_data`: "Edit inside `{{< include foo.qmd >}}` was not saved. To edit this content, open foo.qmd directly." Save **succeeded** (other edits applied); - warning surfaces in the diagnostic panel. + warning surfaces in the diagnostic panel (hub-client) or + DiagnosticStrip (SPA). **Case 3b — user replaces or deletes the include via React**: @@ -204,10 +377,10 @@ walk — same visual outcome, just no per-type styling). qmd writer; the include is gone from output. - If the user replaced the include with a fresh IncludeExpansion (e.g., changed `foo.qmd` to `bar.qmd` via a hypothetical UI), the new - wrapper has Synthetic { by: user_edit } source_info and - `plain_data["source_path"] = "bar.qmd"`. The qmd writer's arm reads - `plain_data` and emits `{{< include bar.qmd >}}`. No warning — the - user's intent is clear. + wrapper has `Generated { by: user_edit, from: [] }` source_info + and `plain_data["source_path"] = "bar.qmd"`. The qmd writer's arm + reads `plain_data` and emits `{{< include bar.qmd >}}`. No warning + — the user's intent is clear. **Case 4 — nested includes**: @@ -227,50 +400,65 @@ walk — same visual outcome, just no per-type styling). ... ] ``` -- The outer wrapper's source_info points to parent.qmd's bytes. The inner - wrapper's source_info points to foo.qmd's bytes (via the FileId remap). -- Round-trip in parent.qmd: outer's `preimage_in(parent_file)` returns the - parent's include token range. Verbatim copy preserves - `{{< include foo.qmd >}}` in parent.qmd. The inner wrapper's bytes never - get serialized because the outer's Verbatim wins. +- The outer wrapper's source_info is Original pointing at parent.qmd's + bytes. The inner wrapper's source_info is Original pointing at + foo.qmd's bytes (via the FileId remap). +- Round-trip in parent.qmd: outer's `preimage_in(parent_file)` returns + the parent's include token range. Verbatim copy preserves + `{{< include foo.qmd >}}` in parent.qmd. The inner wrapper's bytes + never get serialized because the outer's Verbatim wins. ## Open questions for implementation -- **`source_path` accuracy**: the literal arg from the shortcode (`"foo.qmd"`) - is what we re-emit on save. Plan 7's Verbatim copy path doesn't use it - (we copy bytes), but the Rewrite path (let-user-win for fresh - user-constructed IncludeExpansion) does. Make sure the - IncludeExpansionStage stores the literal arg verbatim — including - any whitespace or quoting the user typed — so a round-trip through - React preserves the user's syntactic choices when possible. -- **Recorded includes side-channel**: today's `IncludeExpansionStage` writes - to `doc.recorded_includes` for cache invalidation. The wrapper change - shouldn't affect this — confirm. -- **`extract_include_path` recognition**: today the function recognizes a - Paragraph containing exactly one include Shortcode inline. After the - wrapper change, the structure is unchanged at that recognition point - (the wrapper is built from the recognized Paragraph). The recognition - logic continues to work. +- **`source_path` accuracy**: the literal arg from the shortcode + (`"foo.qmd"`) is what we re-emit on save. Plan 7's Verbatim copy path + doesn't use it (we copy bytes), but the Rewrite path + (let-user-win for fresh user-constructed IncludeExpansion) does. + Make sure the IncludeExpansionStage stores the literal arg verbatim + — including any whitespace or quoting the user typed — so a + round-trip through React preserves the user's syntactic choices when + possible. +- **Recorded includes side-channel**: today's `IncludeExpansionStage` + writes to `doc.recorded_includes` for cache invalidation. The wrapper + change shouldn't affect this — confirm. +- **`extract_include_path` recognition**: today the function recognizes + a Paragraph containing exactly one include Shortcode inline. After + the wrapper change, the structure is unchanged at that recognition + point (the wrapper is built from the recognized Paragraph). The + recognition logic continues to work. ## References -- `crates/quarto-core/src/stage/stages/include_expansion.rs:80-278` — the - stage implementation. The splicing logic at lines 215-220 is what gets - replaced. +- `crates/quarto-core/src/stage/stages/include_expansion.rs:80-278` — + the stage implementation. The splicing logic at lines 215-220 is + what gets replaced with wrapper construction. - `crates/quarto-pandoc-types/src/custom.rs` — CustomNode struct. - `crates/pampa/src/writers/qmd.rs` — qmd writer's CustomNode arm (existing for Callout etc. — extend with IncludeExpansion). -- Plan 6 — provenance audit. Sets the precedent for "preserve source info - on transform output." (Plan 6 uses Derived for shortcodes; Plan 8 uses - the wrapper-CustomNode pattern for includes, since cross-file FileId - prevents Derived from working.) +- `crates/quarto-core/src/transforms/callout_resolve.rs` — pattern to + mirror for `IncludeExpansionResolveTransform`. Note: Callout's + resolve runs in the Normalization Phase + (`crates/quarto-core/src/pipeline.rs:988`), not the Finalization + Phase. Plan 8's resolve runs at the same point in the HTML + pipeline. +- `crates/quarto-core/src/pipeline.rs:1181` — + `Q2_PREVIEW_TRANSFORM_EXCLUDED` const; add + `"include-expansion-resolve"` to skip the unwrap in the q2-preview + pipeline. +- Plan 6 — provenance audit. Sets the precedent for "preserve source + info on transform output." Plan 6 uses `Generated` with + `Invocation` anchors for shortcodes; Plan 8 uses the + wrapper-CustomNode pattern for includes, since cross-file FileId + prevents shortcode-style anchoring from working. - Plan 7 — coarsen logic (Verbatim, Transparent, Omit, soft-drop substitutions, is_atomic_custom_node registry). -- Plan 2A — `hub-client/src/utils/atomicCustomNodes.ts` (the JS-side - atomic registry that Plan 8 amends to add `"IncludeExpansion"`). -- Plan 2B — framework recursion + atomic gate that the IncludeExpansion - component runs through; CustomNode unwrap/rewrap walks that produce - the JS-native shape Plan 2C's component consumes. +- Plan 2A — `ts-packages/preview-renderer/src/utils/atomicCustomNodes.ts` + (the JS-side atomic registry that Plan 8 amends to add + `"IncludeExpansion"`). +- Plan 2B — framework recursion + atomic gate that the + IncludeExpansion component runs through; CustomNode unwrap/rewrap + walks that produce the JS-native shape Plan 2C's component + consumes. - Plan 2C — React component infrastructure (registers IncludeExpansion component as a transparent read-only passthrough; Plan 2C already ships the placeholder component as dormant wiring before Plan 8 @@ -280,9 +468,9 @@ walk — same visual outcome, just no per-type styling). - **Untouched-include round-trip**: parse a parent.qmd with an include, run pipeline, write back without modification, assert byte-equal. -- **Edit-outside round-trip**: edit a paragraph outside the include in the - AST, write back, assert the include token is byte-equal-preserved and - the edited paragraph is rewritten. +- **Edit-outside round-trip**: edit a paragraph outside the include in + the AST, write back, assert the include token is byte-equal-preserved + and the edited paragraph is rewritten. - **Edit-inside soft-drop**: programmatically modify a Str inside the IncludeExpansion's content slot (bypass the React layer), call `incremental_write_qmd_for_preview`, assert the result is `Ok` with @@ -292,68 +480,93 @@ walk — same visual outcome, just no per-type styling). IncludeExpansion with a fresh user-constructed IncludeExpansion (new source_path), call the writer, assert the output contains `{{< include >}}` with no warning. The qmd writer's - CustomNode arm hit the Rewrite path with Synthetic { by: user_edit } - source_info and read `plain_data["source_path"]`. + CustomNode arm hit the Rewrite path with `Generated { by: user_edit, + from: [] }` source_info and read `plain_data["source_path"]`. - **Delete-include let-user-win**: replace an IncludeExpansion with a Para in the new AST, call the writer, assert the include token is gone from output and the Para's text appears in its place. No warning. -- **Nested includes round-trip**: parent → foo → bar. Untouched: all three - preserved. Edit inside bar: `Q-3-43` warning with bar's wrapper source - range; the inner edit is reverted via Plan 7's soft-drop, parent.qmd - byte-equal to no-op edit. -- **Plan 2C component test**: render an IncludeExpansion wrapper; assert - setLocalAst is not propagated to children (no edit affordance). +- **Nested includes round-trip**: parent → foo → bar. Untouched: all + three preserved. Edit inside bar: `Q-3-43` warning with bar's + wrapper source range; the inner edit is reverted via Plan 7's + soft-drop, parent.qmd byte-equal to no-op edit. +- **HTML pipeline resolve test**: render a doc with an include through + the HTML pipeline; assert the resulting HTML contains the included + content flat (not wrapped in a `
` + or similar) — the resolve transform unwrapped it before the HTML + writer ran. +- **q2-preview pipeline preservation test**: render the same doc + through the q2-preview pipeline; assert the resulting AST contains + the IncludeExpansion CustomNode wrapper (not unwrapped). The JSON + writer emits it; the React component renders it. +- **q2-preview attribution test**: with a `PreBuiltAttributionProvider` + installed, render a doc with an include through q2-preview; assert + the wrapper's `astContext.attribution` record references the + include-line author (the latest author of the parent's include line + bytes). HTML output of the same doc has no attribution on the + include line (intentional v1 behavior). +- **Plan 2C component test**: render an IncludeExpansion wrapper; + assert setLocalAst is not propagated to children (no edit + affordance). - **Idempotence**: re-run Plan 3's idempotence test with includes. The wrapper should be deterministic across runs. +- **Shortcode-inside-include provenance shape** (cross-reference to + Plan 6): Plan 6 owns the test that asserts a shortcode resolving + inside include-spliced content gets an Invocation anchor with + `file_id != 0` (pointing into the included file, not the parent). + Plan 8's wrapper carries the parent-file `Original` independently; + the two anchors compose correctly. Plan 8's tests above exercise + the wrapper round-trip in isolation; the composition shape lives + in Plan 6's §Test plan ("Shortcode-inside-include composition + test"). +- **Cross-file Invocation in resolve transform**: after the + `IncludeExpansionResolveTransform` (Plan 8) unwraps the wrapper, + any shortcode-resolved Generated children retain + `Invocation -> Original{foo.qmd, ...}` source_info. The unwrapped + HTML pipeline sees these children with `file_id != 0`, and + `query_attribution` skips them per the v1 single-doc invariant + (matching §HTML attribution above). Round-trip-equivalent to a + fixture without includes: the HTML writer doesn't care about + source_info; it just renders the nodes. Regression test: render + parent + foo (foo contains `{{< meta title >}}`) through the HTML + pipeline, grep for the resolved title in the output, assert the + rendered text matches `meta.title` from foo.qmd's metadata. ## Dependencies -- Depends on: Plans 4, 6, 7 (Synthetic types not strictly needed since the - wrapper uses Original; the audit pattern; the writer's atomic logic). -- Plan 2C also depends on this for the IncludeExpansion component (which - Plan 8 confirms is needed; Plan 2C ships the placeholder dormant). +- Depends on: Plans 4, 6, 7 (Generated types not strictly needed for + the wrapper's source_info since it stays Original; the audit pattern + for what kinds of nodes get Generated vs Original; the writer's + atomic logic). +- Plan 2C also depends on this for the IncludeExpansion component + (which Plan 8 confirms is needed; Plan 2C ships the placeholder + dormant). - Final plan in the sequence; nothing depends on it. ## Risk areas -- **The include's wrapper source_info uses the *parent* file's FileId**. - The included blocks inside the slot have a *different* FileId. Plan 7's - `preimage_in(parent_file)` correctly returns None for the children - (because their FileId differs). This is the intended behavior — children - contribute nothing to the verbatim-copy path; only the wrapper does. - Confirm by walking through the test cases. -- **Existing tests for `IncludeExpansionStage`**: the existing tests assert - the spliced-flat behavior (e.g., `assert_eq!(doc.ast.blocks.len(), 2)` - after expanding one include). Update these tests for the wrapper - behavior (`assert_eq!(doc.ast.blocks.len(), 1)` — the Paragraph is - replaced by one wrapper). -- **The `recorded_includes` side-channel**: existing pipeline-cache logic - reads this. The wrapper change shouldn't affect it because we still call - `record_include` at the same point. Confirm during implementation. -- **Existing HTML pipeline tests**: the wrapper change applies to the HTML - pipeline too (we're modifying `IncludeExpansionStage`, which runs in - both HTML and q2-preview pipelines). For HTML output, the wrapper - passes through subsequent transforms unchanged and gets serialized to - HTML by `RenderHtmlBodyStage`. Confirm the HTML writer's CustomNode - arm handles `"IncludeExpansion"` (or that we don't need it because the - HTML pipeline runs `CrossrefRenderTransform` etc. that don't touch this - type — but ensure HTML output looks right). - - Actually: the HTML pipeline doesn't have a transform that materializes - IncludeExpansion. So the HTML writer SEES the wrapper at render time. - The simplest fix: make the HTML writer's CustomNode arm transparently - render the slot content (effectively materializing into HTML, which is - the right thing for HTML output). Or: add a render-side resolve transform - for IncludeExpansion that runs only in the HTML pipeline. - - This is the one significant complication. Worth investigating during - implementation. The cleanest answer is probably: a small render-side - transform `IncludeExpansionResolveTransform` that runs ONLY in the HTML - pipeline (not q2-preview), unwraps `CustomNode("IncludeExpansion")` - back into flat blocks for the HTML writer to handle normally. - - Symmetric with `CalloutResolveTransform`. Same shape. +- **The include's wrapper source_info uses the *parent* file's + FileId**. The included blocks inside the slot have a *different* + FileId. Plan 7's `preimage_in(parent_file)` correctly returns None + for the children (because their FileId differs). This is the intended + behavior — children contribute nothing to the verbatim-copy path; + only the wrapper does. Confirm by walking through the test cases. +- **Existing tests for `IncludeExpansionStage`**: the existing tests + assert the spliced-flat behavior (e.g., + `assert_eq!(doc.ast.blocks.len(), 2)` after expanding one include). + Update these tests for the wrapper behavior + (`assert_eq!(doc.ast.blocks.len(), 1)` — the Paragraph is replaced + by one wrapper). +- **The `recorded_includes` side-channel**: existing pipeline-cache + logic reads this. The wrapper change shouldn't affect it because we + still call `record_include` at the same point. Confirm during + implementation. +- **Existing HTML pipeline tests**: the wrapper change applies to the + HTML pipeline too (we're modifying `IncludeExpansionStage`, which + runs in both HTML and q2-preview pipelines). The + `IncludeExpansionResolveTransform` in the Normalization Phase + unwraps before the HTML writer sees it, so HTML output is + byte-equivalent to current main. Verify with a regression test. - **Extension-registration forward-compat**: Plan 8 adds `IncludeExpansion` to the hardcoded `pub const ATOMIC_CUSTOM_NODES` set in `quarto-core`. After the future extension-registration @@ -371,32 +584,37 @@ walk — same visual outcome, just no per-type styling). |---|---| | IncludeExpansionStage modification (wrap inserted blocks) | ~40 | | qmd writer arm for IncludeExpansion (atomic) | ~30 | -| HTML pipeline resolve transform (unwrap before HTML writer) | ~50 | -| `is_atomic_custom_node` registration | ~5 | +| `IncludeExpansionResolveTransform` (Normalization Phase, HTML only) | ~50 | +| Adding `"include-expansion-resolve"` to `Q2_PREVIEW_TRANSFORM_EXCLUDED` | ~5 | +| `is_atomic_custom_node` registration (Rust + TS hand-mirror) | ~10 | | React component (transparent passthrough, read-only) | ~30 | | Test updates for existing IncludeExpansionStage tests | ~50 | -| New round-trip tests | ~200 | -| **Total** | **~405** | +| New round-trip tests (untouched, edit-outside, soft-drop, replace, nested, HTML, attribution) | ~250 | +| **Total** | **~465** | -Two focused sessions likely. The HTML pipeline resolve transform is the -piece I didn't fully account for in my earlier estimates — confirm scope -during implementation kickoff. +Two focused sessions likely. ## Notes +The wrapper-CustomNode pattern is the right shape for includes because +the included content lives in a *different file* than the parent. +Their source_info points into foo.qmd, not parent.qmd. There's no +`Generated`-with-`Invocation`-anchor chain that can connect those +blocks back to the parent file's include token bytes (the anchor's +chain would need to resolve into the target file, and foo.qmd is a +different FileId). So we need a wrapper at the parent-file level whose +`source_info` is `Original{parent_file, include_token_range}` to serve +as the writer's anchor. That's what `CustomNode("IncludeExpansion")` +provides. + +Shortcodes (Plan 6) don't have this issue (they resolve in the same +file) which is why they use `Generated { by: shortcode, from: [Invocation -> ...] }` +instead of a wrapper. The genuine cross-file case is the only one that +warrants the wrapper. + The HTML-pipeline-resolve-transform finding is the kind of thing the -research plan exists to surface. The wrapper change has implications for -HTML output that aren't immediately visible from the q2-preview-only lens. -Plan 8's research plan should make this explicit so that the -implementation session doesn't get blindsided. - -Why a wrapper for includes (different from shortcodes): includes pull in -content from a *different file*. The included blocks have a different -FileId than the parent file. Their source_info points into foo.qmd, not -parent.qmd. There's no `Derived` chain that can connect those blocks -back to the parent file's include token bytes — Derived requires a `from` -that resolves into the target file. So we need a wrapper at the parent-file -level whose source_info is `Original{parent_file, include_token_range}` to -serve as the writer's anchor. That's what `CustomNode("IncludeExpansion")` -provides. Shortcodes don't have this issue (they resolve in the same file) -which is why they use Derived (Plan 6) instead of a wrapper. +design discussion exists to surface. The wrapper change has +implications for HTML output that aren't immediately visible from the +q2-preview-only lens. Plan 8's implementation kickoff should land the +resolve transform alongside the wrapper change to keep HTML +byte-equivalent across the transition. diff --git a/claude-notes/plans/2026-05-18-q2-preview-project-replay-engine.md b/claude-notes/plans/2026-05-18-q2-preview-project-replay-engine.md index 9941f1696..06aac08cd 100644 --- a/claude-notes/plans/2026-05-18-q2-preview-project-replay-engine.md +++ b/claude-notes/plans/2026-05-18-q2-preview-project-replay-engine.md @@ -189,6 +189,24 @@ class="{r}">` in the iframe DOM). The engine-name check is against the `engine_name` field of the capture itself, so we only attempt to splice cells belonging to the captured engine. +### Pre-engine timing — why a flat walk is safe + +Both `derive_cell_outputs(A1, B1)` and `splice(A2, output_map)` +above iterate `.blocks` flatly. This is correct **only because the +splice runs at the pre-engine checkpoint** — strictly before +`SectionizeTransform` (and the other "sugar phase" synthesizers) +add the top-level transparent wrapper Div that the writer learned +about the hard way in commits `bdcfdc53` / `b9f64b56` / `2bf92664`. +At the pre-engine checkpoint, `A2.blocks[0]` is a real user block. + +If a future variant ever moves the splice point past the sugar +phase (or runs it on a post-pipeline AST for any other reason), +the flat walk would miss every cell inside the wrapper. Route the +walker through `first_in_user_tree` / a `visit_user_blocks` +sibling per +[`claude-notes/designs/transparent-wrappers.md`](../designs/transparent-wrappers.md) +in that case. + ## Where the splice lives in the pipeline Two viable insertion points; the v1 picks the simpler: diff --git a/claude-notes/plans/2026-05-21-vfs-url-write-root-split.md b/claude-notes/plans/2026-05-21-vfs-url-write-root-split.md new file mode 100644 index 000000000..015bbce2f --- /dev/null +++ b/claude-notes/plans/2026-05-21-vfs-url-write-root-split.md @@ -0,0 +1,287 @@ +# Plan — Split `vfs_root` into write-root + url-root in `ResourceResolverContext` + +**Date:** 2026-05-21 +**Branch:** `beads/bd-rz2we-plan-3-q2-preview` → integrates into `feature/provenance` +**Status:** Implementation plan +**Beads:** bd-rz2we +**Blocks:** closing Plan 3 (q2-preview idempotence gate) + +## Goal + +Decouple two roles `ResourceResolverContext::vfs_root_mode` currently +plays as a single `PathBuf`: + +1. **Disk-write root** — where `runtime.file_write` / `OutputSink` + put artifacts (theme CSS, copied resources, site libs). +2. **URL prefix** — what gets embedded in HTML link / asset URLs. + +In production WASM these are intentionally identical +(`"/.quarto/project-artifacts"` for both, a synthetic VFS path the +service worker serves from memory). On native test runs they have to +diverge: the write root has to be a real tempdir so the runtime can +actually write, but URLs must be path-independent so the AST is +idempotent across runs. + +## Why we can't defer this to a later plan + +Plan 3 locks in the idempotence + structural-hash-stability contract. +Right now `website_links` produces: + +```text +target: ("/private/var/folders/.../T/.tmpXXX/.quarto/project-artifacts/other.html", "") +``` + +Two runs in two tempdirs → two distinct URLs → block-hash divergence. + +Plans 4–8 (typed source-info, wire format, audit, incremental writer, +include round-trip) all assume Plan 3's gate is green on the +fixtures they care about. None of them name URL canonicalization in +scope. Unlike bd-3odjm (whose fix-owner is Plan 5 because Plan 5 +rewrites the wire format anyway), bd-rz2we has no natural fix-owner +downstream of Plan 3. Fixing it here is the right scope. + +It's also wrong-output, not just non-determinism. Any in-process +caller of `RenderToPreviewAstRenderer::new(real_disk_path)` (test +helpers today, anything else that wants to host the q2-preview +pipeline natively tomorrow) gets links whose URLs leak the host +machine's tempdir into the AST. The browser's iframe service worker +doesn't intercept `/private/var/...`, so those links would 404 if +served. + +## Where the bug lives (verified 2026-05-21) + +- `LinkRewriteTransform` calls + `resolve_doc_relative_href("other.qmd", "index.qmd", resolver, idx, …)` + which delegates to `resolver.page_url_for(profile.output_href)`. +- In **VFS-root mode**, `page_url_for` is just + `rel_to_url(&root.join(target))` where `root` is whatever was passed + to `ResourceResolverContext::vfs_root(...)` + (`crates/quarto-core/src/resource_resolver.rs:210-218`). No + relativization, no synthetic prefix — the URL is literally the + joined path. +- `RenderToPreviewAstRenderer` builds its per-doc resolver with + `ResourceResolverContext::vfs_root(self.vfs_root.clone())` + (`pass2_renderer.rs:661`). It also writes theme CSS to + `self.vfs_root.join("styles.css")` directly via + `runtime.file_write` (`pass2_renderer.rs:739`). +- WASM caller passes `"/.quarto/project-artifacts"` + (`wasm-quarto-hub-client/src/lib.rs:1512,1696,1786`) — synthetic + string, identity URL. +- Native test helpers pass `project.dir.join(".quarto/project-artifacts")` + (`tests/render_page_in_project.rs:80`, + `tests/idempotence.rs:243`) — real tempdir, leaks into URL. + +A naive fix in the test (pass `"/.quarto/project-artifacts"` for both +roles) fails because `runtime.file_write("/.quarto/project-artifacts/styles.css")` +hits the read-only root filesystem (verified empirically: `os error +30`). So the split must really be a split, not a single-arg switch. + +## Existing pinned contract + +`crates/quarto-core/src/project/website_post_render.rs:638-653`: +> On VFS-root mode the html_url is absolute (`//

`) +> and the on-disk path is the same with the leading `/` dropped. +> The browser fetches the URL and the hub-client serves from VFS +> at the matching synthetic path. + +This is a **WASM-only** invariant. After the split, the single-arg +`vfs_root(path)` constructor preserves it (write_root == url_root by +construction). The two-arg form intentionally breaks it (write to +tempdir, URL stays synthetic) — but only the native test helpers +take that form, so no production code is affected. + +## Design + +### Resolver field + +Replace the single `Option` field with a small struct: + +```rust +struct VfsRootMode { + /// Absolute disk path. `runtime.file_write` and + /// `OutputSink::allowed_roots` use this. In WASM this is a + /// synthetic VFS path (the runtime serves it from memory); in + /// native tests it's a real tempdir subdirectory. + write_root: PathBuf, + /// URL prefix embedded in HTML links / asset srcs. In WASM this + /// matches `write_root`. In native tests it's a fixed synthetic + /// string (e.g. `/.quarto/project-artifacts`) so URLs don't + /// capture the host machine's tempdir. + url_root: String, +} +``` + +`page_url_for`, `html_url_for`, `page_url_for_site_root_dir` use +`url_root`; `on_disk_path_for` and `allowed_output_roots` use +`write_root`. `is_vfs_root_mode` is unchanged. + +### Resolver constructor + +Existing: +```rust +pub fn vfs_root(vfs_root: impl Into) -> Self { … } +``` +keeps its signature and semantics. Internally it stores the path as +both `write_root` and `url_root` (via `to_string_lossy().replace('\\', '/')`). +Production WASM callers don't change. + +New constructor: +```rust +pub fn vfs_root_with_url_root( + write_root: impl Into, + url_root: impl Into, +) -> Self { … } +``` + +Native test helpers switch to this form. + +### Renderer side + +`RenderToPreviewAstRenderer` and `RenderToHtmlRenderer` each currently +hold a single `vfs_root: PathBuf` and pass it verbatim to the +resolver constructor + theme-CSS write. Add: + +```rust +pub struct RenderToPreviewAstRenderer { + vfs_root: PathBuf, // unchanged — used for disk writes + vfs_url_root: Option, // None → derive from vfs_root (today's behavior) + … +} + +impl RenderToPreviewAstRenderer { + pub fn with_url_root(mut self, url_root: impl Into) -> Self { + self.vfs_url_root = Some(url_root.into()); + self + } + + fn build_resolver(&self) -> ResourceResolverContext { + match &self.vfs_url_root { + Some(url) => ResourceResolverContext::vfs_root_with_url_root( + self.vfs_root.clone(), url.clone(), + ), + None => ResourceResolverContext::vfs_root(self.vfs_root.clone()), + } + } +} +``` + +Same shape on `RenderToHtmlRenderer` for symmetry (its native callers +aren't currently testing URL determinism, but the API stays consistent +and the surface area is identical). + +The three `ResourceResolverContext::vfs_root(self.vfs_root.clone())` +call sites in `pass2_renderer.rs` (lines 437, 552, 661, 798) all +become `self.build_resolver()`. + +The theme-CSS write at `pass2_renderer.rs:739` keeps `self.vfs_root.join("styles.css")` +unchanged — that's the disk write, write_root is correct. + +### Test-helper updates + +`crates/quarto-core/tests/idempotence.rs:243`: +```rust +let vfs_root = project.dir.join(".quarto/project-artifacts"); +let renderer = RenderToPreviewAstRenderer::new(&vfs_root) + .with_url_root("/.quarto/project-artifacts"); +``` + +`crates/quarto-core/tests/render_page_in_project.rs:80-81` gets the +same treatment so the HTML-test path produces deterministic link +URLs too. (Not required by Plan 3, but matches the resolver-level +guarantee. Optional in this plan; do it if regression-cheap.) + +## Phases + +### Phase 1 — Regression tests (failing first) + +- [x] Run `cargo nextest run -p quarto-core --test idempotence website_links` + and confirm it fails today with the absolute-path symptom (already + verified; record in the plan and move on). +- [x] Add a unit test in `resource_resolver.rs` that asserts: given + `ResourceResolverContext::vfs_root_with_url_root("/tmp/abc", "/synthetic")`, + `html_url_for(Project, p)` returns `"/synthetic/

"` and + `on_disk_path_for(Project, p)` returns `"/tmp/abc/

"`. Confirm + it fails to compile (the constructor doesn't exist yet). + +### Phase 2 — Resolver split + +- [x] Define the private `VfsRootMode` struct inside `resource_resolver.rs`. +- [x] Change `vfs_root_mode` field from `Option` to + `Option`. +- [x] Update the four match sites (`html_url_for`, `page_url_for`, + `allowed_output_roots`, `on_disk_path_for`) to read the right field. +- [x] Add the `vfs_root_with_url_root` constructor. +- [x] Update the existing `vfs_root` constructor to populate both + fields from the single arg (preserves the WASM identity contract). +- [x] Run the Phase-1 unit test — should pass. +- [x] Re-run the existing pinned contract test + (`vfs_root_resolver_url_matches_on_disk_path` in `website_post_render.rs`). + Should still pass — single-arg constructor still gives URL == disk. + +### Phase 3 — Renderer split + +- [x] Add `vfs_url_root: Option` field + `with_url_root` builder + to `RenderToPreviewAstRenderer`. +- [x] Mirror on `RenderToHtmlRenderer`. +- [x] Replace the four `ResourceResolverContext::vfs_root(self.vfs_root.clone())` + call sites with `self.build_resolver()`. +- [x] `cargo build --workspace` should succeed — no callers have + changed yet, the new field defaults to `None` which derives the + URL root from `vfs_root` exactly as before. + +### Phase 4 — Wire up test helpers + +- [x] `tests/idempotence.rs::render_active_page_preview` adds + `.with_url_root("/.quarto/project-artifacts")`. +- [x] `tests/render_page_in_project.rs::render_active_page` adds the + same (optional but consistent). +- [x] Re-run `cargo nextest run -p quarto-core --test idempotence website_links`. + Should now pass. +- [x] Re-run the full idempotence suite — confirm no other fixtures + regress. + +### Phase 5 — Workspace verification + +- [x] `cargo build --workspace` clean. +- [x] `cargo nextest run --workspace`. +- [x] `cargo xtask verify --skip-hub-build` (matches CI's `-D warnings` + strictness on the Rust leg). +- [x] Cross-check: WASM hub-client callers still pass single-arg + `vfs_root("/.quarto/project-artifacts")` and produce identical + URLs to today (no behavior change). The + `vfs_root_resolver_url_matches_on_disk_path` test in + `website_post_render.rs` is the regression sentinel — it stays + green by construction. + +### Phase 6 — Beads housekeeping + +- [x] `br close bd-rz2we --reason "fixed: split vfs_root into write-root + url-root in ResourceResolverContext + per-renderer override"`. +- [x] Update Plan 3's Phase-4 checklist line for `website_links` (mark + green, drop the queue note). +- [x] `br sync --flush-only`, then commit `.beads/` from the main + repo. + +## Out of scope + +- `RenderToHtmlRenderer`'s native HTML-output tests aren't currently + asserting on link URLs; this plan touches them only for API + symmetry. If they have latent path-leakage in their assertions + (unlikely — they test HTML content shape), that's a separate ticket. +- The wider `vfs_root` naming question (whether the field should be + renamed from `vfs_root` to `vfs_write_root` everywhere). Holding off + to keep the diff small; rename is a no-op refactor that can land + separately. +- bd-3odjm (FilterProvenance wire-format bug). Owned by Plan 5, + unrelated. + +## Touch list + +- `crates/quarto-core/src/resource_resolver.rs` — field, constructor, + 4 match-site updates, 1 new unit test. +- `crates/quarto-core/src/project/pass2_renderer.rs` — 2 renderers × + (1 new field, 1 builder method, 1 helper, 4 call-site swaps). +- `crates/quarto-core/tests/idempotence.rs` — 1 helper line. +- `crates/quarto-core/tests/render_page_in_project.rs` — 1 helper + line (optional). + +No production-code callers change. diff --git a/claude-notes/plans/2026-05-22-provenance-plan-10-dispatch-anchor.md b/claude-notes/plans/2026-05-22-provenance-plan-10-dispatch-anchor.md new file mode 100644 index 000000000..78f0a4fff --- /dev/null +++ b/claude-notes/plans/2026-05-22-provenance-plan-10-dispatch-anchor.md @@ -0,0 +1,588 @@ +# Provenance Plan 10 — Dispatch anchor + Lua source registration in SourceContext + +**Date:** 2026-05-22 +**Branch:** feature/provenance +**Status:** Research plan (pre-implementation; API surface not yet pinned) +**Milestone:** none directly — improves source-pointing diagnostics + and attribution for Lua-driven content; does not gate M3. + +## Epic context + +Part of the **provenance epic** (Plans 3–10). Lua filter files and +Lua-shortcode handler files contribute source-side bytes to +`Generated` nodes (a filter constructed an `Str("HELLO")` somewhere +in `upper.lua`; a `{{< kbd >}}` handler ran code at `kbd.lua:14`). +Today, that source identity lives in `By.data` as a stringly-typed +`{filter_path, line}` payload constructed via `debug.getinfo()`. +It belongs in the `from` anchor list, attached via a new +`AnchorRole::Dispatch` role. + +Same asymmetry contract as `ValueSource` (settled by Plan 9): +**Dispatch is diagnostic-only**, never walked by the writer's +`preimage_in`. The point is attribution and source-pointing +diagnostics — "this rendered text came from line 14 of `kbd.lua`" — +not round-trip. + +## Goal + +Migrate Lua-driven `Generated` shapes from string-keyed +`by.data: {filter_path, line}` to typed source_info pointers in the +anchor list: + +- **Filter constructions**: `Generated { by: filter(), from: + [Dispatch -> lua_si] }` (was `Generated { by: filter(path, line), + from: [] }`). +- **Lua-handler shortcode resolutions**: `Generated { by: + shortcode(name), from: [Invocation -> token_si, Dispatch -> lua_si] }` + (was `Generated { by: shortcode{name, lua_path, lua_line}, from: + [Invocation -> token_si] }`). + +To make those source_info pointers meaningful, **register Lua filter +files and Lua-shortcode-handler files in `SourceContext`** so they +get `FileId`s and their content is available for byte-range +resolution. + +When this plan lands, source-pointing diagnostics from Lua land +("at line 14 of upper.lua, column 5–10") use the same SourceContext +machinery as qmd / YAML diagnostics. Attribution tooling can chase +the `Dispatch` anchor back to the Lua function that produced a node. + +### Lua filters that wrap user content + +A Lua filter that **wraps existing user content** in a Div (e.g. +the canonical "page-shell" filter, or a Lua reimplementation of +`.callout` wrapping) does not need any registration or opt-in to +participate in the visual editor. If the filter emits a block +container (Div / BlockQuote / Figure / NoteDefinitionFencedBlock) +whose `source_info` is `Generated { by: filter(), from: [Dispatch +-> lua_si] }` (no Invocation anchor) and whose children preserve +their original source positions, the wrapper meets the structural +definition of a *transparent wrapper* in +[`claude-notes/designs/transparent-wrappers.md`](../designs/transparent-wrappers.md): + +1. Generated, no Invocation anchor — ✓ (Dispatch is anchor-only + for diagnostics; doesn't count as a source token). +2. Block-container kind — ✓. +3. Children carry `preimage_in(target)` — ✓ by construction + (the filter mutates rather than constructs). + +The writer's `first_in_user_tree` walker sees through it +automatically; the React dispatcher's editability gate (Plan 7c +Phase 2 — `isEditableInside`) treats its children as editable; +edits inside the wrapped content round-trip cleanly. The filter +author writes idiomatic Lua and gets working visual-editor +support — no contract to satisfy beyond "don't strip +source_info from the children you wrap." + +A Lua filter that **constructs** a fresh block container from +metadata (no source-bearing children) is implicitly atomic via +condition (3) — `first_in_user_tree` doesn't descend into it, +editor treats it as a unit, edits inside soft-drop with Q-3-43. +That's also the right behaviour: there are no source bytes to +edit. + +This works regardless of Plan 10's `Dispatch` migration: the +predicate looks at the shape of the AST, not at the kind of +filter. Plan 10 makes filter diagnostics *better*; the transparent- +wrapper machinery makes the **editing contract** that filter +authors can rely on. + +## Scope + +### In scope + +#### Phase 1 — `AnchorRole::Dispatch` + +- Add `Dispatch` variant to `AnchorRole` enum in + `crates/quarto-source-map/src/source_info.rs:91-118` alongside the + existing `Invocation`, `ValueSource`, `Other`. +- Doc-comment on `Dispatch` references the + Plan-7-established / Plan-9-confirmed policy: `preimage_in` walks + `Invocation` only; `Dispatch` is diagnostic-only and never + consulted by the writer. Plan 7 documents the policy in terms of + "all non-`Invocation` roles, present and future, are skipped"; + Plan 10 names `Dispatch` in the exclusion list once the variant + exists. +- Add `Anchor::dispatch(source_info: Arc) -> Self` + constructor parallel to `Anchor::invocation` / `Anchor::value_source`. + +#### Phase 2 — SourceContext extension for Lua files + +- Extend `SourceContext::add_file` (currently + `crates/quarto-source-map/src/context.rs:59`) to support Lua files. + Two possible extensions: + - (A) Add a `FileKind { Qmd, Yaml, Lua, … }` discriminator on + `FileInformation`. `add_file` stays signature-compatible; + callers passing Lua files use a new `add_lua_file` helper or + pass `FileKind::Lua` explicitly. + - (B) Reuse `add_file` as-is (Lua files are just files; + path/content are sufficient). + - Recommendation: (B) for v1; (A) only if a downstream consumer + needs to distinguish kind (e.g. line-numbering rules differ for + Lua vs. qmd, which they don't today). +- Confirm `FileInformation::compute_line_breaks` handles Lua source + correctly (it should — it just indexes `\n` positions). + +#### Phase 3 — Lua engine bridge: pass FileId through callbacks + +- `apply_lua_filters` + (`crates/pampa/src/lua/filter.rs:158-200` and surrounding) reads + the filter path from `FilterSpec::Lua(path)` and the filter file + bytes from disk. **Register the file in `SourceContext` at that + point**, capturing the returned `FileId`. +- Thread the `FileId` into the Lua closure context so callbacks that + introspect `debug.getinfo()` can resolve `(source: path, line: + line_num)` into `SourceInfo::Original { file_id, start, end }` + where `start..end` covers the line's bytes (via + `FileInformation`'s line-break index). +- Update `get_caller_source_info` + (`crates/pampa/src/lua/diagnostics.rs:255`) — currently constructs + `Generated { by: By::filter(path, line), from: SmallVec::new() }`. + New shape: `Generated { by: By::filter(), from: + [Dispatch(Arc::new(Original{file_id, start, end}))] }`. + +#### Phase 4 — `By::filter` signature shrinks + +- Change `By::filter(path: impl Into, line: usize)` + (currently at `crates/quarto-source-map/src/source_info.rs:458`) + to `By::filter()`. The path/line move to the Dispatch anchor's + source_info; `by.data` becomes `null`. +- All call sites in `crates/pampa/src/lua/types.rs:1830`, + `crates/pampa/src/lua/diagnostics.rs:203,262,847`, + `crates/pampa/src/readers/json.rs:305,2764` migrate. Most are + diagnostic-side; the json reader has a legacy-back-compat path + reading `"FilterProvenance"` tag. +- **No backward-compat carve-out for `By::filter`.** Same reasoning + as Plan 9's `By::appendix` change: + 1. `By::filter` is workspace-internal Rust — no FFI, no extension + SDK, no TS mirror. + 2. Plan 5 has shipped `By::filter(path, line)` to the JSON wire + format. **Clean break** (see §Phase 6 below) — writers emit + the new shape after Plan 10 lands; the old shape disappears + from the codebase in the same PR. No dual-reader window. No + on-disk artifacts hold the old shape, so no migration path is + needed. +- `By::as_filter()` accessor (currently returns + `Option<(&str, usize)>` from `by.data`) gets removed or + repurposed. Callers needing path/line read the Dispatch anchor's + source_info and resolve via `SourceContext`. + +#### Phase 5 — Lua-handler shortcode resolutions + +- The shortcode resolver + (`crates/quarto-core/src/transforms/shortcode_resolve.rs:380-460`) + dispatches to Lua handlers via `dispatch_to_lua_engine`. When the + handler is Lua-backed, attach a `Dispatch` anchor pointing at the + handler function's source line. +- Built-in (Rust) handlers like `MetaShortcodeHandler` stay with + `from: [Invocation]` only — no Dispatch. +- The Lua engine needs to know which file each handler is registered + in (already known via the registration call site). Stash that + alongside the handler binding. + +#### Phase 6 — Wire format clean break + +- Plan 5 emits `Generated { by: filter, by.data: {filter_path, line} }` + to JSON wire code 4. After Plan 10: + - Writers emit `Generated { by: filter, by.data: null }` plus a + `Dispatch` anchor in the `from` list. + - Readers accept the new shape only. The old shape disappears from + the workspace in the same PR. +- **Clean break, no dual reader.** Same rationale as `By::appendix` + and `By::filter`: this is a workspace-internal wire format with no + on-disk artifacts holding the old shape. The IndexedDB profile + cache is invalidated by `pass1_key` (Phase 7 below); any in-flight + WASM bundles rebuild from source. The CI build chain ensures Rust + and TS rebuild in lockstep — no in-the-wild client holds an old + WASM expecting the old shape. +- Equivalent break on the Lua-shortcode-handler shape (currently + `by.data: {name, lua_path, lua_line}` → `by.data: {name}` + + Dispatch anchor). Same one-PR migration. + +#### Phase 7 — Cache-key surface (reuses Plan 7a's field) + +- Lua filter file content becomes Pass1 cache input. **Plan 7a + lands first** and introduces `filter_sources_hash` on + `Pass1KeyInputs` (SHA-256 over filter file bytes + opt-out flags). + Plan 10 **reuses** that field — no new field, no parallel hash. +- Plan 10 Phase 7 reduces to: + - Confirm the existing `filter_sources_hash` semantics cover + Plan 10's needs (cache invalidates when a Lua filter file's + bytes change). They do — both plans hash the same files. + - Add a smoke test: register a Lua filter file in SourceContext, + edit its bytes, confirm `pass1_key` changes accordingly. Likely + Plan 7a's existing tests already cover this; verify during + implementation. +- If Plan 7a hasn't landed when Plan 10 starts (reversed order), + Plan 10 introduces the field itself with Plan 7a's semantics, and + Plan 7a later reuses it. The structural answer is the same; the + PR that lands first owns the field. + +### Out of scope + +- **Lua hot-reload / file-watcher integration** — a Lua file editing + experience that re-runs the filter on save. Demand-driven + invalidation via cache-key hashing is sufficient for v1. +- **Lua-LSP cross-references** (jump-to-definition into filter code + from a rendered diagnostic) — UX work that consumes Plan 10's + output but isn't part of it. Likely a future hub-client plan. +- **Non-Lua extension-contributed handlers** (future WASM-shortcode, + native-Rust-shortcode). The `Dispatch` role is Lua-flavored — the + source_info pointer assumes a file with byte ranges. WASM / + native handlers may want a different anchor role (e.g. + `Other("wasm-handler")` carrying a handler URI). Defer until those + handler kinds exist. +- **Citeproc / JSON-filter source pointers**. Citeproc is a built-in + Rust filter (no Lua); JSON filters are external processes (no + source we can register). `FilterSpec::Citeproc` / `FilterSpec::Json` + variants stay with `Generated { by: filter(), from: [] }` — + diagnostic source pointing isn't meaningful for them. +- **Lua-engine-side restructuring** (e.g. moving the mlua bridge to + a separate crate). Plan 10 changes the contract at the bridge + boundary; it does not refactor the bridge. +- **bd-2mxo / `AttrSourceInfo` fixes** — separate concerns. + +## Design decisions (settled) + +- **`AnchorRole::Dispatch` is diagnostic-only.** Follows Plan 9's + `AnchorRole::Other` policy: `preimage_in` walks `Invocation` only. + Dispatch is consumed by attribution / diagnostic UI, not by the + writer's Verbatim path. + +- **`By::filter` becomes nullary.** Path/line move to Dispatch. + `By.data` for filter-kind is `null`. Wire format migrates (Phase 6 + above). + +- **Lua-handler shortcode keeps `name` in `by.data`.** The shortcode + name is part of the *identity* (which shortcode resolution + produced this node), not the *dispatch source* (which file + resolved it). The two are distinguishable: name is a parameter of + the `By` shape (`shortcode("meta")` vs `shortcode("kbd")`); dispatch + source is an anchor pointing at the handler's location. + +- **Source range of a Dispatch anchor: line-covering `Original`.** + `debug.getinfo()` gives line numbers, not byte ranges. Once Lua + file content is in SourceContext, we compute the byte range of the + named line via `FileInformation`'s line-break index. The Dispatch + anchor's source_info is `Original { file_id: lua_file, start: + line_start, end: line_end }`. Sub-line precision (specific + function or expression) is out of scope for v1 — `debug.getinfo()` + doesn't provide it without parsing the Lua source. + +- **Filter files are registered eagerly at `apply_lua_filters` + entry.** Not lazily on first `debug.getinfo()` call — eager + registration ensures the FileId is stable across multiple + callbacks and accessible without thread-safety gymnastics in the + Lua-closure context. + +- **Lua-shortcode handler files are registered at handler + registration time** (when `_extension.yml` loads). Same eager + pattern as filter files. The handler registry maps handler + name → `(FileId, line_in_file)`. + +- **No backward-compat carve-out for the wire format.** Plan 5's + emitted shape (`by.data: {filter_path, line}`) has shipped, but + appears only in WASM-internal AST flow and IndexedDB profile + cache. The cache is invalidated by `pass1_key` (Phase 7); no + on-disk artifacts hold the old shape. Clean break in one PR — + no dual-reader window. Same rationale as `By::appendix` (Plan 9) + and `By::filter` (Phase 4 above). + +- **Plan posture: research plan.** This document settles the API + shape (the Dispatch role, the `By::filter` migration, the + SourceContext extension); it does not yet commit to the + implementation order. A subsequent review pass converts it to a + development plan with checklisted phases. + +## API surface to settle (research-plan deliverables) + +By the time this plan converts to a development plan, the following +must be pinned: + +1. **`AnchorRole::Dispatch` doc-comment text** — exact wording of + "diagnostic-only, never consulted by `preimage_in`" policy. + +2. **`SourceContext` Lua-file kind discrimination** — option (A) + with `FileKind` enum vs. option (B) reuse `add_file` as-is. + Recommend (B); revisit if downstream needs (A). + +3. **Lua engine bridge: how the `FileId` is threaded into the + closure context.** mlua's app-data slot (`Lua::set_app_data`) is + the obvious answer. Confirm during implementation. + +4. **`Pass1KeyInputs` field shape** — option (A) `lua_filter_files` + field vs. option (B) SourceContext-referenced. Recommend (A) for + v1; Plan 7a coordinates by reading the same field. + +5. **Wire-format migration window** — which release cycle the dual + reader stays active. Stated in Plan 6's commit message; + propagated to wire-format documentation. + +6. **`By::as_filter()` deprecation** — remove vs. repurpose to + read from the Dispatch anchor. Recommend: remove; callers + needing path/line read the Dispatch source_info directly. + +## Open questions for implementation + +- **Pre-registration vs. on-demand registration of Lua files.** + Eager (Phase 3) means every render pays the SourceContext cost + even if `debug.getinfo()` never fires. On-demand registration is + cheaper but introduces order-dependence in the closure context. + Recommend eager; benchmark to confirm cost is negligible. + +- **`debug.getinfo` performance.** Calling + `debug.getinfo` on every constructed node may dominate filter + runtime. Verify against a filter-heavy fixture during + implementation; if it's expensive, batch source-info attachment to + the post-walk helper (`enrich_or_create` in Plan 6's design). + +- **Coordination with Plan 7a's `filter_sources_hash`.** Plan 7a + proposes hashing filter files for idempotence verdicts; Plan 10 + hashes them for cache invalidation. Recommend: settle on one hash + computation owned by Plan 10's Phase 7; Plan 7a reuses it. Confirm + during the Plan 7a → Plan 10 sequencing discussion. + +- **Lua-shortcode-handler file registration timing.** Extension + loading (`_extension.yml` parsing) happens before filter pipeline + setup. Need to ensure SourceContext is available at extension + load — likely via the existing `StageContext`-style threading. + Confirm. + +- **Migration of existing Plan 4 tests.** The unit tests in + `crates/quarto-source-map/src/source_info.rs:715-770` exercise + `By::filter("foo.lua", 42)` extensively. They migrate to + `By::filter()` + a Dispatch anchor; the path/line assertions move + to the anchor's `source_info`. Mechanical but ~10 test changes. + +- **Plan 6's Lua post-walk shape (`enrich_or_create`).** Plan 6 + Phase 6's post-walk helper (per the diff in Plan 6 §"The post-walk + helper") promotes Lua-attached source_info to the canonical + `Generated { by: filter, ... }` form. After Plan 10 the canonical + form is `Generated { by: filter(), from: [Dispatch] }`. The + helper updates accordingly. Confirm Plan 6 lands before Plan 10 + implementation (or that Plan 6 is amended to anticipate the + shape change). + +## References + +- `crates/quarto-source-map/src/source_info.rs:91-118` — + `AnchorRole` enum (Phase 1 extends). +- `crates/quarto-source-map/src/source_info.rs:458-466` — + `By::filter` constructor (Phase 4 signature change). +- `crates/quarto-source-map/src/source_info.rs:582-594` — + `By::as_filter` accessor (Phase 4 removes / repurposes). +- `crates/quarto-source-map/src/context.rs:59-130` — + `SourceContext::add_file*` family (Phase 2 extends). +- `crates/quarto-source-map/src/file_info.rs:12-58` — + `FileInformation`; line-break index used in Phase 3 for byte-range + resolution. +- `crates/pampa/src/lua/filter.rs:158-200,270` — + `apply_lua_filters` entry; Phase 3's eager-registration site. +- `crates/pampa/src/lua/types.rs:1820-1840` — `debug.getinfo()` + consumer (Phase 3 migrates to FileId-backed shape). +- `crates/pampa/src/lua/diagnostics.rs:195-265,847` — Generated + construction sites; Phase 3 + 4 migrate. +- `crates/pampa/src/readers/json.rs:305,2764` — wire-format + decoder; Phase 6's dual-reader window. +- `crates/quarto-core/src/project/cache_key.rs:108-141` — + `Pass1KeyInputs`; Phase 7 extends. +- `crates/quarto-core/src/transforms/shortcode_resolve.rs:380-460` — + Lua shortcode dispatch; Phase 5's stamping site. +- Plan 6 §"Dispatch follow-up" — Plan 10's scope-pickup point. +- Plan 9 §"Settled `AnchorRole::Other` policy" — Plan 10 inherits the + policy for Dispatch. +- Plan 5 (wire format) — Phase 6's migration is on top of Plan 5's + code-4 emission. +- Plan 7a — coordinates on filter-source hashing (Phase 7). +- bd-36fr9 (closes). + +## Test plan + +### Phase 1 (`AnchorRole::Dispatch`) + +- Constructor unit tests parallel to `Anchor::invocation` / + `Anchor::value_source`. +- Serde round-trip test for a `Generated` carrying a `Dispatch` + anchor. +- `preimage_in` asymmetry test: `Generated { by: filter(), from: + [Dispatch(lua_si)] }` → `preimage_in` returns None (Lua bytes are + not body bytes; the writer must not copy them into the parent + file). +- `anchors_with_role(&AnchorRole::Dispatch).count()` returns 1 on + the above shape. + +### Phase 2 (SourceContext Lua-file extension) + +- `add_file` with a `.lua` path produces a FileId; content is + retrievable. +- `FileInformation::map_offset` resolves byte offsets to (row, col) + for Lua source. + +### Phase 3 (Lua bridge FileId threading) + +- A filter that constructs a node (via `pandoc.Str(...)`) produces + a `Generated { by: filter(), from: [Dispatch] }` shape; the + Dispatch anchor's source_info chain-resolves to the filter + file's FileId and the constructed line's byte range. +- `get_caller_source_info` returns the new shape; legacy callers + failing to find a `(path, line)` in `by.data` get a + doc-commented migration message. + +### Phase 4 (`By::filter` signature shrinkage) + +- All migrated unit tests pass with the nullary constructor. +- `By::filter().is_atomic_kind()` still returns true (atomicity + unchanged). + +### Phase 5 (Lua-handler shortcode) + +- A Lua-handler shortcode resolution produces `Generated { by: + shortcode(name), from: [Invocation, Dispatch] }`. Built-in + shortcode resolutions (meta, var) stay `from: [Invocation]` only. + +### Phase 6 (wire format clean break) + +- Writer: emits the new shape (`by.data: null` + Dispatch anchor). +- Reader: accepts only the new shape; old shape removed entirely. +- Snapshot test asserting byte-for-byte stability of Lua-filter- + emitting fixtures under the new shape. +- Compile-time confirmation that no reader code references the old + `filter_path` / `line` keys in `by.data`. + +### Phase 7 (cache-key surface) + +- Cache key invalidates when a Lua filter file's content changes. +- Cache key stable when Lua filter file content is unchanged. + +### End-to-end + +- Lua filter raising a `quarto.warn(...)` from line 14 of `foo.lua` + produces a diagnostic whose source range + chain-resolves (via `SourceInfo::resolve_byte_range`) to + `(foo_lua_file_id, line_14_start, line_14_end)`. +- A document with a Lua-handler shortcode (`{{< kbd Alt-X >}}`): + - Resolved inline carries Dispatch anchor pointing at the + handler's Lua source. + - Edit-back round-trip preserves the `{{< kbd Alt-X >}}` token + in the qmd source (Plan 7 Verbatim via the Invocation anchor; + Dispatch is not consulted). + +## Dependencies + +### Hard dependencies + +- **Plan 4** — `AnchorRole` enum. +- **Plan 6** — `Generated`-stamping post-walk helper + (`enrich_or_create`) is the natural point to migrate to the new + shape. Plan 6 must land before Plan 10 implementation, OR Plan 6 + is amended to anticipate the Dispatch shape during + implementation. Recommend the former. +- **Plan 5** — Plan 10's wire-format migration is on top of Plan + 5's code-4 emission. + +### Soft dependencies + +- **Plan 9** — establishes the `AnchorRole::Other` policy that + Dispatch inherits. Doesn't strictly block Plan 10 implementation + (the policy is doc-only), but Plan 9 lands the policy in writing + first. +- **Plan 7a** — coordinates on filter file hashing (Phase 7). + Recommend Plan 10's Phase 7 lands the cache-input shape; Plan 7a's + idempotence cache reuses it. + +### Does not block + +- **Plan 7 implementation** can ship without Plan 10. Plan 7's + writer consults `Invocation` only; Dispatch lands in the + diagnostic UX cycle. + +### Blocks + +- Future Lua-LSP / hub-client diagnostic-clicks-to-source UX work. +- Future extension-author-facing handler-trace tooling. + +## Risk areas + +- **Lua engine bridge complexity.** Touches mlua interop, app-data + context threading, debug.getinfo behavior across Lua versions + (5.1 vs. 5.4 — verify what we use). The mlua side has historically + been a source of subtle bugs; budget extra time for edge cases. + +- **`debug.getinfo` performance.** Calling on every constructed node + could dominate filter runtime. Mitigation: batch via Plan 6's + post-walk helper if necessary; benchmark. + +- **Wire-format clean-break coordination.** Plan 10's PR must + rebuild WASM and TS in lockstep — the WASM emits the new shape; + TS expects only the new shape. No in-flight client holds an old + WASM expecting the old shape (no npm-published consumer). CI's + `cargo xtask verify` chain catches drift if the rebuild is + incomplete. + +- **SourceContext lifetime / sharing.** Lua files registered eagerly + at `apply_lua_filters` entry need to be available for the + duration of the pipeline. The existing SourceContext sharing + pattern (likely `Arc>` or `&mut` through the pipeline) + must accommodate Lua-file additions mid-pipeline. Verify. + +- **Coordination friction with Plan 7a — resolved.** Both plans + touch `cache_key.rs` and want to hash filter files. Resolved by + agreement: Plan 7a lands first and owns `filter_sources_hash` on + `Pass1KeyInputs`; Plan 10 reuses it. The order is also the + natural one — Plan 7a is independent of Plan 10's Lua-source + registration; Plan 10 benefits from Plan 7a's hashing already + being in place. + +- **Migration tests that touch `By::filter("foo.lua", 42)`.** ~10 + unit tests in `source_info.rs` migrate mechanically; if any are + missed during the signature change, the workspace fails to + compile. Mitigation: the compiler is the safety net here — `cargo + build --workspace` will name every offending site. + +## Estimated scope + +| Phase | Lines (rough) | +|---|---| +| 1: `AnchorRole::Dispatch` + Anchor constructor + tests | ~80 | +| 2: SourceContext Lua-file support (probably minimal) | ~40 | +| 3: Lua bridge FileId threading + byte-range computation | ~200 | +| 4: `By::filter` signature shrinkage + call-site migration | ~120 | +| 5: Lua-handler shortcode Dispatch attachment | ~80 | +| 6: Wire-format clean break + tests | ~80 | +| 7: Cache-key smoke test (reuses Plan 7a's `filter_sources_hash`) | ~30 | +| Tests across phases | ~350 | +| **Total** | **~980** | + +Two focused sessions likely; high-complexity due to mlua interop +and the wire-format migration. The Lua engine bridge work in +Phase 3 is the riskiest piece — if `debug.getinfo` ergonomics or +performance surprise, the design changes. + +## Notes + +This plan is the "Lua-source pointing" wing of the provenance epic. +Plan 9 covers metadata-derived attribution; Plan 10 covers +Lua-derived attribution. Both rely on the `AnchorRole::Other` +policy Plan 9 commits to writing. + +After Plan 10, the `Generated.by.data` payload shrinks across all +known kinds: +- `filter`: `{filter_path, line}` → `null` (Plan 10). +- `shortcode`: `{name, lua_path, lua_line}` for Lua handlers → + `{name}` (Plan 10). Built-in handlers unchanged. +- `appendix`: `null` → serialized `AppendixSection` enum (Plan 9). +- `sectionize`, `title-block`, `footnotes`, `appendix-container`, + `tree-sitter-postprocess`, `user-edit`, `include`: `null` + (unchanged). + +The trajectory is "By.data shrinks; the anchor list grows." That's +the right direction — typed source_info pointers in `from` are +strictly more powerful than untyped strings in `by.data`, and they +follow the established `Invocation` / `ValueSource` / `Dispatch` +role discipline. + +### Naming convention + +Uses the `provenance-plan-N-.md` naming (no `q2-preview-` +prefix) established by Plan 9. The provenance epic has outgrown the +original q2-preview framing. diff --git a/claude-notes/plans/2026-05-22-provenance-plan-9-valuesource-threading.md b/claude-notes/plans/2026-05-22-provenance-plan-9-valuesource-threading.md new file mode 100644 index 000000000..8f9d7204c --- /dev/null +++ b/claude-notes/plans/2026-05-22-provenance-plan-9-valuesource-threading.md @@ -0,0 +1,561 @@ +# Provenance Plan 9 — ValueSource threading for metadata-derived content + +**Date:** 2026-05-22 +**Branch:** feature/provenance +**Status:** Research plan (pre-implementation; API surface not yet pinned) +**Milestone:** none directly — improves attribution / round-trip provenance + reporting; does not gate M3. + +## Epic context + +Part of the **provenance epic** (Plans 3–10). Plan 6 stamps every +pipeline-synthesized node with `Generated { by, from }`; for most +synthesizers the `from` list is non-empty only when there's a +body-source token to anchor at (shortcode resolutions → `Invocation`). +**Several synthesizers consume metadata values (frontmatter, +`_quarto.yml`, `_metadata.yml`) and currently emit `from: []`** because +the value-side source info is discarded somewhere between the YAML +parser and the synthesizer's stamping point. Plan 9 threads it the +last hop and stamps `ValueSource` anchors on those consumers, so +attribution tooling can trace rendered content back to the YAML keys +that produced it. + +Plan 9 is the **consumer wiring** half of the provenance epic. Plan 6 +stamps the identity (`by`); Plan 9 stamps the origin (`ValueSource` in +`from`) on the metadata-derived subset. Together they make every +pipeline-produced metadata-derived node fully attributable. + +## Goal + +Thread per-value `SourceInfo` to where synthesizers can stamp it as +`ValueSource` anchors. Three target consumers: + +1. **Meta/var shortcode resolutions** (closes bd-129m3) — `{{< meta + footer >}}` → `Generated { by: shortcode("meta"), from: + [Invocation -> token_si, ValueSource -> value_si] }`. +2. **DocumentProfile.title → nav-text** (closes bd-8pmq3) — sidebar / + navbar entries built from `profile.title` carry a `ValueSource` + anchor pointing at the source qmd's title metadata bytes. +3. **Appendix container metadata-derived sections** (currently + unowned in beads) — per-section sub-Divs for license, copyright, + citation each stamped with `ValueSource` pointing at + `meta.license` / `meta.copyright` / `meta.citation`. + +Plus the **Plan 7 deferred invariant tests** that depend on at least +one ValueSource consumer existing (the `preimage_in` role-asymmetry +unit test and the appendix-license end-to-end round-trip test). + +When this plan lands, the `Invocation` vs `ValueSource` asymmetry +contract Plan 7 documents has real exercise — there are producers, +the writer correctly walks only the `Invocation` anchors, the +attribution machinery can light up the `ValueSource` data without any +writer changes. + +## Scope + +### In scope + +#### Phase 1 — Infrastructure + +- A provenance-aware conversion API alongside the existing + `config_value_to_inlines(value: &ConfigValue) -> Vec` in + `crates/quarto-core/src/transforms/shortcode_resolve.rs:167`. + **API shape (settled per user direction):** + + ```rust + /// Convert a ConfigValue to inline content, returning both the + /// inlines and the source_info pointing at the value's definition + /// site. The caller decides how to stamp the source_info (typically + /// as an `AnchorRole::ValueSource` on a surrounding `Generated`). + /// + /// For `PandocInlines` content, the returned source_info is the + /// outer ConfigValue's; per-leaf source_info is preserved on the + /// inlines themselves and is not flattened. + fn config_value_to_inlines_with_provenance( + value: &ConfigValue, + ) -> (Vec, SourceInfo); + ``` + + The existing `config_value_to_inlines` stays for legacy callers + (template values, non-provenance contexts). New consumers route + through the provenance-aware version. + +- `DocumentProfile` gains `title_source_info: Option` + (per bd-8pmq3's detailed plan: ~30–50 LOC including `extract` + change + `Default` impl at `crates/quarto-core/src/document_profile.rs`). + Uses `#[serde(default, skip_serializing_if = "Option::is_none")]` + — same pattern as `order: Option`. **No + `DOCUMENT_PROFILE_VERSION` bump** (additive `Option<_>` with + default; per document-profile-contract §"Serialization and + versioning"). Update the contract's §Change log. + + **Transparent-wrapper invariant.** `DocumentProfile::extract` + runs at the pre-sugar checkpoint, so it never sees the + sectionize wrapper — `blocks[0]` here is the user's real first + block. If the extractor is later moved past + `SectionizeTransform`, or extended with a "fall back to the + first H1" rule, it MUST descend through transparent wrappers + via `first_in_user_tree` + (`crates/pampa/src/writers/incremental.rs`). See + [`claude-notes/designs/transparent-wrappers.md`](../designs/transparent-wrappers.md) + for the contract. + +- New typed enum `AppendixSection { License, Copyright, Citation }` + in `crates/quarto-source-map/src/source_info.rs`, with serde + derive. Discriminator for `By::appendix` (see Phase 4). + +#### Phase 2 — Meta/var shortcode ValueSource (closes bd-129m3) + +- `MetaShortcodeHandler::resolve` + (`crates/quarto-core/src/transforms/shortcode_resolve.rs:148`) and + the matching `var` handler look up via + `ctx.metadata.get_nested(&key)` which returns a `&ConfigValue` + whose `.source_info` is the value's definition site. +- Construct resolved inlines via + `config_value_to_inlines_with_provenance`, then stamp the + surrounding `Generated` with both anchors: + + ```rust + let (inlines, value_si) = config_value_to_inlines_with_provenance(value); + let mut gen = SourceInfo::generated(By::shortcode(name)); + gen.append_anchor(AnchorRole::Invocation, Arc::new(token_si)); + gen.append_anchor(AnchorRole::ValueSource, Arc::new(value_si)); + // attach `gen` to each resolved inline + ``` + +- Belt-and-suspenders for `ConfigValueKind::PandocInlines` + (markdown-rich metadata like `title: "**Bold**"`): the `ValueSource` + is attached on the wrapping shape, **not** pushed into every leaf + — keeps Plan 7's multi-inline dedupe rule (which compares + `invocation_anchor()` source_info structurally) trivially correct + with no ValueSource cross-talk. + +#### Phase 3 — DocumentProfile.title → nav-text (closes bd-8pmq3) + +- Update `DocumentProfile::extract` + (`crates/quarto-core/src/document_profile.rs:529`): replace + `title: plain_text_field(meta, "title")` with code that also + captures `meta.get("title")?.source_info.clone()` into the new + `title_source_info` field. +- Three Plan-6 Phase-5 consumer sites attach + `ValueSource(profile.title_source_info)` when present: + - `crates/quarto-core/src/transforms/sidebar_generate.rs:228` + - `crates/quarto-core/src/transforms/sidebar_auto.rs:311` (only + when reading from `profile.title`; file-stem fallback at line 318 + keeps `from: smallvec![]`) + - `crates/quarto-core/src/transforms/navigation_enrich.rs:59` +- Subtitle / description / date / image fields stay out-of-scope + (not consumed by nav sites today). Inline-rich titles + (`ConfigValue::PandocInlines`) preserved by Phase 1's API design. + +#### Phase 4 — Appendix metadata-derived sub-Divs + +- `create_license_section` / `create_copyright_section` / + `create_citation_section` in + `crates/quarto-core/src/transforms/appendix.rs` (lines 270–) read + `meta.get("license")` / `.get("copyright")` / `.get("citation")` — + the source_info is on those `ConfigValue` references and just + needs to ride along. +- **Per-section sub-Div stamping (option A):** each per-section Div + carries + `Generated { by: By::appendix(AppendixSection::License), from: [ValueSource(license_si)] }`, + with the outer container kept at + `Generated { by: By::appendix_container(), from: [] }`. +- **`By::appendix` becomes parameterized** (settled per user + direction): drops the existing nullary `By::appendix()` + constructor in favor of `By::appendix(section: AppendixSection)`. + See §Design decisions for backward-compat rationale (no + production callers; no persisted wire artifacts). +- Need a separate `By::appendix_container()` (or similar) for the + outer wrapper Div, since the wrapper isn't tied to a single + metadata key. Tentative name `By::appendix_container()` — + discriminate during implementation. +- Missing-key cases (no `license` in meta) gracefully skip — no + ValueSource attempt, no synthesizer fires. + +#### Phase 5 — Plan-7 invariant tests (deferred from Plan 7) + +Status: Plan 7 shipped on `feature/provenance` 2026-05-24 (phases +1-7 + 9; Playwright e2e matrix carried separately in `bd-3izo3`). +These tests are now unblocked — they need a real `ValueSource` +consumer (Phase 4's appendix synthesizer) to exercise the +`Invocation`-vs-`ValueSource` asymmetry that Plan 7's writer +implements. Until Phase 4 stamps `ValueSource` anchors on the +appendix synthesizer, the structural-only versions of these tests +remain in Plan 7's `quarto-source-map` test module (the `preimage_in +skips non-Invocation roles` unit test, lines 982-986 of Plan 7). + + +- **`preimage_in` appendix-specific role-asymmetry unit test**: + build `Generated { by: By::appendix(AppendixSection::License), from: [ValueSource(meta_si)] }` + where `meta_si` is `Original { file_id: 0, start: 10, end: 25 }`. + Call `preimage_in(FileId(0))` and assert it returns `None` (NOT the + byte range of the meta-key — that would copy YAML into the body). + Belt-and-suspenders companion to Plan 7's Phase 1 structural test + (which uses generic `By` + `value_source()` and ships without + Plan 9 types); this version pins the same invariant against the + real `By::appendix(...)` shape that Plan 9 introduces. Lives in + `quarto-source-map`'s test module. + +- **Appendix-license end-to-end round-trip test**: build a project + fixture with frontmatter `license: MIT` and a synthesized + appendix (no user-written `:::{.appendix}` block). Run the full + q2-preview pipeline → write back to qmd. Assert: + - (a) no `license: MIT` bytes outside the YAML frontmatter range + (the meta YAML must not leak into the body); + - (b) output qmd is byte-identical to input qmd (round-trip + stability — the synthesized appendix Div is dropped from + output and re-synthesized next pipeline run). + + Covers the Phase 4 shape end-to-end. Belt-and-suspenders against + a future refactor that "leniently" tries `value_source_anchor()` + when `invocation_anchor()` returns None. + +- **Multi-inline dedupe-by-Invocation test**: build a Para with + three inlines each carrying + `Generated { by: shortcode("meta"), from: [Invocation -> token_si, ValueSource -> value_si] }` + (Phase 2 shape). Reconcile against an identical Para. Assert + Plan 7's writer emits the shortcode token bytes ONCE — confirms + dedupe consults `Invocation` only, not the full anchor list, and + doesn't mis-fire if ValueSource source_infos differ. + +- **Inline-level role-asymmetry test**: similar to the unit test + but at the inline level, e.g. a `Span` synthesized by some + metadata-aware transform with `[ValueSource only]`. Assert + `preimage_in` returns None at the inline level too. + +#### Phase 6 — Plan 7 cross-reference cleanup + +- Reword Plan 7's §`Invocation` vs `ValueSource` consumer asymmetry + subsection (added by commit `6a2797b6`) to point at Plan 9's + Phase 4 as the canonical example, rather than asserting that the + shape "is stamped today." Small docs change; closes the + wording-vs-reality gap. +- Cross-link Plan 7's §Test plan to Phase 5's tests' new homes. + +### Out of scope (rationale per item) + +- **bd-36fr9 (Dispatch anchor for Lua filter / handler-shortcode)** — + Conceptually adjacent (another anchor role for diagnostic-only + attribution), but the precondition is *register Lua filter files in + `SourceContext` and assign them `FileId`s*, which touches the Lua + engine bridge, cache-key surface, and SourceContext interning. + Sized for its own plan: **Plan 10**. Plan 9 stays + metadata-flavored. + +- **bd-12vrr (callout default-title)** — Callout titles ("Note", + "Tip", "Warning") come from a static list, not from metadata. The + work needs `By::callout()` and an atomicity decision but doesn't + fit the "thread source-info from metadata" thesis. Standalone + follow-up — see bd-12vrr's comment on the popup-menu use case. + +- **bd-1inj0 (code-block decoration synthesizers)** — Filenames and + captions come from chunk options / Attr, not from `ConfigValue`. + Different threading path (`AttrSourceInfo`, currently broken at the + merge layer per bd-1e6a5 / bd-3aolj). Wait for those preexisting + `Attr` bugs to land before doing decoration ValueSource. Standalone + follow-up. + +- **bd-2mxo (MergedConfig::materialize() strips source_info)** — + Real P2 bug, but per the issue itself "Scalar values are preserved + correctly." Plan 9's consumers read scalar values (`license: "MIT"`, + `title: "Foo"`); the bug affects map and array container + source_info, which Plan 9 doesn't need at the leaf level. Stays as + a parallel P2 fix that doesn't block Plan 9. (See §Risk areas + for the one corner where map-shaped metadata interacts.) + +- **bd-z2j7o (`WithSourceInfo` wrapper audit)** — Phase 1's + threading work may surface a third or fourth ad-hoc `(value, + source_info)` pair. If so, that's evidence for the audit but Plan 9 + doesn't pre-decide the refactor. + +- **bd-hjv5o (source-location-driven path resolution)** — Different + problem: uses `SourceInfo` to *change behavior* (resolving paths + relative to declaration site), not to *stamp anchors*. + +- **Hub-client UI consumption of ValueSource anchors** (hover-preview + showing "this title came from `_quarto.yml:title`"). The + Rust-side correctness is independently verifiable via tests; the + hover-UX is a separate hub-client plan. + +- **Subtitle / description / date / image source_info on + DocumentProfile** — extend when a consumer needs them; this plan + ships title only (the only field the three nav sites consume). + +## Design decisions (settled) + +- **Per-section sub-Div appendix attribution (option A).** Each of + license, copyright, citation gets its own typed `By::appendix` + variant carrying its own `ValueSource`. Enables fine-grained + hover-attribution UX. Trade-off: more sub-Divs, but the + structural cost is small. + +- **`By::appendix(AppendixSection)` typed enum constructor.** Settled + over the alternatives (string-keyed `by.data`, `&'static str` + parameter) because the discriminator is load-bearing and a typed + enum is checked at the compiler. Adding new appendix-section + variants in the future is a deliberate enum change — the right + kind of friction. + +- **No backward-compat carve-out for `By::appendix`.** The shape + change is clean. Reasons (verified): + 1. No production callers today — only test sites in + `source_info.rs` itself. `transforms/appendix.rs` still emits + `SourceInfo::default()`; Plan 6 will add stamping after this + plan finalizes the constructor. + 2. `By` is workspace-internal Rust — no FFI, no extension SDK, + no TS-side mirror. The hub-client's TS hand-mirror is + `atomicCustomNodes` for `CustomNode` types, not `By` kinds. + 3. Wire format: `By` serializes to `{kind, data}` via serde. No + persisted artifact contains `By::appendix` today (Plan 6 + stamping hasn't shipped). No migration needed. + +- **Idiomatic API: `(inlines, source_info)` returned for caller to + wrap.** `config_value_to_inlines_with_provenance` does not stamp + `Generated` itself, because `by` varies by caller (meta-shortcode + vs. appendix sub-Div have different `By` kinds). Parallels how + other source-info helpers in this codebase work. + +- **`AnchorRole::Other` policy explicit (per user direction):** the + `preimage_in` walker walks `Invocation` only; **all other roles, + existing or future, are not consulted by the writer.** Documents + the intent so an extension introducing `AnchorRole::Other("preimage-source")` + knows it'll be ignored. Stated in the doc-comment on + `preimage_in` and re-asserted in §`Invocation` vs `ValueSource` + consumer asymmetry in Plan 7. + +- **`ValueSource` is wrapper-level for `PandocInlines`-shaped + metadata, not per-leaf.** Phase 2 attaches ValueSource on the + surrounding `Generated` (one wrapping each resolved inline), not + inside the rich-content inlines themselves. Two reasons: + (a) keeps Plan 7's multi-inline dedupe rule clean (it consults + Invocation, not anchors on inlines); + (b) maps the user mental model: "this shortcode resolution came + from there" — not "this individual Str came from there". + +- **Plan posture: research plan.** This document settles the API + shape (constructors, function signatures, enum variants); it does + not yet commit to the implementation order or unit-test names. + A subsequent review pass converts it to a development plan with + checklisted phases. + +## API surface to settle (research-plan deliverables) + +By the time this plan converts to a development plan, the following +must be pinned: + +1. **`config_value_to_inlines_with_provenance` signature** — exact + return type, behavior for nil values, behavior for + `PandocInlines` (returns `(inlines, value.source_info.clone())`, + confirmed). Edge: `Concat`-shaped ConfigValue source_info — does + the consumer get the Concat or just the start range? Recommend + passing the full `source_info` regardless of shape; consumers + that need a single range can call `resolve_byte_range`. + +2. **`AppendixSection` enum variants** — `License`, `Copyright`, + `Citation` are the three sections `transforms/appendix.rs` knows + about today. If there are more synthesized sections (or planned + ones), enumerate them now. Verify against + `crates/quarto-core/src/transforms/appendix.rs:135-170`. + +3. **`By::appendix_container` (or equivalent) for the outer + wrapper** — name and signature. `By::appendix_container()` is + tentative; could also be `By::appendix(AppendixSection::Container)` + if treating "container" as a section variant feels right. Pick. + +4. **`DocumentProfile.title_source_info` field placement and + accessor surface** — direct field access (current convention) or + a typed accessor (`fn title_with_source(&self) -> Option<(&str, + &SourceInfo)>`)? + +5. **`AnchorRole::Other` doc-comment text** — exact wording of the + "future roles default to non-walked" policy. Lives on + `AnchorRole::Other` and on `SourceInfo::preimage_in`. + +## Open questions for implementation + +- **Granularity of `ValueSource` for nested `meta.license` shapes.** + YAML like `license: {name: MIT, url: ...}` produces a + `ConfigValueKind::Map`. bd-2mxo notes the merge step strips map + container source_info. Recommended approach for Phase 4: anchor + at the **first scalar leaf** (`name`) when the value is map-shaped, + falling back to the outer key when materialize has already + stripped the container. Notes the limitation; full fix waits for + bd-2mxo. + +- **Multi-anchor cost on Phase 2's two-anchor shape.** Every + meta-shortcode resolution gains a second anchor. Memory: 2 × + `Anchor` per inline. With `SmallVec<[Anchor; 2]>` already in place + (Plan 4), this stays on the stack. Verify no allocation regression + in a perf-sensitive document benchmark. + +- **Cross-reference test fixtures for Phase 4.** The + appendix-license e2e fixture needs to exercise the + YAML-meta-only form (not user-written `:::{.appendix}`). Phase 4 + needs to ensure the synthesizer fires only on the metadata path, + not on user-written appendix blocks. Confirm by reading + `appendix.rs:135-170` carefully during implementation. + +- **`PandocInlines`-shaped metadata behavior in Phase 2.** When + `title: "**Bold**"` resolves to `[Strong[Str], Space, Str]`, each + resolved inline gets a wrapping `Generated` with the ValueSource + on the wrapper. The Bold's children (Str) themselves carry their + own source_info (the parsed positions inside the YAML string). + Test: an edit to the resolved Bold inline goes through Plan 7's + soft-drop because the wrapper is atomic-kind (shortcode); the + user-edit is reverted with Q-3-42. Confirm. + +## References + +- `crates/quarto-pandoc-types/src/config_value.rs:155,170` — + `ConfigValue.source_info` and `ConfigMapEntry.key_source` (already + in place; Plan 9 just propagates them to consumers). +- `crates/quarto-core/src/transforms/shortcode_resolve.rs:148-167` — + `MetaShortcodeHandler::resolve` and `config_value_to_inlines`; + Phase 1/2's primary edit site. +- `crates/quarto-core/src/transforms/appendix.rs:135-260` — + `create_*_section` functions; Phase 4's edit site. +- `crates/quarto-core/src/document_profile.rs:271,487,529` — + `DocumentProfile` field declaration, Default impl, `extract` + helper; Phase 3's edit site (+ doc-contract Change Log). +- `crates/quarto-core/src/transforms/sidebar_generate.rs:228`, + `sidebar_auto.rs:311,318`, `navigation_enrich.rs:59` — Plan-6 + Phase-5 nav consumers; Phase 3's stamping sites. +- `crates/quarto-source-map/src/source_info.rs:91-118` — + `AnchorRole` enum (`Invocation`, `ValueSource`, `Other`); + Phase 1 adds `AppendixSection` here. +- `crates/quarto-source-map/src/source_info.rs:529` — `By::appendix` + constructor; Phase 4 modifies (signature change). +- Plan 6 §"ValueSource follow-up" (line 509-547) — Plan 9's + scope-pickup point. +- Plan 7 §`Invocation` vs `ValueSource` consumer asymmetry + (added by commit `6a2797b6`, not yet on `feature/provenance`) + — Plan 9 Phase 5 lands the tests; Phase 6 cleans up wording. +- bd-129m3 (closes), bd-8pmq3 (closes). + +## Test plan + +(See Phase 5 above for Plan-7-deferred tests.) Additional unit / +integration tests by phase: + +- **Phase 1**: `config_value_to_inlines_with_provenance` unit tests + for scalar, bool, int, `PandocInlines`, `PandocBlocks` (rejection + in inline context), missing key (None returned), nested via + `get_nested`. + +- **Phase 2**: meta-shortcode resolver produces two-anchor shape; + `Invocation` source_info matches the token range; `ValueSource` + source_info matches the metadata-key value range. `var` shortcode + symmetrically. Test with both flat-string and PandocInlines + metadata values. + +- **Phase 3**: each of the three nav consumer sites produces + `Generated` with `from: [ValueSource(profile.title_source_info)]` + when title is present; produces `from: []` when title is None. + Fixture extends Plan 6's multi-page audit-completion test. + +- **Phase 4**: each per-section sub-Div carries its own ValueSource; + missing-key cases gracefully degrade (no Div, no panic); + outer-container Div carries `Generated { by: + By::appendix_container(), from: [] }`. Audit-completion test + (Plan 6) extended. + +- **Phase 5**: see Phase 5 description above. + +## Dependencies + +### Hard dependencies + +- **Plan 6** — establishes `Generated` stamping convention; Plan 9 + builds the consumer wiring on top. Plan 6 stamps with `from: []`; + Plan 9 enriches to `from: [ValueSource]` (Phases 3 and 4) or + `from: [Invocation, ValueSource]` (Phase 2). +- **Plan 4** — `AnchorRole::ValueSource` already exists; this plan + consumes it. + +### Soft dependencies + +- **Plan 7** — Phase 5's appendix-license e2e round-trip test needs + Plan 7's writer + soft-drop infrastructure. The unit-level + asymmetry test (Phase 5 first bullet) doesn't. +- **bd-2mxo** — affects map/array container source_info; relevant + only for nested metadata shapes (`license: {name: MIT, ...}`). + Workaround in Phase 4 lets Plan 9 ship without bd-2mxo. + +### Blocks + +- Future hub-client hover-attribution UX work (separate plan, not + yet scoped). + +### Does not block + +- **Plan 7 implementation** can start without Plan 9 — Plan 7 ships + without ValueSource anywhere; its `Invocation` vs `ValueSource` + asymmetry section is forward-looking. Plan 9 Phase 6 retroactively + cleans up Plan 7's wording. + +## Risk areas + +- **API shape churn between Phases 1, 2, 4.** All three depend on + the `config_value_to_inlines_with_provenance` decision. If the + API shape changes mid-implementation, all three phases revisit. + Mitigation: settle the API as part of this research plan (above); + the development plan starts with the API frozen. + +- **Map-shaped metadata interaction with bd-2mxo.** Phase 4's + "first scalar leaf" workaround degrades gracefully but produces a + less-precise ValueSource for nested licenses. Acceptable for v1; + bd-2mxo's fix tightens later. Document as a known limitation in + the `By::appendix` doc-comment. + +- **Two-anchor cost in Phase 2.** Every meta-shortcode resolution + gains a second anchor. `SmallVec<[Anchor; 2]>` keeps it on the + stack. Add a perf-sensitivity check during implementation if a + document heavy in meta-shortcodes regresses. + +- **Forgetting `AnchorRole::Other` policy in extensions.** A future + extension that adds `Other("attribution-source")` and expects + `preimage_in` to walk it would silently be ignored. Mitigation: + the policy is doc-commented at multiple sites; reviewers catch + the case if it comes up. + +## Estimated scope + +| Phase | Lines (rough) | +|---|---| +| 1: Infrastructure (`config_value_to_inlines_with_provenance` + `DocumentProfile.title_source_info` + `AppendixSection` enum) | ~150 | +| 2: Meta/var shortcode (bd-129m3) | ~80 | +| 3: Nav-text ValueSource (bd-8pmq3) | ~60 | +| 4: Appendix sub-Div ValueSource | ~180 | +| 5: Plan-7 invariant tests | ~120 | +| 6: Plan 7 docs reword | ~20 | +| Tests across phases | ~250 | +| **Total** | **~860** | + +One focused session, possibly two if Phase 4's per-section +discrimination surfaces unexpected interactions. Comparable scope to +Plan 6. + +## Notes + +This plan is the "consumer wiring" half of the provenance epic. Plan 6 +stamped the *identity* (`by`) on synthesizers; Plan 9 stamps the +*origin* (`ValueSource` in `from`) on the metadata-derived subset. +Together they make every pipeline-produced metadata-derived node +fully attributable. + +Future plans in the same family: +- **Plan 10** — Dispatch anchor for Lua filter / handler-shortcode + (closes bd-36fr9). Requires Lua-file registration in SourceContext. +- **bd-12vrr** and **bd-1inj0** — standalone follow-ups for + synthesizers whose source isn't metadata-shaped. + +### File naming convention + +This is the first plan to use the `provenance-plan-N-.md` +naming convention (dropping the `q2-preview-` prefix). The +provenance epic has outgrown the original q2-preview framing — it +serves attribution, round-trip writing, error reporting, and (via +the Dispatch role in Plan 10) Lua-source pointing. Plans 3–8 keep +their existing q2-preview filenames for git-history continuity; +plans 9+ adopt the new convention. diff --git a/claude-notes/plans/2026-05-24-q2-preview-plan-7b-test-orama.md b/claude-notes/plans/2026-05-24-q2-preview-plan-7b-test-orama.md new file mode 100644 index 000000000..095feb422 --- /dev/null +++ b/claude-notes/plans/2026-05-24-q2-preview-plan-7b-test-orama.md @@ -0,0 +1,511 @@ +# Plan 7b — Provenance test-o-rama (Plan 7 deferred-tests consolidation) + +**Date:** 2026-05-24 +**Branch:** feature/provenance (or fresh worktree branched from it) +**Status:** Implementation plan +**Milestone:** none directly — quality bar for M3 (Plan 7 already + shipped the functionality); intended to run **before** Plan 7a so + the writer's round-trip contract is fully exercised before + layering runtime idempotence detection on top. + +## Epic context + +Part of the **provenance epic** (Plans 3–8 + 7a + this). Plan 7 +landed the incremental writer + bridge + consumer migrations on +`feature/provenance` (2026-05-24 session). During that session, +four test items were deferred — three Rust-side unit tests where +the plan author hedged about fixture construction cost, and the +Playwright e2e scenario matrix that was scoped out for context +budget. Plan 7b consolidates all four into a single deliberate +testing pass so the deferrals don't decay into permanent gaps. + +Plan 7a (runtime user-filter idempotence detection + `Q-3-44`) +layers on top of Plan 7's writer contract. Running Plan 7b first +means Plan 7a's contract surface is fully pinned before new +detection mechanisms get added. + +## Hand-off start point + +1. Worktree: `feature/provenance` at + `/Users/gordon/src/q2/.worktrees/provenance/` (or create a fresh + one branched off it — full `cargo xtask verify` is green at + `2f91ee0e`). +2. `claude-notes/plans/CURRENT.md` points here. +3. **Start with Phase 1** — Rust unit tests, ~120 LOC, self-contained. + It's the cheapest and derisks the Playwright phases. Don't open + Phase 2 or 3 until Phase 1 is green. +4. The work is *test coverage*, not new behaviour. Each Plan 7b + test confirms an invariant Plan 7 already implements; an + unexpected test failure is a regression — file a beads ticket + and don't change Plan 7's code without escalating. +5. Don't push without explicit user permission. + +## Goal + +Bring Plan 7's invariants under full delivery coverage: + +1. **Rust unit tests** — close three gaps in + `crates/pampa/src/writers/incremental.rs`'s coarsen test module. +2. **Playwright e2e specs** — the ten scenarios in `bd-3izo3`, + spread across `hub-client/e2e/` and `q2-preview-spa/e2e/`, so + the write-back path is exercised through a real browser session. +3. **Cleanup** — close `bd-3izo3` once the e2e matrix lands; flip + Plan 7's three deferred checkboxes to done. + +No new product code. No new diagnostics. No new design surface. +Pure test coverage. + +## Scope + +### In scope + +#### Phase 1 — Rust unit tests (`crates/pampa/src/writers/incremental.rs`) + +**Repo facts the implementer needs:** + +- Test module lives at the bottom of + `crates/pampa/src/writers/incremental.rs` (search for + `#[cfg(test)]`). Mirror the existing tests' style and import set. +- Reference test to model after: + `keep_before_with_atomic_kind_generated_no_anchor_emits_omit` + (around line 1589 — search for the function name, line numbers + drift). It builds a `Generated { by: filter(...), from: [] }`, + a one-block AST, a `ReconciliationPlan { block_alignments: + vec![BlockAlignment::KeepBefore(0)], .. }`, calls `coarsen(qmd, + ast, ast, &plan, &mut warnings)`, asserts the entry shape. +- Soft-drop reference tests (asserting Q-3-42 / Q-3-43 codes): + search the same file for `warnings[0].code.as_deref(), Some("Q-3-42")` + and `Q-3-43`; ~6 sibling tests cover the existing alignment + paths. +- `compute_blocks_hash_fresh` + `compute_meta_hash_fresh_excluding_rendered` + live in `crates/quarto-ast-reconcile/src/hash.rs`; the + Plan-3 idempotence tests at `crates/quarto-core/tests/idempotence.rs` + use them — that's the pattern for the writer-lossless test. +- Run `cargo nextest run -p pampa coarsen` after each new test; + full pampa suite when the phase is ready. + +All three follow the existing test-module conventions in that +file. Each is self-contained, no fixtures outside the test module. + +- **Writer-lossless baseline test for Plan-6/7 shapes**. The + existing `parse(write(ast))` integration tests cover the common + Original-SI shapes but not Generated / atomic-CustomNode / + synthesized-footnotes-container shapes (those don't appear from + raw qmd parse). Construct each shape directly via + `SourceInfo::generated(...)` + manual node builders, write it, + parse, then assert `compute_blocks_hash_fresh` + the + `_excluding_rendered` meta variant match. Lives in + `quarto-source-map`'s test module (or alongside Plan 6's writer + tests). One assertion per shape: + - `Generated { by: shortcode("meta"), from: [Invocation] }` + (atomic resolution) + - `Generated { by: filter(...), from: [] }` (no Invocation — + atomic-kind drop) + - `Generated { by: sectionize, from: [] }` with source-bearing + children (non-atomic wrapper — Transparent walk) + - Atomic CustomNode container (no preimage) + + **Transparent-wrapper-at-top fixture (added 2026-05-25 from the + closure-gap audit).** The shapes above all assume the wrapper is + *not* `blocks[0]`. Add one more fixture where the sectionize + wrapper IS `blocks[0]` (the production q2-preview shape), with + the user's real content nested inside, AND with an inline edit + to a wrapped child. This exercises the descent helpers + (`derive_target_file_id`, `first_target_anchored_start_in`, + `coarsen`'s RecurseIntoContainer Transparent-recursion arm — the + three sites the closure-gap fixes touched). The existing test + `sectionize_wrapper_preserves_frontmatter_after_inner_edit` in + `crates/pampa/tests/incremental_writer_tests.rs` is the + reference shape; extend it into a writer-lossless variant if + Plan 7b lands a separate test module. See + [`claude-notes/designs/transparent-wrappers.md`](../designs/transparent-wrappers.md) + for the underlying invariant. + +- **Soft-drop interaction test (compound case)**. Build a Para + with two inlines: one atomic-resolved (Generated by shortcode, + Invocation in target) and one editable (Original). New AST + mutates *both*: the atomic resolution to a different + `Str("new")` and the editable inline's text. Assert: + - non-atomic edit lands in output qmd, + - atomic edit becomes `Q-3-42`, + - emitted qmd preserves the shortcode token bytes verbatim, + - exactly one warning emitted. + +- **Filter-construction soft-drop test (UseAfter path)**. Mirror + of `keep_before_with_atomic_kind_generated_no_anchor_emits_omit` + but exercising the *edit* path: original inline is `Generated { + by: filter("emoji.lua", 14), from: [] }`; new inline is a + different `Str`. Build a single-Para AST per side, build a + `ReconciliationPlan` with `BlockAlignment::RecurseIntoContainer + { container: 0, inline_alignments: vec![InlineAlignment::UseAfter(0)] }`, + call `coarsen`, assert `Omit` + `Q-3-43`. ~40 LOC; structurally + identical to the sibling KeepBefore test at line 1589. + +- **Idempotence: write-twice-byte-equal**. The existing tests + pin a single round-trip (`parse(write(ast)) == ast`). They do + *not* pin that running the writer on its own output produces + the same bytes again. This is load-bearing for sectionized and + footnotes-container documents because `compute_separator` + silently drops its "use the original gap" optimization across + `Transparent` children: child entries carry `orig_idx: None` + (see `incremental.rs`'s `clear_orig_idx_for_transparent_child`), + so the second write recomputes separators on standard-newline + fallback. The result is still semantically correct; the + question is whether it's *byte-identical* to the first write. + + Three fixtures, each tested under + `tests/incremental_idempotence.rs`: + + 1. **Sectionized document.** Original qmd: a doc with at least + two headings so `SectionizeTransform` synthesizes two + section-Divs. Pass through `parse → transform → write` + to get `qmd_1`. Repeat: parse `qmd_1` → transform → + write → `qmd_2`. Assert `qmd_1 == qmd_2`. + 2. **Footnotes container document.** Doc with at least one + footnote so the footnotes-container Div synthesizes. + Same two-pass assertion. + 3. **Adjacent Transparent wrappers.** Doc whose first two + blocks both synthesize as Transparent (sectionize at + blocks[0], footnotes-container at blocks[1]). Exercises + the case where two consecutive top-level entries are both + `Transparent`, making the separator-between-blocks question + non-trivial. + + Test driver pattern: use `pampa::parse_qmd_to_ast` for parse, + the existing q2-preview pipeline for transform, and + `incremental_write(qmd, ast, ast, trivial_plan)` for write + (where `trivial_plan` is the all-KeepBefore plan against the + same AST on both sides). Trivial plan ensures we're testing the + writer's *idempotence*, not the reconciler's behaviour. + + ~120 LOC, including the test helper for building a trivial + reconciliation plan from an AST. + +- **Cross-file `Original` Rewrite catch-all coverage.** Today's + test `keep_before_cross_file_original_falls_back_to_rewrite` + (`incremental.rs:1687`) exercises the catch-all minimally — a + hand-constructed `Original` pointing at a different file, no + children. Add richer fixtures so the catch-all path that Plan 8 + will land on top of has explicit pre-Plan-8 coverage. + + **Why this matters structurally** (added 2026-05-25): the + catch-all is now the only producer of `Rewrite` entries reached + via `coarsen_keep_before_block`. After the + [`CoarsenedEntry` self-contained refactor](./2026-05-25-coarsened-entry-self-contained.md) + it pre-computes block text at coarsen time — that change closed + a latent panic in the Transparent-recursion path, so the + writer-lossless baseline below should include at least one + fixture that fires this catch-all *inside* a Transparent wrapper + (e.g. a sectionize Div containing a cross-file `Original` child). + Without that fixture, regressions in the self-contained invariant + pass tests but break q2-preview. + + 1. **Substring rooted outside target.** `Substring` whose + `parent` is `Original` in a non-target file. `preimage_in` + walks the parent → returns `None`. Assert Rewrite catch-all. + 2. **Concat with one piece outside target.** `Concat` of two + pieces, one in target and one in a non-target file. + `preimage_in` requires *every* piece to resolve → + `None`. Assert Rewrite catch-all. + 3. **Paragraph with mixed-file inline content.** Para whose + `source_info` is `Original` in target but whose inlines have + `Original` in a non-target file (no wrapper). Coarsen + classifies the Para as Verbatim at the block level + (preimage in target), so the inline mixing doesn't trip the + catch-all — but the test still pins that current behaviour + and documents what Plan 8 will change. + + ~80 LOC across three small tests; each ~25 LOC. Place + alongside `keep_before_cross_file_original_falls_back_to_rewrite`. + +#### Phase 2 — Hub-client Playwright specs (`hub-client/e2e/`) + +**Repo facts the implementer needs:** + +- Existing specs to model on: `hub-client/e2e/smoke.spec.ts`, + `hub-client/e2e/preview-extraction.spec.ts`, + `hub-client/e2e/q2-debug-render-components.spec.ts`. +- Config: `hub-client/playwright.config.ts`. CI gating: these run + under `cargo xtask verify --e2e`; they are NOT in the default + verify pipeline (per `playwright.config.ts` design — keep it + that way). +- The hub-client preview uses Monaco; AST mutations happen via + iframe `setAst` (now wired through Plan 7's `handleSetAst`). + Drive edits by sending postMessage from the spec, or by + programmatically calling `setAst` on the preview iframe. +- Soft-drop diagnostics surface in the existing hub-client + diagnostics panel (Plan 7 commit `a0a4c7c8` plumbed warnings + into `pendingWriteWarningsRef` → `onDiagnosticsChange`). +- Saved qmd is observable from the spec via the project's file + system (Playwright tests spawn a real hub against a tempdir + fixture project). + +Five specs. Each spawns the hub, opens a fixture project, performs +an AST mutation through the live preview, and reads the round-tripped +qmd back from disk (or via `getFileContent`) to assert the +preservation contract. Each spec is ~60 LOC including fixture setup. + +- **sectionized doc + edit paragraph**. Fixture has H1 + H2 + + multiple Paras. Edit one paragraph through the preview. Assert + saved qmd preserves heading structure verbatim and contains the + edited text. +- **single-inline shortcode + edit different paragraph**. Fixture + has `{{< meta title >}}` inline in Para 1; edit Para 2. Assert + Para 1's qmd is byte-identical to the original (token preserved). +- **multi-inline shortcode + edit different paragraph in same Para**. + Two `{{< meta title >}}` inlines in Para 1; an editable inline + also in Para 1; edit only the editable inline. Assert the + shortcode token appears exactly once at each position (dedupe + by-Invocation) and the editable inline reflects the change. +- **edit resolved shortcode title → Q-3-42 + byte-equal no-op**. + Fixture has resolved shortcode title. Edit the rendered title + through the preview. Assert: `Q-3-42` surfaces in the + diagnostics panel, saved qmd is byte-equal to original. +- **edit inside synthesized footnotes container → Q-3-43 + + container regenerates**. Fixture has footnote refs in body that + produce a synthesized appendix footnotes Div. Edit a Str inside + the synthesized container. Assert: `Q-3-43` surfaces, the saved + qmd doesn't carry the synthesized container text, the next + render re-synthesizes the container. + +#### Phase 3 — SPA Playwright specs (`q2-preview-spa/e2e/`) + +**Repo facts the implementer needs:** + +- Server-spawn helper: `q2-preview-spa/e2e/helpers/previewServer.ts` + (`startPreviewServer({ fixtureFiles: [...] })`). Each test + spawns its own server against a tempdir; tests run in parallel. +- Reference spec: `q2-preview-spa/e2e/basic-preview.spec.ts` — + full pattern: server lifecycle, `waitForInnerHeading` polling + the iframe DOM, edit-then-rerender flow. +- Render-counter gauge: `window.__renderTicks` (Phase D.3 / + bd-0mji per `PreviewApp.tsx`). The echo-prevention spec depends + on it specifically; assert it bumps exactly once per edit. +- Soft-drop warnings surface in `DiagnosticStrip` (Plan 7 commit + `20f4b0ff`) — a small overlay anchored bottom-right of the + preview pane. Assert by querying for the strip's text content. +- SPA write-back path: `handleSetAst` in `q2-preview-spa/src/PreviewApp.tsx` + → `incrementalWriteQmd(originalQmd, baselineAst, newAst)` → + `updateFileContent(path, qmd)`. Echo-prevention via FNV-1a hash + in `lastEmittedRef`. The hash function (`fnv1aHex`) lives in + the same file. +- Driving edits from the spec: send a postMessage with + `{ type: 'SET_AST', ast: ... }` into the iframe (whatever + channel the existing iframe handler reads). + +Five specs. Use the existing `startPreviewServer` pattern. Each ~60 LOC. + +- **project + edit paragraph round-trip**. Multi-file project, + active page edited via the iframe's `setAst` postMessage. Assert + the edited qmd lands on disk + `__renderTicks` bumped exactly + once (no echo loop). +- **single-file mode + edit paragraph round-trip** (single-file + mode is `bd-tnm3k`). Same shape, single-file fixture. +- **edit shortcode → Q-3-42 in DiagnosticStrip**. Programmatically + drive `setAst` to mutate a shortcode-resolved inline. Assert + `DiagnosticStrip` mounts and shows the Q-3-42 entry; saved qmd + byte-equal. +- **mixed atomic + non-atomic edit**. Fixture has both an + atomic-resolved inline and an editable inline in the same Para. + setAst mutates both. Assert non-atomic edit lands on disk + + Q-3-42 in the strip + atomic source unchanged. +- **content-match echo-prevention fixture**. Fixture: edit a Para + through setAst. Assert exactly one `__renderTicks` bump (the + emitted qmd's FNV-1a hash matches the echoed `onFileContent`, + the ref is consumed, no re-render). Also: write a *different* + file in the same project while the echo is pending — assert that + one *does* trigger a re-render (the gate is per-(path, hash), + not global). Pins the `lastEmittedRef` contract. + +#### Phase 4 — Cleanup + +- Close `bd-3izo3` with reason "landed via Plan 7b Phase 2 + 3". +- Flip the three Phase-2 deferred checkboxes in Plan 7 to `[x]` + with one-line references to the Plan-7b commits. +- Update `hub-client/changelog.md` with a single 7b entry. + +### Out of scope + +- **Item #5 from the Plan 7 review (manual smokes)** — the user + is running these by hand. Not a Claude-driven test. +- **Plan 7a's runtime idempotence detection** — that's a separate + plan with its own test surface. +- **Plan 9's `preimage_in` role-asymmetry e2e test** — owned by + Plan 9 Phase 5; depends on a real `ValueSource` consumer that + doesn't exist until Plan 9 lands. +- **Refactor of any test infrastructure** — use the patterns + already established in `hub-client/e2e/` and + `q2-preview-spa/e2e/`; don't introduce a new harness. + +## Design decisions (settled) + +- **Three phases, separable commits.** Each phase ships as one + commit (or one per spec if a phase author wants more granularity). + Phase 1 is the cheapest and most self-contained; ship it first + to derisk the rest. + +- **No new fixtures library.** Each spec inlines its qmd fixture + in the test file. Plan 7b's tests are self-documenting via the + literal fixtures right next to the assertions. A shared fixture + library would be premature abstraction. + +- **`__renderTicks` is the SPA's truth gauge.** The render counter + on `window.__renderTicks` already exists (Phase D.3 / bd-0mji + per `PreviewApp.tsx` comments). The echo-prevention spec relies + on it specifically. If a spec needs counts at a different layer + (e.g. samod patches received), add a new gauge — don't reuse + `__renderTicks` for two distinct meanings. + +- **E2e gating stays `--e2e` on CI.** Per `q2-preview-spa/playwright.config.ts`'s + existing convention; specs run under `cargo xtask verify --e2e` + but not the default verify pipeline. Plan 7b doesn't change this. + +## Test plan + +This *is* the test plan. Each Phase-1 entry asserts a Rust-side +invariant; each Phase-2/3 entry asserts an end-to-end delivery +invariant. The phase descriptions above name the exact assertion +per spec. + +TDD order within each phase: + +- Phase 1: write the test, run it, watch it fail (it won't — + Phase 1's mechanics are already shipped; the test confirms + coverage, not new behavior). If a test unexpectedly *fails*, + that's a regression in Plan 7's writer and gets a beads issue + before the test lands. +- Phase 2/3: same — Plan 7's surfaces already produce the right + end-to-end behavior; these specs pin it. Unexpected failures + block the spec and surface as regression beads. + +## Dependencies + +- **Plan 7 (shipped, commit `4ee51e4a` on `feature/provenance`).** + The writer + bridge + consumer migrations + manual smoke are + the *unit under test* for this plan. +- **`bd-3izo3`** — already-filed beads for the Playwright matrix. + Plan 7b consolidates and replaces it; close on landing Phase 2+3. + +No upstream dependencies — every contract Plan 7b exercises is +already in `feature/provenance`. + +## Risk areas + +- **Browser-flake on the SPA echo-prevention spec.** samod's + echo latency varies; the assertion "exactly one + `__renderTicks` bump after an edit" needs a generous timeout + + a poll-stable-for-Nms shape. Use the same harness pattern + as `basic-preview.spec.ts`'s 30s `waitForFunction`. + +- **Synthesized-footnotes Q-3-43 fixture is fragile.** The footnotes + container is generated by an AppendixTransform that runs late + in the pipeline; the fixture needs at least one `[^1]` ref in + the body. Confirm via `q2 render` before relying on the shape + in a test. + +- **Multi-inline dedupe spec depends on shortcode reuse.** The + fixture needs two `{{< meta title >}}` invocations in the same + Para; assert dedupe operates on `Invocation` source-info + identity, not full anchor list. If the spec doesn't catch a + regression because the dedupe condition is over-eager, add a + second spec where two shortcodes have different + `value_source` anchors but identical `invocation` anchors and + assert dedupe still fires (forward-compat with Plan 9). + +## Estimated scope + +| Component | LOC (rough) | +|------------------------------------------------------------------------|-------------| +| Phase 1 — 3 Rust unit tests + helpers | ~120 | +| Phase 1 — idempotence (write-twice) tests, 3 fixtures + plan helper | ~120 | +| Phase 1 — cross-file Original Rewrite catch-all, 3 small tests | ~80 | +| Phase 2 — 5 hub-client Playwright specs | ~300 | +| Phase 3 — 5 SPA Playwright specs | ~300 | +| Phase 4 — cleanup (checkboxes + changelog) | ~10 | +| **Total** | **~930** | + +Time estimate: one focused session per phase (Phase 1 a few hours, +Phase 2 and 3 a day each). Total ~3 days of work, parallelizable +across phases. + +## Implementation checklist + +### Phase 1 — Rust unit tests + +- [ ] Writer-lossless baseline fixture test for Generated / + atomic-CustomNode / synthesized-footnotes shapes — one + sub-test per shape; assert + `compute_blocks_hash_fresh(parse(write(ast))) == compute_blocks_hash_fresh(ast)`. +- [ ] Soft-drop interaction test: Para with one atomic-resolved + inline + one editable inline; new AST mutates both; assert + non-atomic edit lands + exactly one Q-3-42 + atomic source + preserved. +- [ ] Filter-construction soft-drop UseAfter test in + `crates/pampa/src/writers/incremental.rs` test module; + mirror of `keep_before_with_atomic_kind_generated_no_anchor_emits_omit` + but via `RecurseIntoContainer` + `InlineAlignment::UseAfter`; + assert `Omit` + `Q-3-43`. +- [ ] Idempotence (write-twice-byte-equal) tests: + `idempotence_sectionized_document`, + `idempotence_footnotes_container`, + `idempotence_adjacent_transparent_wrappers`. Test file + `crates/pampa/tests/incremental_idempotence.rs`. Each: + `qmd → ast → write1; parse(write1) → ast' → write2; + assert write1 == write2`. Use a trivial all-`KeepBefore` + plan against the same AST on both sides so we're testing + the writer's idempotence, not the reconciler's. +- [ ] Cross-file `Original` Rewrite catch-all tests: + `keep_before_substring_outside_target_falls_back_to_rewrite`, + `keep_before_concat_with_outside_target_piece_falls_back_to_rewrite`, + `para_with_mixed_file_inline_content_keeps_block_verbatim`. + Place alongside the existing + `keep_before_cross_file_original_falls_back_to_rewrite` + (`incremental.rs:1687`). Each ~25 LOC. +- [ ] `cargo nextest run -p pampa` green (incremental writer + tests pass; no regression elsewhere). +- [ ] `cargo xtask verify --skip-hub-build` green. + +### Phase 2 — Hub-client Playwright specs + +- [ ] `hub-client/e2e/plan7-sectionized-edit.spec.ts` +- [ ] `hub-client/e2e/plan7-shortcode-preserve.spec.ts` +- [ ] `hub-client/e2e/plan7-multi-shortcode-dedupe.spec.ts` +- [ ] `hub-client/e2e/plan7-q342-shortcode-edit.spec.ts` +- [ ] `hub-client/e2e/plan7-q343-footnotes-regenerate.spec.ts` +- [ ] `cargo xtask verify --e2e` green on the hub-client leg. + +### Phase 3 — SPA Playwright specs + +- [ ] `q2-preview-spa/e2e/plan7-project-edit.spec.ts` +- [ ] `q2-preview-spa/e2e/plan7-single-file-edit.spec.ts` +- [ ] `q2-preview-spa/e2e/plan7-q342-diagnostic-strip.spec.ts` +- [ ] `q2-preview-spa/e2e/plan7-mixed-atomic-nonatomic.spec.ts` +- [ ] `q2-preview-spa/e2e/plan7-echo-prevention.spec.ts` +- [ ] `cargo xtask verify --e2e` green on the SPA leg. + +### Phase 4 — Cleanup + +- [ ] `br close bd-3izo3 --reason "landed via Plan 7b Phase 2+3"` +- [ ] Plan 7 (`claude-notes/plans/2026-05-04-q2-preview-plan-7-incremental-writer.md`): + flip the three Phase-2 deferred checkboxes at lines 1282 / 1287 / 1288 + to `[x]` with a one-line reference to the Plan 7b commit. +- [ ] `hub-client/changelog.md` entry: one line under the + Phase-4 commit date noting "Plan 7's incremental-write + round-trip is now under e2e coverage (sectionized docs, + shortcode preservation, soft-drop diagnostics, echo + prevention)." + +## Notes + +This is the "boring but important" plan. Nothing here is research; +everything is a follow-through. The temptation is to skip it +because Plan 7 already works. Don't — the deferrals were honest +scope calls, not impossibility claims, and leaving them open +means the round-trip contract has only Rust-unit and one +JS-wrapper test pinning it. A regression in Plan 7a (runtime +idempotence detection) or Plan 9 (ValueSource threading) could +silently break the e2e behaviour and tests would still pass. + +Running 7b before 7a means Plan 7a starts with a fully-pinned +baseline and any new failures in 7a's CI runs point clearly at +7a's changes, not at latent gaps. diff --git a/claude-notes/plans/2026-05-25-coarsened-entry-self-contained.md b/claude-notes/plans/2026-05-25-coarsened-entry-self-contained.md new file mode 100644 index 000000000..25fdc373c --- /dev/null +++ b/claude-notes/plans/2026-05-25-coarsened-entry-self-contained.md @@ -0,0 +1,384 @@ +# Plan — Make `CoarsenedEntry::Rewrite` self-contained + +**Status:** Drafted 2026-05-25. +**Branch:** `feature/provenance`. +**Trigger:** Panic discovered 2026-05-25 during the q2-preview gate-bypass +UX experiment (see §History below). Index-out-of-bounds in +`emit_entries` when a `Rewrite` entry produced inside the Transparent +recursion (added in commit `bdcfdc53`) carried a child-relative +`new_idx` but was looked up against the top-level `new_ast.blocks`. + +## Goal + +Lift the existing implicit invariant — *every `CoarsenedEntry` variant +must be self-contained (carry its own emit-time bytes)* — to be an +explicit, enforced architectural rule. Today four of the five variants +already satisfy this: + +| Variant | Self-contained? | How | +|---|---|---| +| `Verbatim` | ✓ | `byte_range` into `original_qmd` | +| `InlineSplice` | ✓ | pre-computed `block_text: String` | +| `Transparent` | ✓ | list of self-contained child entries | +| `Omit` | ✓ | emits nothing | +| `Rewrite` | ✗ | `new_idx: usize` — a deferred index into `new_ast.blocks` | + +`Rewrite` is the outlier. Make it match its siblings by carrying +pre-computed `block_text: String`. The qmd writer call moves from +emit time (`assemble`) to coarsen time (`coarsen`); the work is the +same, the timing changes; the entry becomes self-describing. + +Behaviour does not change. Tests stay green. The Transparent-recursion +panic disappears. + +## History — why was `Rewrite` written context-dependently? + +`git log -S CoarsenedEntry -- crates/pampa/src/writers/incremental.rs` +(top of file, latest 4 entries): + +1. **`eb81cbc5`** ("Add incremental QMD writer with idempotence and + round-trip tests") — original commit. `CoarsenedEntry` had **two** + variants: + ```rust + enum CoarsenedEntry { + Verbatim { byte_range: Range, orig_idx: usize }, + Rewrite { new_idx: usize }, + } + ``` + The writer was top-level only. Every entry corresponded directly + to one top-level block. `Verbatim` carried its own bytes; `Rewrite` + deferred to `assemble`-time via an index into `new_ast.blocks` — + correct because indices were unambiguous and the deferral saved + a `write_block_to_string` call when the entry was never emitted + (defensive). Behaviour invariant: `new_idx` is always a *top-level* + index. Honoured by construction at this point. + +2. **`ab10f37b`** ("Implement inline splicing for incremental writer + (bd-1hwd)") — added `InlineSplice` variant for partial block + rewrites: + ```rust + InlineSplice { block_text: String, orig_idx: usize } + ``` + Inline splicing builds *bespoke* block text by mixing original + bytes with newly-serialized inlines. There's no `new_idx` that + would reconstruct it — the text is necessarily pre-computed at + coarsen time. This was the first variant to break the "defer to + emit time" pattern, **out of necessity**, but no one refactored + `Rewrite` to match. The asymmetry was introduced silently. + +3. **`9a473fe9`** ("plan-7 phase 2+3a: writer internals — soft-drop, + Transparent/Omit, multi-inline dedupe") — Plan 7 added the + `Transparent` and `Omit` variants. `Transparent { child_entries }` + allows recursive emission for non-atomic Generated wrappers + (sectionize, footnotes, appendix). `orig_idx` became `Option` + so children inside `Transparent` could opt out of the + `compute_separator` original-gap optimization. The commit + **explicitly flagged** the latent Rewrite issue: + + > // result_idx is unused for child Rewrites (a child Rewrite + > // would need a different lookup mechanism; not exercised by + > // today's synthesizers). + + Accurate at the time — coarsen_keep_before_block was the only + producer of child entries (under static Transparent recursion + for unchanged wrappers), and its catch-all hit Rewrite only on + cross-file Original / gappy Concat / Generated-without-source-bearing-children + shapes that the pipeline didn't produce in practice. + +4. **`bdcfdc53`** ("recurse into non-atomic Generated wrappers in + RecurseIntoContainer") — *this PR's* fix from earlier today. + Added a Transparent-recursion path in `coarsen_blocks` for the + *changed-wrapper* case (RecurseIntoContainer with a + `block_container_plans` entry). For the first time, **`coarsen_blocks` + runs on child slices**, and any `Rewrite` it produces carries a + child-relative index. The "not exercised by today's synthesizers" + caveat from `9a473fe9` no longer holds. + +The takeaway: `Rewrite`'s context-dependent design was a vestige of +the original Phase-1 top-level-only writer. It survived because every +expansion since (`InlineSplice`, then `Transparent`) sidestepped it +rather than refactoring. Today's panic is the bill coming due. + +## Behavioural equivalence — coarsen-time vs emit-time + +**Question:** does pre-computing `block_text` at coarsen time produce +byte-identical output to deferred emit-time computation? + +**Answer:** yes. `write_block_to_string` +(`crates/pampa/src/writers/incremental.rs:1089`) is a pure function of +its `Block` argument: + +```rust +fn write_block_to_string(block: &Block) -> Result { + let mut buf = Vec::new(); + qmd::write_single_block(block, &mut buf)?; + String::from_utf8(buf).map_err(…) +} +``` + +`qmd::write_single_block` (`writers/qmd.rs:2392`) constructs a fresh +`QmdWriterContext::new()` per call. The context's mutable fields +(`emphasis_stack`, `prev_emitted_alnum`) accumulate state only +**within** a single `write_single_block` invocation — they're created, +used, and dropped per call. No state leaks across calls. + +There is no global state in `crates/pampa/src/writers/qmd.rs` (verified +by `git grep 'static\|thread_local' crates/pampa/src/writers/qmd.rs` +returning empty). No file I/O, no environment reads, no system clock. +The function depends only on the input `Block`. + +Therefore: `write_block_to_string(b)` is referentially transparent. +Calling it at coarsen time vs emit time produces identical output. + +Performance: `Rewrite` is the catch-all path — when we get an entry we +*always* emit. No coarsened plan keeps Rewrite entries it doesn't use +(emit_entries walks every non-Omit entry). The qmd-write work is +performed exactly once either way; only its timing changes. No extra +allocations. + +## Consumers — confirming the scope + +`CoarsenedEntry` is private to `crates/pampa/src/writers/incremental.rs` +(lowercase `enum`, no `pub`). Two internal consumers: + +1. `assemble`'s `emit_entries` — concatenates bytes per entry. +2. `compute_edits_from_coarsened` — currently calls `assemble` + internally and returns a single full-document edit. + +No external consumers. The refactor is fully local to one file. + +Future consumers (Phase 3 minimal-edit diffing, Plan-X-WIP) will benefit +from the self-contained invariant: every entry carries its own *intended +text* and (where applicable) its *intended source range*, which is the +right shape to derive minimal Monaco edits without re-deriving a +post-assemble diff. Mentioned in §Out-of-scope but worth noting as +direction-of-travel. + +## Work items + +### Phase 1 — Pin the panic with a failing test + +- [x] Add `sectionize_wrapper_with_shortcode_child_edit_does_not_panic` + to `crates/pampa/tests/incremental_writer_tests.rs`. The current + draft (commit `5f2bbab0`'s working tree) reaches the panic via + a cross-file Original child shape; alternative is a synthesized + empty section Div or a Lua-filter-emitted Generated wrapper + with no source-bearing children. Either reproduces the + `Rewrite { new_idx: child_idx }` → `new_ast.blocks[child_idx]` + out-of-bounds. +- [x] Run; confirm the test panics with "index out of bounds" on + `incremental.rs:890` (the `Rewrite` arm of `emit_entries`). +- [x] Added `sectionize_wrapper_shortcode_child_edit_soft_drops` — + goes further than no-panic by asserting on output bytes + + Q-3-43 warning. This caught a *second* bug Phase 1's no-panic + test would have hidden: the `UseAfter` arm fell through to + let-user-win Rewrite for atomic-Generated with preimage, + writing the resolved bytes (the edit applied to generated + content) back into the source qmd. The architectural Rewrite + fix made this newly visible by replacing the panic with silent + wrong-bytes; see Phase 2 below for the additional soft-drop + branch that closes the gap. + +### Phase 2 — Lift `Rewrite` to self-contained + +- [x] Change the variant to carry pre-computed `block_text: String`. + Drop the `new_idx: usize` field. +- [x] Update every `Rewrite` producer to pre-compute (four sites: + `coarsen_blocks` UseAfter, two RecurseIntoContainer sub-branches, + and `coarsen_keep_before_block`'s catch-all). +- [x] Convert `coarsen_keep_before_block` to + `→ Result>`. Both call + sites updated to `?`. +- [x] Update `emit_entries` to `block_text.clone()`. `new_ast` is now + unused for byte production in any variant (kept in signature for + now; removal is a tidying follow-up). +- [x] Delete the "result_idx is unused for child Rewrites" comment. + +#### Phase 2b — Soft-drop for atomic-Generated in UseAfter (scope expansion) + +Discovered during Phase 3 verification: the user reported that with +the dispatch.tsx bypass in place, clicking +react on a paragraph +inside `{{< lipsum 3 >}}` produced wrong qmd output — the resolved +lipsum bytes + reactji were being written back into source. The +architectural Rewrite refactor made this newly observable by +replacing the panic with silent wrong-bytes. + +Root cause: when the user edits inside an atomic-Generated block +with realistic content delta, the reconciler can emit +`KeepBefore` (Header) + `UseAfter` (new lipsum) at the +sectionize-child level — implicit deletion of the original lipsum +Para. The `UseAfter` arm filtered atomic-CustomNode and +no-preimage-Generated but had no branch for atomic-Generated-with- +preimage, so it fell through to let-user-win Rewrite (write the +new bytes). + +- [x] Add an `atomic_generated_preimage` check at the head of the + `UseAfter` arm in `coarsen_blocks`. If the new block is + `Generated` with `is_atomic_kind() == true` AND has preimage + in target → emit `Verbatim` of the preimage range + a + Q-3-43 soft-drop warning. The pattern: when an entry's *new* + block looks like an attempt to edit content the user can't + actually edit, refuse the edit at the writer regardless of + what the reconciler's alignment said. +- [x] Test: `sectionize_wrapper_shortcode_child_edit_soft_drops` — + asserts on output bytes (token preserved, reactji NOT + emitted) and the Q-3-43 warning. + +### Phase 3 — Tests + verification + +- [x] Re-run the Phase 1 test; passes (Ok, no panic). +- [x] `cargo nextest run -p pampa` — 3902 / 3902 passing + (one new soft-drop test added). +- [x] `cargo xtask verify --skip-hub-build --skip-hub-tests` — Rust + workspace 9655 / 9655 passing. (The + `ts-packages/preview-renderer` integration tests fail under the + bypass; expected — they assert the atomic-aware NOOP gate + fires, which the bypass disables. They pass once the bypass + is reverted.) +- [x] Rebuild WASM (`hub-client && npm run build:wasm`) — exit 0. +- [ ] Playwright e2e `q2-preview-render-components-write` — *blocked + by a dev server holding port 5173 in this worktree; deferred, + see "scaffolding cleanup" task*. +- [x] Manual: user confirmed the no-panic + soft-drop behavior in + their local browser session after rebuilding. Initial report + flagged wrong-bytes (resolved lipsum text in qmd), which led + to discovering Phase 2b. After Phase 2b lands, the + regression test `sectionize_wrapper_shortcode_child_edit_soft_drops` + asserts: token bytes preserved, reactji NOT emitted, Q-3-43 + warning fires. +- [ ] Restore the dispatch.tsx gate before this plan's commits ship + (it was a one-shot UX experiment; the proper TS-side intercept + signal is separate work — see §Out of scope). + +### Phase 4 — Design doc + +- [x] Write `claude-notes/designs/incremental-writer-internals.md` + (new file). **Note (2026-05-26):** the file was later absorbed + into `incremental-writer-contract.md` (Task 3 of the rebase + reconciliation) and deleted — the self-containment material + now lives in the contract doc's "`CoarsenedEntry` self- + containment" section. Sections originally: + - *Purpose*. The incremental writer takes `(original_qmd, + original_ast, new_ast, plan)` and produces `(new_qmd, + warnings)`. It does so by *coarsening* the hierarchical + reconciliation plan into a flat list of self-contained + emit instructions, then *assembling* the result by walking + the instructions in order. + - *The `CoarsenedEntry` contract* — the rule this plan + enforces. Every variant carries enough information to + produce its emit bytes *without further context*. No + index-into-an-ambient-slice deferral. Each variant + documented with its payload and self-containment property. + - *Why this matters* — the panic story, the Transparent + recursion composition story, the minimal-edit-diffing + future story. + - *Anti-patterns* — "don't add a variant that defers to a + named slice"; "don't add a variant that depends on context + not encoded in the variant itself"; "if you need timing of + side effects, that's a sign the entry shape is wrong." + - *History* — pointer to this plan; pointer to the historical + commits (`eb81cbc5`, `ab10f37b`, `9a473fe9`, `bdcfdc53`). + - *Promotion path* — same shape as + `transparent-wrappers.md`'s "where the code lives + when + to promote it" — `CoarsenedEntry` is private today; if a + second crate ever wants to consume the coarsened plan + (e.g. minimal-edit-diffing in a separate crate), promote + the type and its emission helpers to `quarto-pandoc-types` + or a new module. +- [x] Cross-link from `transparent-wrappers.md` §"Reference + primitive" — added a "Sibling primitive on the emission side" + preamble that points to the new doc. +- [x] Cross-link from `provenance-contract.md` §7 "Atomic-kind set + and consumer impact" — added a closing paragraph pointing to + the new doc as the place where the writer's internal shape is + pinned. + +### Phase 5 — Plan annotations + +Plans whose work would build on the self-contained invariant: + +- [x] `claude-notes/plans/2026-05-04-q2-preview-plan-7-incremental-writer.md` + — added a "Follow-ups closed" section pointing here. +- [x] `claude-notes/plans/2026-05-24-q2-preview-plan-7b-test-orama.md` + — its Phase 1 writer-lossless fixtures should include at least + one shape where the writer's catch-all Rewrite path fires + (cross-file Original child, or empty Generated wrapper). Already + flagged from the sectionize-wrapper audit; this plan supplies + the structural reason such fixtures matter. + +## Out of scope + +- The TS-side gate's silent NOOP (lipsum-paragraph clicks produce no + user feedback today). Separate plan; the temporary + `dispatch.tsx` bypass exists only to surface the writer-side + diagnostic UX once and must be reverted as part of Phase 3. +- The proper TS-side "edit rejected at the gate" signal — needs + its own design (synthetic diagnostic shape, framework emit + callback, location resolution via the source pool). Tracked + separately. +- Removing `new_ast: &Pandoc` from the `emit_entries` signature. + Once Rewrite no longer reads it, the parameter might be fully + removable (audit the other arms). Defer to a tidying commit + unrelated to this plan's correctness work. +- Eventual minimal-edit diffing from `CoarsenedEntry` directly + (rather than `assemble` + post-diff). The self-contained + invariant is a precondition; the actual diff-emitting work is + its own plan. + +## Risk assessment + +**Low risk overall.** Three reasons: + +1. **No behaviour change.** `write_block_to_string` is referentially + transparent (§Behavioural equivalence). The refactor moves a + pure-function call earlier in the pipeline; emit bytes are + byte-identical. +2. **Fully local.** `CoarsenedEntry` is private to one file; two + internal consumers; no FFI; no wire format. +3. **Mirrors an existing precedent.** `InlineSplice` already carries + pre-computed `block_text`. The new `Rewrite` is structurally + identical. + +Risks worth naming: + +- **Tests pass but production hits a path we missed.** Mitigation: + the Plan-7b §"writer-lossless baseline" call-out for adding a + catch-all Rewrite fixture; verify with the e2e + manual browser + repro before committing. +- **Coarsen-time errors surface differently.** Before: `write_block_to_string` + errored at emit time, propagated up through `assemble` to + `incremental_write`. After: errors propagate from `coarsen_blocks` + (via the `?` in producer sites) — same overall return path + (`Result<_, Vec>`), but the *order* of error + vs. soft-drop-warning emission could shift. Verify the existing + error tests still produce the same diagnostic ordering. +- **Increased coarsen-time allocations.** Each Rewrite producer now + allocates a `String` immediately. Negligible at typical document + sizes; flagged for awareness rather than as a real concern. + +## Estimated scope + +| Phase | Lines (rough) | +|-------|---------------| +| 1 — pin panic with failing test | ~80 | +| 2 — Rewrite self-contained refactor | ~60 net change (delete + add) | +| 3 — verification (test runs, e2e) | 0 LOC (verification only) | +| 4 — design doc | ~200 | +| 5 — plan annotations | ~30 | +| **Total** | **~370** | + +## References + +- This plan's panic: `2026-05-25` session transcript; stack trace shows + `Rewrite { new_idx: 8 }` against `new_ast.blocks.len() == 1`. +- Plan 7's original `CoarsenedEntry` design: + `claude-notes/plans/2026-05-04-q2-preview-plan-7-incremental-writer.md` + §"Coarsen step". +- Plan 7c's transparent-wrapper fix: + `claude-notes/plans/2026-05-25-q2-preview-plan-7c-closure-gaps.md` + §Phase 8. +- The "not exercised by today's synthesizers" landmine comment: + `crates/pampa/src/writers/incremental.rs` around line ~640 (after + the `coarsen_keep_before_block` Transparent recursion). +- Existing precedent for pre-computed text: + `CoarsenedEntry::InlineSplice` (introduced in commit `ab10f37b`). diff --git a/claude-notes/plans/2026-05-25-q2-preview-plan-7c-closure-gaps.md b/claude-notes/plans/2026-05-25-q2-preview-plan-7c-closure-gaps.md new file mode 100644 index 000000000..d80a59ea6 --- /dev/null +++ b/claude-notes/plans/2026-05-25-q2-preview-plan-7c-closure-gaps.md @@ -0,0 +1,1522 @@ +# Plan 7c — Plan 7 closure gaps (Q-3-41, TS editability gate, per-kind tests) + +**Date:** 2026-05-25 +**Branch:** `feature/provenance` (or fresh worktree branched from it). +The contract docs the plan references — `provenance-contract.md` and +`incremental-writer-contract.md` — currently live on +`review/provenance-plan-7` and merge into `feature/provenance` as +part of the review-pass merge that is the same prerequisite for this +plan. +**Status:** Implementation plan +**Milestone:** none directly — closes correctness/coverage gaps in +the writer surface Plan 7 already shipped. + +## Epic context + +Part of the **provenance epic** (Plans 3–8 + 7a + 7b + this). When +the Plan-7 implementation agent ran on 2026-05-24, the post-review +Plan-7 doc had not yet been merged into `feature/provenance`. Three +correctness/coverage gaps survived as a result. Plan 7c closes them: + +1. **Q-3-41 "Edit dropped — render not ready yet"** — the + first-edit-before-render diagnostic the review pass introduced. + Neither the catalog entry nor the React/SPA emission landed. +2. **TS-side `hasPreimageIn` + `isEditableInside`** — the predicate + pair that closes Plan 2A's React framework gate. The Rust side + has the canonical version (`pampa::writers::incremental::is_editable_inside_*`); + the TS side at `ts-packages/preview-renderer/src/utils/sourceInfo.ts` + only exports the atomicity half. +3. **`cfg(debug_assertions)` `#[should_panic]` test** for the + shortcode-Generated-with-empty-`from` debug-assert at + `crates/pampa/src/writers/incremental.rs:448`. +4. **Per-kind soft-drop test symmetry** — explicit tests for each + atomic kind (filter / title-block / tree-sitter-postprocess) on + the Omit and inline UseAfter paths; the multi-inline dedupe + filter case. + +Plan 7b (`claude-notes/plans/2026-05-24-q2-preview-plan-7b-test-orama.md`) +already covers two adjacent test gaps — the writer-lossless baseline +test and the filter-construction-UseAfter test. Plan 7c is the +*disjoint* gap; do not duplicate Plan 7b's items here. + +## Hand-off start point + +1. Worktree: `feature/provenance` at + `/Users/gordon/src/q2/.worktrees/provenance/` (the integration + branch). `cargo xtask verify` is green there at the current tip; + confirm before starting. +2. The review-pass commits that introduced the missing design — `00222099`, + `bfb40962`, `561eefa0`, plus the cross-link commit `7c03be64` — + live on `review/provenance-plan-7`. Either merge that branch + into `feature/provenance` before starting (preferred — gives the + contract docs to consult) or work from the audit summary + below. +3. The audit that produced this plan: see the conversation transcript + on 2026-05-25 (Claude session resolving the rebase of + `review/provenance-plan-7` onto `feature/provenance`). +4. **Phase order matters.** Do Phase 1 (catalog) first so Phases 2 + and 3 can reference `Q-3-41`. Phase 4 (TS gate) is independent + of Phase 1 in code but conceptually pairs with Phase 3 (Q-3-41 + is the visible signal for the gate's "no baseline yet" branch). +5. Don't push without explicit user permission. + +## Goal + +Bring Plan 7's user-visible surface back into alignment with the +post-review contract, and close two correctness/UX issues that the +post-implementation code review surfaced: + +- The user always sees *some* signal when an edit is dropped — + Q-3-42 for atomic-content edits, Q-3-43 for no-preimage edits, + Q-3-41 for first-edit-before-render. No silent drops. +- The React framework's read-only gate matches the writer's + editability predicate, so edits that the writer would soft-drop + are gated at the DOM rather than reverting after a round-trip. +- The writer's debug-assert + each atomic kind's soft-drop path + has explicit regression coverage. +- **Q-3-43's diagnostic body actually names what was dropped** — + include path, metadata key, or container kind — instead of three + emission sites sharing one generic message. (Code-review item, + not part of the original closure audit; see Phase 6.) +- **Inline-level soft-drop looks up the original by the + reconciler's index**, not by the result-side positional proxy + that today's code uses. Today's proxy is exact for in-place + retypings (the shortcode case the tests cover) but misfires on + any inline insert/delete before the soft-drop site. (Code-review + item; see Phase 7.) +- **`target_file_id` derivation walks past synthesized first + blocks** instead of falling back to `FileId(0)` on a title-block- + first document. Dormant bug today (single-file fixtures + happen to land on `FileId(0)`); pre-empts Plan 8's multi-file + story. (Code-review item; see Phase 8.) + +Behaviour outside these items is unchanged. The code-review phases +tighten the writer's existing contract — they don't add new +contract surface, new diagnostic semantics, or a new pipeline tier. + +## Scope + +### In scope + +#### Phase 1 — `Q-3-41` catalog entry (`quarto-error-reporting`) + +**Repo facts the implementer needs:** + +- Catalog file: `crates/quarto-error-reporting/error_catalog.json`. + Q-3-42 / Q-3-43 entries at lines 527–541 are the shape to mirror + (`subsystem`, `title`, `message_template`, `docs_url`, + `since_version`). +- Subsystem for writer-side codes is `"writer"`. `since_version` + is `"99.9.9"` for unreleased entries. +- Q-3-41 is unallocated today (Q-3-40 is taken; Q-3-42/Q-3-43 are + the Plan-7 codes). Slot Q-3-41 between them. +- Q-3-41 is **TS-emitted** — there is no Rust caller (the writer + isn't invoked when the baseline is missing). No diagnostic + builder needed on the Rust side. The catalog entry exists so the + docs URL and version metadata are consistent. + +- [ ] Add Q-3-41 entry to `error_catalog.json` between Q-3-40 and + Q-3-42. Title: `"Edit dropped — render not ready yet"`. + `message_template`: `"Your edit was dropped because the + document hasn't finished rendering. Try again in a moment."` + `docs_url`: `"https://quarto.org/docs/errors/Q-3-42"`-style + shape; `since_version`: `"99.9.9"`. +- [ ] Build: `cargo xtask verify --skip-hub-build --skip-hub-tests` + green (the catalog has a unit test that asserts every entry + parses). + +#### Phase 2 — TS-side `hasPreimageIn` + `isEditableInside` + +**Repo facts the implementer needs:** + +- Target file: `ts-packages/preview-renderer/src/utils/sourceInfo.ts` + (59 lines today; will roughly double). +- Wire-format types: `ts-packages/preview-renderer/src/types/sourceInfo.ts` + documents codes 0/1/2/3/4. Walk pattern: `entryFor(node, pool)` + for the entry; `entry.t` discriminates. +- Rust reference: `crates/pampa/src/writers/incremental.rs:113-162` + (`is_editable_inside_block` / `_inline` / `_source_info`) + + `crates/quarto-source-map/src/source_info.rs:406-442` + (`preimage_in`). +- Anchor roles on the wire: `Generated` entries (code 4) carry + `from?: AnchorRef[]` where `role: "invocation" | "value-source" + | "other:<…>"`. Walk only `role === "invocation"`. +- `targetFileId` derivation, Rust-side: `original_ast.blocks.first() + .and_then(|b| b.source_info().root_file_id()).unwrap_or(FileId(0))` + (`incremental.rs:289-293`). On the TS side, look up the first + block's `s`-index in the pool, walk to its root Original, take + its `d` (file id). Default to `0` if absent. +- React context to extend: `ts-packages/preview-renderer/src/framework/RegistryContext.tsx`. + Add an optional `targetFileId?: number`. Default `0` when absent + (mirrors the Rust default, and covers callers that don't pass + the field yet). +- React dispatcher gate to update: + `ts-packages/preview-renderer/src/framework/dispatch.tsx:404-411`. + Replace the `isAtomic` check with `!isEditableInside(...)`. +- The Ast provider that builds the context value: + `ts-packages/preview-renderer/src/framework/Ast.tsx:121`. + Compute `targetFileId` once and pass it alongside `sourceInfoPool`. + +**Implementation sketch:** + +```ts +// In ts-packages/preview-renderer/src/utils/sourceInfo.ts + +/** Walk an entry's preimage chain in the pool; return [start, end] + * if the chain resolves to bytes in `targetFileId`, else undefined. + * Mirrors Rust `SourceInfo::preimage_in`. */ +export function hasPreimageIn( + node: { s?: number }, + pool: SourceInfoPool | undefined, + targetFileId: number, +): [number, number] | undefined { + const entry = entryFor(node, pool); + if (!entry) return undefined; + return preimageInEntry(entry, pool, targetFileId); +} + +function preimageInEntry( + entry: SourceInfoEntry, + pool: SourceInfoPool | undefined, + targetFileId: number, +): [number, number] | undefined { + if (entry.t === 0) { + return entry.d === targetFileId ? entry.r : undefined; + } + if (entry.t === 1) { + const parent = pool?.[entry.d]; + if (!parent) return undefined; + const parentRange = preimageInEntry(parent, pool, targetFileId); + if (!parentRange) return undefined; + return [parentRange[0] + entry.r[0], parentRange[0] + entry.r[1]]; + } + if (entry.t === 2) { + // Concat: every piece must resolve in target AND be byte-contiguous. + const ranges: Array<[number, number]> = []; + for (const [si_id, _offset, _len] of entry.d) { + const piece = pool?.[si_id]; + if (!piece) return undefined; + const r = preimageInEntry(piece, pool, targetFileId); + if (!r) return undefined; + ranges.push(r); + } + if (ranges.length === 0) return undefined; + for (let i = 1; i < ranges.length; i++) { + if (ranges[i - 1][1] !== ranges[i][0]) return undefined; + } + return [ranges[0][0], ranges[ranges.length - 1][1]]; + } + if (entry.t === 4) { + // Generated: walk the Invocation anchor only. + const inv = entry.d.from?.find((a) => a.role === 'invocation'); + if (!inv) return undefined; + const anchored = pool?.[inv.si_id]; + if (!anchored) return undefined; + return preimageInEntry(anchored, pool, targetFileId); + } + // t === 3 (legacy) and any future codes — not consulted. + return undefined; +} + +/** Combined editability gate. Mirrors Rust + * `pampa::writers::incremental::is_editable_inside_*`. */ +export function isEditableInside( + node: { s?: number; t?: string; type_name?: string }, + pool: SourceInfoPool | undefined, + targetFileId: number, + atomicKinds: ReadonlySet, +): boolean { + // Atomic CustomNodes — never editable inside. + const isCustom = node.t === 'CustomBlock' || node.t === 'CustomInline'; + if (isCustom && isAtomicCustomNode(node.type_name ?? '')) return false; + // Atomic-kind Generated — never editable inside. + if (isAtomicSourceInfo(node, pool, atomicKinds)) return false; + // No preimage in target — never editable inside. + return hasPreimageIn(node, pool, targetFileId) !== undefined; +} +``` + +- [ ] Implement `hasPreimageIn` per the sketch above. Export from + `sourceInfo.ts`. +- [ ] Implement `isEditableInside`. Place the + `isAtomicCustomNode` import alongside the existing + `entryFor` / `isAtomicSourceInfo` imports + (`ts-packages/preview-renderer/src/utils/atomicCustomNodes.ts`). +- [ ] Add unit tests for `hasPreimageIn` mirroring the Rust ones + at `crates/quarto-source-map/src/source_info.rs:1614-1750`: + Original same / different file; Substring composes offsets; + Concat contiguous / gappy / empty; Generated with Invocation + / with ValueSource only / no anchors. New test file: + `ts-packages/preview-renderer/src/utils/sourceInfo.test.ts`. +- [ ] Add unit tests for `isEditableInside` covering the three + uneditable reasons (atomic CustomNode, atomic-kind Generated, + no-preimage Generated) plus positive cases. +- [ ] Extend `RegistryContext` to carry optional `targetFileId?: number` + with default `0` in the empty-registry initial value. +- [ ] In `Ast.tsx`, compute `targetFileId` from the pool's first + block (walk to root Original; default `0`) and pass it + through the provider value. +- [ ] Update `framework/dispatch.tsx:404-411`'s `Node` gate: + replace the `isAtomic` check with + `!isEditableInside(node, sourceInfoPool, targetFileId, ATOMIC_KINDS)`. + Keep `NOOP_SET_LOCAL_AST` as the substituted callback. + +**Cross-language parity test — keeping TS in sync with Rust.** + +Hand-mirrored unit tests catch most desync the day the desync +happens, but they rely on a contributor noticing that "I changed +the Rust walker; I should update the TS one too." That discipline +fails the first time someone forgets. We need a structural check. + +The mechanism: a corpus of `(SourceInfoPool, node_s, target_file_id, +expected_preimage_or_null)` cases that's **generated from Rust** and +**consumed from TS**. Rust is the source of truth; if the Rust +`preimage_in` semantics change, the corpus regenerates; the TS test +runs against the new corpus and fails until the TS walker is +updated to match. + +Corpus shape (single JSON file, committed): + +```json +{ + "schema_version": 1, + "generated_from": "crates/quarto-source-map/src/source_info.rs", + "cases": [ + { + "name": "original_same_file", + "pool": [ /* SourceInfoEntry wire-format entries, code 0/1/2/4 */ ], + "node_s": 0, + "target_file_id": 0, + "expected": [10, 25] + }, + { + "name": "generated_with_value_source_only_no_invocation", + "pool": [ ... ], + "node_s": 2, + "target_file_id": 0, + "expected": null + } + ] +} +``` + +Location: `crates/quarto-source-map/test-fixtures/preimage-parity/cases.json`. +Lives with the producer of truth (the Rust walker), consumed by +the verifier (the TS walker). The TS test reads the file via +Vite's `import.meta.glob` or a path-relative fetch in test +config. + +**Rust side — generator + freshness gate.** + +Rust generates the fixture from a hand-written enumeration of +cases that mirror the existing `preimage_in` unit tests at +`crates/quarto-source-map/src/source_info.rs:1614-1750`. The +generator runs as a Rust integration test: + +```rust +// crates/quarto-source-map/tests/preimage_parity_fixture.rs +// +// Generates the cross-language parity corpus consumed by +// ts-packages/preview-renderer/src/utils/sourceInfo.parity.test.ts. +// Run with `cargo nextest run -p quarto-source-map preimage_parity`. +// Fails if `cases.json` is stale relative to the in-code corpus — +// re-run with `QUARTO_BLESS_PREIMAGE_PARITY=1` to regenerate. + +#[test] +fn preimage_parity_fixture_is_up_to_date() { + let cases = build_corpus(); // hand-written enumeration + let expected = serialize_corpus(&cases); + let path = "test-fixtures/preimage-parity/cases.json"; + if std::env::var("QUARTO_BLESS_PREIMAGE_PARITY").is_ok() { + std::fs::write(path, &expected).unwrap(); + return; + } + let actual = std::fs::read_to_string(path).unwrap_or_default(); + assert_eq!( + actual.trim(), + expected.trim(), + "preimage parity fixture is stale; rerun with \ + QUARTO_BLESS_PREIMAGE_PARITY=1 to regenerate" + ); +} +``` + +The corpus enumeration covers, at minimum: + +- `Original` in target file (positive) +- `Original` in non-target file (None) +- `Substring` composing offsets through a parent in target +- `Substring` rooted outside target (None) +- `Concat` of contiguous pieces in target (positive) +- `Concat` with a gap (None) +- `Concat` empty (None) +- `Generated` with `Invocation` anchor resolving in target (positive) +- `Generated` with `Invocation` anchor in non-target (None) +- `Generated` with only `ValueSource` anchor (None — role-asymmetry) +- `Generated` with only `Other("…")` anchor (None — forward-compat) +- `Generated` with empty `from[]` (None) +- Nested cases: `Substring` of a `Generated`'s Invocation; + `Generated` whose Invocation is itself a `Substring`. + +Every shape `preimage_in` matches on must appear at least once; +every "None" reason must appear at least once. The +`role-asymmetry` cases are load-bearing — they're the contract +that Plans 9/10 inherit. + +**TS side — consumer test.** + +```ts +// ts-packages/preview-renderer/src/utils/sourceInfo.parity.test.ts +import cases from + '../../../../crates/quarto-source-map/test-fixtures/preimage-parity/cases.json'; +import { hasPreimageIn } from './sourceInfo'; + +describe('preimage parity with Rust', () => { + for (const c of cases.cases) { + test(c.name, () => { + const node = { s: c.node_s }; + const actual = hasPreimageIn(node, c.pool, c.target_file_id); + expect(actual ?? null).toEqual(c.expected); + }); + } +}); +``` + +The test relies on the TS wire-format types +(`ts-packages/preview-renderer/src/types/sourceInfo.ts`) deserializing +the corpus `pool` entries directly — that is the same wire format +the runtime consumes, so if the corpus deserializes, the runtime +contract holds. + +**Atomic-kinds parity (belt-and-suspenders).** + +Separately from the walker corpus, a small text-level check +keeps the atomicity sets in sync. Add a Rust integration test +that generates a JSON file listing the `is_atomic_kind` kinds: + +```rust +// crates/quarto-source-map/tests/atomic_kinds_fixture.rs +#[test] +fn atomic_kinds_fixture_is_up_to_date() { + let kinds = ["filter", "shortcode", "title-block", + "tree-sitter-postprocess"]; + // (in-code enumeration is the source of truth; assert + // every kind here is_atomic_kind-true and no other kind + // we synthesize is true) + for k in kinds { assert!(By::raw(k, json!(null)).is_atomic_kind()); } + // ... write to test-fixtures/atomic-kinds.json with bless flag ... +} +``` + +And a TS test that asserts `ATOMIC_KINDS` equals the fixture's +set. Same bless-flag freshness gate, same desync-loud failure. + +**Implementation steps for the parity work.** + +- [ ] Create `crates/quarto-source-map/tests/preimage_parity_fixture.rs` + with the corpus builder per the sketch above. Enumerate the + cases listed in §"corpus enumeration." +- [ ] Run with `QUARTO_BLESS_PREIMAGE_PARITY=1` to generate + `crates/quarto-source-map/test-fixtures/preimage-parity/cases.json`. + Commit the fixture. +- [ ] Create `ts-packages/preview-renderer/src/utils/sourceInfo.parity.test.ts` + per the sketch. Configure the test runner to find the + `cases.json` path (relative import works under Vitest's + default config; confirm `npm run test:ci` picks it up). +- [ ] Create the atomic-kinds parity fixture + Rust generator + + TS consumer test. The TS consumer test imports + `ATOMIC_KINDS` from `utils/sourceInfo.ts` and asserts + set-equality with the fixture. +- [ ] Document the bless flag in `crates/quarto-source-map/README.md` + (create if missing): a single paragraph on when to bless + the fixtures (any Rust-side change that affects + `preimage_in`'s behaviour or the atomic-kinds enumeration). +- [ ] CI: `cargo nextest run` already runs the freshness gate; + no CI changes needed. The TS parity test runs under + `npm run test:ci`, which is already in `cargo xtask verify`. + +**Why the freshness gate matters.** + +Without the gate, a Rust-side change (say, adding `By::callout()` +to `is_atomic_kind`'s matches arm) would silently leave the TS +fixture stale, and the TS parity test would pass against the +stale fixture. The gate makes that change a Rust test failure — +loud, immediate, easy to fix by re-running with the bless flag. +The TS side then trips when the contributor regenerates the +fixture without updating `ATOMIC_KINDS` to match. Two-step +diagnosis, but both steps fail loudly. + +- [ ] `cd hub-client && npm run build:all` green (hits the + preview-renderer build via project references). +- [ ] `cd hub-client && npm run test:ci` green. + +#### Phase 3 — First-edit gates emit `Q-3-41` + +**Repo facts the implementer needs:** + +- ReactPreview no-baseline branch: + `hub-client/src/components/render/ReactPreview.tsx:444-446`. + Currently `console.warn` + bare `return`. +- SPA no-baseline branch: + `q2-preview-spa/src/PreviewApp.tsx:437-440`. Currently + `console.warn` + bare `return`. +- SPA already has a Q-3-42/Q-3-43 surface — `DiagnosticStrip` at + `q2-preview-spa/src/components/DiagnosticStrip.tsx` and the + `setWriteWarnings` state in `PreviewApp.tsx:392`. Push Q-3-41 + through the same channel. +- ReactPreview already drains write-back warnings into + `pendingWriteWarningsRef` (line 320) and flushes via + `onDiagnosticsChange` on the next render (line 361-366). Push + Q-3-41 into `pendingWriteWarningsRef.current` so it surfaces in + the existing diagnostics panel. Per the autosave-context + suppress-after-3 policy, the merging already de-dupes by source + range; Q-3-41 has no range so it'll just repeat — acceptable + for v1 because the user will keep retrying until the render + catches up. +- TS `Diagnostic` shape: + `ts-packages/preview-renderer/src/types/diagnostic.ts:28-49`. + Required fields: `kind: 'warning'`, `title`, `hints: string[]`, + `details: DiagnosticDetail[]` (can be empty). Optional: `code`, + `problem`, `start_line` / `start_column` / `end_line` / + `end_column` (omit — no source range), `rendered`. + +**Helper sketch** — shared between both call sites. Live in +`ts-packages/preview-runtime/src/firstEditDiagnostic.ts` (new file; +both ReactPreview and the SPA already import from this package): + +```ts +import type { Diagnostic } from '@quarto/preview-renderer/types/diagnostic'; + +/** Construct a Q-3-41 warning for the "edit before first render + * produced a baseline AST" case. Body text mirrors the catalog + * entry; the helper is the TS counterpart to a Rust + * `diagnostic_q3_41()` builder that doesn't exist (the writer is + * never called in this branch). */ +export function diagnosticQ3_41(): Diagnostic { + return { + kind: 'warning', + code: 'Q-3-41', + title: 'Edit dropped — render not ready yet', + problem: + "Your edit was dropped because the document hasn't " + + "finished rendering. Try again in a moment.", + hints: [], + details: [], + }; +} +``` + +- [ ] Create `ts-packages/preview-runtime/src/firstEditDiagnostic.ts` + with `diagnosticQ3_41()` per the sketch. Export from + `ts-packages/preview-runtime/src/index.ts`. +- [ ] Co-located unit test + `ts-packages/preview-runtime/src/firstEditDiagnostic.test.ts`: + assert `diagnosticQ3_41()` returns the expected shape (kind, + code, title, problem present). +- [ ] In `ReactPreview.tsx`'s `handleSetAst`, replace the + `console.warn` + return in the no-baseline branch + (`!baseline`) with: + `pendingWriteWarningsRef.current = [...pendingWriteWarningsRef.current, diagnosticQ3_41()];` + followed by the early return. Trigger a re-render so the + pending warnings flush — pass through `onDiagnosticsChange` + directly with the merged set rather than waiting for the + next render, since no qmd content change happens here. + (Implementation detail: store `pendingWriteWarningsRef` flush + logic in a small helper if duplicated from the post-render + drain.) +- [ ] In `PreviewApp.tsx`'s `handleSetAst`, replace the + `console.warn` + return in the `!path || !baselineJson` branch + with `setWriteWarnings((prev) => [...prev, diagnosticQ3_41()]);` + followed by the early return. +- [ ] In ReactPreview: assert the diagnostic still surfaces if the + user fixes the underlying issue (render eventually completes, + baseline becomes available, next edit succeeds — the Q-3-41 + from the dropped edit remains in the diagnostics panel until + the next successful render's drain clears it). Document this + in the call-site comment. +- [ ] Hub-client integration test (Vitest): mount ReactPreview + with `ast=''` (no baseline), call `handleSetAst({})`, assert + `onDiagnosticsChange` is called with a list containing + `code: 'Q-3-41'`. Place alongside the existing ReactPreview + tests; if there's no test file for ReactPreview yet, model + on `hub-client/src/services/incrementalWrite.wasm.test.ts`'s + structure. +- [ ] SPA integration test + (`q2-preview-spa/src/PreviewApp.integration.test.tsx`): + drive `handleSetAst` before the first successful render + completes; assert `DiagnosticStrip` renders a row with the + Q-3-41 title. +- [ ] `cd hub-client && npm run build:all && npm run test:ci` green. + +#### Phase 4 — Per-kind soft-drop test symmetry (Rust) + +**Repo facts the implementer needs:** + +- Existing test module at the bottom of + `crates/pampa/src/writers/incremental.rs` (search `#[cfg(test)]`). + Models to mirror: + - Omit on atomic-kind: `keep_before_with_atomic_kind_generated_no_anchor_emits_omit` + (line ~1590; uses `By::filter("upper.lua", 14)`). + - Inline UseAfter soft-drop: + `inline_use_after_on_atomic_generated_soft_drops_to_keep_before_with_q3_42` + (line ~2028; uses `By::shortcode("meta")`). + - Multi-inline dedupe positive: + `multi_inline_dedupe_emits_token_once_when_invocation_shared` + (line ~1909; shortcode case). +- The code paths in question (`coarsen_keep_before_block` for + Omit, `assemble_inline_content` for inline UseAfter) do not + branch on `by.kind` — they branch on `by.is_atomic_kind()`. New + per-kind tests exercise the same code, but a regression in + `is_atomic_kind`'s enumeration (e.g. dropping `"title-block"` + from the match) would be caught here whereas the generic test + alone wouldn't. + +**The block-Omit path is `is_atomic_kind`-driven, not kind-specific.** + +The block-level Omit branch (`coarsen_keep_before_block`) and the +inline soft-drop branch (`assemble_inline_content`) both consult +`by.is_atomic_kind()` — they don't pattern-match on kind. A +hand-written per-kind test exercises the same `matches!` arm +through a different constructor; the only regression it catches +is "someone dropped a kind from the `matches!` arm at +`source_info.rs:647`." That's a real but narrow failure mode. + +A single enumeration property test catches the same failure with +less scaffolding and stays correct as the atomic-kind set grows. +The hand-written inline-soft-drop pair is more justified — the +inline path has subtle wiring (diagnostic-location selection in +`diagnostic_q3_42_inline`, dedupe interaction with `Invocation` +equality) that isn't a function of kind alone. + +**Block-level: one property test, not three hand-written tests.** + +```rust +#[test] +fn every_atomic_kind_emits_omit_under_keep_before_with_empty_from() { + // Drives every kind in the documented atomic-kind set through + // coarsen and asserts the Omit verdict. New kinds added to + // `By::is_atomic_kind()` must be added here too; if a kind + // ever leaves the set without leaving this test, the test + // either fails (kind no longer atomic) or false-passes + // (regression) — the latter is caught by the corresponding + // freshness gate in Plan 7c Phase 2's atomic-kinds parity + // fixture. + let atomic_kinds: Vec = vec![ + By::filter("upper.lua", 14), + // shortcode is excluded — its empty-`from` case trips + // the debug-assert (see Phase 5); the property below + // only enumerates kinds whose empty-`from` is "normal." + By::title_block(), + By::tree_sitter_postprocess(), + ]; + for by in atomic_kinds { + assert!(by.is_atomic_kind(), "kind {:?} no longer atomic", by); + let block = para(vec![], SourceInfo::generated(by.clone())); + let ast = quarto_pandoc_types::Pandoc { + blocks: vec![block], + meta: ConfigValue::default(), + }; + let plan = ReconciliationPlan { + block_alignments: vec![BlockAlignment::KeepBefore(0)], + ..Default::default() + }; + let mut warnings = Vec::new(); + let entries = coarsen("", &ast, &ast, &plan, &mut warnings).unwrap(); + assert!( + matches!(entries[0], CoarsenedEntry::Omit), + "expected Omit for kind {:?}, got {:?}", by, entries[0], + ); + assert!(warnings.is_empty(), "KeepBefore branch should not warn"); + } +} +``` + +**Inline-level: keep the hand-written pair.** The inline path is +worth exercising once per kind because the diagnostic builder +and the soft-drop substitution have distinct behaviour beyond +the `is_atomic_kind()` gate. + +- [ ] Add `every_atomic_kind_emits_omit_under_keep_before_with_empty_from` + per the sketch above. ~30 LOC, replaces the three + block-Omit per-kind tests. +- [ ] Add `inline_use_after_on_filter_constructed_inline_soft_drops`: + mirror the shortcode test at line ~2028, build + `By::filter("emoji.lua", 9)` on the original inline, assert + Q-3-42 + KeepBefore. ~25 LOC. (Complements Plan 7b's Phase-1 + *block-level* filter UseAfter test by exercising the inline + path.) +- [ ] Add `inline_use_after_on_title_block_inline_soft_drops`: + same shape, `By::title_block()`. ~25 LOC. +- [ ] Add `multi_inline_dedupe_filter_case`: shape-equivalent to + `multi_inline_dedupe_emits_token_once_when_invocation_shared` + but using `By::filter("decoration.lua", 12)`. Filter + constructions rarely produce multi-inline output in practice, + but the dedupe rule consults `Invocation` regardless of + kind, so the test pins the regression shape. ~30 LOC. +- [ ] `cargo nextest run -p pampa -E 'test(/coarsen|inline_use_after|multi_inline|every_atomic_kind/)'` + green. + +#### Phase 5 — `cfg(debug_assertions)` `#[should_panic]` test + +**Repo facts the implementer needs:** + +- The debug-assert site: + `crates/pampa/src/writers/incremental.rs:448-455`. Panic message + starts with `"Generated { by: shortcode, from: [] } reached the + writer — Plan 6's stamper must always attach an Invocation + anchor for shortcode resolutions."` +- `#[should_panic(expected = "…")]` matches on a substring. Use + the unique prefix `"Generated { by: shortcode, from: [] } reached"` + to avoid false positives. +- Release builds compile `debug_assert!` out. The test must be + cfg-gated to `debug_assertions` so release-profile test runs + don't trip the `should_panic` reverse-failure. + +**Sketch:** + +```rust +#[test] +#[cfg(debug_assertions)] +#[should_panic(expected = "Generated { by: shortcode, from: [] } reached")] +fn shortcode_with_empty_from_trips_debug_assert() { + // The Plan-6 stamper invariant: every Generated{by:shortcode} + // carries an Invocation anchor. A hand-constructed shape that + // skips the anchor must trip the writer's debug_assert. + let gen_info = SourceInfo::generated(By::shortcode("meta")); + let block = para(vec![], gen_info); + let ast = quarto_pandoc_types::Pandoc { + blocks: vec![block], + meta: ConfigValue::default(), + }; + let plan = ReconciliationPlan { + block_alignments: vec![BlockAlignment::KeepBefore(0)], + ..Default::default() + }; + let mut warnings = Vec::new(); + // Coarsen panics inside `coarsen_keep_before_block` via the + // debug-assert at incremental.rs:448. + let _ = coarsen("", &ast, &ast, &plan, &mut warnings); +} +``` + +- [ ] Add the test per the sketch above to the same test module. + Document the `cfg(debug_assertions)` gating with a one-line + comment so release-profile runners aren't confused. +- [ ] `cargo nextest run -p pampa shortcode_with_empty_from` green + (default profile = `debug_assertions` on). +- [ ] `cargo nextest run --release -p pampa shortcode_with_empty_from` + green (test is compiled out, suite still passes). + +#### Phase 6 — Differentiated `Q-3-43` builder via `Q343Reason` enum + +**Repo facts the implementer needs:** + +- Current builder: + `crates/pampa/src/writers/incremental.rs:552-563` + (`diagnostic_q3_43_block`). Returns a single generic message — + `"An edit to pipeline-generated content was reverted."` — and a + single generic hint that lists three possible upstreams + ("an include, a metadata key, or other source"). +- Three call sites, each currently calls + `diagnostic_q3_43_block(block)` with no case discriminator: + - `incremental.rs:320` — block `UseAfter` on a no-preimage + Generated container (user wholesale-replaced a synthesized + container via React). + - `incremental.rs:344` — block `RecurseIntoContainer` on an + atomic CustomNode whose wrapper has preimage in target + (typically `IncludeExpansion` / `CrossrefResolvedRef`; soft-drop + substitutes Verbatim). + - `incremental.rs:350` — block `RecurseIntoContainer` on a + no-preimage Generated container (synthesized + footnotes / appendix / etc.; soft-drop substitutes Omit). +- The post-review contract doc + (`claude-notes/designs/incremental-writer-contract.md`, + §"User-facing diagnostic surface") promises body text that names + the upstream: `"To edit this content, open `` directly."` + for includes; `"This content is generated from metadata; edit + `_quarto.yml` to change it."` for metadata-derived containers. + Today's code delivers neither. +- For the include-recurse case, the include path lives in the + atomic CustomNode's `plain_data["source_path"]`. Look at + `crates/quarto-pandoc-types/src/custom.rs` for the `plain_data` + shape; use `.as_str()` on the `Value` to extract. +- For metadata-derived containers, the synthesizer's `By::kind` + string (`"footnotes"`, `"appendix"`, etc.) is the only stable + identifier today — there is no metadata-key anchor in v1. Plan 9 + (`ValueSource`) will give us the actual metadata range; until + then, naming the kind is the best the diagnostic can do. + +**Design — `Q343Reason` enum at the call boundary.** + +The three emission sites collapse to one builder that takes a +typed reason. The enum forces every new emission site to pick a +case (compile-time exhaustiveness) and centralises the body-text +choices for the message catalog. + +```rust +/// Why a Q-3-43 was emitted. One variant per emission path in +/// `coarsen`; new soft-drop sites must extend this enum so the +/// match in `diagnostic_q3_43_block` covers them at compile time. +enum Q343Reason<'a> { + /// User edited inside an atomic CustomNode whose wrapper has + /// preimage in target — typically an `IncludeExpansion` or a + /// `CrossrefResolvedRef`. `include_path` is the wrapper's + /// `plain_data["source_path"]` if present (Plan 8); `None` for + /// CustomNodes without a source-path field. + IncludeRecurse { include_path: Option<&'a str> }, + /// User edited inside a no-preimage Generated container + /// (footnotes / appendix / sectionize / etc.). `kind` is the + /// `by.kind` string of the container. + MetadataContainerRecurse { kind: &'a str }, + /// User wholesale-replaced a no-preimage Generated container + /// via React. `kind` is the new-side block's `by.kind`. + NoPreimageReplacement { kind: &'a str }, +} + +fn diagnostic_q3_43_block( + block: &Block, + reason: Q343Reason, +) -> quarto_error_reporting::DiagnosticMessage { + let (title, problem, hint): (&str, String, String) = match reason { + Q343Reason::IncludeRecurse { include_path: Some(path) } => ( + "Include content edit dropped", + format!("An edit inside `{{{{< include {} >}}}}` was reverted.", path), + format!("To edit this content, open `{}` directly.", path), + ), + Q343Reason::IncludeRecurse { include_path: None } => ( + "Generated content edit dropped", + "An edit inside an atomic block was reverted.".into(), + "This block is read-only; edit its upstream source instead.".into(), + ), + Q343Reason::MetadataContainerRecurse { kind } => ( + "Generated content edit dropped", + format!("An edit inside the synthesized `{}` container was reverted.", kind), + "This content is generated from metadata; edit `_quarto.yml` to change it.".into(), + ), + Q343Reason::NoPreimageReplacement { kind } => ( + "Generated content edit dropped", + format!("A replacement of the synthesized `{}` container was reverted.", kind), + "Generated containers must be changed by editing their metadata source.".into(), + ), + }; + quarto_error_reporting::DiagnosticMessageBuilder::warning(title) + .with_code("Q-3-43") + .with_location(block.source_info().clone()) + .problem(problem) + .add_hint(hint) + .build() +} +``` + +The `Block` parameter stays so `with_location` can anchor the +warning at the original wrapper's source range (atomic CN paths) +or fall through to a no-range diagnostic (no-preimage container +paths — `with_location` accepts a `SourceInfo::Generated` whose +`preimage_in` returns `None`; the resulting warning lands without +a Monaco squiggle and surfaces in the diagnostics banner only). + +**Catalog reconciliation.** The catalog entry +(`crates/quarto-error-reporting/error_catalog.json`) currently +carries one Q-3-43 with a generic `message_template`. The +builder is now responsible for the per-case body text (matching +Plan 7's already-established "builder picks body text, catalog +holds metadata" convention for Q-3-43), so no catalog change is +needed. Confirm by grepping the catalog entry's `since_version` +is still `"99.9.9"`; if a later catalog reformat tries to pin +body text, that's the point to push back on. + +**Implementation steps.** + +- [ ] Add the `Q343Reason` enum next to `diagnostic_q3_43_block` + in `incremental.rs`. Keep it `pub(super)` or module-private; + it's a call-boundary type, not part of the writer's external + API. +- [ ] Replace the body of `diagnostic_q3_43_block` per the sketch + above. Title, problem, hint per variant. +- [ ] Update the three call sites in `coarsen`: + - `incremental.rs:320`: pass `Q343Reason::NoPreimageReplacement + { kind: kind_of(new_block) }` where `kind_of` reads + `Generated.by.kind` (use the existing `.is_kind(...)` helper + family or write a small `by_kind_of_block(&Block) -> Option<&str>`). + - `incremental.rs:344`: pass `Q343Reason::IncludeRecurse + { include_path: include_path_of(orig_block) }` — write a + small helper that downcasts `Block::Custom(cn)` and reads + `cn.plain_data.get("source_path").and_then(|v| v.as_str())`. + - `incremental.rs:350`: pass `Q343Reason::MetadataContainerRecurse + { kind: by_kind_of_block(orig_block).unwrap_or("generated") }`. +- [ ] Adjust the existing soft-drop tests in + `coarsen_plan7_tests` (`incremental.rs:1525+`) so they assert + the *new* per-case problem text: + - `recurse_into_atomic_custom_node_soft_drops_to_verbatim` + (line ~1807): wrap the original `CrossrefResolvedRef` + CustomNode with `plain_data` containing + `{"source_path": "foo.qmd"}`; assert the warning's problem + contains `"foo.qmd"`. Add a `_no_source_path` variant that + omits `plain_data` and asserts the fallback wording. + - `recurse_into_no_preimage_generated_soft_drops_to_omit` + (line ~1851): assert the problem contains `"appendix"` + (the `By::appendix()` kind used by the fixture). + - `use_after_on_no_preimage_generated_soft_drops_to_omit` + (line ~1769): assert the problem contains the new-side + block's kind. +- [ ] Add a Phase-6-specific test that exercises all three + `Q343Reason` variants through `diagnostic_q3_43_block` + directly (skipping `coarsen`); compact regression pin for + the message text. + +**Location anchoring — what `with_location` should resolve to.** + +The current code passes `new_block` at the UseAfter→Omit site +(line 394) and `orig_block` at the two RecurseIntoContainer +sites (lines 427, 467). The new-side block in the UseAfter case +is React-constructed — its `source_info` is typically +`Generated { by: user_edit, from: [] }` or a `SourceInfo::default()`. +`preimage_in` returns `None` on either, so the Monaco squiggle +doesn't land anywhere useful. The original-side block in this +case is a no-preimage Generated container, whose `source_info` +also has no useful preimage — so the squiggle problem is intrinsic +to the case, not a fixable bug. + +Two things follow from that: + +- **For the two RecurseIntoContainer sites, `orig_block` is the + right anchor and the code already does it.** The IncludeRecurse + case has a useful range (the include token); the + MetadataContainerRecurse case doesn't, but choosing `orig_block` + over `new_block` is still correct because the warning is *about* + the original wrapper, and downstream attribution layers + (`resolve_byte_range`, etc.) prefer original-side info. +- **For the UseAfter→Omit site, switch from `new_block` to the + original block's source_info IF available.** Today the call + site doesn't bind any `orig_block` — `BlockAlignment::UseAfter` + has no `displaced_before_idx`. Two options: + - **v1 fix (cheap):** pass the new block (current behavior), + accept that the diagnostic carries no useful location. Pin the + behavior with a test so future contributors don't accidentally + "fix" it without parallel work on the alignment type. + - **v2 fix (parallel to Phase 7):** extend + `BlockAlignment::UseAfter` the same way Phase 7 extends + `InlineAlignment::UseAfter`, then pass `original_blocks[displaced_before_idx]`. + Out of scope for Plan 7c — file a follow-up beads issue. + +The v1 fix is what Phase 6 ships. Tests pin current behavior, +the v2 follow-up is a beads-issue note. + +- [ ] Add `q3_43_location_anchors_to_original_block_on_recurse`: + assert that for `recurse_into_atomic_custom_node_soft_drops_to_verbatim` + and `recurse_into_no_preimage_generated_soft_drops_to_omit`, + the emitted warning's `location` matches the *original* + block's `source_info`, not the new block's. Cheap pin + against accidental regression. +- [ ] Add `q3_43_location_falls_back_to_new_block_on_use_after`: + for `use_after_on_no_preimage_generated_soft_drops_to_omit`, + assert that the warning's `location` is the new block's + `source_info` (current v1 behavior). Comment block explains + the v2 follow-up. +- [ ] File a follow-up beads issue: "Block-level UseAfter soft-drop: + extend `BlockAlignment::UseAfter` to carry + `displaced_before_idx` (parallel to Plan 7c Phase 7's inline + fix)." Reference Plan 7c Phase 6 location-anchoring v2. + Priority 3 (polish — no user-visible squiggle today either + way; affects attribution metadata downstream). + +- [ ] `cargo nextest run -p pampa` green. +- [ ] `cargo xtask verify --skip-hub-build --skip-hub-tests` + green. + +**Why an enum, not three top-level helpers.** + +A reasonable alternative is three named helpers +(`q3_43_include_recurse`, `q3_43_metadata_recurse`, +`q3_43_no_preimage_replace`) instead of one builder taking an +enum. The enum is preferred here because: + +1. The failure mode we're fixing — "someone added a new soft-drop + site and reused the generic message" — is exactly what landed + in Plan 7. The enum's exhaustiveness check makes the regression + structural: a new `Q343Reason::Foo` is a compile error until + the builder handles it. +2. The catalog has one Q-3-43 entry; modelling the call sites as + one builder mirrors that shape and avoids future drift between + the catalog and the emission code. +3. Adding a fourth Q-3-43 emission site (likely from Plan 8's + IncludeExpansion work) means one new enum variant and one new + match arm — no scaffolding to copy-paste. + +If a future case grows wildly different message structure (e.g. +a multi-paragraph body), peel it off into its own helper at that +point. + +#### Phase 7 — Inline soft-drop carries the displaced original index + +**Repo facts the implementer needs:** + +- Soft-drop site: + `crates/pampa/src/writers/incremental.rs:1069-1080` + (`assemble_inline_content`, the `UseAfter(_)` arm of the + effective-alignment-rewriting loop). The current code reaches + for `orig_inlines.get(result_idx)` to find the original inline + whose editability gates the soft-drop. +- The comment in the code is honest about the proxy: + > "exact for in-place retypings (the common shortcode-edit + > case), approximate for arbitrary insertions/deletions." +- Reconciler type: + `crates/quarto-ast-reconcile/src/types.rs:112-124` + (`InlineAlignment`). The relevant variant is + `UseAfter(usize)` — tuple variant carrying only `after_idx`. +- The same shape exists for blocks: + `BlockAlignment::UseAfter(usize)` at line 100. The block + soft-drop path does **not** consult an original-side index + (it checks the new-side block's editability via + `new_block.source_info().preimage_in(...)`), so this phase is + inline-only. Block soft-drop is correct as-is. +- Today's test suite for inline soft-drop: + `inline_use_after_on_atomic_generated_soft_drops_to_keep_before_with_q3_42` + at `incremental.rs:2027`. All inline-soft-drop fixtures align + `orig_inlines[i]` with `new_inlines[i]` 1:1, so the proxy + bug is invisible to CI. + +**The fix — extend `InlineAlignment::UseAfter` to a struct variant.** + +The reconciler is the only place that knows which original inline +(if any) the `UseAfter` is replacing. Today's tuple variant +throws that information away; the fix is to keep it. Change to: + +```rust +// crates/quarto-ast-reconcile/src/types.rs +pub enum InlineAlignment { + KeepBefore(usize), + + /// Use the after-side inline. `displaced_before_idx` is + /// `Some(i)` when the reconciler treated this as a replacement + /// of `orig_inlines[i]` (the common positional-edit case); + /// `None` for genuine inserts where no original aligns with + /// this slot. Consumers that gate on the original inline's + /// editability (e.g. the writer's soft-drop) MUST use this + /// field rather than deriving it from the alignment index. + #[serde(rename = "use_after")] + UseAfter { + after_idx: usize, + #[serde(default)] + displaced_before_idx: Option, + }, + + RecurseIntoContainer { before_idx: usize, after_idx: usize }, +} +``` + +`Option` rather than `usize` because inserts (no +displaced original) and replacements (displaced original known) +both need to be expressible. The `#[serde(default)]` makes the +new field absent-friendly on the wire — pre-existing JSON +serializations of `UseAfter` deserialize cleanly with +`displaced_before_idx = None`, which is the "be conservative, +don't soft-drop" answer. + +**Why a struct variant, not a new enum variant.** + +A less-invasive alternative is to add `UseAfterReplacing +{ after_idx, before_idx }` alongside `UseAfter(usize)` and leave +the existing variant for genuine inserts. Rejected because: + +- Every consumer of `InlineAlignment` then has to handle two + variants that mean almost the same thing. The writer's match + arms double. +- The reconciler still has to decide which variant to emit on + every alignment, and that decision *is* the + `displaced_before_idx` Option — just expressed in two enum + variants instead of one struct variant with an `Option`. + +Struct-variant migration is mechanical: `cargo build` will list +every pattern match that needs updating. + +**Reconciler-side: populate `displaced_before_idx`.** + +The reconciler at +`crates/quarto-ast-reconcile/src/inline.rs` (or wherever +inline alignment is decided — locate via `git grep +'InlineAlignment::UseAfter'`) produces `UseAfter` from its +positional alignment loop. In practice: + +- LCS-style alignment: when `UseAfter(j)` is emitted at result + position `r`, the reconciler has just consumed `orig_inlines[i]` + on the original side (or hasn't, in which case this is an + insert). The `displaced_before_idx` is `Some(i)` in the + consumed case, `None` in the insert case. +- Positional alignment: `displaced_before_idx = Some(r)` when + `r < orig_inlines.len()`, `None` otherwise. + +The exact derivation depends on the reconciler's algorithm. +Locate the alignment loop and add the index alongside the +existing `after_idx` emission. + +**Writer-side: consume `displaced_before_idx`.** + +```rust +// crates/pampa/src/writers/incremental.rs (assemble_inline_content) +InlineAlignment::UseAfter { after_idx, displaced_before_idx } => { + if let Some(orig_idx) = displaced_before_idx + && let Some(orig) = orig_inlines.get(*orig_idx) + && !is_editable_inside_inline(orig, target_file_id) + { + warnings.push(diagnostic_q3_42_inline(orig)); + effective.push(InlineAlignment::KeepBefore(*orig_idx)); + continue; + } + effective.push(alignment.clone()); +} +``` + +When `displaced_before_idx` is `None` (a genuine insert), there +is no original to gate against, and the alignment passes through +unchanged. That is the correct behaviour — inserts can't soft-drop +because there's nothing they're displacing. + +**Implementation steps.** + +- [ ] In `crates/quarto-ast-reconcile/src/types.rs`: change + `InlineAlignment::UseAfter` from tuple variant `(usize)` to + struct variant `{ after_idx, displaced_before_idx }` per + the sketch. Update the serde rename and add + `#[serde(default)]` on the new field. +- [ ] `cargo build --workspace` and walk every compile error; + update each pattern match. Reconciler tests in the same + crate will surface most of them. Writer call sites in + `pampa::writers::incremental` will surface the rest. +- [ ] Reconciler: populate `displaced_before_idx` in the inline + alignment loop. Add a test in + `quarto-ast-reconcile` asserting the field is populated for + a fixture where `UseAfter` replaces an original inline, + and is `None` for a fixture that inserts a fresh inline. +- [ ] Writer: replace `orig_inlines.get(result_idx)` at + `incremental.rs:1074` with the `displaced_before_idx`-aware + logic. Remove the `result_idx` positional proxy and its + explanatory comment. +- [ ] Add a regression test: + `inline_use_after_with_insert_before_shortcode_does_not_misfire`. + Construct an inline plan with `[Insert("X"), UseAfter + (over-shortcode)]` so the result-side index `1` and the + original-side index `0` differ. Assert the soft-drop fires + against the original shortcode inline (the + `displaced_before_idx`), not against + `orig_inlines.get(result_idx=1)` (which would be out of + bounds, or wrong). +- [ ] Add a complementary test: + `inline_use_after_pure_insert_does_not_soft_drop`. A new + inline with `displaced_before_idx = None` must not consult + `orig_inlines` at all. Assert no Q-3-42 is emitted. +- [ ] `cargo xtask verify --skip-hub-build --skip-hub-tests` + green. +- [ ] `cargo xtask verify` (full) — the WASM bridge passes + `ReconciliationPlan` JSON over the wire; the + `#[serde(default)]` makes the change wire-compatible, but + a full verify confirms nothing else broke. + +**Wire-format compatibility.** + +The TS side at +`ts-packages/quarto-sync-client/src/types.ts` does not currently +deserialize `ReconciliationPlan` itself — the plan is computed +inside WASM and never crosses the boundary as JSON. Confirm with +`git grep -l 'InlineAlignment'` in `ts-packages/` and +`hub-client/`; if any TS consumer turns up, the same +`#[serde(default)]` semantics apply on the parsing side (new +field absent ⇒ `null`/`undefined` ⇒ "don't soft-drop"). + +#### Phase 7b — Inline `UseAfter` soft-drop checks the new-side inline's atomicity + +**Discovered 2026-05-26** during the algebraic-soundness research +that produced today's block-level UseAfter fix (commit +`e584428d`). The block-level cascade in `coarsen_blocks` had a gap +for `BlockAlignment::UseAfter(j)` where `new_blocks[j]` was +atomic-Generated *with preimage* — the let-user-win Rewrite +fell through and emitted the resolved bytes back into source. The +fix added a branch that detects atomic-Generated-with-preimage on +the *new* block and substitutes `Verbatim` of preimage + Q-3-43. + +The inline cascade in `assemble_inline_content` +(`crates/pampa/src/writers/incremental.rs:1325-1362`) has the +exact analogue gap. Today's Phase 1 of `assemble_inline_content` +only checks the **original-side** inline's editability (via the +positional proxy that Phase 7 above fixes). It does not check +whether the *new* inline at `after_idx` is atomic-Generated with +preimage. If a reconciler emits `InlineAlignment::UseAfter(j)` +where `new_inlines[j]` carries `Generated{by:shortcode, from: +[Invocation -> token_si in target]}`, the cascade lets it through +to splice/rewrite and the resolved bytes leak. + +**The fix mirrors the block-level fix shipped today.** Add a new +check at the head of the `InlineAlignment::UseAfter` arm: + +```rust +InlineAlignment::UseAfter { after_idx, displaced_before_idx } => { + let new_inline = &new_inlines[*after_idx]; + let new_si = new_inline.source_info(); + let atomic_generated_preimage = match new_si { + SourceInfo::Generated { by, .. } if by.is_atomic_kind() => + new_si.preimage_in(target_file_id), + _ => None, + }; + if let Some(_range) = atomic_generated_preimage { + // User edited inside an atomic-kind Generated inline + // (typically a shortcode-resolved Str). The new inline + // still carries the token's Invocation anchor; emit the + // token bytes verbatim by substituting KeepBefore of the + // displaced original (if known) or the positional proxy. + let orig_idx = displaced_before_idx + .or_else(|| Some(*after_idx).filter(|i| *i < orig_inlines.len()))?; + warnings.push(diagnostic_q3_42_inline(&orig_inlines[orig_idx])); + effective.push(InlineAlignment::KeepBefore(orig_idx)); + continue; + } + // ... existing original-side check follows. +} +``` + +This fix and Phase 7's `displaced_before_idx` enrichment compose +naturally: Phase 7 gives us the precise original-side index; +Phase 7b uses it (or falls back to the positional proxy) when +emitting `KeepBefore`. The two phases can land in either order; +Phase 7 lands first if it's already scoped as drafted, then +Phase 7b layers on top. + +**Why this is a separate phase, not folded into Phase 7.** Phase 7 +fixes an *accuracy* bug in the existing original-side check (the +positional proxy misfires on inserts/deletes). Phase 7b adds a +*new branch* (new-side atomicity) that doesn't exist in any form +today. Both are denylist tightenings; both become moot once Plan +7d's algebraic refactor lands. + +- [ ] Add the atomic-Generated-with-preimage check at the head of + `InlineAlignment::UseAfter` in `assemble_inline_content`. +- [ ] Regression test: + `inline_use_after_on_atomic_generated_shortcode_with_preimage_soft_drops`. + Construct an inline plan with `UseAfter` targeting a Span + whose `source_info` is `Generated{by:shortcode, from: + [Invocation -> token_si]}` and whose content differs from the + original. Assert the qmd output preserves the token bytes + verbatim and one Q-3-42 warning fires. Mirrors today's + block-level `sectionize_wrapper_shortcode_child_edit_soft_drops`. +- [ ] `cargo xtask verify --skip-hub-build --skip-hub-tests` green. + +#### Phase 8 — `target_file_id` derivation skips no-`root_file_id` first blocks + +**Repo facts the implementer needs:** + +- Current derivation site: + `crates/pampa/src/writers/incremental.rs:289-293`. The current + shape: + ```rust + let target_file_id = original_ast + .blocks + .first() + .and_then(|b| b.source_info().root_file_id()) + .unwrap_or(quarto_source_map::FileId(0)); + ``` +- `root_file_id()` lives at + `crates/quarto-source-map/src/source_info.rs:487-498`. For + `Generated`, it walks the `Invocation` anchor; for an empty + `from[]` it returns `None`. So a document whose first block is + a synthesized title-block (no Invocation) gets `None` → + fallback to `FileId(0)`. +- `FileId(0)` is the wire-format default — the same FileId the + parser stamps on a fresh single-file parse. So on a one-file + document, `FileId(0)` happens to be correct by coincidence, + and the bug only surfaces when there's a real cross-file + story (Plan 8's IncludeExpansion, the q2-preview-spa's project + mode addressing multiple files). +- Today the bug is dormant. We don't ship multi-file editing + in this writer pass yet; Plan 8 will. But the test is cheap + and the fix is cheap, and shipping them now means Plan 8 doesn't + have to rediscover the issue. + +**The fix — `iter().find_map(...)` over `first().and_then(...)`.** + +```rust +let target_file_id = original_ast + .blocks + .iter() + .find_map(|b| b.source_info().root_file_id()) + .unwrap_or(quarto_source_map::FileId(0)); +``` + +`find_map` walks blocks in order, returning the first block whose +`root_file_id()` resolves to `Some`. Synthesized title-blocks, +sectionize wrappers, footnotes containers — anything Generated +with empty `from[]` — get skipped. The fallback to `FileId(0)` +remains for the genuinely-empty-document case (no blocks at all, +or every block is no-`root_file_id` Generated). + +**Implementation steps.** + +- [x] Write the failing test first: + `target_file_id_skips_synthesized_first_block`. Build a + Pandoc whose `blocks[0]` is a synthesized title-block (e.g. + `Block::Header` with + `SourceInfo::generated(By::title_block())` and empty `from[]`) + and whose `blocks[1]` is a real `Original` paragraph with + `FileId(7)`. Drive `coarsen` and assert that the editability + check on `blocks[1]` returns `true` (i.e. `target_file_id` + resolved to `FileId(7)`, not `FileId(0)`). The pre-fix + coarsen sees `target_file_id == FileId(0)`, + `preimage_in(FileId(0))` on a `FileId(7)`-Original returns + `None`, and the block is gated as non-editable — the test + fails. +- [x] Apply the `find_map` fix at `incremental.rs:289-293`. + Implemented as a recursive `derive_target_file_id` helper + that descends through `block_block_children` as well, so a + sole-top-level sectionize wrapper (with the user's real + blocks inside) also yields the right file id rather than + `FileId(0)` by accident. The implementation note in §"Why + this isn't already broken in CI" below remains accurate: + single-file fixtures with `Original`-first blocks hit the + fast path; the wrapper-first variant required descent. +- [x] Re-run the test; assert it passes. +- [x] Add a fully-empty-document test: + `target_file_id_defaults_to_zero_for_empty_document`. The + `FileId(0)` fallback only kicks in when every block returns + `None` from `root_file_id()` — or there are no blocks. +- [x] `cargo nextest run -p pampa target_file_id` green. +- [x] `cargo xtask verify --skip-hub-build --skip-hub-tests` + green. + +**Why this isn't already broken in CI.** + +The existing test suite uses fixtures with `Original`-first +blocks: `keep_before_with_original_in_target_emits_verbatim` +at `incremental.rs:1565` builds a `Paragraph` with +`SourceInfo::original(TARGET, 10, 25)` at `blocks[0]`, so +`root_file_id()` returns `Some(TARGET)` immediately and the +fallback path is never hit. A title-block-first fixture +exposes it. The Plan 8 single-file include story would hit +it too — pre-empting that discovery is the value here. + +#### Phase 9 — Verification + +- [ ] `cargo xtask verify` (full) green. +- [ ] End-to-end exercise: open `q2-preview` against a small + fixture, type into a `{{< meta foo >}}`-resolved region + *before* the first render completes (or use the dev server + with artificial render delay), confirm Q-3-41 appears in the + `DiagnosticStrip`. Record the invocation + observed + diagnostic in the plan body under §"Verification" per + `CLAUDE.md`'s end-to-end rule. +- [ ] End-to-end exercise for the framework gate: open a fixture + with a no-preimage Generated container (e.g. the synthesized + footnotes Div from Plan 6 + a single inline edit), confirm + the React dispatcher's gate now intercepts the typing before + the writer's soft-drop fires (no `Q-3-43` flashes through). +- [ ] Plan-7 doc gets a "Closed via Plan 7c" footnote on the four + open items (do not flip the checkboxes — they describe + Plan-7 scope; Plan 7c is a follow-up). + +### Out of scope + +- Anything in Plan 7b (writer-lossless baseline test; + filter-construction *block-level* UseAfter test; e2e Playwright + matrix). +- `is_editable_inside` migration to `quarto_core::editability`. + The Rust module lives in `pampa::writers::incremental` for + documented dependency-cycle reasons (see Plan 7 Phase 1 + implementation note). The TS-side predicate goes into + `preview-renderer`, mirroring the consumer placement; no + attempt is made to unify the module names. +- Plan 9 (`ValueSource`) / Plan 10 (`Dispatch`) work. The role- + asymmetry rule (`preimage_in` walks `Invocation` only) is + already in place on both sides; future anchor roles inherit + the gate behaviour for free. +- New diagnostic codes beyond Q-3-41. The codes for the gate + surfaces (Q-3-42, Q-3-43) are already implemented. +- Suppressing Q-3-41 spam in autosave contexts. The current + `suppressAfterThree` helper in `DiagnosticStrip` keys by source + range; Q-3-41 has no range so will repeat per keystroke. If + this proves noisy in practice, file a follow-up to extend the + helper to also key by code. + +## Design decisions (settled in conversation) + +- **Q-3-41 is TS-constructed, not Rust-constructed.** The writer + is never invoked in the no-baseline branch — the gate intercepts + before the bridge. A Rust `diagnostic_q3_41()` builder would be + dead code; the catalog entry exists for docs URL / version + consistency only. (Plan 7 §"Catalog mechanics" already + established that the writer's Q-3-43 emission picks its body + text via the builder, not the catalog template; Q-3-41 takes + the same path with the builder on the TS side.) +- **`targetFileId` defaults to `0`.** Both sides default the + target FileId to 0 when the AST lacks a first-block root + FileId — see `incremental.rs:289-293` for the Rust precedent. + The default is safe for empty documents (won't match any real + source bytes; `hasPreimageIn` returns `undefined`; gate + conservatively denies editing). +- **TS predicate placement.** `hasPreimageIn` / + `isEditableInside` go into the existing `utils/sourceInfo.ts` + rather than a new module — they're a natural extension of the + atomicity helpers already there, and the `ATOMIC_KINDS` set is + next to them. +- **No new context fields.** `targetFileId` joins the existing + `RegistryContext`; no new context type is introduced. The + default-`0` semantics matter: dispatchers that don't pass it + fall through to the same "no preimage anywhere" behaviour they + had before (since the wire-format default `d` is FileId 0, + which matches the gate). The only practical regression is if a + caller relies on editing happening inside a non-zero-FileId + AST without setting `targetFileId` — that's a Plan 8 / include + story and not regressed today. +- **Phase ordering inside Phase 2.** The implementation order + inside Phase 2 is: predicate + tests → context plumbing → + dispatcher gate. The predicate is independently testable; the + context plumbing only matters when the gate consumes it; the + gate is the integration point. + +## References + +- Audit transcript (2026-05-25 Claude session): the four items + numbered 1–4 in §Goal map to that audit's items 1, 2, 3, and 4. +- `claude-notes/designs/incremental-writer-contract.md` — + consumer-side contract; §"Role-asymmetry" and §"Unified + editability predicate" pin the rules this plan implements. +- `claude-notes/designs/provenance-contract.md` — producer-side + contract; §4 "Role-asymmetry" and §7 "Atomic-kind set" + cross-reference the editability work. +- `claude-notes/designs/transparent-wrappers.md` — sibling + contract introduced 2026-05-25 alongside Phase 8's fix. Names + the descent pattern that `derive_target_file_id` implements + and lifts it into a reusable primitive (`first_in_user_tree`) + that future plans (8/9/10/replay) can cite without + rediscovering. +- `claude-notes/plans/2026-05-04-q2-preview-plan-7-incremental-writer.md` + — Phase 1 implementation note documents the + `pampa::writers::incremental` placement (the deliberate + deviation from the post-review `quarto_core::editability` + pin). +- `claude-notes/plans/2026-05-24-q2-preview-plan-7b-test-orama.md` + — the *other* Plan-7-followup test pass. Plan 7c is disjoint; + scan Plan 7b before adding any test to make sure it's not + already covered there. +- `crates/pampa/src/writers/incremental.rs:113-162` — Rust + reference for the editability predicate. +- `crates/quarto-source-map/src/source_info.rs:406-442` — Rust + reference for `preimage_in` (Original / Substring / Concat / + Generated walk). +- `ts-packages/preview-renderer/src/utils/sourceInfo.ts` — TS + target file for the new predicates. +- `ts-packages/preview-renderer/src/framework/dispatch.tsx:404-411` + — the gate to update. + +## Estimated scope + +| Phase | Lines (rough) | +|-------|---------------| +| 1 — Q-3-41 catalog entry | ~15 | +| 2 — TS predicates + context + gate + unit tests | ~250 | +| 2 — Cross-language parity fixture + tests (Rust gen + TS consumer + atomic-kinds belt-suspenders) | ~200 | +| 3 — First-edit Q-3-41 emission + helper + tests | ~120 | +| 4 — Rust per-kind tests | ~120 | +| 5 — `cfg(debug_assertions)` `#[should_panic]` test | ~25 | +| 6 — Differentiated `Q-3-43` builder + call-site updates + test adjustments + location-anchoring tests | ~180 | +| 7 — Inline soft-drop: extend `InlineAlignment::UseAfter` to struct variant + reconciler population + writer consumption + regression tests | ~180 | +| 8 — `target_file_id` derivation: `find_map` over `first()` + regression tests | ~40 | +| 9 — Verification | (no code) | +| **Total** | **~1130** | + +Roughly the size of Plan 7 itself. Phase 6 and Phase 7 add real +correctness fixes (Phase 6 closes a doc-vs-code drift on Q-3-43 +body text; Phase 7 fixes a positional-proxy hole in inline +soft-drop). The new parity-test work in Phase 2 adds a structural +sync check so the TS↔Rust walker pair can't drift silently. + +No new diagnostic codes. No new pipeline tier. The +`InlineAlignment::UseAfter` shape change in Phase 7 is the only +type-surface change; `#[serde(default)]` keeps it wire-compatible. + +## Risk areas + +- **Q-3-41 spam in autosave.** Without a code-keyed suppression + rule, every keystroke before first-render emits a fresh + warning. The DiagnosticStrip's `suppressAfterThree` keys on + source range and Q-3-41 has none. Acceptable for v1 — the + pre-render window is short — but document the limitation in + the strip's comment so a future contributor can extend the + helper. +- **`targetFileId` derivation under include.** Plan 8's + IncludeExpansion wrapper introduces source content from a + non-zero FileId. The default-`0` derivation in Phase 2 is + conservative: nodes whose root FileId is the included file + fail `hasPreimageIn(target=0)`, so the gate denies editing. + This is the *correct* behavior for v1 (editing inside an + included child should require the user to open the child), + but worth confirming with a fixture once Plan 8 lands. +- **Gate desync between Rust and TS.** The two predicates must + agree on which kinds are atomic, which roles are walked, and + how `preimage_in` chains resolve. The parity fixture work + inside Phase 2 makes this structural: Rust generates + `test-fixtures/preimage-parity/cases.json` from its in-code + corpus, a Rust test fails when that fixture is stale, and the + TS test fails when its walker disagrees with the regenerated + fixture. Future-walker changes either re-bless both sides + (matching) or trip one of the two gates (loud). The atomic- + kinds belt-and-suspenders fixture catches the simpler "added a + kind on one side only" drift in one Rust + one TS assertion. +- **Reconciler change in Phase 7 ripples through pattern + matches.** Changing `InlineAlignment::UseAfter` from + `(usize)` to `{ after_idx, displaced_before_idx }` is a + breaking change for every consumer of the type. The mechanical + fix is `cargo build --workspace` until clean; the risk is a + consumer that silently ignores the new field (e.g. wildcards + the variant). After Phase 7, audit for `InlineAlignment::UseAfter + { .. }` matches that don't bind `displaced_before_idx`; any such + match outside test code should be reviewed. +- **`Q343Reason::IncludeRecurse { include_path: None }` fallback.** + Atomic CustomNodes without a `source_path` field in `plain_data` + (e.g. `CrossrefResolvedRef` today) fall back to a generic + message. That's worse than the catalog promise but better than + Plan 7's all-cases-identical text. Plan 8's IncludeExpansion + will give us the include path universally for include cases; + CrossrefResolvedRef would need its own `Q343Reason` variant + later (e.g. `Q343Reason::CrossrefRecurse { ref_id: &str }`) if + the message text needs to differ. + +## Notes + +This is the third Plan-7 follow-up alongside Plan 7a (runtime +filter idempotence, `bd-bk3y` / Q-3-44/45) and Plan 7b +(test-o-rama). Each addresses a different gap left by the +2026-05-24 implementation session; together with this plan, the +post-Plan-7 surface is closed. + +Phases 1–5 close gaps where the implementation drifted from the +post-review intent — no contract change. Phases 6 and 7 close +correctness/UX issues that the post-implementation code review +surfaced: + +- **Phase 6** brings Q-3-43's body text up to the contract the + doc already promises. Mechanical fix; the contract itself is + unchanged. +- **Phase 7** narrows the inline soft-drop's positional proxy by + threading the displaced original index through + `InlineAlignment::UseAfter`. This is a small reconciler-type + contract change (struct variant + `Option` field) — and + the only contract change in the plan. The semantics it adds + (the reconciler tells consumers which original was displaced) + is what the writer already needed and approximated; the type + now expresses it honestly. + +If a reviewer reads this and thinks "this needs a design +discussion," the only candidate is Phase 7's reconciler-type +change, which is the kind of small structural sharpening that +fits inside this plan rather than a separate design doc. The +other six phases are wiring + test work + a single-file +diagnostic refactor. + +Update the contract docs alongside the implementation: + +- `claude-notes/designs/incremental-writer-contract.md` — + §"User-facing diagnostic surface" should note that Q-3-43 + body text differentiates by reason (include / metadata / + replacement), with the wording the builder produces. +- `claude-notes/designs/incremental-writer-contract.md` — + §"Soft-drop semantics" should note that the inline-level + case consults `InlineAlignment::UseAfter`'s + `displaced_before_idx` (the reconciler's truth) rather than + the alignment's result-side index. diff --git a/claude-notes/plans/2026-05-26-q2-preview-plan-7d-algebraic-soundness.md b/claude-notes/plans/2026-05-26-q2-preview-plan-7d-algebraic-soundness.md new file mode 100644 index 000000000..9bfd36dea --- /dev/null +++ b/claude-notes/plans/2026-05-26-q2-preview-plan-7d-algebraic-soundness.md @@ -0,0 +1,442 @@ +# Plan 7d — Algebraic soundness of the coarsen / incremental-writer + +**Date:** 2026-05-26 +**Branch:** feature/provenance (research; refactor work happens on a child branch once Phase 0 validation closes) +**Status:** Research plan — Phase 0 ("Validate the algebra") gates every subsequent phase. The technical description in this file's introduction is the artifact under review. +**Milestone:** none directly. Pre-condition for any future "minimal-edit diffing" work that would consume the coarsened plan to derive per-region Monaco edits rather than full-document saves. + +## Epic context + +Fourth sibling follow-up to Plan 7 in the provenance epic: + +| Sibling | Axis | Status | +|---|---|---| +| Plan 7 | Incremental writer + soft-drop + bridge migration | shipped on `feature/provenance` | +| Plan 7a | Runtime user-filter idempotence (input-side validation) | open | +| Plan 7b | Test-coverage consolidation | open | +| Plan 7c | Closure gaps in the existing soft-drop cascade | open | +| Plan 7d | Algebraic soundness of the coarsen/write step | this plan | + +7d differs from 7c in *disposition*. Plan 7c tightens the existing denylist cascade — each phase adds a branch the cascade should have caught but didn't, or repairs a per-arm predicate that drifts from accuracy. Plan 7d replaces the cascade with an allowlist algebra: every emission is allowed by construction rather than by the absence of a denylist match. The two plans are parallel, not sequenced. If 7d lands first, two of 7c's phases (Phase 7 and the newly-added Phase 7b) become *defense-in-depth* rather than load-bearing — they patch producer-side hygiene failures the algebra would tolerate but does not strictly require. If 7c lands first, the denylist gets more complete in the meantime; 7d's refactor still proceeds against whatever state 7c leaves behind. + +The implementation in this plan starts from the current HEAD of `feature/provenance`, after the rebase that landed `incremental-writer-contract.md` (today's Task 1) and after the soft-drop fix that prompted this whole line of investigation (commit `e584428d`). The CoarsenedEntry self-containment property that fix established — every variant produces its emit bytes from its own payload without ambient context — is a precondition for the algebra to compose correctly. Plan 7d is the next step on top. + +## Goal + +Bring the writer's coarsen step under a soundness proof. The property to prove is the **byte-provenance invariant (BP)**: every byte the writer emits is either (i) copied verbatim from `Source` at a position some AST node identifies as its source-side knob, or (ii) produced by serializing a single AST leaf whose own immediate content is the user's authored content. The invariant rules out, by structural induction over the AST: resolved shortcode bytes leaking back into source; filter-output bytes leaking back into source; synthesized container chrome leaking back into source; in general, any byte derived from pipeline output that the user could not have authored at the position it lands. + +Today's writer satisfies BP only by enumeration: a list of per-alignment-arm predicates that have grown branch-by-branch as bugs surfaced. The list is incomplete by construction (the lipsum fix on `e584428d` was one example; the new Plan 7c Phase 7b is the inline-level analogue). The goal is to replace the enumeration with a *total* dispatch on `(alignment_kind, source_info_shape)` whose inductive soundness argument discharges BP without per-arm checking. + +This is a refactor of one layer of the system. The reconciler, the AST types, `apply_reconciliation`, and the diagnostic catalog are not touched. + +## Status + +Phase 0 — Validate the algebra — is the gating phase. The text under "The proposed algebra" below is the artifact under review. No code changes happen in Phase 0; the user reads, asks questions, and either approves the algebra (allowing Phases 1-6 to proceed) or sends it back for revision. + +Phases 1-6 are *sketched* below, not specified in fine detail. Their concrete shape depends on decisions made during Phase 0 (which trust points to tighten, which `CoarsenedEntry` variants survive the refactor, how the qmd writer's per-container arms decompose). The sketches exist so Phase 0 can judge the *scope* of work the algebra implies, not so a fresh agent can implement them cold. + +--- + +## Reconciler / coarsen architecture (brief) + +The system has three layers between the user's edit and the bytes that hit disk: + +**Layer 1 — Reconciler (`crates/quarto-ast-reconcile`).** +Input: two ASTs (`original`, `executed`). Output: a hierarchical `ReconciliationPlan` of alignment decisions describing how `executed` relates structurally to `original`. The output operations are three per AST level: `KeepBefore(orig_idx)` (this position's content matches the original), `UseAfter(exec_idx)` (this position's content is new), `RecurseIntoContainer { before, after }` (the container at this position was structurally paired; descend into its children for the diff). Same three variants exist for `BlockAlignment` and `InlineAlignment`; lists use a separate three-variant `ListItemAlignment` enum. + +The reconciler's algorithm is three-phase (`crates/quarto-ast-reconcile/src/compute.rs:37-215`): +1. Phase 1: exact hash matches anywhere in the original block list → `KeepBefore`. +2. Phase 2: positional type matches at the same index → `RecurseIntoContainer` (or `KeepBefore` for inline-content blocks when the inline plan finds at least one matching inline). +3. Phase 3: fallback for unmatched blocks → `UseAfter`. + +A `ReconciliationPlan` is a flat `Vec` plus side-tables of nested plans (one per recursion target): `block_container_plans`, `inline_plans`, `inline_container_plans`, `note_block_plans`, `custom_node_plans`, `table_plans`, `list_item_alignments`, `list_item_plans`. The plan tree mirrors the AST's tree structure. + +**Layer 2 — `apply_reconciliation` (`crates/quarto-ast-reconcile/src/apply.rs`).** +Consumes a `ReconciliationPlan` and produces a new `Pandoc` by *moving* original blocks where the plan says `KeepBefore`, *moving* executed blocks where it says `UseAfter`, and recursively reconciling where it says `RecurseIntoContainer`. The output is an AST, not bytes. + +The writer does not use `apply_reconciliation`. The two consumers of a `ReconciliationPlan` — `apply_reconciliation` and the writer's `coarsen` — sit side by side, each interpreting the same plan into a different output medium. + +**Layer 3 — Writer's `coarsen` (`crates/pampa/src/writers/incremental.rs`).** +Inputs: `original_qmd: &str`, `original_ast: &Pandoc`, `new_ast: &Pandoc`, `plan: &ReconciliationPlan`, `target_file_id`. Output: `Vec` — a flat list of byte-emission instructions that `assemble` walks to produce `Source'`. + +The `CoarsenedEntry` variants are the writer's internal byte-emission language: +- `Verbatim { byte_range, orig_idx }` — copy `original_qmd[byte_range]`. +- `InlineSplice { block_text, orig_idx }` — emit pre-computed text mixing original-source prefix/suffix with newly-serialized inline content. +- `Rewrite { block_text }` — emit pre-computed text from re-serializing an AST subtree via the qmd writer. +- `Transparent { child_entries }` — inline children's bytes (wrapper contributes nothing). +- `Omit` — emit nothing. + +`Rewrite` is a Layer-3 concept, not a reconciler output. The reconciler never says "rewrite anything"; the writer's coarsen translates each alignment, in context with the node's source_info, into a `CoarsenedEntry`. The current mapping (alignment + source_info → CoarsenedEntry) is the table the algebra reorganizes. + +## Cumulative delta — `main` → `feature/provenance` → 7d + +The coarsen step has three states of interest: + +**State A — `main` (pre-Plan-7).** The incremental writer exists in skeletal form. `CoarsenedEntry` has three variants: `Verbatim`, `Rewrite`, `InlineSplice`. There is no soft-drop infrastructure; non-editable content isn't recognized as such; the writer's dispatch is essentially "try Verbatim, otherwise Rewrite the whole block." No `Transparent` (synthesized wrappers don't get descended-through), no `Omit` (no soft-drop), no editability predicate, no source-info-aware dispatch. + +**State B — `feature/provenance` HEAD (Plan 7 shipped + recent fixes).** `CoarsenedEntry` has five variants: `Verbatim`, `Rewrite`, `InlineSplice`, `Transparent`, `Omit`. The soft-drop cascade exists with six cases (per `incremental-writer-contract.md` §"Soft-drop semantics"). `is_editable_inside` predicate exists. Source-info-aware dispatch exists within each alignment arm. The cascade has known structural weaknesses (`Rewrite`-as-subtree-serializer is the catch-all; per-arm predicate duplication; the lipsum-paragraph and inline-UseAfter-with-atomic-source_info gaps Plan 7c and our recent commit `e584428d` patched). + +**State C — after Plan 7d.** `CoarsenedEntry` has four variants: `Verbatim`, `Omit`, `Recurse`, `Leaf`. The dispatch is a single table over `(alignment_kind, source_info_shape)`. `Rewrite` is gone (its work decomposes into `Recurse` over containers + `Leaf` over actual leaves). `Transparent` is gone (subsumed by `Recurse` with empty shells). `InlineSplice` is gone (subsumed by `Recurse` where shells come from the original block's prefix/suffix). The byte-provenance invariant is provable by induction on the dispatch table. + +The net effect from State A to State C is significant but not radical: the *concept* of an incremental writer with a coarsened intermediate language is preserved from `main`; what changes is the variant set, the dispatch shape, and the soundness property. The reconciler is unchanged across all three states; only Layer 3 of the system moves. + +## The proposed algebra — technical description + +### Setup + +The writer is a function + +``` +write : (Source, AST_old, AST_new, Plan) → (Source', Warnings) +``` + +where `Source` is the user's qmd bytes; `AST_old` is the AST produced by parsing `Source` through some pipeline tier; `AST_new` is the same AST after a structural edit applied by some upstream layer (React framework, programmatic edit, etc.); `Plan` is the reconciler's diff between `AST_old` and `AST_new`; `Source'` is the qmd bytes the writer produces, intended to round-trip through the pipeline back to `AST_new` (modulo non-meaningful whitespace). + +The job is non-trivial because the pipeline `Source → AST` is non-injective. Some bytes of `Source` produce multiple AST nodes (shortcode resolution: a token of seventeen bytes produces three paragraphs). Some AST nodes have no `Source` bytes (sectionize wrapper; title-block synthesis). Writing back a mutated AST means deciding, for every byte in `Source'`, what source-side identity it has. + +### Provenance + +Every AST node carries a `SourceInfo` value with four physical shapes: + +- `Original{file, start, end}` — bytes come from `file[start..end]`. +- `Substring{parent, start, end}` — bytes are a contiguous restriction of `parent`'s bytes. +- `Concat[pieces]` — bytes are the concatenation of `pieces`, each itself a `SourceInfo`. +- `Generated{by, from}` — bytes were synthesized by an operation tagged `by`; `from` is a list of `Anchor` values that record diagnostically-useful source positions without claiming byte-equivalence. + +The derived operation `preimage_in(node, target_file_id) → Option` answers: *given a node, what contiguous byte range in `target_file_id` corresponds to its source-side identity?* The walk rules: + +- `Original{f, s, e}` returns `Some(s..e)` if `f == target`, else `None`. +- `Substring{parent, s, e}` walks `parent`, restricts the returned range. +- `Concat[pieces]` returns `Some(union)` iff every piece resolves contiguously in `target`; otherwise `None`. +- `Generated{by, from}` walks `from` looking for an `Anchor` whose `role` is `Invocation`. Other roles (`ValueSource`, `Dispatch`, `Other`) are diagnostic-only and *do not* contribute to byte-traceability. This is the role-asymmetry contract (see `incremental-writer-contract.md` §"The role-asymmetry contract on `Generated.from`"). + +`preimage_in` is total (every node returns either `Some` or `None`) and side-effect-free. + +### The user-authorable predicate + +Define `editable_inside(node, target) : Bool` as `true` iff all three hold: + +1. `node` is not an atomic `Custom` block or inline (its type name is not in `ATOMIC_CUSTOM_NODES`). +2. `node.source_info` is not `Generated{by, _}` with `by.is_atomic_kind()`. +3. `preimage_in(node, target).is_some()`. + +A node where `editable_inside(node, target) = true` represents content the user can directly edit at the position `preimage_in` identifies. The negations matter: + +- (1) Atomic `Custom` nodes (`IncludeExpansion`, `CrossrefResolvedRef`) are replaceable wholesale via UI but not editable byte-by-byte. +- (2) Atomic-kind `Generated` (shortcode, filter, title-block, tree-sitter-postprocess) are pipeline outputs whose user-side knob is the invocation token, not the resolved content. +- (3) No preimage means there are no bytes in the target file to map back to. + +### The byte-provenance invariant (BP) + +> **(BP)** For every byte `b` in `Source'`, exactly one of: +> +> **(P1)** `b = Source[i]` for some position `i`, where `i ∈ preimage_in(n, target)` for some AST node `n` in `AST_old`. *Read:* `b` was lifted verbatim from the user's source file, at a position the writer identified as the source-side knob for some AST node. +> +> **(P2)** `b` was produced by `serialize_leaf(n)` for some AST node `n`, where `n` has no children to recurse into and `serialize_leaf` emits only bytes derived from `n`'s own immediate content. *Read:* `b` was generated by re-serializing a single AST node that has no descendants whose serialization could also contribute bytes. + +Two notes on (P1): + +- (P1) does not condition on `editable_inside(n, target)`. It says only that the bytes came from some node's source-side identity. Whether the user is *currently* allowed to edit that position is a separate matter; it determines whether a warning rides along with the emission, but it does not affect whether the emission satisfies BP. Atomic-Generated soft-drop emits Verbatim of the token bytes — the bytes satisfy (P1) (they're at the Invocation anchor's range in target), and a Q-3-43 warning is emitted alongside. + +- `preimage_in` only returns Some when the source bytes are recoverable as a contiguous range in target. For Generated nodes, that range is the *Invocation* token, not the resolved content. So Verbatim of a Generated node's preimage emits the token bytes — exactly what the writer wants when the user attempts to edit resolved content. + +Two notes on (P2): + +- The "no descendants to recurse into" clause is what makes the algebra's recursion structural. Today's writer has paths that call `write_block_to_string` on a non-leaf, and that function walks the entire subtree, emitting bytes from every node it traverses. Under BP, that path is forbidden. The only way bytes can be generated (rather than copied) is through `serialize_leaf` on a node with no children. + +- The recursion that produces a container's bytes happens *outside* the serialization, in the algebra's dispatch step, which independently classifies each child via the rules below. The container's *shell* bytes (the `:::` of a Div, the `> ` of a BlockQuote, the `- ` of a list item) are also user-authorable bytes — the user could have typed them — but they are emitted by the recursion's compositional step, not by leaf serialization. We will revisit this distinction in the "Where user edits land" subsection below. + +What BP rules out: every byte in `Source'` must come from either the user's existing source (P1) or a single-node leaf serialization (P2). There is no way for the writer to emit bytes derived from walking a subtree that includes atomic-Generated descendants the user can't author. The unsoundness today's writer has — that the catch-all `Rewrite` path can serialize a subtree whose descendants haven't been individually classified — is structurally absent under BP. + +What BP does not promise: position correctness (whether the bytes land at the right place in `Source'`; that's `assemble`'s job), warning fidelity (whether the right diagnostics are emitted; that's the diagnostic layer's job, on top of BP), and producer-side hygiene (whether AST leaves with no provenance are actually user-authored; that's the producer contract, on which BP relies as a narrow trust point — see "Open design judgment" below). + +### The coarsened-tree algebra + +The algebra reorganizes the writer's output language. The proposed `CoarsenedEntry` shape: + +```rust +enum CoarsenedEntry { + Verbatim { + byte_range: Range, + orig_idx: Option, // separator hint, never byte-production context + }, + Omit { + warning: Option, + }, + Recurse { + shell_open: Bytes, + children: Vec, + shell_close: Bytes, + separator: SeparatorRule, + }, + Leaf { + block_text: Bytes, + }, +} +``` + +This unifies today's five variants (`Verbatim`, `Omit`, `Transparent`, `InlineSplice`, `Rewrite`) into four. The renames and consolidations: + +- Today's `Transparent { child_entries }` becomes the special case `Recurse { shell_open: "", shell_close: "", children, separator }`. The wrapper contributes no bytes; only the children's compositions do. Sectionize Div, footnotes container, appendix container all use this shape today and continue to under the algebra. + +- Today's `InlineSplice { block_text, orig_idx }` becomes a more general case: `Recurse { shell_open, children, shell_close, separator }` where `shell_open` and `shell_close` are the *original-qmd prefix and suffix bytes* of the block being spliced, and `children` are the inline children. The pre-computed `block_text` field is derived: `block_text = shell_open + assemble(children) + shell_close`. Today's `InlineSplice` is essentially "the container's wrapper is the same as the original; only the inside changed" — exactly what `Recurse` models when `shell_open` and `shell_close` come from `Source`. + +- Today's `Rewrite { block_text }` does *not* survive in its current form. Its substance — "serialize this subtree as text" — is decomposed. For container blocks, that work becomes a `Recurse` whose `shell_open` and `shell_close` come from the qmd writer's container-shell helpers and whose `children` come from the algebra's recursion. For leaf blocks, that work becomes a `Leaf` whose `block_text` is `serialize_leaf(node)` — `serialize_leaf` being `write_block_to_string` restricted to nodes with no recursable descendants. + +`assemble : CoarsenedEntry → Bytes` is the fold: `Verbatim` returns `Source[byte_range]`; `Omit` returns `""`; `Recurse` returns `shell_open ++ join(separator, [assemble(c) for c in children]) ++ shell_close`; `Leaf` returns `block_text`. + +### The dispatch table + +`coarsen : (Node, target, align_ctx) → CoarsenedEntry`. Total recursive function over the AST, dispatched on the pair `(align_ctx.alignment_kind, node.source_info_shape)`. The table: + +| Alignment | Source-info / structure | Rule | Operation | +|---|---|---|---| +| `KeepBefore(i)` | preimage in target | R1 | `Verbatim(preimage)` | +| `KeepBefore(i)` | atomic-kind Generated, no preimage | R2 | `Omit` (no warning; content regenerates from baseline) | +| `KeepBefore(i)` | non-atomic, no preimage, container with source-bearing children | R3 (Transparent-form) | `Recurse{ "", children-coarsened, "", separator }` | +| `KeepBefore(i)` | non-atomic, no preimage, no recursable children | R5 | `Leaf{ serialize_leaf(node) }` (rare; cross-file-rooted leaf, etc.) | +| `UseAfter(j)` | atomic-kind Generated with preimage | R1' (soft-drop) | `Verbatim(preimage)` + Q-3-43 | +| `UseAfter(j)` | atomic-kind Generated, no preimage | R2' (soft-drop) | `Omit` + Q-3-43 | +| `UseAfter(j)` | atomic `Custom` | R5-special (let-user-win) | `Leaf{ serialize_leaf(node) }` via `plain_data`; no warning | +| `UseAfter(j)` | non-atomic, no preimage, container | R3 | `Recurse{ shell_open, children, shell_close, separator }` where shells come from the qmd writer's per-container syntax helpers | +| `UseAfter(j)` | non-atomic, no preimage, leaf | R5 | `Leaf{ serialize_leaf(node) }` | +| `UseAfter(j)` | non-atomic with preimage | R1 | `Verbatim(preimage)` (paste-from-elsewhere case; trust the source_info producer marked) | +| `RecurseIntoContainer{ before, after }` | non-editable inside | R1' / R2' (soft-drop) | `Verbatim(preimage)` + Q-3-43 (if preimage exists), or `Omit` + Q-3-43 (no preimage). Recursion stops here. | +| `RecurseIntoContainer{ before, after }` | editable inside, block container | R3 | `Recurse{ shell_open, children-coarsened-per-`block_container_plans`, shell_close, separator }` | +| `RecurseIntoContainer{ before, after }` | editable inside, inline container | R4 | `Recurse{ shell_open-from-original-prefix, inlines-coarsened-per-`inline_plans`, shell_close-from-original-suffix, separator }` — the inline-splice case generalized | + +The dispatch is total: every `(align, source_info)` pair matches exactly one row. R3 and R4 are structurally the same operation (recurse with shells); they're listed separately because R3 dispatches on `block_container_plans` while R4 dispatches on `inline_plans`, and the shell sources differ (R4 takes shells from the *original* block's source bytes for the inline-splice case; R3 takes shells from the new container's syntax helpers). + +The rule R1' / R2' on `RecurseIntoContainer` for non-editable nodes is the soft-drop substitution: even though the reconciler said "recurse into this container," the writer overrides the recursion because the container itself isn't editable. The recursion would emit user-side bytes the user can't actually author; the substitution emits the wrapper's preimage instead. This is what the existing soft-drop cascade does today; the algebra preserves the behavior under R1' / R2'. + +### Soundness sketch + +**Claim.** For any well-formed input `(Source, AST_old, AST_new, Plan)` and any node `n` in `AST_new` with alignment context `align`, `assemble(coarsen(n, target, align))` produces bytes that satisfy BP. + +**Proof.** By structural induction on `n`. + +- **R1 base case.** Emits `Source[range]` where `range = preimage_in(node, target)`. `Source[range]` is by definition bytes in `Source` at a position `range` is the preimage of an AST node. (P1) holds for every byte. The atomicity of the node doesn't enter; (P1) doesn't require editability. + +- **R2 base case.** Emits no bytes. Vacuously satisfies BP. + +- **R5 base case.** Emits `serialize_leaf(node)` where `node` is a leaf (no children to recurse into). `serialize_leaf` emits bytes derived from `node`'s own immediate content; by R5's precondition (no recursable descendants), the "no descendants whose serialization would contribute" clause of (P2) is vacuously true. (P2) holds. The trust point: we trust `node`'s immediate content represents user-authored bytes; if a producer creates such a leaf without it being user content, the trust is misplaced. This is the producer-side contract's job, narrowed by the algebra to leaves only. + +- **R3 / R4 inductive case.** Emits `shell_open ++ join(separator, [assemble(c) for c in children]) ++ shell_close`. By inductive hypothesis, each `assemble(c)` satisfies BP. Concatenation preserves BP per-byte (each byte still satisfies (P1) or (P2)). `shell_open` and `shell_close` are user-authorable syntax for the container kind — for R3 with new containers, they come from the qmd writer's syntax helpers (e.g., `:::{.foo}\n` for a Div, `> ` per line for a BlockQuote); for R4, they come from `Source[original_prefix_range]` and `Source[original_suffix_range]` of the original block. In the R3 case, the shell bytes satisfy (P2) (they're emitted by a structural leaf operation — the syntax helper — that has no descendants of its own to walk). In the R4 case, the shell bytes satisfy (P1) (they're copied from `Source` at the original block's range). Separator bytes are user-authorable whitespace; treated like shell bytes. + +QED, informally. + +### Properties enforced + +The algebra implies the following properties as theorems (some require Phase 0 design judgment to fully nail down; flagged below): + +1. **(BP) Byte-provenance soundness.** For every byte of `Source'`, (P1) or (P2) holds. Proven by the structural induction above. + +2. **Totality of dispatch.** Every `(node, align)` pair matches exactly one row in the table; the writer is a total function on well-formed inputs. + +3. **Compositionality.** `coarsen(container) = Recurse(shells, [coarsen(child)])`. The writer's behavior on a subtree is a function of its behavior on the components. This is what makes inductive reasoning over the AST possible. + +4. **Source-info-driven dispatch within alignment kind.** Given a fixed alignment kind, the rule that fires depends only on `node.source_info` and structural shape. No ambient context. No per-arm duplication of predicates. + +5. **Leaf-only serialization.** `serialize_leaf` is invoked only on nodes that the algebra classifies as R5 leaves. Subtree serialization is structurally absent — there is no path through the algebra that calls a serialization function on a non-leaf AST node without first recursing into its children via R3 / R4. + +6. **Termination.** `coarsen` recurses only on strictly smaller substructures (children). The AST is finite. Termination is by structural induction on AST size. + +7. **Diagnostic determinism.** The set of warnings produced is a function of the input ASTs alone — the warning a `(alignment, source_info)` cell emits is fixed by the table; no order-dependence, no cascade-arm-dependence. + +8. **Reconciler-independence of rule choice.** The reconciler's `Plan` informs *which* node is coarsened at each position and *what alignment context* applies, but the rule selection within a row is determined by source_info and structural shape alone. New reconciler outputs (a hypothetical fourth alignment kind, a new sub-plan type) would require a new row block in the table but would not require changes to existing rows. + +### What the refactor concretely changes (starting from `feature/provenance` HEAD) + +The architectural delta from the implementation starting point is concentrated in three places: + +**1. `write_block_to_string` decomposes into two functions.** Today, `write_block_to_string(block) → text` walks the block's entire subtree and emits text via the qmd writer's per-container arms. Under the algebra, it splits: + +- `serialize_block_shell_open(block) → Bytes` and `serialize_block_shell_close(block) → Bytes` emit the wrapper bytes of a container — the open and close syntax — without consulting the container's children. For Div: `:::{.foo}\n` and `:::\n`. For BlockQuote: per-line `> ` prefix (modelled as a SeparatorRule that prefixes each child's lines). For NoteDefinitionFencedBlock: the fenced-block syntax. For list shapes (`BulletList`, `OrderedList`): the per-item marker (a per-item shell within a list-Recurse). + +- `serialize_leaf(node) → Bytes` is `write_block_to_string` restricted to leaves. It is structurally guaranteed not to recurse: the function panics (or is enforced via the type system if we want to be strict) if invoked on a non-leaf node. + +The qmd writer's per-container arms are refactored to expose this decomposition. The unified-pass version (`write_block_to_string` as it exists today) becomes a derived convenience function that the rest of the codebase can still call for native rendering — it just isn't used by the incremental writer's coarsen step anymore. + +**2. `CoarsenedEntry::Rewrite` ceases to be a subtree operation.** The variant `Leaf { block_text }` replaces it for genuine leaves. The cases today's `Rewrite` covered: + +- `coarsen_keep_before_block` catch-all (cross-file Original, gappy Concat, no-preimage Generated without source-bearing children) → becomes either R3 (for containers — the catch-all recurses into children) or R5 (for leaves — emit the leaf's serialization). +- `coarsen_blocks::UseAfter` let-user-win on non-atomic, no-preimage → R3 for containers, R5 for leaves. +- `coarsen_blocks::RecurseIntoContainer` with inline_plan but not splice-safe → R4 with shells from the *new* block's syntax (since the splice-safety failure means we can't preserve the original's shell verbatim across the rewrite). +- `coarsen_blocks::RecurseIntoContainer` no inline_plan, block-children case → R3 with shells from the new container's syntax. + +**3. `coarsen_blocks` dispatch becomes source-info-aware uniformly.** Today's `coarsen_blocks` matches on `BlockAlignment` first, then dispatches on source_info within each arm. The same predicates (atomic-Generated check, preimage check, editability check) appear in two or three arms with slightly different surrounding logic in each. Under the algebra, the dispatch flips: for each block we intend to emit, compute `(alignment_kind, source_info_shape)`, look up the row, apply the rule. The per-arm duplication of the soft-drop cascade disappears. + +The inline cascade in `assemble_inline_content` undergoes the analogous restructuring. Today's two-phase shape (Phase 1: substitute safe alignments; Phase 2: emit with multi-inline dedupe) becomes a single recursive coarsen over the inline-level R1–R5 table. The multi-inline dedupe optimization (`compute_separator` checking that consecutive `KeepBefore` entries share an `Invocation` anchor and emitting their shared token once) is preserved as a separator rule. + +`compute_separator` itself becomes a method on the new `SeparatorRule` value carried by `Recurse`. The "consecutive-in-original" optimization (today's `orig_idx: Option` on `Verbatim` and `InlineSplice`) survives as a separator-rule variant; the indices remain `Option`-typed because children inside a `Recurse` don't have top-level positional identity. + +### Where user edits land + +A practical clarification, because the discussion that led to this plan kept conflating "where bytes get serialized" with "where user edits land in the output." They are different questions. + +The algebra has three base cases that produce bytes: + +- **R1 (Verbatim).** Emits bytes from `Source`. These bytes were authored by the user *at the position they came from*. R1 fires for unchanged content (KeepBefore on a node with preimage) AND for atomic-content soft-drop (UseAfter or RecurseIntoContainer on a non-editable node with preimage — substitute the preimage as the safe alternative). Same emission operation, different alignment contexts. + +- **R3 / R4 shell bytes.** Emitted by `Recurse`'s shell-emission step. These bytes are the *syntax* of a container — the `:::` of a Div, the `> ` of a BlockQuote, the `- ` of a list item, the `:::{.callout-note}` of a callout. The bytes are user-authorable because the user could have typed them directly in qmd. R3's shells come from the qmd writer's syntax helpers when the container is newly constructed; R4's shells come from `Source` when the container is being inline-spliced (preserving the original block's wrapping bytes). + +- **R5 (Leaf serialization).** Emits bytes from `serialize_leaf(node)` — the leaf node's own content rendered as text. `Str("hello")` becomes `hello`. `Code` block emits its code-fence syntax plus content (treated as a leaf because its content is bytes, not children needing recursion). Atomic `Custom` (via let-user-win) emits its qmd syntax derived from `plain_data`. + +User edits land at all three: + +| Kind of edit | Rule(s) that produce bytes for the edit | Example | +|---|---|---| +| User reorders / wraps / moves existing content | R1 (copies preserved bytes from `Source` at original positions) + R3/R4 shells (for new containers wrapping the moved content) | Wrap three paragraphs in a blockquote: R1 copies the paragraph bytes, R3 emits `> ` prefixes. | +| User constructs a new structural parent | R3 / R4 (shells of the new container) + recursion through children | Add a new list item: R3 emits the list's per-item iteration, the new item's R3 emits `- `, the item's children fire their own rules. | +| User types new leaf content | R5 (serialize the new leaf) | Type a word in a Para: R5 emits the new `Str`'s text. | +| User replaces atomic Custom via component picker | R5-special (let-user-win on atomic Custom) | Pick a different include source: R5 emits the new `{{< include … >}}` syntax derived from `plain_data`. | +| User attempts to edit atomic-Generated content | R1' soft-drop or R2' soft-drop (emit preimage + warning, OR omit + warning) | Type into a lipsum-resolved paragraph: R1' emits the `{{< lipsum 3 >}}` token, Q-3-43 warns. | + +A single user edit typically produces bytes from *multiple* rules in combination. The algebra's recursion walks down the new AST shape, choosing the right rule at each level based on `(alignment, source_info)`. The structural property is: every byte produced satisfies BP, regardless of which rule produced it. + +The reason R5 *appears* central in the soundness story: R5 is the only rule that emits bytes by *serializing AST content*, so it's where the algebra's "trust point" sits (the residual assumption that a non-atomic leaf with no preimage represents user-authored content). R3/R4's shell bytes are emitted by syntax helpers that don't carry trust (they emit fixed syntax based on the container kind). R1's bytes are copied from `Source` (trust derives from `Source` being the user's file). So when proving BP, R5 is the place to worry. When asking "where does user-typed content land in `Source'`," the answer is "R3 + R4 + R5 in combination, distributed across the recursion." + +### Open design judgment + +Each item below is a decision Phase 0 resolves before Phases 1-6 proceed. They are not implementation-prescriptive; they're design questions whose answer constrains the refactor's shape. + +**1. R5's trust point.** Today's writer trusts `Original`-source_info content's preimage. The algebra inherits that trust. It also adds a trust point at R5: nodes that reach R5 are assumed user-authored even if their source_info doesn't strictly say so (e.g., `SourceInfo::default()` on a freshly-React-typed leaf). The producer contract (`provenance-contract.md`) is the safeguard. Two possible tightenings: + + (a) **Permissive** (what the table above proposes): any leaf with `source_info` that isn't atomic-Generated reaches R5. Trust the producer to mark synthesized leaves correctly. + + (b) **Strict**: R5 fires only on leaves whose source_info is `Original`-rooted-in-target OR `Generated{by: user_edit, _}` OR equivalent explicit user-content markers. Anything else (including `SourceInfo::default()`) becomes R2' (Omit + warning) or R5'-with-warning. Tightens the trust surface but requires the React framework to attach explicit user-content source_info on edits. + + Phase 0 picks (a) or (b). The tradeoff is producer hygiene burden vs. residual writer trust. + +**2. Custom node treatment.** Today, `CoarsenedEntry::Rewrite` on a non-atomic CustomNode (like Callout) serializes the whole node via the qmd writer's CustomNode arm, which reads `plain_data` and walks the slot contents. Under the algebra, this needs to decompose: + + (a) The CustomNode's "shell" is its `plain_data`-derived open and close syntax (e.g., `:::{.callout-note}\n` and `:::\n`). + + (b) The slot contents are coarsened independently via the `custom_node_plans` side-table. + + This requires the qmd writer's CustomNode arm to expose `serialize_custom_shell_open(plain_data)` and `serialize_custom_shell_close(plain_data)`. Mechanically straightforward; needs a per-Custom-type sweep. + +**3. List shapes.** `BulletList` / `OrderedList` carry `Vec>` — a list of items, each item itself a Vec. The per-item marker (`- `, `1. `, etc.) is per-item-syntactic, not per-list. Two modelling choices: + + (a) `Recurse` carries a `SeparatorRule::ListItem { marker_fn }` that emits the marker before each child's bytes. The list itself is `Recurse { shell_open: "", children, shell_close: "", separator: ListItem }`. Each item is itself a `Recurse` over its blocks. + + (b) List items become a separate `CoarsenedEntry` variant (`ListItem { marker, content }`) the recursion handles specially. + + (a) is more uniform; (b) is more explicit. Phase 0 picks. + +**4. Separator state threading.** Today's `compute_separator` looks at adjacent `Verbatim`/`InlineSplice` entries' `orig_idx` values to decide blank-line vs. newline-only spacing. Under the algebra, separators are per-`Recurse`-level. The right model is probably: `SeparatorRule` carries enough information to reproduce today's decisions *given the local children*, without consulting global state. The exact rule needs to be written out in Phase 0's design pass. + +**5. Inline-cascade alignment with Plan 7c.** Plan 7c Phase 7's `displaced_before_idx` enrichment of `InlineAlignment::UseAfter` becomes *defense-in-depth* under the algebra rather than load-bearing: the algebra dispatches on the *new* inline's source_info, and a UseAfter on atomic-Generated-with-preimage fires R1' regardless of whether the displaced original is tracked. But if React strips source_info during edits (replaces an atomic inline with a fresh inline carrying `SourceInfo::default()`), R5 fires on the fresh leaf and the atomic content is overwritten. Phase 7's reconciler-side tracking is the second line of defense against that producer-side failure mode. Phase 0 decides whether to retain Phase 7 (defense-in-depth) or drop it (trust the producer contract). + +**6. Cost.** Today's `write_block_to_string` is a single function call that walks a subtree once. The algebra's R3/R4 recursion through every container layer is potentially O(layers × per-layer-work) more invocations. In practice R1 (Verbatim) short-circuits most unchanged subtrees, so the actual cost likely matches today's. Worth measuring before committing; Phase 4's property tests provide a natural benchmarking harness. + +--- + +## Phases + +### Phase 0 — Validate the algebra + +The reader (user) confirms: + +- [ ] The byte-provenance invariant (P1 + P2) is the right invariant for the writer. +- [ ] The dispatch table covers all `(alignment, source_info)` pairs the system today produces. +- [ ] The decomposition of `Rewrite` into `Recurse` + `Leaf` matches the desired structure. +- [ ] R5's trust point is acceptable as stated (or is tightened per "Open design judgment" #1). +- [ ] Custom-node treatment, list-shape treatment, and separator-state threading land at the decisions made in "Open design judgment" #2–#4. +- [ ] The relationship to Plan 7c Phases 7 / 7b (defense-in-depth vs drop) is settled per "Open design judgment" #5. +- [ ] The scope of changes (refactor `write_block_to_string`; restructure `coarsen_blocks` and `assemble_inline_content`; retire `Rewrite`) is acceptable. + +Open questions raised here. No code changes happen until Phase 0 closes. + +### Phase 1 — Decompose `write_block_to_string` + +- [ ] Identify every per-container arm in `crates/pampa/src/writers/qmd.rs` that produces output for a container block (Div, BlockQuote, Figure, NoteDefinitionFencedBlock, OrderedList, BulletList, DefinitionList, Table, Custom block). +- [ ] For each, extract `serialize_block_shell_open(block) → Bytes` and `serialize_block_shell_close(block) → Bytes`. Move the children-emitting code out of the arm; the arm emits only the wrapper. +- [ ] Do the same for inline containers (Emph, Strong, Link, Image, Span, Cite, Note, …): `serialize_inline_shell_open(inline) → Bytes` and `serialize_inline_shell_close(inline) → Bytes`. +- [ ] Define `serialize_leaf(node) → Bytes` as `write_block_to_string` restricted to leaves. Type-enforce or runtime-assert that the function panics on non-leaf input. +- [ ] Preserve `write_block_to_string` as a public convenience function that the rest of the codebase (native rendering, snapshot tests, etc.) can call. Its implementation becomes `shell_open + assemble(children-coarsened) + shell_close` — but the incremental writer no longer calls it. +- [ ] Tests: each shell-helper has a unit test that asserts its output for a known node. + +### Phase 2 — Restructure `coarsen_blocks` dispatch + +- [ ] Define the new `CoarsenedEntry` shape with `Verbatim`, `Omit`, `Recurse`, `Leaf` variants. Delete `Transparent`, `InlineSplice`, `Rewrite` (their roles are absorbed). +- [ ] Implement the dispatch table from "The proposed algebra" as a single `dispatch(node, align, target) → Rule` function. Each rule has a small implementation: R1 packages `Verbatim`; R2 packages `Omit`; R3 / R4 package `Recurse` with shells from Phase 1's helpers and children recursed via `coarsen`; R5 packages `Leaf { serialize_leaf(node) }`. +- [ ] `coarsen_blocks` becomes a thin wrapper that iterates the `block_alignments`, calls `dispatch` for each, threads separator context. +- [ ] Delete `coarsen_keep_before_block` (its logic is absorbed into the dispatch table). +- [ ] Verify against today's regression tests: every existing test in `crates/pampa/tests/incremental_writer_tests.rs` must still pass byte-for-byte. The refactor doesn't change observable behavior on the inputs the tests cover. + +### Phase 3 — Restructure `assemble_inline_content` + +- [ ] Define inline `Rule` dispatch analogous to block-level. R1-inline, R2-inline, R3-inline (inline `Recurse` for nested inline containers), R5-inline (leaf inline). +- [ ] `assemble_inline_content` becomes a recursive coarsen over the inline cascade. Phase 1's two-phase shape (soft-drop substitution + emit-with-dedupe) collapses to a single pass. +- [ ] Multi-inline dedupe (today's `compute_separator` shared-`Invocation`-anchor optimization) becomes a `SeparatorRule::InlineDedupe` carried by the parent `Recurse`. +- [ ] Verify: every existing inline-cascade test passes byte-for-byte. + +### Phase 4 — Property tests for BP + +The algebra is sound by construction, but a property test pins the invariant against bugs in the implementation. + +- [ ] Write a proptest generator `gen_pandoc_with_atomic_descendants` that produces ASTs with atomic-Generated descendants at varying depths inside non-atomic containers, plus arbitrary user edits applied. +- [ ] Write the property `bp_holds`: given a generated `(AST_old, AST_new, Source)` and a reconciler plan, run the writer. Assert: the output `Source'` does not contain any of the resolved bytes of atomic-Generated descendants. (Implementation: tag the generator's atomic-resolved content with a recognizable marker string; assert the marker doesn't appear in `Source'`.) +- [ ] Add property tests for individual rule soundness: R1 emits bytes from `Source`; R5 emits bytes derived only from the leaf's own content; R3 / R4 emit bytes that are concatenations of shell + children. +- [ ] Run under `cargo nextest run -p pampa` with high iteration counts. Save regression seeds if any fail. + +### Phase 5 — Retire denylist branches obviated by the algebra + +- [ ] Audit Plan 7c's open phases. For each phase that becomes defense-in-depth under the algebra (Phase 7's `displaced_before_idx`, Phase 7b's inline atomic-Generated check), decide per Phase 0's "Open design judgment" #5 whether to retain or drop. +- [ ] Remove obsolete branches from the codebase. Update tests to match. + +### Phase 6 — Update design docs + +- [ ] `claude-notes/designs/incremental-writer-contract.md`: the six-case soft-drop enumeration in §"Soft-drop semantics" is replaced by a pointer to this plan's dispatch table. The §"Non-soft-drop branches in the same cascade" sub-section is rewritten to reflect the algebra's uniform handling rather than the present-day cascade asymmetry. The §"`CoarsenedEntry` self-containment" sub-section is updated to reflect the new variant set (`Verbatim`, `Omit`, `Recurse`, `Leaf`). +- [ ] Add a §"Algebraic soundness" section to the contract doc that states BP, the dispatch table, and the soundness sketch. +- [ ] Cross-link from `provenance-contract.md` §7 (atomic-kind set and consumer impact). +- [ ] Add a "Follow-ups closed" entry to Plan 7 pointing here, retiring the algebraic-soundness item from its open tail. + +## What 7d does not change + +Explicit non-changes, for clarity: + +- **The reconciler's algorithm.** `compute_reconciliation` and its helpers stay as they are. Three-phase pass; same hash-match / positional / fallback logic. +- **`BlockAlignment` / `InlineAlignment` / `ListItemAlignment` types.** Same variants. No payload changes. +- **`apply_reconciliation` (AST-level reconciliation).** Independent of the writer; not touched. +- **`ReconciliationPlan` shape.** All sub-plan tables (`block_container_plans`, `inline_plans`, etc.) stay. +- **The wire format.** The plan is computed inside WASM and never crosses the boundary as JSON; nothing in `ts-packages/quarto-sync-client/src/types.ts` changes. +- **The diagnostic catalog.** Q-3-41, Q-3-42, Q-3-43 stay. The algebra reorganizes which dispatch row emits which code; the codes themselves don't change. +- **The producer-side contract (`provenance-contract.md`).** The role-asymmetry rule, the `By::` catalog, the atomic-kind set — all stay. The algebra inherits these as preconditions on its input. + +## Relationship to siblings + +- **Plan 7** (shipped): provides the existing writer the algebra refactors. 7d's implementation phases (1-6) start from `feature/provenance` HEAD. + +- **Plan 7a** (open): runtime user-filter idempotence detection. Orthogonal — concerns the validity of *inputs* to the writer (whether filters break round-trip), not the writer itself. 7a's work is independent of 7d. + +- **Plan 7b** (open): test-coverage consolidation. The property tests in 7d Phase 4 *complement* Plan 7b's per-shape regression tests. 7d's properties are coarser (input-distribution-driven; assert structural properties of output); 7b's are finer (specific shapes, specific assertions). Both are useful; neither obviates the other. + +- **Plan 7c** (open): closure gaps in the denylist cascade. + - Phases 1-6 of 7c remain useful regardless of 7d. They address concrete present-day bugs (Q-3-41 catalog gap, TS-side gate, per-kind soft-drop coverage, `Q343Reason` typing, `target_file_id` descent). + - Phases 7 and 7b of 7c become defense-in-depth under 7d (the algebra catches the cases they protect against, *provided* the producer contract is satisfied). Phase 0's design judgment #5 decides whether to retain them. + - 7c can ship before 7d, after 7d, or in parallel. The two plans are independent in scope; only the defense-in-depth question links them. + +## Risks + +- **Refactor scope.** The decomposition of `write_block_to_string` touches every per-container arm in the qmd writer. Each arm is small but there are many of them (Div, BlockQuote, OrderedList, BulletList, DefinitionList, Figure, NoteDefinitionFencedBlock, Table, Custom block, Header, Paragraph, Plain, …). Estimating 500-1000 LOC of mechanical refactor work, plus 200-400 LOC of dispatch-table consolidation. + +- **Behavioral compatibility.** Every existing test must pass byte-for-byte after the refactor. The algebra is designed to preserve behavior on today's inputs; any deviation is a refactor bug. Phase 4's property tests guard against regressions on inputs today's tests don't cover. + +- **Cost.** Recursive emission may be slower than today's single-pass `write_block_to_string`. Phase 4 includes benchmarking; if cost regresses significantly, Phase 0 may need to revisit (e.g., add memoization, or keep `write_block_to_string` as an optimized path for trees the algebra has already verified safe). + +- **Producer-contract drift.** The algebra leans on producer hygiene at R5's trust point. If a producer (a new transform, a Lua filter, a future synthesizer) introduces a leaf with non-default source_info that doesn't fit the algebra's classifications, R5 may emit bytes the algebra trusts but shouldn't. The mitigation is the producer contract's pre-existing rule ("new kinds default to non-atomic; promote deliberately") combined with the property tests catching obvious violations. + +- **The CustomNode decomposition.** Phase 1's split of CustomNode arms into shell helpers may surface CustomNode types that resist the decomposition (e.g., where the open syntax depends on the slot content, or where serialization isn't naturally separable into shell + children). These are spot-fixable but may add work. + +## References + +- This plan's algebraic content was developed across the 2026-05-25 / 2026-05-26 sessions on `feature/provenance` after the lipsum-paragraph regression (commit `e584428d`) prompted reconsideration of the writer's structural soundness. +- Today's writer: `crates/pampa/src/writers/incremental.rs` (~2700 LOC; `coarsen_blocks`, `coarsen_keep_before_block`, `assemble`, `assemble_inline_content`, `write_block_to_string`). +- Reconciler: `crates/quarto-ast-reconcile/src/compute.rs` (algorithm), `src/types.rs` (alignment types), `src/apply.rs` (AST-level apply, not used by the writer). +- Contract doc: [`claude-notes/designs/incremental-writer-contract.md`](../designs/incremental-writer-contract.md) — the byte-provenance contract this plan makes provable. +- Producer-side contract: [`claude-notes/designs/provenance-contract.md`](../designs/provenance-contract.md) — the rules producers must satisfy for the algebra's trust points to hold. +- Sibling primitive: [`claude-notes/designs/transparent-wrappers.md`](../designs/transparent-wrappers.md) — the traversal-side analogue (`first_in_user_tree`) of the writer's emission-side recursion. +- Plan 7 (shipped): [`claude-notes/plans/2026-05-04-q2-preview-plan-7-incremental-writer.md`](./2026-05-04-q2-preview-plan-7-incremental-writer.md) — the writer the algebra refactors. +- Plan 7c (open): [`claude-notes/plans/2026-05-25-q2-preview-plan-7c-closure-gaps.md`](./2026-05-25-q2-preview-plan-7c-closure-gaps.md) — the denylist-tightening sibling plan. diff --git a/claude-notes/research/2026-05-22-plan-6-audit.md b/claude-notes/research/2026-05-22-plan-6-audit.md new file mode 100644 index 000000000..422935d0d --- /dev/null +++ b/claude-notes/research/2026-05-22-plan-6-audit.md @@ -0,0 +1,184 @@ +# Plan 6 audit: `SourceInfo::default()` sites in transforms + +**Date:** 2026-05-22 +**Branch:** feature/provenance +**Plan:** `claude-notes/plans/2026-05-04-q2-preview-plan-6-provenance-audit.md` + +Comprehensive grep of `SourceInfo::default()` in `crates/quarto-core/src/transforms/` +and `crates/pampa/src/` (excluding test code). 682 total occurrences across +50+ files. This report categorizes the **production** (non-test, non-reader) +sites and decides Plan 6's disposition for each. + +## A. In Plan 6 scope — fix in this pass + +These are the sites the plan body enumerates. Each gets either +`Generated { by: By::(), from: smallvec![] }` (true synthesizers), +`Generated { by: By::shortcode(name), from: [Invocation] }` (shortcode +results via the stamper), or threaded source info (theorem/proof +name-attr, error/literal call sites). + +### Shortcode resolver +`crates/quarto-core/src/transforms/shortcode_resolve.rs` — 12 production +sites, all funnelled through `resolve_shortcode`'s dispatch: + +| Line | Site | Stamper covers? | +|------|---------------------------------------------|-----------------| +| 172 | `config_value_to_inlines` Str | yes | +| 179 | `config_value_to_inlines` Str (bool) | yes | +| 186 | `config_value_to_inlines` Str (int) | yes | +| 203 | `config_value_to_inlines` Str (plain) | yes | +| 208 | `config_value_to_inlines` Str (empty) | yes | +| 215 | `config_value_to_inlines` Str (invalid) | yes | +| 222 | `config_value_to_inlines` Str (Path/Glob) | yes | +| 238 | `flatten_blocks_to_inlines` Space | yes | +| 470 | `lua_result_to_shortcode_result::Text` Str | yes | +| 1034 | `make_error_inline` inner Str | no — call-site threading | +| 1036 | `make_error_inline` outer Strong | no — call-site threading | +| 1109 | `shortcode_to_literal` Str | no — call-site threading | + +Lines 1468 / 1473 / 1576 / 1578 are inside test modules — out of scope. + +### True synthesizers +- `crates/quarto-core/src/transforms/title_block.rs:183, 185` — h1 + + Str. `By::title_block()`. +- `crates/pampa/src/transforms/sectionize.rs:96, 148` — Section Div on + the two close-section paths. `By::sectionize()`. +- `crates/quarto-core/src/transforms/footnotes.rs:495` — footnotes + container Div. `By::footnotes()`. (Per plan: synthesized `` + markers and footnote backlinks are *not* added here; they reuse the + Note's source_info or are inline overlaps Plan 7 covers.) +- `crates/quarto-core/src/transforms/appendix.rs:230, 265, 286, 335, 376` + — five `let source_info = SourceInfo::default()` synthesizer headers + for `wrap_bibliography`, `create_appendix_container`, + `create_license_section`, `create_copyright_section`, + `create_citation_section`. All get `By::appendix()` (the plan only + enumerates `create_appendix_container`; the four other Appendix + helpers are structurally identical — see decisions below). +- `crates/pampa/src/pandoc/treesitter_utils/postprocess.rs:1348` — + synthetic Space between citation and suffix. `By::tree_sitter_postprocess()`. + +### Threaded source info (not Generated) +- `crates/quarto-core/src/transforms/theorem.rs:313` — name-attr title + Str. Thread `attr_source.attributes[idx].1` from + `extract_name_attr`'s caller, with positional-alignment guard. +- `crates/quarto-core/src/transforms/proof.rs:167` — parallel site in + `proof.rs`. Same fix. + +### Decisions on plan-adjacent sites +- **Appendix's four helper functions** (`wrap_bibliography`, + `create_license_section`, `create_copyright_section`, + `create_citation_section`) — the plan only enumerated the container + Div. Including the four helpers extends the scope by ~16 LOC of + trivial mechanical change and keeps the appendix-pipeline output + free of `SourceInfo::default()` for the audit-completion test. + Decision: include in Plan 6. + +## B. Out-of-scope synthesizers (follow-ups) + +These ARE true AST synthesizers that today emit `SourceInfo::default()` +and would benefit from a `Generated` shape, but the plan doesn't +enumerate them and they each require either a new `By::` constructor +or a design decision (atomicity classification). Open as follow-up +beads issues; do not block Plan 6 on them. + +- **`crates/quarto-core/src/transforms/callout_resolve.rs:267`** — + default callout title (e.g. "Note", "Tip" when the user didn't write + one). One synthesizer site; needs `By::callout()` + an + `is_atomic_kind` decision. Open beads. + +## C. Out of scope — website chrome / project-level + +These are transforms that generate website chrome (TOC, navigation, +sidebars, etc.) from metadata, not from the qmd body. They run *after* +the document profile checkpoint (per CLAUDE.md "Document profile +checkpoint" section) and consume the profile rather than processing +source-tracked content. Source attribution for these synthesizers is a +separate design problem (likely tied to the website-project epic). +Plan 6 explicitly defers them: + +- `categories_sidebar.rs`, `footer_generate.rs`, `footer_render.rs`, + `listing_generate.rs`, `listing_render.rs`, `navbar_generate.rs`, + `navbar_render.rs`, `navigation_active.rs`, `navigation_enrich.rs`, + `navigation_href.rs`, `page_nav_generate.rs`, `page_nav_render.rs`, + `sidebar_auto.rs`, `sidebar_generate.rs`, `sidebar_render.rs`, + `toc_generate.rs`, `toc_render.rs`, `website_canonical_url.rs`, + `website_favicon.rs`, `website_title_prefix.rs`. + +Most of these construct `ConfigValue` instances (with `source_info` +fields) rather than `Inline`/`Block` AST nodes; they're typed as +config rather than as user content. + +## D. Out of scope — non-synthesizer code + +- **`crates/pampa/src/readers/json.rs`** — JSON reader. Per the doc + comment at line 80, `SourceInfo::default()` is intentional here: + Pandoc JSON files have no source location data. Out of scope. +- **`crates/pampa/src/writers/{html,json}.rs`** — output writers; any + `SourceInfo::default()` here is for output-only intermediate AST + shaping. Out of scope. +- **`crates/pampa/src/lua/*`** — Lua infrastructure. Plan 4 already + introduced `filter_source_info` (the canonical auto-attach for + typed Lua filter constructions). The remaining + `SourceInfo::default()` in this directory is either deep + type-construction plumbing (`pandoc.X()` wrappers that + `filter_source_info` overrides on the way out) or bare-string + result fallbacks. Out of scope for Plan 6; the Dispatch follow-up + (bd-36fr9) will revisit. +- **`crates/pampa/src/{citeproc_filter,json_filter,filters}.rs`** — + filter execution paths. Constructions inside Lua filters are + already handled by `filter_source_info`; the bare + `SourceInfo::default()` here is for filter-internal scaffolding + (containers spliced around filter output). Out of scope. +- **`crates/pampa/src/template/*`** — Pandoc-compatible template + engine. Doctemplate output is not source-tracked through this + pipeline. Out of scope. +- **`crates/pampa/src/pandoc/{meta,shortcode}.rs`** — type-level + defaults / data-shape conversions, not pipeline-level synthesis. + Out of scope. +- **`crates/pampa/src/toc.rs`** — TOC generation. Same scope note as + the website-chrome transforms in §C. +- **`crates/quarto-core/src/transforms/{code_block_generate,code_block_render}.rs`** + — code-block decoration (filename labels, captions). Possibly + in-scope for a future audit pass; defer for now. Open beads. +- **`crates/quarto-core/src/transforms/config.rs`** — config-merge + bookkeeping. Constructs ConfigValues, not user-content AST nodes. + Out of scope. +- **`crates/quarto-core/src/transforms/link_rewrite.rs`** — link + rewriting (URL canonicalization). The 13 sites are mostly + test-helper code; the production sites construct intermediate + Link/Image nodes whose `source_info` is then overwritten with the + original node's `source_info` later in the rewrite. The default + acts as a placeholder. Audit shows no genuine synthesis; out of + scope. + +## E. Test-only sites + +Filter list: `dummy_source_info`, `#[cfg(test)]` modules, +`fn test_*`. These are intentional test scaffolding and out of scope. + +## Audit summary + +| Category | Count (production) | +|----------------------------------|--------------------| +| A. In Plan 6 scope (will fix) | 22 | +| B. Plan-adjacent synthesizers | 1 | +| C. Website chrome (deferred) | ~120 | +| D. Non-synthesizer code | ~80 | +| E. Test scaffolding | ~459 | +| **Total** | **~682** | + +The audit-completion test (Plan 6 test plan) asserts the §A sites +all become `Generated` (or threaded `Original`) shapes after the +pass. It does not assert that §B/C/D become Generated — those are +out of scope. + +## Follow-up beads to open + +- **Callout default-title synthesizer** (callout_resolve.rs:267). + Needs `By::callout()` constructor + atomicity decision. +- **Code-block decoration** (code_block_generate / code_block_render). + Audit pass for codeblock chrome. + +These are opened as discovered-from links to whatever beads issue +tracks Plan 6's umbrella work (or left as standalone follow-ups +since Plan 6 is plan-driven, not beads-driven). diff --git a/crates/pampa/Cargo.toml b/crates/pampa/Cargo.toml index a8398dd6c..b19f22dad 100644 --- a/crates/pampa/Cargo.toml +++ b/crates/pampa/Cargo.toml @@ -58,6 +58,7 @@ regex = { version = "1.12.3", features = ["unicode"] } clap = { version = "4.5", features = ["derive"] } serde = { workspace = true, features = ["derive"] } serde_json = "1.0" +smallvec.workspace = true glob = "0.3" paste = "1.0.15" once_cell = "1.21.3" diff --git a/crates/pampa/src/lua/diagnostics.rs b/crates/pampa/src/lua/diagnostics.rs index 5a16262f2..a5bac7c30 100644 --- a/crates/pampa/src/lua/diagnostics.rs +++ b/crates/pampa/src/lua/diagnostics.rs @@ -10,7 +10,8 @@ use mlua::{Error, Lua, MultiValue, Result, Table, Value}; use quarto_error_reporting::DiagnosticMessage; -use quarto_source_map::{FileId, SourceInfo, SourcePiece}; +use quarto_source_map::{Anchor, AnchorRole, By, FileId, SourceInfo, SourcePiece}; +use smallvec::SmallVec; use std::sync::Arc; use super::types::{LuaBlock, LuaInline}; @@ -57,7 +58,12 @@ pub fn register_quarto_namespace(lua: &Lua) -> Result<()> { /// - Original: { t = "Original", file_id = N, start_offset = N, end_offset = N } /// - Substring: { t = "Substring", parent = {...}, start_offset = N, end_offset = N } /// - Concat: { t = "Concat", pieces = [{source_info = {...}, offset_in_concat = N, length = N}, ...] } -/// - FilterProvenance: { t = "FilterProvenance", filter_path = "...", line = N } +/// - Generated: { t = "Generated", by = { kind = "...", data = "..." (JSON-encoded) }, +/// from = [{role = "Invocation" | "ValueSource" | "Other:", +/// source_info = {...}}, ...] } +/// +/// The reader also accepts the legacy `"FilterProvenance"` tag for back-compat, +/// mapping it onto `Generated { by: filter, from: [] }`. fn source_info_to_lua_table(lua: &Lua, si: &SourceInfo) -> Result { let table = lua.create_table()?; match si { @@ -96,15 +102,50 @@ fn source_info_to_lua_table(lua: &Lua, si: &SourceInfo) -> Result
{ } table.set("pieces", pieces_table)?; } - SourceInfo::FilterProvenance { filter_path, line } => { - table.set("t", "FilterProvenance")?; - table.set("filter_path", filter_path.clone())?; - table.set("line", *line)?; + SourceInfo::Generated { by, from } => { + table.set("t", "Generated")?; + table.set("by", by_to_lua_table(lua, by)?)?; + let from_table = lua.create_table()?; + for (i, anchor) in from.iter().enumerate() { + let anchor_table = lua.create_table()?; + anchor_table.set("role", anchor_role_to_lua_string(&anchor.role))?; + anchor_table.set( + "source_info", + source_info_to_lua_table(lua, &anchor.source_info)?, + )?; + from_table.set(i + 1, anchor_table)?; + } + table.set("from", from_table)?; } } Ok(table) } +/// Serialize a [`By`] to a Lua table: `{ kind = "...", data = "" }`. +/// +/// `data` is JSON-encoded as a string because Lua tables don't carry the +/// `serde_json::Value` discriminator; readers decode it back via +/// [`serde_json::from_str`]. +fn by_to_lua_table(lua: &Lua, by: &By) -> Result
{ + let table = lua.create_table()?; + table.set("kind", by.kind.clone())?; + if !by.data.is_null() { + let encoded = serde_json::to_string(&by.data) + .map_err(|e| Error::runtime(format!("By.data serialize failed: {e}")))?; + table.set("data", encoded)?; + } + Ok(table) +} + +/// Serialize an [`AnchorRole`] to a Lua string. +fn anchor_role_to_lua_string(role: &AnchorRole) -> String { + match role { + AnchorRole::Invocation => "Invocation".to_string(), + AnchorRole::ValueSource => "ValueSource".to_string(), + AnchorRole::Other(name) => format!("Other:{name}"), + } +} + /// Deserialize a SourceInfo from a Lua table fn source_info_from_lua_table(table: &Table) -> Result { let t: String = table.get("t")?; @@ -136,14 +177,61 @@ fn source_info_from_lua_table(table: &Table) -> Result { } Ok(SourceInfo::Concat { pieces }) } - "FilterProvenance" => Ok(SourceInfo::FilterProvenance { - filter_path: table.get("filter_path")?, - line: table.get("line")?, + "Generated" => { + let by_table: Table = table.get("by")?; + let by = by_from_lua_table(&by_table)?; + let mut from: SmallVec<[Anchor; 2]> = SmallVec::new(); + // The `from` field is optional in serialization; absent means empty. + if let Ok(from_table) = table.get::
("from") { + for i in 1..=from_table.raw_len() { + let anchor_table: Table = from_table.get(i)?; + let role_str: String = anchor_table.get("role")?; + let role = anchor_role_from_lua_string(&role_str); + let si_table: Table = anchor_table.get("source_info")?; + from.push(Anchor { + role, + source_info: Arc::new(source_info_from_lua_table(&si_table)?), + }); + } + } + Ok(SourceInfo::Generated { by, from }) + } + // Legacy back-compat: read the old "FilterProvenance" tag as + // `Generated { by: filter(...), from: [] }`. Writers never emit + // this tag after Plan 4 Phase 4. + "FilterProvenance" => Ok(SourceInfo::Generated { + by: By::filter( + table.get::("filter_path")?, + table.get::("line")?, + ), + from: SmallVec::new(), }), _ => Err(Error::runtime(format!("Unknown SourceInfo type: {}", t))), } } +/// Deserialize a [`By`] from `{ kind = "...", data = "" }`. +fn by_from_lua_table(table: &Table) -> Result { + let kind: String = table.get("kind")?; + let data = match table.get::("data") { + Ok(encoded) => serde_json::from_str(&encoded) + .map_err(|e| Error::runtime(format!("By.data parse failed: {e}")))?, + Err(_) => serde_json::Value::Null, + }; + Ok(By { kind, data }) +} + +/// Inverse of [`anchor_role_to_lua_string`]. +fn anchor_role_from_lua_string(s: &str) -> AnchorRole { + if let Some(rest) = s.strip_prefix("Other:") { + AnchorRole::Other(rest.to_string()) + } else if s == "ValueSource" { + AnchorRole::ValueSource + } else { + AnchorRole::Invocation + } +} + // ============================================================================ // Helper Functions for Extracting SourceInfo from Elements // ============================================================================ @@ -170,7 +258,10 @@ fn extract_source_info_from_element(lua: &Lua, elem: &Value) -> Result SourceInfo { let (source, line) = get_caller_location(lua); let source_path = source.strip_prefix('@').unwrap_or(&source); - SourceInfo::filter_provenance(source_path, line.max(0) as usize) + SourceInfo::Generated { + by: By::filter(source_path, line.max(0) as usize), + from: SmallVec::new(), + } } /// Add a diagnostic to the quarto._diagnostics table @@ -441,16 +532,18 @@ mod tests { // Verify source location was captured assert!(diagnostics[0].location.is_some()); - if let Some(SourceInfo::FilterProvenance { filter_path, line }) = &diagnostics[0].location { + if let Some(SourceInfo::Generated { by, .. }) = &diagnostics[0].location + && let Some((filter_path, line)) = by.as_filter() + { // The path should contain the filter name (@ prefix is stripped) assert!( filter_path.contains("test_filter.lua"), "Expected path to contain 'test_filter.lua', got '{}'", filter_path ); - assert_eq!(*line, 1); + assert_eq!(line, 1); } else { - panic!("Expected FilterProvenance source info"); + panic!("Expected filter-kind Generated source info"); } } @@ -506,9 +599,10 @@ mod tests { assert_eq!(*start_offset, 100, "start_offset should be preserved"); assert_eq!(*end_offset, 110, "end_offset should be preserved"); } - Some(SourceInfo::FilterProvenance { filter_path, line }) => { + Some(SourceInfo::Generated { by, .. }) if by.is_kind("filter") => { + let (filter_path, line) = by.as_filter().unwrap(); panic!( - "Expected SourceInfo::Original, but got FilterProvenance({}, {}). \ + "Expected SourceInfo::Original, but got filter-Generated({}, {}). \ This is the bug we're fixing!", filter_path, line ); @@ -749,11 +843,45 @@ mod tests { let roundtrip = source_info_from_lua_table(&table).unwrap(); assert_eq!(concat, roundtrip); - // Test FilterProvenance - let filter_prov = SourceInfo::filter_provenance("/path/to/filter.lua", 42); + // Test filter-kind Generated round-trip + let filter_prov = SourceInfo::generated(By::filter("/path/to/filter.lua", 42)); let table = source_info_to_lua_table(&lua, &filter_prov).unwrap(); let roundtrip = source_info_from_lua_table(&table).unwrap(); assert_eq!(filter_prov, roundtrip); + + // Test shortcode Generated with an Invocation anchor + let mut shortcode = SourceInfo::generated(By::shortcode("meta")); + shortcode.append_anchor( + AnchorRole::Invocation, + Arc::new(SourceInfo::Original { + file_id: FileId(3), + start_offset: 1, + end_offset: 9, + }), + ); + let table = source_info_to_lua_table(&lua, &shortcode).unwrap(); + let roundtrip = source_info_from_lua_table(&table).unwrap(); + assert_eq!(shortcode, roundtrip); + } + + #[test] + fn test_legacy_filter_provenance_tag_reads_as_filter_generated() { + // Plan 4 Phase 4: writers never emit "FilterProvenance" anymore, but + // the reader still accepts the legacy tag and maps it to a + // filter-kind Generated with empty anchor list. + let lua = Lua::new(); + let table = lua.create_table().unwrap(); + table.set("t", "FilterProvenance").unwrap(); + table.set("filter_path", "legacy.lua").unwrap(); + table.set("line", 7usize).unwrap(); + let parsed = source_info_from_lua_table(&table).unwrap(); + match parsed { + SourceInfo::Generated { by, from } => { + assert_eq!(by.as_filter(), Some(("legacy.lua", 7))); + assert!(from.is_empty()); + } + other => panic!("Expected filter-kind Generated, got {:?}", other), + } } // ========================================================================= @@ -797,11 +925,11 @@ mod tests { // Should still work, falling back to stack location let diagnostics = extract_lua_diagnostics(&lua).unwrap(); assert_eq!(diagnostics.len(), 1); - // Should have FilterProvenance since the element wasn't recognized + // Should have filter-Generated since the element wasn't recognized match &diagnostics[0].location { - Some(SourceInfo::FilterProvenance { .. }) => {} + Some(SourceInfo::Generated { by, .. }) if by.is_kind("filter") => {} other => panic!( - "Expected FilterProvenance for non-userdata element, got {:?}", + "Expected filter-Generated for non-userdata element, got {:?}", other ), } diff --git a/crates/pampa/src/lua/filter_tests.rs b/crates/pampa/src/lua/filter_tests.rs index 30c4d2b30..3e0518b96 100644 --- a/crates/pampa/src/lua/filter_tests.rs +++ b/crates/pampa/src/lua/filter_tests.rs @@ -737,7 +737,7 @@ end } #[tokio::test] -async fn test_filter_provenance_tracking() { +async fn test_filter_generated_tracking() { // Test that elements created by filters capture their source location let dir = TempDir::new().unwrap(); let filter_path = dir.path().join("provenance_test.lua"); @@ -777,17 +777,23 @@ end .unwrap() .pandoc; - // The filtered Str should have FilterProvenance source info + // The filtered Str should have filter-kind Generated source info match &filtered.blocks[0] { Block::Paragraph(p) => match &p.content[0] { Inline::Str(s) => { assert_eq!(s.text, "created-by-filter"); - // Check that the source_info is FilterProvenance + // Check that the source_info is Generated { by: filter, .. } match &s.source_info { - quarto_source_map::SourceInfo::FilterProvenance { - filter_path: path, - line, - } => { + quarto_source_map::SourceInfo::Generated { by, from } + if by.is_kind("filter") => + { + assert!( + from.is_empty(), + "Filter-constructed Generated nodes carry no anchors yet" + ); + let (path, line) = by + .as_filter() + .expect("filter-kind Generated should expose path/line"); // The filter_path should contain our filter file name assert!( path.contains("provenance_test.lua"), @@ -796,13 +802,16 @@ end ); // The line should be around line 5 where pandoc.Str is called assert!( - *line >= 4 && *line <= 7, + (4..=7).contains(&line), "Expected line to be between 4-7, got: {}", line ); } other => { - panic!("Expected FilterProvenance source info, got: {:?}", other) + panic!( + "Expected filter-kind Generated source info, got: {:?}", + other + ) } } } @@ -1799,13 +1808,13 @@ end ); // Check source location - if let Some(quarto_source_map::SourceInfo::FilterProvenance { filter_path, line }) = - &diagnostics[0].location + if let Some(quarto_source_map::SourceInfo::Generated { by, .. }) = &diagnostics[0].location + && let Some((filter_path, line)) = by.as_filter() { assert!(filter_path.contains("warn_test.lua")); - assert!(*line > 0, "Line should be positive"); + assert!(line > 0, "Line should be positive"); } else { - panic!("Expected FilterProvenance source info"); + panic!("Expected filter-kind Generated source info"); } } @@ -6826,7 +6835,9 @@ end attr: (String::new(), vec![], hashlink::LinkedHashMap::new()), attr_source: crate::pandoc::AttrSourceInfo::empty(), content: vec![], - source_info: quarto_source_map::SourceInfo::filter_provenance("test.lua", 1), + source_info: quarto_source_map::SourceInfo::generated( + quarto_source_map::By::filter("test.lua", 1), + ), })], source_info: quarto_source_map::SourceInfo::default(), })], diff --git a/crates/pampa/src/lua/types.rs b/crates/pampa/src/lua/types.rs index f2a67e70f..4c5ed92fc 100644 --- a/crates/pampa/src/lua/types.rs +++ b/crates/pampa/src/lua/types.rs @@ -15,7 +15,8 @@ use mlua::{ Error, IntoLua, Lua, MetaMethod, Result, Table, UserData, UserDataFields, UserDataMethods, UserDataRef, Value, Variadic, }; -use quarto_source_map::SourceInfo; +use quarto_source_map::{By, SourceInfo}; +use smallvec::SmallVec; use crate::pandoc::{Block, Inline}; @@ -723,8 +724,8 @@ impl UserData for LuaInline { /// `:byte_range()` and `:file_id()` accessors that chain-resolve the /// underlying `SourceInfo` to a `(file_id, start, end)` tuple in the /// root source file. Both return `nil` when the chain resolves to -/// `SourceInfo::Concat` or `SourceInfo::FilterProvenance` — the same -/// rule applied by `AttributionRenderTransform`. +/// `SourceInfo::Concat` or a `Generated` node without an `Invocation` +/// anchor — the same rule applied by `AttributionRenderTransform`. /// /// This is the building block of the `quarto.attribution.lookup(el)` /// convenience: it reads `el.source_info:byte_range()` then calls @@ -1825,7 +1826,10 @@ pub fn filter_source_info(lua: &Lua) -> SourceInfo { // The source often starts with "@" for file paths let path: &str = src.strip_prefix("@").unwrap_or(&src); let line_num = line.unwrap_or(0); - return Some(SourceInfo::filter_provenance(path.to_string(), line_num)); + return Some(SourceInfo::Generated { + by: By::filter(path.to_string(), line_num), + from: SmallVec::new(), + }); } None }) { diff --git a/crates/pampa/src/pandoc/location.rs b/crates/pampa/src/pandoc/location.rs index f97a7d1ef..9c5846c47 100644 --- a/crates/pampa/src/pandoc/location.rs +++ b/crates/pampa/src/pandoc/location.rs @@ -325,24 +325,6 @@ pub fn empty_source_info() -> quarto_source_map::SourceInfo { ) } -/// Extract filename index from quarto_source_map::SourceInfo by walking to Original mapping -pub fn extract_filename_index(info: &quarto_source_map::SourceInfo) -> Option { - match info { - quarto_source_map::SourceInfo::Original { file_id, .. } => Some(file_id.0), - quarto_source_map::SourceInfo::Substring { parent, .. } => extract_filename_index(parent), - quarto_source_map::SourceInfo::Concat { pieces } => { - // Return first non-None filename_index from pieces - pieces - .iter() - .find_map(|p| extract_filename_index(&p.source_info)) - } - quarto_source_map::SourceInfo::FilterProvenance { .. } => { - // Filter provenance doesn't have a filename index - None - } - } -} - #[cfg(test)] mod tests { use super::*; @@ -568,93 +550,6 @@ mod tests { assert_eq!(si.length(), 0); } - #[test] - fn test_extract_filename_index_original() { - let si = quarto_source_map::SourceInfo::from_range( - quarto_source_map::FileId(42), - quarto_source_map::Range { - start: quarto_source_map::Location { - offset: 0, - row: 0, - column: 0, - }, - end: quarto_source_map::Location { - offset: 10, - row: 0, - column: 10, - }, - }, - ); - assert_eq!(extract_filename_index(&si), Some(42)); - } - - #[test] - fn test_extract_filename_index_substring() { - let parent = quarto_source_map::SourceInfo::from_range( - quarto_source_map::FileId(99), - quarto_source_map::Range { - start: quarto_source_map::Location { - offset: 0, - row: 0, - column: 0, - }, - end: quarto_source_map::Location { - offset: 100, - row: 5, - column: 0, - }, - }, - ); - let substring = quarto_source_map::SourceInfo::substring(parent, 10, 50); - assert_eq!(extract_filename_index(&substring), Some(99)); - } - - #[test] - fn test_extract_filename_index_concat() { - let piece1 = quarto_source_map::SourceInfo::from_range( - quarto_source_map::FileId(7), - quarto_source_map::Range { - start: quarto_source_map::Location { - offset: 0, - row: 0, - column: 0, - }, - end: quarto_source_map::Location { - offset: 10, - row: 0, - column: 10, - }, - }, - ); - let piece2 = quarto_source_map::SourceInfo::from_range( - quarto_source_map::FileId(8), - quarto_source_map::Range { - start: quarto_source_map::Location { - offset: 0, - row: 0, - column: 0, - }, - end: quarto_source_map::Location { - offset: 20, - row: 1, - column: 0, - }, - }, - ); - // concat takes Vec<(SourceInfo, usize)> - pairs of source info and length - let concat = quarto_source_map::SourceInfo::concat(vec![(piece1, 10), (piece2, 20)]); - // Should return the first piece's file_id - assert_eq!(extract_filename_index(&concat), Some(7)); - } - - #[test] - fn test_extract_filename_index_filter_provenance() { - // filter_provenance takes filter_path and line number - let filter_prov = quarto_source_map::SourceInfo::filter_provenance("test-filter.lua", 42); - // FilterProvenance doesn't have a filename index - assert_eq!(extract_filename_index(&filter_prov), None); - } - #[test] fn test_source_info_combine_takes_self_start_when_smaller() { // Test case where self.range.start < other.range.start (covers line 53) diff --git a/crates/pampa/src/pandoc/treesitter_utils/pipe_table.rs b/crates/pampa/src/pandoc/treesitter_utils/pipe_table.rs index 391a5d602..c422e6842 100644 --- a/crates/pampa/src/pandoc/treesitter_utils/pipe_table.rs +++ b/crates/pampa/src/pandoc/treesitter_utils/pipe_table.rs @@ -252,31 +252,11 @@ pub fn process_pipe_table( let table_start = node_source_info_with_context(node, context); let start_offset = table_start.start_offset(); let end_offset = cap_info.end_offset(); - // Extract file_id from the table's source info - let file_id = match &table_start { - quarto_source_map::SourceInfo::Original { file_id, .. } => *file_id, - quarto_source_map::SourceInfo::Substring { parent, .. } => { - // Recursively extract from parent (should always reach Original eventually) - match **parent { - quarto_source_map::SourceInfo::Original { file_id, .. } => file_id, - _ => quarto_source_map::FileId(0), // Fallback - } - } - quarto_source_map::SourceInfo::Concat { pieces } => { - // Use first piece's file_id - if let Some(piece) = pieces.first() { - match &piece.source_info { - quarto_source_map::SourceInfo::Original { file_id, .. } => *file_id, - _ => quarto_source_map::FileId(0), // Fallback - } - } else { - quarto_source_map::FileId(0) // Fallback - } - } - quarto_source_map::SourceInfo::FilterProvenance { .. } => { - quarto_source_map::FileId(0) // Fallback - filter-created tables shouldn't reach this - } - }; + // Extract file_id from the table's source info; root_file_id walks + // every nesting level, so this works for arbitrarily deep Substrings. + let file_id = table_start + .root_file_id() + .unwrap_or(quarto_source_map::FileId(0)); // Create a new SourceInfo spanning from table start to caption end quarto_source_map::SourceInfo::original(file_id, start_offset, end_offset) } else { diff --git a/crates/pampa/src/pandoc/treesitter_utils/postprocess.rs b/crates/pampa/src/pandoc/treesitter_utils/postprocess.rs index bdbfebb54..019b3d9b3 100644 --- a/crates/pampa/src/pandoc/treesitter_utils/postprocess.rs +++ b/crates/pampa/src/pandoc/treesitter_utils/postprocess.rs @@ -49,7 +49,8 @@ use quarto_pandoc_types::AttrSourceInfo; use quarto_pandoc_types::table::{ Alignment, Cell, ColSpec, ColWidth, Row, Table, TableBody, TableFoot, TableHead, }; -use quarto_source_map::SourceInfo; +use quarto_source_map::{By, SourceInfo}; +use smallvec::smallvec; use std::cell::RefCell; use std::collections::HashMap; @@ -1349,8 +1350,12 @@ pub fn postprocess(doc: Pandoc, error_collector: &mut DiagnosticCollector) -> Re // bracket attached to the first word and closing bracket to the last word // e.g., "@knuth [p. 33]" becomes: Str("@knuth"), Space, Str("[p."), Space, Str("33]") cite.content.push(Inline::Space(Space { - // Synthetic Space: inserted to separate citation from suffix - source_info: quarto_source_map::SourceInfo::default(), + // Synthetic Space: inserted to separate citation from suffix. + // Plan 6 §"tree-sitter postprocess" — Generated, no preimage. + source_info: SourceInfo::Generated { + by: By::tree_sitter_postprocess(), + from: smallvec![], + }, })); // The span content may have been merged into a single string, so we need to diff --git a/crates/pampa/src/pandoc/treesitter_utils/section.rs b/crates/pampa/src/pandoc/treesitter_utils/section.rs index 21f6b4e2b..fd180862c 100644 --- a/crates/pampa/src/pandoc/treesitter_utils/section.rs +++ b/crates/pampa/src/pandoc/treesitter_utils/section.rs @@ -125,31 +125,13 @@ pub fn process_section( // Extend table's source_info to include the caption let table_start_offset = table.source_info.start_offset(); let caption_end_offset = caption_source_info.end_offset(); - // Extract file_id from table's source info - let file_id = match &table.source_info { - quarto_source_map::SourceInfo::Original { file_id, .. } => *file_id, - quarto_source_map::SourceInfo::Substring { parent, .. } => { - match **parent { - quarto_source_map::SourceInfo::Original { file_id, .. } => file_id, - _ => quarto_source_map::FileId(0), // Fallback - } - } - quarto_source_map::SourceInfo::Concat { pieces } => { - if let Some(piece) = pieces.first() { - match &piece.source_info { - quarto_source_map::SourceInfo::Original { file_id, .. } => { - *file_id - } - _ => quarto_source_map::FileId(0), // Fallback - } - } else { - quarto_source_map::FileId(0) // Fallback - } - } - quarto_source_map::SourceInfo::FilterProvenance { .. } => { - quarto_source_map::FileId(0) // Fallback - filter-created tables - } - }; + // Extract file_id from table's source info; root_file_id + // walks every nesting level, so nested Substrings resolve + // correctly (the previous shallow match returned FileId(0)). + let file_id = table + .source_info + .root_file_id() + .unwrap_or(quarto_source_map::FileId(0)); table.source_info = quarto_source_map::SourceInfo::original( file_id, table_start_offset, diff --git a/crates/pampa/src/readers/json.rs b/crates/pampa/src/readers/json.rs index 587ab09f3..32d1b65f0 100644 --- a/crates/pampa/src/readers/json.rs +++ b/crates/pampa/src/readers/json.rs @@ -16,8 +16,9 @@ use crate::pandoc::{ }; use hashlink::LinkedHashMap; use quarto_pandoc_types::{ConfigMapEntry, ConfigValue, ConfigValueKind}; -use quarto_source_map::FileId; +use quarto_source_map::{Anchor, AnchorRole, By, FileId}; use serde_json::Value; +use smallvec::SmallVec; use std::sync::Arc; #[derive(Debug)] @@ -250,36 +251,132 @@ impl SourceInfoDeserializer { quarto_source_map::SourceInfo::Concat { pieces: pieces? } } 3 => { - // Transformed variant no longer exists in SourceInfo - // Convert to approximate Substring pointing to parent - // This loses the transformation mapping but preserves the parent relationship + // Legacy reader for code 3 — accepts both old Transformed + // numeric-array and buggy FilterProvenance string-array; new + // writers (post-Plan-5) never emit code 3. Two shapes are + // possible and dispatch is by `data[0]`'s JSON type, which + // is unambiguous: + // - Numeric-headed `[parent_id, ...]` → legacy Transformed, + // approximated as Substring pointing to that parent + // (preserves today's back-compat). + // - String-headed `[filter_path, line]` → latent + // FilterProvenance, recovered as + // `Generated { by: filter, from: [] }` (closes bd-3odjm). + // Strict on every other shape — same convention as the + // Substring / Concat arms above (no `unwrap_or(0)`). let data_array = data .as_array() .ok_or(JsonReadError::MalformedSourceInfoPool)?; if data_array.is_empty() { return Err(JsonReadError::MalformedSourceInfoPool); } - let parent_id = data_array[0] - .as_u64() - .ok_or(JsonReadError::MalformedSourceInfoPool)? - as usize; - // Check for circular/forward references - if parent_id >= current_index { - return Err(JsonReadError::CircularSourceInfoReference(parent_id)); - } + if let Some(parent_id) = data_array[0].as_u64() { + // Legacy Transformed path. + let parent_id = parent_id as usize; - let parent = pool - .get(parent_id) - .ok_or(JsonReadError::InvalidSourceInfoRef(parent_id))? - .clone(); + // Check for circular/forward references + if parent_id >= current_index { + return Err(JsonReadError::CircularSourceInfoReference(parent_id)); + } - // Approximate with Substring - quarto_source_map::SourceInfo::Substring { - parent: Arc::new(parent), - start_offset, - end_offset, + let parent = pool + .get(parent_id) + .ok_or(JsonReadError::InvalidSourceInfoRef(parent_id))? + .clone(); + + // Approximate with Substring + quarto_source_map::SourceInfo::Substring { + parent: Arc::new(parent), + start_offset, + end_offset, + } + } else if let Some(filter_path) = data_array[0].as_str() { + // Latent FilterProvenance shape: must be exactly + // [path, line]; no `unwrap_or(0)` on the line. + if data_array.len() != 2 { + return Err(JsonReadError::MalformedSourceInfoPool); + } + let line = data_array[1] + .as_u64() + .ok_or(JsonReadError::MalformedSourceInfoPool)? + as usize; + quarto_source_map::SourceInfo::Generated { + by: By::filter(filter_path.to_string(), line), + from: SmallVec::new(), + } + } else { + return Err(JsonReadError::MalformedSourceInfoPool); + } + } + 4 => { + // Generated { by, from }. The outer `r` field is parsed + // by the caller and *ignored here* — Generated entries + // don't carry their own offsets; ranges come from + // chain-walking the Invocation anchor via + // `resolve_byte_range`. The writer hard-codes + // `r: [0, 0]` for code-4 entries, but `r != [0, 0]` from + // an older/future writer is silently accepted (precedent: + // today's Concat arm also parses `r` but doesn't use it). + // + // Strict on every other shape: missing `by`, missing + // `by.kind`, `from` not an array, `from` entry not an + // object, `from` entry missing `role`/`si_id`, + // unrecognized role string, `Other("")` role with empty + // suffix → `MalformedSourceInfoPool`. Same convention as + // the Substring / Concat arms above. + let obj = data + .as_object() + .ok_or(JsonReadError::MalformedSourceInfoPool)?; + let by_obj = obj + .get("by") + .and_then(|v| v.as_object()) + .ok_or(JsonReadError::MalformedSourceInfoPool)?; + let kind = by_obj + .get("kind") + .and_then(|v| v.as_str()) + .ok_or(JsonReadError::MalformedSourceInfoPool)? + .to_string(); + let by_data = by_obj.get("data").cloned().unwrap_or(Value::Null); + let by = By { + kind, + data: by_data, + }; + + let mut from: SmallVec<[Anchor; 2]> = SmallVec::new(); + if let Some(from_val) = obj.get("from") { + let from_arr = from_val + .as_array() + .ok_or(JsonReadError::MalformedSourceInfoPool)?; + for entry in from_arr { + let entry_obj = entry + .as_object() + .ok_or(JsonReadError::MalformedSourceInfoPool)?; + let role_str = entry_obj + .get("role") + .and_then(|v| v.as_str()) + .ok_or(JsonReadError::MalformedSourceInfoPool)?; + let role = parse_anchor_role(role_str)?; + let si_id = entry_obj + .get("si_id") + .and_then(|v| v.as_u64()) + .ok_or(JsonReadError::MalformedSourceInfoPool)? + as usize; + if si_id >= current_index { + return Err(JsonReadError::CircularSourceInfoReference(si_id)); + } + let target = pool + .get(si_id) + .cloned() + .ok_or(JsonReadError::InvalidSourceInfoRef(si_id))?; + from.push(Anchor { + role, + source_info: Arc::new(target), + }); + } } + + quarto_source_map::SourceInfo::Generated { by, from } } _ => { return Err(JsonReadError::MalformedSourceInfoPool); @@ -306,6 +403,33 @@ impl SourceInfoDeserializer { } } +/// Decode a wire-format anchor role string into its typed `AnchorRole`. +/// +/// Recognized strings: +/// - `"invocation"` → [`AnchorRole::Invocation`] +/// - `"value-source"` → [`AnchorRole::ValueSource`] +/// - `"other:"` → [`AnchorRole::Other()`], where `` must +/// be non-empty. +/// +/// Anything else — including the bare `"other:"` with an empty suffix — +/// is rejected as `MalformedSourceInfoPool`. +#[allow(dead_code)] // Used by the Phase 2 code-4 reader. +fn parse_anchor_role(s: &str) -> Result { + match s { + "invocation" => Ok(AnchorRole::Invocation), + "value-source" => Ok(AnchorRole::ValueSource), + _ => { + let name = s + .strip_prefix("other:") + .ok_or(JsonReadError::MalformedSourceInfoPool)?; + if name.is_empty() { + return Err(JsonReadError::MalformedSourceInfoPool); + } + Ok(AnchorRole::Other(name.to_string())) + } + } +} + /// Convert from old JSON format (filename_index, range) to new SourceInfo fn make_source_info(filename_index: Option, range: Range) -> quarto_source_map::SourceInfo { let file_id = FileId(filename_index.unwrap_or(0)); @@ -2631,4 +2755,521 @@ mod tests { _ => panic!("Expected CircularSourceInfoReference error"), } } + + // ---------------------------------------------------------------- + // Plan 5 Phase 1 — Legacy code-3 dual-shape reader + // ---------------------------------------------------------------- + + /// Filter-provenance recovery: code-3 with `[filter_path, line]` payload + /// must decode to `Generated { by: By::filter(path, line), from: [] }`. + /// Closes bd-3odjm — this is the latent FilterProvenance shape that + /// today's reader misinterprets as a legacy Transformed parent_id. + #[test] + fn test_deserialize_code3_filter_provenance_recovery() { + let pool_json = json!([ + { + "r": [0, 0], + "t": 3, + "d": ["/path/to/filter.lua", 42] + } + ]); + + let deserializer = SourceInfoDeserializer::new(&pool_json).unwrap(); + assert_eq!(deserializer.pool.len(), 1); + match &deserializer.pool[0] { + SourceInfo::Generated { by, from } => { + assert!(from.is_empty()); + let (path, line) = by.as_filter().expect("expected filter By"); + assert_eq!(path, "/path/to/filter.lua"); + assert_eq!(line, 42); + } + other => panic!("Expected Generated, got {:?}", other), + } + } + + /// Legacy Transformed back-compat: code-3 with `[parent_id, ...]` + /// numeric payload must continue to decode as a Substring approximation. + #[test] + fn test_deserialize_code3_legacy_transformed_back_compat() { + let pool_json = json!([ + { + "r": [0, 100], + "t": 0, + "d": 0 // Original + }, + { + "r": [10, 20], + "t": 3, + "d": [0] // Legacy Transformed -> Substring(parent_id=0) + } + ]); + + let deserializer = SourceInfoDeserializer::new(&pool_json).unwrap(); + assert_eq!(deserializer.pool.len(), 2); + match &deserializer.pool[1] { + SourceInfo::Substring { + parent, + start_offset, + end_offset, + } => { + assert_eq!(*start_offset, 10); + assert_eq!(*end_offset, 20); + assert!(matches!(&**parent, SourceInfo::Original { .. })); + } + other => panic!("Expected Substring, got {:?}", other), + } + } + + /// Strict rejection: code-3 with `[path]` (missing line) is malformed. + /// Guards the no-`unwrap_or(0)` rule. + #[test] + fn test_deserialize_code3_filter_missing_line_rejected() { + let pool_json = json!([ + { + "r": [0, 0], + "t": 3, + "d": ["/path/to/filter.lua"] + } + ]); + + let result = SourceInfoDeserializer::new(&pool_json); + assert!(matches!( + result, + Err(JsonReadError::MalformedSourceInfoPool) + )); + } + + /// Strict rejection: code-3 with `[path, "not-a-number"]` is malformed. + /// Guards the no-`unwrap_or(0)` rule. + #[test] + fn test_deserialize_code3_filter_non_numeric_line_rejected() { + let pool_json = json!([ + { + "r": [0, 0], + "t": 3, + "d": ["/path/to/filter.lua", "oops"] + } + ]); + + let result = SourceInfoDeserializer::new(&pool_json); + assert!(matches!( + result, + Err(JsonReadError::MalformedSourceInfoPool) + )); + } + + /// Strict rejection: code-3 with `[path, line, extra]` (too many items) + /// is malformed — the filter shape must be exactly two elements. + #[test] + fn test_deserialize_code3_filter_too_many_elements_rejected() { + let pool_json = json!([ + { + "r": [0, 0], + "t": 3, + "d": ["/path/to/filter.lua", 42, "extra"] + } + ]); + + let result = SourceInfoDeserializer::new(&pool_json); + assert!(matches!( + result, + Err(JsonReadError::MalformedSourceInfoPool) + )); + } + + /// Strict rejection: code-3 with empty array is malformed. + #[test] + fn test_deserialize_code3_empty_array_rejected() { + let pool_json = json!([ + { + "r": [0, 0], + "t": 3, + "d": [] + } + ]); + + let result = SourceInfoDeserializer::new(&pool_json); + assert!(matches!( + result, + Err(JsonReadError::MalformedSourceInfoPool) + )); + } + + /// Strict rejection: code-3 with a non-array payload is malformed. + #[test] + fn test_deserialize_code3_non_array_payload_rejected() { + let pool_json = json!([ + { + "r": [0, 0], + "t": 3, + "d": 7 + } + ]); + + let result = SourceInfoDeserializer::new(&pool_json); + assert!(matches!( + result, + Err(JsonReadError::MalformedSourceInfoPool) + )); + } + + // ---------------------------------------------------------------- + // Plan 5 Phase 2 — Code-4 (Generated) reader + // ---------------------------------------------------------------- + + /// Forward-compat: code-4 with no `from` array decodes as + /// `Generated { by: , from: [] }`. Pure synthesis. + #[test] + fn test_deserialize_code4_generated_no_anchors() { + let pool_json = json!([ + { + "r": [0, 0], + "t": 4, + "d": { "by": { "kind": "sectionize" } } + } + ]); + + let deserializer = SourceInfoDeserializer::new(&pool_json).unwrap(); + match &deserializer.pool[0] { + SourceInfo::Generated { by, from } => { + assert_eq!(by.kind, "sectionize"); + assert!(by.data.is_null()); + assert!(from.is_empty()); + } + other => panic!("Expected Generated, got {:?}", other), + } + } + + /// Forward-compat: code-4 with `by.data` round-trips arbitrary JSON. + #[test] + fn test_deserialize_code4_generated_with_by_data() { + let pool_json = json!([ + { + "r": [0, 0], + "t": 4, + "d": { + "by": { + "kind": "filter", + "data": { "filter_path": "/x.lua", "line": 7 } + } + } + } + ]); + + let deserializer = SourceInfoDeserializer::new(&pool_json).unwrap(); + match &deserializer.pool[0] { + SourceInfo::Generated { by, from } => { + let (path, line) = by.as_filter().expect("expected filter By"); + assert_eq!(path, "/x.lua"); + assert_eq!(line, 7); + assert!(from.is_empty()); + } + other => panic!("Expected Generated, got {:?}", other), + } + } + + /// Code-4 with a single Invocation anchor — every known role is + /// recoverable. + #[test] + fn test_deserialize_code4_with_invocation_anchor() { + let pool_json = json!([ + { + "r": [0, 5], + "t": 0, + "d": 0 // Original (target of the anchor) + }, + { + "r": [0, 0], + "t": 4, + "d": { + "by": { "kind": "shortcode", "data": { "name": "meta" } }, + "from": [ + { "role": "invocation", "si_id": 0 } + ] + } + } + ]); + + let deserializer = SourceInfoDeserializer::new(&pool_json).unwrap(); + match &deserializer.pool[1] { + SourceInfo::Generated { by, from } => { + assert_eq!(by.kind, "shortcode"); + assert_eq!(from.len(), 1); + assert!(matches!(from[0].role, AnchorRole::Invocation)); + assert!(matches!(&*from[0].source_info, SourceInfo::Original { .. })); + } + other => panic!("Expected Generated, got {:?}", other), + } + } + + /// Code-4 with multiple anchors — invocation + value-source + an + /// extension-defined Other role. + #[test] + fn test_deserialize_code4_with_multiple_anchors() { + let pool_json = json!([ + { "r": [0, 5], "t": 0, "d": 0 }, // Original 0 + { "r": [5, 10], "t": 0, "d": 0 }, // Original 1 + { "r": [10, 15], "t": 0, "d": 0 }, // Original 2 + { + "r": [0, 0], + "t": 4, + "d": { + "by": { "kind": "shortcode", "data": { "name": "x" } }, + "from": [ + { "role": "invocation", "si_id": 0 }, + { "role": "value-source", "si_id": 1 }, + { "role": "other:ext/foo/bar", "si_id": 2 } + ] + } + } + ]); + + let deserializer = SourceInfoDeserializer::new(&pool_json).unwrap(); + match &deserializer.pool[3] { + SourceInfo::Generated { from, .. } => { + assert_eq!(from.len(), 3); + assert!(matches!(from[0].role, AnchorRole::Invocation)); + assert!(matches!(from[1].role, AnchorRole::ValueSource)); + match &from[2].role { + AnchorRole::Other(name) => assert_eq!(name, "ext/foo/bar"), + other => panic!("Expected Other(ext/foo/bar), got {:?}", other), + } + } + other => panic!("Expected Generated, got {:?}", other), + } + } + + /// Silently accept code-4 entries with `r != [0, 0]` (precedent: the + /// Concat arm parses `r` but doesn't use it). Future writers will + /// emit `r: [0, 0]`; older/divergent writers might not. + #[test] + fn test_deserialize_code4_nonzero_r_accepted() { + let pool_json = json!([ + { + "r": [42, 99], + "t": 4, + "d": { "by": { "kind": "sectionize" } } + } + ]); + + let deserializer = SourceInfoDeserializer::new(&pool_json).unwrap(); + assert!(matches!(deserializer.pool[0], SourceInfo::Generated { .. })); + } + + /// Forward-compat: unknown `by.kind` decodes opaquely — the wire + /// format does not constrain `kind` to known values. + #[test] + fn test_deserialize_code4_unknown_kind_is_forward_compat() { + let pool_json = json!([ + { + "r": [0, 0], + "t": 4, + "d": { + "by": { + "kind": "ext/future/foo", + "data": { "anything": [1, 2, 3] } + } + } + } + ]); + + let deserializer = SourceInfoDeserializer::new(&pool_json).unwrap(); + match &deserializer.pool[0] { + SourceInfo::Generated { by, from } => { + assert_eq!(by.kind, "ext/future/foo"); + assert_eq!(by.data["anything"], json!([1, 2, 3])); + assert!(from.is_empty()); + } + other => panic!("Expected Generated, got {:?}", other), + } + } + + // --- Strict code-4 rejection tests ------------------------------ + + #[test] + fn test_deserialize_code4_missing_by_rejected() { + let pool_json = json!([ + { "r": [0, 0], "t": 4, "d": {} } + ]); + assert!(matches!( + SourceInfoDeserializer::new(&pool_json), + Err(JsonReadError::MalformedSourceInfoPool) + )); + } + + #[test] + fn test_deserialize_code4_missing_by_kind_rejected() { + let pool_json = json!([ + { "r": [0, 0], "t": 4, "d": { "by": {} } } + ]); + assert!(matches!( + SourceInfoDeserializer::new(&pool_json), + Err(JsonReadError::MalformedSourceInfoPool) + )); + } + + #[test] + fn test_deserialize_code4_by_not_object_rejected() { + let pool_json = json!([ + { "r": [0, 0], "t": 4, "d": { "by": "filter" } } + ]); + assert!(matches!( + SourceInfoDeserializer::new(&pool_json), + Err(JsonReadError::MalformedSourceInfoPool) + )); + } + + #[test] + fn test_deserialize_code4_from_not_array_rejected() { + let pool_json = json!([ + { + "r": [0, 0], + "t": 4, + "d": { + "by": { "kind": "sectionize" }, + "from": "not-an-array" + } + } + ]); + assert!(matches!( + SourceInfoDeserializer::new(&pool_json), + Err(JsonReadError::MalformedSourceInfoPool) + )); + } + + #[test] + fn test_deserialize_code4_from_entry_not_object_rejected() { + let pool_json = json!([ + { "r": [0, 5], "t": 0, "d": 0 }, + { + "r": [0, 0], + "t": 4, + "d": { + "by": { "kind": "shortcode" }, + "from": [ "bad-entry" ] + } + } + ]); + assert!(matches!( + SourceInfoDeserializer::new(&pool_json), + Err(JsonReadError::MalformedSourceInfoPool) + )); + } + + #[test] + fn test_deserialize_code4_from_entry_missing_role_rejected() { + let pool_json = json!([ + { "r": [0, 5], "t": 0, "d": 0 }, + { + "r": [0, 0], + "t": 4, + "d": { + "by": { "kind": "shortcode" }, + "from": [ { "si_id": 0 } ] + } + } + ]); + assert!(matches!( + SourceInfoDeserializer::new(&pool_json), + Err(JsonReadError::MalformedSourceInfoPool) + )); + } + + #[test] + fn test_deserialize_code4_from_entry_missing_si_id_rejected() { + let pool_json = json!([ + { "r": [0, 5], "t": 0, "d": 0 }, + { + "r": [0, 0], + "t": 4, + "d": { + "by": { "kind": "shortcode" }, + "from": [ { "role": "invocation" } ] + } + } + ]); + assert!(matches!( + SourceInfoDeserializer::new(&pool_json), + Err(JsonReadError::MalformedSourceInfoPool) + )); + } + + #[test] + fn test_deserialize_code4_unknown_role_rejected() { + let pool_json = json!([ + { "r": [0, 5], "t": 0, "d": 0 }, + { + "r": [0, 0], + "t": 4, + "d": { + "by": { "kind": "shortcode" }, + "from": [ { "role": "bogus", "si_id": 0 } ] + } + } + ]); + assert!(matches!( + SourceInfoDeserializer::new(&pool_json), + Err(JsonReadError::MalformedSourceInfoPool) + )); + } + + #[test] + fn test_deserialize_code4_empty_other_role_rejected() { + let pool_json = json!([ + { "r": [0, 5], "t": 0, "d": 0 }, + { + "r": [0, 0], + "t": 4, + "d": { + "by": { "kind": "shortcode" }, + "from": [ { "role": "other:", "si_id": 0 } ] + } + } + ]); + assert!(matches!( + SourceInfoDeserializer::new(&pool_json), + Err(JsonReadError::MalformedSourceInfoPool) + )); + } + + #[test] + fn test_deserialize_code4_si_id_forward_reference_rejected() { + // si_id must be < current_index — Generated is at index 0 and + // points to index 5 (nonexistent and forward-referencing). + let pool_json = json!([ + { + "r": [0, 0], + "t": 4, + "d": { + "by": { "kind": "shortcode" }, + "from": [ { "role": "invocation", "si_id": 5 } ] + } + } + ]); + let result = SourceInfoDeserializer::new(&pool_json); + assert!(matches!( + result, + Err(JsonReadError::CircularSourceInfoReference(5)) + )); + } + + #[test] + fn test_deserialize_code4_by_data_omitted_is_null() { + // The serializer skips `data` when it's null; the reader must + // accept the omitted shape and produce `data: Value::Null`. + let pool_json = json!([ + { + "r": [0, 0], + "t": 4, + "d": { "by": { "kind": "sectionize" } } + } + ]); + let deserializer = SourceInfoDeserializer::new(&pool_json).unwrap(); + match &deserializer.pool[0] { + SourceInfo::Generated { by, .. } => { + assert!(by.data.is_null()); + } + _ => panic!("Expected Generated"), + } + } } diff --git a/crates/pampa/src/transforms/sectionize.rs b/crates/pampa/src/transforms/sectionize.rs index 492c7ab9b..7d4a2b97f 100644 --- a/crates/pampa/src/transforms/sectionize.rs +++ b/crates/pampa/src/transforms/sectionize.rs @@ -46,7 +46,8 @@ use crate::pandoc::block::{Block, Div, Header}; use hashlink::LinkedHashMap; use quarto_pandoc_types::attr::AttrSourceInfo; -use quarto_source_map::SourceInfo; +use quarto_source_map::{By, SourceInfo}; +use smallvec::smallvec; /// Wrap headers in section Divs. /// @@ -93,7 +94,10 @@ pub fn sectionize_blocks(blocks: Vec) -> Vec { let section_div = Block::Div(Div { attr: section_attr, content: section_content, - source_info: SourceInfo::default(), + source_info: SourceInfo::Generated { + by: By::sectionize(), + from: smallvec![], + }, attr_source: AttrSourceInfo::empty(), }); // Add closed section to parent, or output if no parent @@ -145,7 +149,10 @@ pub fn sectionize_blocks(blocks: Vec) -> Vec { let section_div = Block::Div(Div { attr: section_attr, content: section_content, - source_info: SourceInfo::default(), + source_info: SourceInfo::Generated { + by: By::sectionize(), + from: smallvec![], + }, attr_source: AttrSourceInfo::empty(), }); if let Some((_, _, parent_content)) = section_stack.last_mut() { @@ -523,6 +530,53 @@ mod tests { assert!(matches!(&result[1], Block::Paragraph(_))); } + #[test] + fn test_sectionize_section_div_has_generated_provenance() { + // Plan 6: every synthesized Section Div carries + // Generated { by: sectionize(), from: [] }. The wrapped Header retains + // its original source_info. + let blocks = vec![ + make_header(2, "sec-a", vec![], "A"), + make_para("body"), + make_header(2, "sec-b", vec![], "B"), + ]; + let result = sectionize_blocks(blocks); + assert_eq!(result.len(), 2); + + // First section's outer Div is Generated. + let Block::Div(div) = &result[0] else { + panic!("Expected section Div"); + }; + match &div.source_info { + SourceInfo::Generated { by, from } => { + assert_eq!(by.kind, "sectionize"); + assert!( + from.is_empty(), + "sectionize synthesizers carry no source-side anchors" + ); + } + other => panic!("Expected Generated source_info, got {:?}", other), + } + + // Second section (end-of-input path) — same shape. + let Block::Div(div2) = &result[1] else { + panic!("Expected section Div"); + }; + match &div2.source_info { + SourceInfo::Generated { by, from } => { + assert_eq!(by.kind, "sectionize"); + assert!(from.is_empty()); + } + other => panic!("Expected Generated source_info, got {:?}", other), + } + + // The wrapped Header inside the section retains its original + // (dummy) source_info — only the Div is synthesized. + let Block::Header(_) = &div.content[0] else { + panic!("Expected Header inside section"); + }; + } + #[test] fn test_sectionize_class_order() { // Classes should be: "section", "levelN", then user classes diff --git a/crates/pampa/src/utils/trim_source_location.rs b/crates/pampa/src/utils/trim_source_location.rs index 177a278c8..6238863ea 100644 --- a/crates/pampa/src/utils/trim_source_location.rs +++ b/crates/pampa/src/utils/trim_source_location.rs @@ -96,8 +96,8 @@ pub fn trim_whitespace( // For concat, just return as-is for now (edge case) source_info.clone() } - SourceInfo::FilterProvenance { .. } => { - // For filter provenance, just return as-is + SourceInfo::Generated { .. } => { + // No characteristic local-text range to trim against. source_info.clone() } }; @@ -128,8 +128,8 @@ pub fn trim_whitespace( // Proper handling would require splitting/adjusting pieces source_info.clone() } - SourceInfo::FilterProvenance { .. } => { - // For filter provenance, just return as-is + SourceInfo::Generated { .. } => { + // No characteristic local-text range to trim against. source_info.clone() } } diff --git a/crates/pampa/src/writers/incremental.rs b/crates/pampa/src/writers/incremental.rs index 69486e286..7fbd8dcb0 100644 --- a/crates/pampa/src/writers/incremental.rs +++ b/crates/pampa/src/writers/incremental.rs @@ -16,6 +16,8 @@ use quarto_ast_reconcile::types::{ }; use quarto_ast_reconcile::{structural_eq_blocks, structural_eq_inlines}; use quarto_pandoc_types::config_value::{ConfigMapEntry, ConfigValue, ConfigValueKind}; +use quarto_pandoc_types::is_atomic_custom_node; +use quarto_source_map::{FileId, SourceInfo}; use std::ops::Range; use super::qmd; @@ -33,20 +35,37 @@ pub struct TextEdit { pub replacement: String, } -/// An entry in the coarsened plan: either copy verbatim, rewrite, or inline-splice. +/// An entry in the coarsened plan. +/// +/// Plan 7 adds `Transparent` and `Omit` to the original three variants +/// (`Verbatim`, `Rewrite`, `InlineSplice`). #[derive(Debug)] enum CoarsenedEntry { /// Copy this byte range verbatim from original_qmd. /// The text includes the block content + trailing \n. Verbatim { byte_range: Range, - /// Index of this block in original_ast.blocks (for gap computation) - orig_idx: usize, + /// Index of this block in original_ast.blocks (for gap computation). + /// `None` for entries that came from a `Transparent` recursion — those + /// children aren't top-level blocks so they have no top-level index; + /// `compute_separator`'s original-gap optimization falls back to the + /// standard separator for them. + orig_idx: Option, }, /// Rewrite this block using the standard writer. + /// + /// `block_text` is pre-computed at coarsen time so the entry stays + /// self-contained regardless of nesting depth. (Earlier the variant + /// carried `new_idx: usize` and looked up the block at emit time, + /// but that indexed `new_ast.blocks` top-level — wrong for entries + /// produced inside a `Transparent` recursion, where the relevant + /// block lives in a child slice. See + /// `claude-notes/designs/incremental-writer-internals.md` for the + /// "every variant is self-contained" contract.) Rewrite { - /// Index into new_ast.blocks - new_idx: usize, + /// Pre-computed block text — same bytes `write_block_to_string` + /// would produce on the corresponding block. + block_text: String, }, /// Splice inlines within a block without rewriting the entire block. /// The block structure (prefix, suffix) is preserved from the original; @@ -54,9 +73,100 @@ enum CoarsenedEntry { InlineSplice { /// Pre-computed block text: original block with inline content replaced. block_text: String, - /// Index of this block in original_ast.blocks (for gap computation) - orig_idx: usize, + /// Index of this block in original_ast.blocks (for gap computation). + /// Same `Option` semantics as `Verbatim::orig_idx`. + orig_idx: Option, }, + /// Plan 7: a non-atomic `Generated` wrapper with empty anchors AND + /// source-bearing children. The wrapper contributes no bytes; its + /// children produce the output. Used for sectionize wrappers, + /// footnotes container, appendix container — synthesizers whose + /// container shell has no preimage but whose inner content does. + Transparent { child_entries: Vec }, + /// Plan 7: drop this node from output entirely. The next pipeline run + /// regenerates it from baseline content. Used for atomic-kind + /// `Generated` nodes with no Invocation anchor (filter constructions, + /// title-block synthesis, tree-sitter postprocess space) and for + /// no-preimage `Generated` containers replaced via React. + Omit, +} + +// ============================================================================= +// Editability gate (Plan 7) +// ============================================================================= + +/// Decide whether the *interior* of `block` is editable, with respect to the +/// active document `target_file_id`. +/// +/// "Editable inside" means: the user can type into this node's content and +/// have their edit round-trip back to source bytes. Three reasons content is +/// **not** editable inside: +/// +/// 1. The block is an atomic `CustomNode` (per +/// [`quarto_pandoc_types::is_atomic_custom_node`]). Atomic nodes are +/// replaceable wholesale via a React-side component menu but have no +/// editable text region. Today: `"CrossrefResolvedRef"`. +/// 2. The block carries `SourceInfo::Generated` with an atomic-kind `by` +/// (shortcode / filter / title-block / tree-sitter-postprocess). +/// Content is the resolved value of an invocation token; the user's +/// source-side knob is the token, not the resolved bytes. +/// 3. The block's source_info has no preimage in `target_file_id` +/// (synthesized-from-metadata containers, cross-file Original chains). +/// There are no bytes in the target file to map an inner edit back to. +/// +/// **Returns `true` for everything else.** Used by `coarsen`'s soft-drop +/// logic; the React-side hand-mirror lives at +/// `ts-packages/preview-renderer/src/utils/atomicCustomNodes.ts` plus a +/// parallel `is_editable_inside` predicate to be added in a follow-up. +/// +/// See Plan 7 §"Unified editability predicate". +pub fn is_editable_inside_block(block: &Block, target_file_id: FileId) -> bool { + if let Block::Custom(cn) = block + && is_atomic_custom_node(&cn.type_name) + { + return false; + } + is_editable_inside_source_info(block.source_info(), target_file_id) +} + +/// Inline-side counterpart of [`is_editable_inside_block`]. +/// +/// Same three reasons content is not editable inside; for `Inline::Custom` +/// the atomic-CustomNode check applies (some atomic types live in the +/// inline arm — `CrossrefResolvedRef` is one). +pub fn is_editable_inside_inline(inline: &Inline, target_file_id: FileId) -> bool { + if let Inline::Custom(cn) = inline + && is_atomic_custom_node(&cn.type_name) + { + return false; + } + is_editable_inside_source_info(inline.source_info(), target_file_id) +} + +/// Shared editability rules driven by `SourceInfo` alone (the +/// atomic-CustomNode gate is applied by the block / inline callers above). +fn is_editable_inside_source_info(si: &SourceInfo, target_file_id: FileId) -> bool { + // Atomic-kind Generated (shortcode, filter, title-block, + // tree-sitter-postprocess): the content is pipeline-resolved; the + // user's source-side knob is the invocation token, not the bytes + // inside. + if let SourceInfo::Generated { by, .. } = si + && by.is_atomic_kind() + { + return false; + } + // Catch-all: editable iff the region has byte-traceable preimage in + // the target file. Covers: + // - Original in target → editable. ✓ + // - Substring chain resolving in target → editable. ✓ + // - Original/Substring rooted outside target → not editable. + // - Generated with empty anchors (sectionize, footnotes, + // appendix containers) → preimage_in returns None → not editable. + // - Generated with only ValueSource/Dispatch/Other anchors → not + // editable (preimage_in walks Invocation only). + // - Non-atomic Generated with Invocation anchor in target → + // editable. + si.preimage_in(target_file_id).is_some() } // ============================================================================= @@ -73,24 +183,36 @@ enum CoarsenedEntry { /// * `plan` - A reconciliation plan describing alignment between original_ast and new_ast /// /// # Returns -/// A new QMD string where: -/// - Unchanged blocks are preserved verbatim from `original_qmd` -/// - Changed blocks are rewritten using the standard writer -/// - The result round-trips: `read(result) ≡ new_ast` (structural equality) +/// +/// On success: `(new_qmd, warnings)`. The qmd preserves unchanged blocks +/// verbatim from `original_qmd`, rewrites changed blocks via the standard +/// writer, and soft-drops bad edits to non-editable regions (atomic +/// CustomNodes, atomic-kind Generated, no-preimage Generated containers). +/// Each soft-drop pushes a Q-3-42 / Q-3-43 warning into the returned vec; +/// the overall write still succeeds. +/// +/// On failure: `Err(fatal_errors)` — genuine structural failure (UTF-8 +/// error, inline-splice impossibility, etc.). Soft-drop substitutions +/// never reach this arm. pub fn incremental_write( original_qmd: &str, original_ast: &Pandoc, new_ast: &Pandoc, plan: &ReconciliationPlan, -) -> Result> { +) -> Result< + (String, Vec), + Vec, +> { // The QMD reader internally pads input with '\n' when it doesn't end with // one, producing source spans relative to the padded input. We must use the // same padded string so that block source spans are valid byte indices. let mut padded_storage = None; let (qmd, did_pad) = ensure_trailing_newline(original_qmd, &mut padded_storage); - // Step 1: Coarsen the reconciliation plan - let coarsened = coarsen(qmd, original_ast, new_ast, plan)?; + // Step 1: Coarsen the reconciliation plan. Soft-drop warnings collect + // into this sink; coarsen never returns Err for soft-drop cases. + let mut warnings: Vec = Vec::new(); + let coarsened = coarsen(qmd, original_ast, new_ast, plan, &mut warnings)?; // Step 2: Assemble the result string let mut result = assemble(qmd, original_ast, new_ast, &coarsened)?; @@ -101,24 +223,34 @@ pub fn incremental_write( result.pop(); } - Ok(result) + Ok((result, warnings)) } /// Compute minimal text edits to transform `original_qmd` into the incremental write result. /// /// Each TextEdit describes a byte range in `original_qmd` to replace and the replacement text. /// Edits are sorted by range.start and non-overlapping. +/// +/// Like [`incremental_write`], returns a tuple `(edits, warnings)` on +/// success; soft-drop warnings (Q-3-42 / Q-3-43) ride alongside. pub fn compute_incremental_edits( original_qmd: &str, original_ast: &Pandoc, new_ast: &Pandoc, plan: &ReconciliationPlan, -) -> Result, Vec> { +) -> Result< + ( + Vec, + Vec, + ), + Vec, +> { // Same trailing-newline normalization as incremental_write (see comment there). let mut padded_storage = None; let (qmd, did_pad) = ensure_trailing_newline(original_qmd, &mut padded_storage); - let coarsened = coarsen(qmd, original_ast, new_ast, plan)?; + let mut warnings: Vec = Vec::new(); + let coarsened = coarsen(qmd, original_ast, new_ast, plan, &mut warnings)?; let mut edits = compute_edits_from_coarsened(qmd, original_ast, new_ast, &coarsened)?; if did_pad { @@ -134,7 +266,7 @@ pub fn compute_incremental_edits( } } - Ok(edits) + Ok((edits, warnings)) } // ============================================================================= @@ -146,70 +278,267 @@ pub fn compute_incremental_edits( /// Phase 5 strategy: for RecurseIntoContainer blocks that are inline-content blocks /// (Paragraph, Plain, Header) with inline plans that pass the safety check, /// produce InlineSplice entries. All other RecurseIntoContainer become Rewrite. +/// +/// Plan 7: soft-drop warnings push into `warnings`. Bad-edit cases +/// (atomic-CustomNode interior edit, atomic-Generated edit, no-preimage +/// Generated edit) substitute a safe alignment AND record a Q-3-42 / +/// Q-3-43 warning; coarsen never returns `Err` for these cases. `Err` is +/// reserved for genuine structural failures (UTF-8 errors, inline-splice +/// impossibility from assemble_inline_splice). fn coarsen( original_qmd: &str, original_ast: &Pandoc, new_ast: &Pandoc, plan: &ReconciliationPlan, + warnings: &mut Vec, +) -> Result, Vec> { + // The "target file" for editability decisions is the file + // `original_qmd` was parsed from. Derived by descending past any + // synthesized first blocks (title-block, sectionize wrappers, + // footnotes / appendix containers) so we get the user's real + // qmd FileId rather than the FileId(0) fallback by accident. + // Closes Plan 7c Phase 8. + let target_file_id = derive_target_file_id(&original_ast.blocks); + + coarsen_blocks( + original_qmd, + &original_ast.blocks, + &new_ast.blocks, + plan, + target_file_id, + warnings, + ) +} + +/// Recurse into the children of a non-atomic Generated wrapper whose +/// own bytes are synthesized but whose children carry real source +/// preimage. Used by the RecurseIntoContainer arm of `coarsen_blocks` +/// when soft-dropping the wrapper would silently delete real user +/// content. The wrapper's index resolves the nested +/// `block_container_plans` entry; that plan describes alignment +/// between the wrapper's `orig` and `new` children. +/// +/// Per the `Verbatim`/`InlineSplice::orig_idx` contract (see the +/// `CoarsenedEntry` doc comments), child entries returned to a +/// `Transparent` wrapper must carry `orig_idx: None` — their indices +/// are children-relative, not top-level, so `compute_separator`'s +/// "consecutive in original" optimization can't use them. +fn coarsen_children( + original_qmd: &str, + orig_children: &[Block], + new_children: &[Block], + child_plan: &ReconciliationPlan, + target_file_id: quarto_source_map::FileId, + warnings: &mut Vec, +) -> Result, Vec> { + let mut entries = coarsen_blocks( + original_qmd, + orig_children, + new_children, + child_plan, + target_file_id, + warnings, + )?; + for entry in &mut entries { + clear_orig_idx_for_transparent_child(entry); + } + Ok(entries) +} + +/// Walk a `CoarsenedEntry` tree and set `orig_idx` to `None` on every +/// `Verbatim` / `InlineSplice`. Used when promoting entries into a +/// `Transparent` wrapper, where the indices no longer refer to +/// top-level positions. +fn clear_orig_idx_for_transparent_child(entry: &mut CoarsenedEntry) { + match entry { + CoarsenedEntry::Verbatim { orig_idx, .. } => *orig_idx = None, + CoarsenedEntry::InlineSplice { orig_idx, .. } => *orig_idx = None, + CoarsenedEntry::Transparent { child_entries } => { + for child in child_entries { + clear_orig_idx_for_transparent_child(child); + } + } + CoarsenedEntry::Rewrite { .. } | CoarsenedEntry::Omit => {} + } +} + +/// Coarsen a block-alignment plan against the given original/new +/// block slices. Extracted from `coarsen` so the RecurseIntoContainer +/// path can recurse into a non-atomic Generated wrapper's children +/// using the nested `block_container_plans` plan. +fn coarsen_blocks( + original_qmd: &str, + original_blocks: &[Block], + new_blocks: &[Block], + plan: &ReconciliationPlan, + target_file_id: quarto_source_map::FileId, + warnings: &mut Vec, ) -> Result, Vec> { let mut entries = Vec::with_capacity(plan.block_alignments.len()); for (result_idx, alignment) in plan.block_alignments.iter().enumerate() { let entry = match alignment { - BlockAlignment::KeepBefore(orig_idx) => { - let span = block_source_span(&original_ast.blocks[*orig_idx]); - CoarsenedEntry::Verbatim { - byte_range: span, - orig_idx: *orig_idx, + BlockAlignment::KeepBefore(orig_idx) => coarsen_keep_before_block( + &original_blocks[*orig_idx], + target_file_id, + Some(*orig_idx), + )?, + BlockAlignment::UseAfter(after_idx) => { + let new_block = &new_blocks[*after_idx]; + let new_si = new_block.source_info(); + + let is_atomic_cn = matches!(new_block, Block::Custom(cn) + if is_atomic_custom_node(&cn.type_name)); + let atomic_generated_preimage = match new_si { + SourceInfo::Generated { by, .. } if by.is_atomic_kind() => { + new_si.preimage_in(target_file_id) + } + _ => None, + }; + let no_preimage_generated = matches!(new_si, SourceInfo::Generated { .. }) + && new_si.preimage_in(target_file_id).is_none(); + + if let Some(range) = atomic_generated_preimage { + // User edited inside an atomic-kind Generated block + // (shortcode / filter / title-block / tree-sitter- + // postprocess). The reconciler split the edit into + // a deleted-original + new-block; the new block + // still carries the token's Invocation anchor, so + // its preimage IS the source-side knob. Emit the + // token bytes verbatim and soft-drop the edit; + // without this branch the let-user-win Rewrite + // below would write the resolved bytes (the edit + // applied to the generated content) back into qmd, + // poisoning the source. See + // `claude-notes/designs/incremental-writer-internals.md` + // for why this lives at the writer, not the gate. + warnings.push(diagnostic_q3_43_block(new_block)); + CoarsenedEntry::Verbatim { + byte_range: range, + // No original block paired with this entry — + // the UseAfter alignment implicitly deleted + // the original; compute_separator's + // consecutive-in-original optimization can't + // use a top-level orig_idx here. + orig_idx: None, + } + } else if !is_atomic_cn && no_preimage_generated { + // User replaced a synthesized-from-metadata container + // wholesale via React. No source position to anchor a + // Rewrite at; soft-drop with Q-3-43. + warnings.push(diagnostic_q3_43_block(new_block)); + CoarsenedEntry::Omit + } else { + // Let-user-win — including for atomic CustomNodes (the + // user replaced an include / CrossrefResolvedRef via a + // component menu; the qmd writer's CustomNode arm + // serializes the fresh plain_data). + CoarsenedEntry::Rewrite { + block_text: write_block_to_string(new_block)?, + } } } - BlockAlignment::UseAfter(_after_idx) => CoarsenedEntry::Rewrite { - new_idx: result_idx, - }, BlockAlignment::RecurseIntoContainer { before_idx, after_idx, } => { - // Check if this block has an inline plan and is safe to splice + let orig_block = &original_blocks[*before_idx]; + + // Plan 7: if the original container is not editable inside, + // soft-drop the inner edit. Substitutions: + // - atomic CustomNode with preimage → Verbatim wrapper bytes + // - non-atomic Generated wrapper with source-bearing + // children → recurse Transparent into the children (the + // wrapper's bytes are synthesized but the children carry + // real preimage; mirrors the unchanged-wrapper Transparent + // path in `coarsen_keep_before_block` at line ~459) + // - everything else (no-preimage Generated container with + // no source-bearing children, etc.) → Omit + if !is_editable_inside_block(orig_block, target_file_id) { + // First: atomic CustomNode with preimage → keep the + // wrapper bytes verbatim (the user-side edit is lost, + // but the wrapper text survives). + if let Some(range) = orig_block.source_info().preimage_in(target_file_id) { + warnings.push(diagnostic_q3_43_block(orig_block)); + entries.push(CoarsenedEntry::Verbatim { + byte_range: range, + orig_idx: Some(*before_idx), + }); + continue; + } + + // Second: non-atomic Generated wrapper (sectionize, + // footnotes-container, appendix-container, ...). If it + // has source-bearing children AND the reconciler built a + // container plan for this index, recurse coarsen on the + // children. The user's edit is *inside* the wrapper — + // soft-dropping the wrapper would silently delete real + // user content. + if let SourceInfo::Generated { by, .. } = orig_block.source_info() + && !by.is_atomic_kind() + && let (Some(orig_children), Some(new_children)) = ( + block_block_children(orig_block), + block_block_children(&new_blocks[*after_idx]), + ) + && orig_children + .iter() + .any(|c| c.source_info().preimage_in(target_file_id).is_some()) + && let Some(child_plan) = plan.block_container_plans.get(&result_idx) + { + let child_entries = coarsen_children( + original_qmd, + orig_children, + new_children, + child_plan, + target_file_id, + warnings, + )?; + entries.push(CoarsenedEntry::Transparent { child_entries }); + continue; + } + + // Last resort: no preimage, no recursable children → + // soft-drop with Q-3-43. + warnings.push(diagnostic_q3_43_block(orig_block)); + entries.push(CoarsenedEntry::Omit); + continue; + } + + // Existing recurse logic: try inline-splice if the block has + // an inline plan and is safe to splice; else Rewrite. if let Some(inline_plan) = plan.inline_plans.get(&result_idx) { - let orig_block = &original_ast.blocks[*before_idx]; - let new_block = &new_ast.blocks[*after_idx]; + let new_block = &new_blocks[*after_idx]; if let (Some(orig_inlines), Some(new_inlines)) = (block_inlines(orig_block), block_inlines(new_block)) + && !orig_inlines.is_empty() + && is_inline_splice_safe(new_inlines, inline_plan) + && block_attrs_eq(orig_block, new_block) { - if !orig_inlines.is_empty() - && is_inline_splice_safe(new_inlines, inline_plan) - && block_attrs_eq(orig_block, new_block) - { - // Safe to splice — assemble the patched block text - let block_text = assemble_inline_splice( - original_qmd, - orig_block, - orig_inlines, - new_inlines, - inline_plan, - )?; - CoarsenedEntry::InlineSplice { - block_text, - orig_idx: *before_idx, - } - } else { - CoarsenedEntry::Rewrite { - new_idx: result_idx, - } + // Safe to splice — assemble the patched block text + let block_text = assemble_inline_splice( + original_qmd, + orig_block, + orig_inlines, + new_inlines, + inline_plan, + target_file_id, + warnings, + )?; + CoarsenedEntry::InlineSplice { + block_text, + orig_idx: Some(*before_idx), } } else { - // Not an inline-content block — fall back to Rewrite CoarsenedEntry::Rewrite { - new_idx: result_idx, + block_text: write_block_to_string(new_block)?, } } } else { - // No inline plan — this is a block container (Div, BlockQuote, etc.) - // Fall back to Rewrite + // No inline plan — this is a block container (Div, + // BlockQuote, etc.). Fall back to Rewrite. CoarsenedEntry::Rewrite { - new_idx: result_idx, + block_text: write_block_to_string(&new_blocks[*after_idx])?, } } } @@ -220,6 +549,282 @@ fn coarsen( Ok(entries) } +/// Classify a single `KeepBefore` block per Plan 7's cascade: +/// +/// 1. **Verbatim** if `preimage_in(target)` returns `Some(range)` — covers +/// `Original`/`Substring`/contiguous-`Concat`/`Generated`-via-Invocation. +/// The atomic-kind shortcode case lands here too (its Invocation anchor +/// resolves to the token bytes). +/// 2. **Omit** if the source_info is `Generated` with `is_atomic_kind()` +/// and no Invocation anchor — filter constructions, title-block +/// synthesis, tree-sitter-postprocess space. Belt-and-suspenders +/// `debug_assert!` against shortcode-with-empty-from (Plan 6 stamper +/// invariant: every shortcode resolution must carry an Invocation). +/// 3. **Transparent** if the source_info is a non-atomic `Generated` +/// wrapper with source-bearing children (sectionize wrapper, +/// footnotes-container, appendix-container). Recurses into the +/// children. +/// 4. **Rewrite** catch-all — re-serializes the unchanged block through +/// the qmd writer. Lossy at the byte level but preserves content. +/// Handles cross-file Original chains (no Plan-8 wrapper yet), +/// Substring rooted outside target, gappy Concat. +/// +/// `top_level_orig_idx` is `Some(idx)` for top-level blocks (used by +/// `compute_separator`'s original-gap optimization) and `None` for +/// children of a `Transparent` (whose indices don't reference +/// `original_ast.blocks` directly). +/// +/// KeepBefore implies the original block at this position and the new +/// block at the same position are structurally equivalent (that's what +/// the reconciler's KeepBefore alignment *means*). So when we fall +/// through to Rewrite, serializing the original `block` yields the +/// same bytes as serializing the new one — by referential transparency +/// of `write_block_to_string`. We pick the original to avoid threading +/// the new slice down here. +fn coarsen_keep_before_block( + block: &Block, + target_file_id: quarto_source_map::FileId, + top_level_orig_idx: Option, +) -> Result> { + let si = block.source_info(); + + if let Some(range) = si.preimage_in(target_file_id) { + return Ok(CoarsenedEntry::Verbatim { + byte_range: range, + orig_idx: top_level_orig_idx, + }); + } + + if let SourceInfo::Generated { by, .. } = si { + if by.is_atomic_kind() { + // Atomic-kind Generated with no Invocation anchor. + debug_assert!( + !by.is_kind("shortcode"), + "Generated {{ by: shortcode, from: [] }} reached the writer — \ + Plan 6's stamper must always attach an Invocation anchor for \ + shortcode resolutions. \ + Block: {:?}", + block, + ); + return Ok(CoarsenedEntry::Omit); + } + + // Non-atomic Generated wrapper. If it has source-bearing children, + // recurse Transparent. Else fall through to Rewrite. + if let Some(children) = block_block_children(block) + && children + .iter() + .any(|c| c.source_info().preimage_in(target_file_id).is_some()) + { + let child_entries = children + .iter() + .map(|child| { + // Children of a Transparent wrapper aren't top-level + // blocks — pass orig_idx=None so compute_separator + // doesn't try the original-gap optimization on them. + coarsen_keep_before_block(child, target_file_id, None) + }) + .collect::, _>>()?; + return Ok(CoarsenedEntry::Transparent { child_entries }); + } + } + + // Catch-all: cross-file Original, Substring rooted outside target, + // gappy Concat, Generated wrapper without source-bearing children. + Ok(CoarsenedEntry::Rewrite { + block_text: write_block_to_string(block)?, + }) +} + +/// Return the inner block children of a block, if the block is a +/// recognized block container. +/// +/// Today's Plan-6 synthesizers produce `Div`-shaped wrappers (sectionize, +/// footnotes-container, appendix-container). Other block containers +/// (BlockQuote, Figure, NoteDefinitionFencedBlock) round out the set so +/// the Transparent cascade applies uniformly when those carry Generated +/// source_info. List-shaped containers (BulletList, OrderedList, +/// DefinitionList) return `None` — their `content` is `Vec` +/// (lists of lists), which isn't the Transparent shape. +fn block_block_children(block: &Block) -> Option<&[Block]> { + match block { + Block::Div(d) => Some(&d.content), + Block::BlockQuote(b) => Some(&b.content), + Block::Figure(f) => Some(&f.content), + Block::NoteDefinitionFencedBlock(n) => Some(&n.content), + _ => None, + } +} + +// ============================================================================= +// Transparent wrappers +// ============================================================================= +// +// A *transparent wrapper* is a block that's structurally part of the +// AST but has no source bytes of its own — the user's actual content +// lives in its children. Sectionize Divs, footnotes containers, and +// appendix containers (from `pampa::pandoc::sugar::SectionizeTransform` +// and friends) are the canonical examples. A Lua filter that wraps +// user content in a Div is another: the wrapper has `Generated` +// source_info, but the children preserve their original positions. +// +// Code that asks "where do the user's source bytes live?" must +// **descend through transparent wrappers** rather than reading +// `blocks[0]` directly. The `first_in_user_tree` walker below is the +// reference implementation; `derive_target_file_id` / +// `first_target_anchored_start_in` are thin specializations. +// +// See `claude-notes/designs/transparent-wrappers.md` for the full +// contract and the rationale (why the predicate is structural rather +// than opt-in: filter authors don't have to register anything — the +// AST shape they emit *is* the declaration). + +/// Walk `blocks` depth-first, applying `extract` to each block. On a +/// `Some` result, stop and return it. On `None`, descend through +/// `block_block_children` and try again. This is how we see through +/// transparent wrappers: a wrapper has no source position of its +/// own, so `extract` returns `None` for it; the walker then looks +/// inside. +/// +/// Used by `derive_target_file_id` and `first_target_anchored_start_in`, +/// and intended as the building block for any future code that needs +/// to find "the first user block matching X" without having to +/// re-derive the descent. +fn first_in_user_tree(blocks: &[Block], extract: &impl Fn(&Block) -> Option) -> Option { + for block in blocks { + if let Some(v) = extract(block) { + return Some(v); + } + if let Some(children) = block_block_children(block) + && let Some(v) = first_in_user_tree(children, extract) + { + return Some(v); + } + } + None +} + +/// Returns `true` iff `block` is a transparent wrapper with respect +/// to `target_file_id`: +/// +/// 1. its `SourceInfo` is `Generated` with no `Invocation` anchor (so +/// it has no source token of its own), AND +/// 2. it has block-children (`block_block_children` recognises it as +/// a container — Div, BlockQuote, Figure, NoteDefinitionFencedBlock), AND +/// 3. at least one descendant has real `preimage_in(target_file_id)` +/// (so there's actual user content under it). +/// +/// Condition (3) is what makes this *structural* rather than opt-in: +/// a Lua filter that wraps existing user content in a Div produces a +/// Generated wrapper whose children carry their original preimage — +/// transparent. A filter that constructs a fresh Div from metadata +/// has no source-bearing children — atomic. Authors don't have to +/// declare anything; the AST shape declares it for them. +/// +/// Available to callers that need to make an explicit decision +/// (e.g. a future Q-3-44 diagnostic that hints "your filter walked +/// into the sectionize wrapper"). Routine source-position lookups +/// should use `first_in_user_tree` directly. +#[allow(dead_code)] +fn is_transparent_wrapper(block: &Block, target_file_id: quarto_source_map::FileId) -> bool { + if !matches!(block.source_info(), SourceInfo::Generated { .. }) { + return false; + } + if block.source_info().invocation_anchor().is_some() { + return false; + } + let Some(children) = block_block_children(block) else { + return false; + }; + first_in_user_tree(children, &|b| b.source_info().preimage_in(target_file_id)).is_some() +} + +/// Derive the "target file" — the file that `original_qmd` was parsed +/// from, used for editability and preimage checks throughout the +/// writer. +/// +/// Descends through transparent wrappers via `first_in_user_tree`, +/// so a synthesized first block (title-block, sectionize wrapper, +/// footnotes / appendix container) is skipped and the user's real +/// qmd `FileId` is returned. Falls back to `FileId(0)` only for the +/// genuinely-empty document. +/// +/// Closes Plan 7c Phase 8. +fn derive_target_file_id(blocks: &[Block]) -> quarto_source_map::FileId { + first_in_user_tree(blocks, &|b| b.source_info().root_file_id()) + .unwrap_or(quarto_source_map::FileId(0)) +} + +/// Find the start offset (in `target` bytes) of the first block in +/// `blocks` whose `source_info().preimage_in(target)` is `Some`, +/// descending through transparent wrappers. Used by +/// `emit_metadata_prefix` to locate the boundary between the YAML +/// frontmatter region and the first source-anchored user block. +fn first_target_anchored_start_in( + blocks: &[Block], + target: quarto_source_map::FileId, +) -> Option { + first_in_user_tree(blocks, &|b| { + b.source_info().preimage_in(target).map(|r| r.start) + }) +} + +// ============================================================================= +// Soft-drop diagnostic builders (Plan 7) +// ============================================================================= + +/// Build a `Q-3-42` warning for an inline-level edit that targeted +/// atomic-Generated content (typically a shortcode resolution). The +/// source location is the inline's `Invocation` anchor when available +/// (the token bytes), falling back to the inline's own source_info. +fn diagnostic_q3_42_inline(inline: &Inline) -> quarto_error_reporting::DiagnosticMessage { + let location = inline + .source_info() + .invocation_anchor() + .map(|arc| arc.as_ref().clone()) + .unwrap_or_else(|| inline.source_info().clone()); + + quarto_error_reporting::DiagnosticMessageBuilder::warning("Shortcode edit dropped") + .with_code("Q-3-42") + .with_location(location) + .problem( + "An edit to shortcode-resolved (or other atomic-Generated) \ + content was reverted.", + ) + .add_hint( + "The resolved text is read-only; edit the invocation token \ + (e.g. `{{< meta foo >}}`) in source instead.", + ) + .build() +} + +/// Build a `Q-3-43` warning for a block-level edit dropped because the +/// container is not editable inside. +/// +/// Three emission paths share this builder (per Plan 7 +/// §"Diagnostic codes"): +/// - Block RecurseIntoContainer on an atomic CustomNode — wrapper's +/// source_info is `Original` pointing at the token bytes; +/// `with_location` highlights the include / crossref in Monaco. +/// - Block RecurseIntoContainer on a no-preimage Generated container — +/// the wrapper's source_info is `Generated` with no Invocation; the +/// diagnostic lands without a Monaco squiggle and surfaces via the +/// diagnostics banner. +/// - Block UseAfter on a no-preimage Generated container — same as +/// the previous case. +fn diagnostic_q3_43_block(block: &Block) -> quarto_error_reporting::DiagnosticMessage { + quarto_error_reporting::DiagnosticMessageBuilder::warning("Generated content edit dropped") + .with_code("Q-3-43") + .with_location(block.source_info().clone()) + .problem("An edit to pipeline-generated content was reverted.") + .add_hint( + "This content has no editable source position in this file; \ + edit its upstream definition (an include, a metadata key, \ + or other source) instead.", + ) + .build() +} + // ============================================================================= // Step 2: Assemble the Result String // ============================================================================= @@ -237,20 +842,75 @@ fn assemble( let _has_meta_prefix = emit_metadata_prefix(&mut result, original_qmd, original_ast, new_ast, coarsened)?; - // 2b. Walk coarsened entries and assemble blocks with separators + // 2b. Walk coarsened entries and assemble blocks with separators. + // Transparent entries recursively re-enter this loop on their children; + // Omit entries contribute nothing. let mut prev_entry: Option<&CoarsenedEntry> = None; let mut prev_block_text: Option = None; + emit_entries( + &mut result, + original_qmd, + original_ast, + new_ast, + coarsened, + &mut prev_entry, + &mut prev_block_text, + )?; + + Ok(result) +} - for entry in coarsened { - // 2c. Separator between blocks - // Note: we only add a separator when there's a previous block. +/// Recursive helper that walks a slice of `CoarsenedEntry` and emits each +/// one's bytes into `result`, threading `prev_entry` / `prev_block_text` +/// across siblings. +/// +/// `Transparent` re-enters this loop with its children, sharing the same +/// `prev_entry` / `prev_block_text` state so separators compose across the +/// wrapper boundary as if the wrapper weren't there. `Omit` is a no-op — +/// no bytes, no separator update; the next sibling's separator is computed +/// against the entry before the `Omit`. +fn emit_entries<'e>( + result: &mut String, + original_qmd: &str, + original_ast: &Pandoc, + new_ast: &Pandoc, + entries: &'e [CoarsenedEntry], + prev_entry: &mut Option<&'e CoarsenedEntry>, + prev_block_text: &mut Option, +) -> Result<(), Vec> { + for entry in entries { + match entry { + CoarsenedEntry::Omit => { + // Contributes nothing; leave prev_entry / prev_block_text alone + // so the next sibling's separator is computed against the + // entry before this Omit. + continue; + } + CoarsenedEntry::Transparent { child_entries } => { + // Recurse into children with shared prev_* state so separator + // semantics compose through the wrapper. + emit_entries( + result, + original_qmd, + original_ast, + new_ast, + child_entries, + prev_entry, + prev_block_text, + )?; + continue; + } + _ => {} + } + + // Separator between blocks (only if there's a previous emitting entry). // The metadata prefix already includes the gap to the first block, // so we must NOT add an extra separator after it. if prev_entry.is_some() { let separator = compute_separator( original_qmd, original_ast, - prev_entry, + *prev_entry, entry, prev_block_text.as_deref(), ); @@ -262,18 +922,18 @@ fn assemble( CoarsenedEntry::Verbatim { byte_range, .. } => { original_qmd[byte_range.clone()].to_string() } - CoarsenedEntry::Rewrite { new_idx } => { - write_block_to_string(&new_ast.blocks[*new_idx])? - } + CoarsenedEntry::Rewrite { block_text } => block_text.clone(), CoarsenedEntry::InlineSplice { block_text, .. } => block_text.clone(), + // Transparent + Omit were handled above; coarsen never emits + // any other variant. + CoarsenedEntry::Transparent { .. } | CoarsenedEntry::Omit => unreachable!(), }; result.push_str(&block_text); - prev_block_text = Some(block_text); - prev_entry = Some(entry); + *prev_block_text = Some(block_text); + *prev_entry = Some(entry); } - - Ok(result) + Ok(()) } /// Emit the metadata prefix (YAML front matter region). @@ -286,16 +946,23 @@ fn emit_metadata_prefix( new_ast: &Pandoc, _coarsened: &[CoarsenedEntry], ) -> Result> { - // Determine where the metadata region ends by looking at the first - // ORIGINAL block's start offset. We must NOT use the first coarsened - // entry's offset — when blocks are removed from the beginning, the - // first coarsened block may reference a later original block whose - // start > 0, falsely triggering the metadata prefix logic. - let first_block_start = if !original_ast.blocks.is_empty() { - Some(block_source_span(&original_ast.blocks[0]).start) - } else { - None - }; + // Determine where the metadata region ends by finding the start + // offset of the first source-anchored ORIGINAL block in the target + // file. We must NOT use the first coarsened entry's offset — when + // blocks are removed from the beginning, the first coarsened block + // may reference a later original block whose start > 0, falsely + // triggering the metadata prefix logic. + // + // We must also NOT use `original_ast.blocks[0]`'s offset directly: + // when the post-pipeline AST wraps user content in a synthesized + // top-level container (sectionize Div, title-block, footnotes / + // appendix wrappers), `blocks[0].start_offset()` is 0 (Generated, + // no preimage), which would falsely conclude "no metadata region" + // and silently delete the YAML frontmatter. Descend through + // `block_block_children` of any such wrapper to find the first + // block with real preimage in the target file. + let target_file_id = derive_target_file_id(&original_ast.blocks); + let first_block_start = first_target_anchored_start_in(&original_ast.blocks, target_file_id); // Check if there's a metadata region before the first block if let Some(start) = first_block_start { @@ -358,24 +1025,26 @@ fn compute_separator<'a>( curr_entry: &CoarsenedEntry, prev_block_text: Option<&str>, ) -> &'a str { - // Try to use original gap for consecutive blocks that preserve original positions - let prev_orig_idx = match prev_entry { - Some(CoarsenedEntry::Verbatim { orig_idx, .. }) => Some(*orig_idx), - Some(CoarsenedEntry::InlineSplice { orig_idx, .. }) => Some(*orig_idx), + // Try to use original gap for consecutive blocks that preserve original + // positions. Transparent/Omit entries don't carry a top-level orig_idx — + // they fall through to the standard separator. + let prev_orig_idx: Option = match prev_entry { + Some(CoarsenedEntry::Verbatim { orig_idx, .. }) => *orig_idx, + Some(CoarsenedEntry::InlineSplice { orig_idx, .. }) => *orig_idx, _ => None, }; - let curr_orig_idx = match curr_entry { - CoarsenedEntry::Verbatim { orig_idx, .. } => Some(*orig_idx), - CoarsenedEntry::InlineSplice { orig_idx, .. } => Some(*orig_idx), + let curr_orig_idx: Option = match curr_entry { + CoarsenedEntry::Verbatim { orig_idx, .. } => *orig_idx, + CoarsenedEntry::InlineSplice { orig_idx, .. } => *orig_idx, _ => None, }; - if let (Some(prev_idx), Some(curr_idx)) = (prev_orig_idx, curr_orig_idx) { - if curr_idx == prev_idx + 1 { - // Consecutive in original — use original gap - let prev_span = block_source_span(&original_ast.blocks[prev_idx]); - let curr_span = block_source_span(&original_ast.blocks[curr_idx]); - return &original_qmd[prev_span.end..curr_span.start]; - } + if let (Some(prev_idx), Some(curr_idx)) = (prev_orig_idx, curr_orig_idx) + && curr_idx == prev_idx + 1 + { + // Consecutive in original — use original gap + let prev_span = block_source_span(&original_ast.blocks[prev_idx]); + let curr_span = block_source_span(&original_ast.blocks[curr_idx]); + return &original_qmd[prev_span.end..curr_span.start]; } // Standard separator — but check if previous block already ends with \n\n @@ -605,6 +1274,8 @@ fn assemble_inline_splice( orig_inlines: &[Inline], new_inlines: &[Inline], plan: &InlineReconciliationPlan, + target_file_id: quarto_source_map::FileId, + warnings: &mut Vec, ) -> Result> { let block_span = block_source_span(orig_block); @@ -618,7 +1289,14 @@ fn assemble_inline_splice( let suffix = &original_qmd[inline_end..block_span.end]; // Assemble the new inline content - let inline_content = assemble_inline_content(original_qmd, orig_inlines, new_inlines, plan)?; + let inline_content = assemble_inline_content( + original_qmd, + orig_inlines, + new_inlines, + plan, + target_file_id, + warnings, + )?; Ok(format!("{}{}{}", prefix, inline_content, suffix)) } @@ -629,19 +1307,127 @@ fn assemble_inline_splice( /// - KeepBefore: copying the original inline's bytes verbatim /// - UseAfter: writing the new inline to a string /// - RecurseIntoContainer: preserving delimiters, recursing into children +/// +/// Plan 7: inline-level soft-drop substitutes `KeepBefore` for `UseAfter` +/// / `RecurseIntoContainer` alignments that target a non-editable original +/// inline (atomic-CustomNode, atomic-kind Generated, no-preimage +/// Generated). Each substitution pushes a `Q-3-42` warning. The +/// substitution uses the *new-side* index as the positional proxy for the +/// "original inline at the same position" — exact for in-place retypings +/// (the common shortcode-edit case), approximate for arbitrary +/// insertions/deletions. +/// +/// Plan 7 also adds multi-inline dedupe: consecutive `KeepBefore` entries +/// whose original inlines' `Invocation` anchors are `PartialEq`-equal +/// emit a single combined byte range, so a multi-inline shortcode +/// resolution (`{{< meta footer >}}` → `[Strong[Str], Space, Str]`) +/// emits the shortcode token bytes once. fn assemble_inline_content( original_qmd: &str, orig_inlines: &[Inline], new_inlines: &[Inline], plan: &InlineReconciliationPlan, + target_file_id: quarto_source_map::FileId, + warnings: &mut Vec, ) -> Result> { - let mut result = String::new(); - + // Phase 1: apply soft-drop substitutions. Walk alignments and rewrite + // UseAfter/RecurseIntoContainer that target non-editable original + // inlines into KeepBefore(original-position). + let mut effective: Vec = Vec::with_capacity(plan.inline_alignments.len()); for (result_idx, alignment) in plan.inline_alignments.iter().enumerate() { match alignment { + InlineAlignment::UseAfter(_) => { + // Use result_idx (positional proxy) to find the + // corresponding original inline. + if let Some(orig) = orig_inlines.get(result_idx) + && !is_editable_inside_inline(orig, target_file_id) + { + warnings.push(diagnostic_q3_42_inline(orig)); + effective.push(InlineAlignment::KeepBefore(result_idx)); + continue; + } + effective.push(alignment.clone()); + } + InlineAlignment::RecurseIntoContainer { before_idx, .. } => { + let orig = &orig_inlines[*before_idx]; + if !is_editable_inside_inline(orig, target_file_id) { + warnings.push(diagnostic_q3_42_inline(orig)); + effective.push(InlineAlignment::KeepBefore(*before_idx)); + continue; + } + effective.push(alignment.clone()); + } + InlineAlignment::KeepBefore(_) => effective.push(alignment.clone()), + } + } + + // Phase 2: emit, with multi-inline dedupe for consecutive + // KeepBefore entries whose Invocation anchors are PartialEq-equal. + let mut result = String::new(); + let mut i = 0; + while i < effective.len() { + match &effective[i] { InlineAlignment::KeepBefore(orig_idx) => { - let span = inline_source_span(&orig_inlines[*orig_idx]); - result.push_str(&original_qmd[span]); + let first_si = orig_inlines[*orig_idx].source_info(); + let first_invocation = first_si.invocation_anchor().cloned(); + + // Try to extend the run: gather all consecutive KeepBefore + // entries whose invocation_anchor() is PartialEq-equal to + // first_invocation. Only consider runs of length >= 2 for + // dedupe; a single inline emits via the normal path. + let mut j = i + 1; + if first_invocation.is_some() { + while j < effective.len() { + let InlineAlignment::KeepBefore(next_orig_idx) = &effective[j] else { + break; + }; + let next_invocation = orig_inlines[*next_orig_idx] + .source_info() + .invocation_anchor() + .cloned(); + if next_invocation != first_invocation { + break; + } + j += 1; + } + } + + if j > i + 1 { + // Dedupe: the whole group shares one Invocation anchor. + // Emit the Invocation source's preimage bytes once, + // not the individual inlines' ranges. Use the anchor + // source_info's preimage in the target file when + // available; fall back to the first inline's range. + let anchor_arc = first_invocation.unwrap(); + if let Some(range) = anchor_arc.preimage_in(target_file_id) { + result.push_str(&original_qmd[range]); + } else { + // Fall back: emit each inline's bytes individually. + // Shouldn't happen — KeepBefore implies preimage_in + // succeeded for the surrounding block. Keep + // structurally safe behavior just in case. + for k in i..j { + let InlineAlignment::KeepBefore(idx) = &effective[k] else { + unreachable!() + }; + let span = inline_source_span(&orig_inlines[*idx]); + result.push_str(&original_qmd[span]); + } + } + i = j; + continue; + } + + // Singleton KeepBefore — emit the inline's preimage in + // the target file when available (covers Generated inlines + // whose Invocation anchor resolves into target), falling + // back to the inline's literal source span for Original + // inlines (the common case; identical bytes either way). + let range = orig_inlines[*orig_idx] + .source_info() + .preimage_in(target_file_id) + .unwrap_or_else(|| inline_source_span(&orig_inlines[*orig_idx])); + result.push_str(&original_qmd[range]); } InlineAlignment::UseAfter(after_idx) => { let text = write_inline_to_string(&new_inlines[*after_idx])?; @@ -655,11 +1441,14 @@ fn assemble_inline_content( original_qmd, &orig_inlines[*before_idx], &new_inlines[*after_idx], - plan.inline_container_plans.get(&result_idx), + plan.inline_container_plans.get(&i), + target_file_id, + warnings, )?; result.push_str(&text); } } + i += 1; } Ok(result) @@ -674,6 +1463,8 @@ fn assemble_recursed_container( orig_inline: &Inline, new_inline: &Inline, nested_plan: Option<&InlineReconciliationPlan>, + target_file_id: quarto_source_map::FileId, + warnings: &mut Vec, ) -> Result> { let orig_span = inline_source_span(orig_inline); @@ -700,7 +1491,14 @@ fn assemble_recursed_container( let closing = &original_qmd[last_child_end..orig_span.end]; // Recursively assemble children - let children_text = assemble_inline_content(original_qmd, orig_children, new_children, plan)?; + let children_text = assemble_inline_content( + original_qmd, + orig_children, + new_children, + plan, + target_file_id, + warnings, + )?; Ok(format!("{}{}{}", opening, children_text, closing)) } @@ -830,3 +1628,701 @@ pub fn write_inline_to_string( ); Ok(result) } + +// ============================================================================= +// Tests +// ============================================================================= + +#[cfg(test)] +mod editability_tests { + use super::*; + use quarto_pandoc_types::{Block, CustomNode, Inline, Paragraph, Plain, Str, attr::empty_attr}; + use quarto_source_map::source_info::{AnchorRole, By}; + use std::sync::Arc; + + const TARGET: FileId = FileId(0); + const OTHER: FileId = FileId(1); + + fn make_str(text: &str, si: SourceInfo) -> Inline { + Inline::Str(Str { + text: text.into(), + source_info: si, + }) + } + + // ------------------------------------------------------------------------- + // is_editable_inside_block + // ------------------------------------------------------------------------- + + #[test] + fn editable_block_with_original_in_target() { + let block = Block::Paragraph(Paragraph { + content: vec![make_str("hello", SourceInfo::original(TARGET, 0, 5))], + source_info: SourceInfo::original(TARGET, 0, 5), + }); + assert!(is_editable_inside_block(&block, TARGET)); + } + + #[test] + fn not_editable_block_with_original_outside_target() { + // Original points at a different file (cross-file reference, no + // wrapper). preimage_in(TARGET) returns None. + let block = Block::Paragraph(Paragraph { + content: vec![make_str("hi", SourceInfo::original(OTHER, 0, 2))], + source_info: SourceInfo::original(OTHER, 0, 2), + }); + assert!(!is_editable_inside_block(&block, TARGET)); + } + + #[test] + fn not_editable_atomic_custom_node_block() { + // CrossrefResolvedRef is in ATOMIC_CUSTOM_NODES even though its + // source_info Original is in the target file. + let cn = CustomNode::new( + "CrossrefResolvedRef", + empty_attr(), + SourceInfo::original(TARGET, 0, 10), + ); + let block = Block::Custom(cn); + assert!(!is_editable_inside_block(&block, TARGET)); + } + + #[test] + fn editable_non_atomic_custom_node_block() { + // Non-atomic CustomNode (e.g., Callout) with source_info in target + // → editable. + let cn = CustomNode::new("Callout", empty_attr(), SourceInfo::original(TARGET, 0, 20)); + let block = Block::Custom(cn); + assert!(is_editable_inside_block(&block, TARGET)); + } + + #[test] + fn not_editable_atomic_kind_generated_block() { + // Shortcode-resolved Para: Generated{by: shortcode, from: [Invocation]}. + // Even though Invocation resolves to a token in TARGET (so + // preimage_in returns Some), is_atomic_kind() shortcode means the + // user can't edit the *resolved content* — only the token. + let token = SourceInfo::original(TARGET, 100, 120); + let mut gen_info = SourceInfo::generated(By::shortcode("meta")); + gen_info.append_anchor(AnchorRole::Invocation, Arc::new(token)); + let block = Block::Paragraph(Paragraph { + content: vec![], + source_info: gen_info, + }); + assert!(!is_editable_inside_block(&block, TARGET)); + } + + #[test] + fn not_editable_no_preimage_generated_block() { + // Synthesized-from-metadata container: Generated with empty + // anchors (sectionize / footnotes / appendix container shape). + // preimage_in returns None → not editable. + let block = Block::Paragraph(Paragraph { + content: vec![], + source_info: SourceInfo::generated(By::sectionize()), + }); + assert!(!is_editable_inside_block(&block, TARGET)); + } + + #[test] + fn not_editable_value_source_only_generated_block() { + // Plan 9 shape: Generated with only ValueSource anchor (no + // Invocation). The ValueSource points into the target file's + // YAML metadata range, but the writer must NOT treat the + // interior as editable — those bytes are YAML, not body. + let meta_si = SourceInfo::original(TARGET, 10, 25); + let mut gen_info = SourceInfo::generated(By::appendix()); + gen_info.append_anchor(AnchorRole::ValueSource, Arc::new(meta_si)); + let block = Block::Paragraph(Paragraph { + content: vec![], + source_info: gen_info, + }); + assert!(!is_editable_inside_block(&block, TARGET)); + } + + // ------------------------------------------------------------------------- + // is_editable_inside_inline + // ------------------------------------------------------------------------- + + #[test] + fn editable_inline_with_original_in_target() { + let inline = make_str("hi", SourceInfo::original(TARGET, 0, 2)); + assert!(is_editable_inside_inline(&inline, TARGET)); + } + + #[test] + fn not_editable_atomic_custom_node_inline() { + let cn = CustomNode::new( + "CrossrefResolvedRef", + empty_attr(), + SourceInfo::original(TARGET, 0, 8), + ); + let inline = Inline::Custom(cn); + assert!(!is_editable_inside_inline(&inline, TARGET)); + } + + #[test] + fn not_editable_atomic_kind_generated_inline() { + let token = SourceInfo::original(TARGET, 100, 120); + let mut gen_info = SourceInfo::generated(By::shortcode("meta")); + gen_info.append_anchor(AnchorRole::Invocation, Arc::new(token)); + let inline = make_str("resolved", gen_info); + assert!(!is_editable_inside_inline(&inline, TARGET)); + } + + #[test] + fn not_editable_inline_with_original_outside_target() { + let inline = make_str("hi", SourceInfo::original(OTHER, 0, 2)); + assert!(!is_editable_inside_inline(&inline, TARGET)); + } + + // ------------------------------------------------------------------------- + // Sanity: Plain (non-Para) block carries the same predicate behaviour. + // ------------------------------------------------------------------------- + + #[test] + fn editable_plain_block_with_original_in_target() { + let block = Block::Plain(Plain { + content: vec![make_str("hi", SourceInfo::original(TARGET, 0, 2))], + source_info: SourceInfo::original(TARGET, 0, 2), + }); + assert!(is_editable_inside_block(&block, TARGET)); + } +} + +#[cfg(test)] +mod coarsen_plan7_tests { + //! Plan 7: coarsen behavior under the new soft-drop + cascade rules. + //! + //! These tests construct `Pandoc` + `ReconciliationPlan` fixtures by + //! hand to exercise the new code paths directly. The existing + //! `incremental_writer_tests.rs` integration tests cover the + //! end-to-end (parse → reconcile → write) flow; these tests pin + //! coarsen's specific classification + soft-drop behavior. + + use super::*; + use quarto_ast_reconcile::types::{ + BlockAlignment, InlineAlignment, InlineReconciliationPlan, ReconciliationPlan, + }; + use quarto_pandoc_types::{Block, CustomNode, Div, Inline, Paragraph, Str, attr::empty_attr}; + use quarto_source_map::source_info::{AnchorRole, By}; + use std::sync::Arc; + + const TARGET: FileId = FileId(0); + const OTHER: FileId = FileId(1); + + fn make_str(text: &str, si: SourceInfo) -> Inline { + Inline::Str(Str { + text: text.into(), + source_info: si, + }) + } + + fn para(content: Vec, si: SourceInfo) -> Block { + Block::Paragraph(Paragraph { + content, + source_info: si, + }) + } + + // ------------------------------------------------------------------------- + // KeepBefore cascade + // ------------------------------------------------------------------------- + + #[test] + fn keep_before_with_original_in_target_emits_verbatim() { + let block = para(vec![], SourceInfo::original(TARGET, 10, 25)); + let ast = quarto_pandoc_types::Pandoc { + blocks: vec![block], + meta: ConfigValue::default(), + }; + let plan = ReconciliationPlan { + block_alignments: vec![BlockAlignment::KeepBefore(0)], + ..Default::default() + }; + let mut warnings = Vec::new(); + let qmd = "0123456789012345678901234567890"; + let entries = coarsen(qmd, &ast, &ast, &plan, &mut warnings).unwrap(); + + assert_eq!(entries.len(), 1); + match &entries[0] { + CoarsenedEntry::Verbatim { byte_range, .. } => { + assert_eq!(byte_range, &(10..25)); + } + other => panic!("expected Verbatim, got {:?}", other), + } + assert!(warnings.is_empty()); + } + + #[test] + fn keep_before_with_atomic_kind_generated_no_anchor_emits_omit() { + // Filter construction: Generated { by: filter, from: [] }. + // Atomic-kind, no Invocation → Omit (next pipeline run + // regenerates the decoration). + let block = para(vec![], SourceInfo::generated(By::filter("upper.lua", 14))); + let ast = quarto_pandoc_types::Pandoc { + blocks: vec![block], + meta: ConfigValue::default(), + }; + let plan = ReconciliationPlan { + block_alignments: vec![BlockAlignment::KeepBefore(0)], + ..Default::default() + }; + let mut warnings = Vec::new(); + let entries = coarsen("", &ast, &ast, &plan, &mut warnings).unwrap(); + + assert_eq!(entries.len(), 1); + assert!(matches!(entries[0], CoarsenedEntry::Omit)); + // KeepBefore branch doesn't emit warnings. + assert!(warnings.is_empty()); + } + + #[test] + fn keep_before_with_atomic_kind_generated_with_invocation_emits_verbatim() { + // Shortcode resolution: atomic-kind, Invocation in target → Verbatim. + let token = SourceInfo::original(TARGET, 100, 120); + let mut gen_info = SourceInfo::generated(By::shortcode("meta")); + gen_info.append_anchor(AnchorRole::Invocation, Arc::new(token)); + let block = para(vec![], gen_info); + let ast = quarto_pandoc_types::Pandoc { + blocks: vec![block], + meta: ConfigValue::default(), + }; + let plan = ReconciliationPlan { + block_alignments: vec![BlockAlignment::KeepBefore(0)], + ..Default::default() + }; + let mut warnings = Vec::new(); + let qmd = "0".repeat(200); + let entries = coarsen(&qmd, &ast, &ast, &plan, &mut warnings).unwrap(); + + assert_eq!(entries.len(), 1); + match &entries[0] { + CoarsenedEntry::Verbatim { byte_range, .. } => { + assert_eq!(byte_range, &(100..120)); + } + other => panic!("expected Verbatim, got {:?}", other), + } + } + + #[test] + fn keep_before_with_nonatomic_generated_wrapper_emits_transparent() { + // Sectionize wrapper: Div with Generated { by: sectionize, from: [] } + // and source-bearing children (one Para in target). + let child = para( + vec![make_str("hi", SourceInfo::original(TARGET, 10, 12))], + SourceInfo::original(TARGET, 10, 12), + ); + let div = Block::Div(Div { + attr: empty_attr(), + content: vec![child], + source_info: SourceInfo::generated(By::sectionize()), + attr_source: quarto_pandoc_types::AttrSourceInfo::empty(), + }); + let ast = quarto_pandoc_types::Pandoc { + blocks: vec![div], + meta: ConfigValue::default(), + }; + let plan = ReconciliationPlan { + block_alignments: vec![BlockAlignment::KeepBefore(0)], + ..Default::default() + }; + let mut warnings = Vec::new(); + let qmd = "0".repeat(30); + let entries = coarsen(&qmd, &ast, &ast, &plan, &mut warnings).unwrap(); + + assert_eq!(entries.len(), 1); + match &entries[0] { + CoarsenedEntry::Transparent { child_entries } => { + assert_eq!(child_entries.len(), 1); + match &child_entries[0] { + CoarsenedEntry::Verbatim { + byte_range, + orig_idx, + } => { + assert_eq!(byte_range, &(10..12)); + // Children of Transparent get None for orig_idx. + assert_eq!(orig_idx, &None); + } + other => panic!("expected Verbatim child, got {:?}", other), + } + } + other => panic!("expected Transparent, got {:?}", other), + } + } + + #[test] + fn keep_before_cross_file_original_falls_back_to_rewrite() { + // Block whose source_info points at a different file (no preimage + // in target) AND isn't Generated → Rewrite (catch-all). + let block = para(vec![], SourceInfo::original(OTHER, 0, 10)); + let ast = quarto_pandoc_types::Pandoc { + blocks: vec![block], + meta: ConfigValue::default(), + }; + let plan = ReconciliationPlan { + block_alignments: vec![BlockAlignment::KeepBefore(0)], + ..Default::default() + }; + let mut warnings = Vec::new(); + // Note: target_file_id is derived from the first block's + // root_file_id, which for this AST is OTHER (FileId 1) — so + // preimage_in(OTHER) succeeds. To exercise the catch-all path + // we need a block whose source_info doesn't resolve in *its + // own* root file_id. Use a separate AST whose first-block + // file-id sets target = TARGET, but this block points at OTHER. + let target_setter = para(vec![], SourceInfo::original(TARGET, 0, 5)); + let block_cross = para(vec![], SourceInfo::original(OTHER, 0, 10)); + let ast2 = quarto_pandoc_types::Pandoc { + blocks: vec![target_setter, block_cross], + meta: ConfigValue::default(), + }; + let plan2 = ReconciliationPlan { + block_alignments: vec![BlockAlignment::KeepBefore(0), BlockAlignment::KeepBefore(1)], + ..Default::default() + }; + let qmd = "0".repeat(30); + let entries = coarsen(&qmd, &ast2, &ast2, &plan2, &mut warnings).unwrap(); + + assert_eq!(entries.len(), 2); + // First entry resolves in target via preimage_in. + assert!(matches!(entries[0], CoarsenedEntry::Verbatim { .. })); + // Second entry doesn't resolve in target → Rewrite catch-all. + assert!(matches!(entries[1], CoarsenedEntry::Rewrite { .. })); + assert!(warnings.is_empty()); + // Silence unused: plan was for the single-block AST scenario above. + let _ = (ast, plan); + } + + // ------------------------------------------------------------------------- + // UseAfter soft-drop / let-user-win + // ------------------------------------------------------------------------- + + #[test] + fn use_after_on_atomic_custom_node_is_let_user_win_rewrite() { + // User replaced a CrossrefResolvedRef wholesale via a component + // menu. The new-side block IS the atomic CustomNode; we let the + // user win and Rewrite (no warning). + let new_cn = CustomNode::new( + "CrossrefResolvedRef", + empty_attr(), + SourceInfo::original(TARGET, 0, 10), + ); + let new_ast = quarto_pandoc_types::Pandoc { + blocks: vec![Block::Custom(new_cn)], + meta: ConfigValue::default(), + }; + let orig_block = para(vec![], SourceInfo::original(TARGET, 0, 0)); + let original_ast = quarto_pandoc_types::Pandoc { + blocks: vec![orig_block], + meta: ConfigValue::default(), + }; + let plan = ReconciliationPlan { + block_alignments: vec![BlockAlignment::UseAfter(0)], + ..Default::default() + }; + let mut warnings = Vec::new(); + let qmd = "0".repeat(20); + let entries = coarsen(&qmd, &original_ast, &new_ast, &plan, &mut warnings).unwrap(); + + assert_eq!(entries.len(), 1); + assert!(matches!(entries[0], CoarsenedEntry::Rewrite { .. })); + assert!( + warnings.is_empty(), + "let-user-win on atomic CustomNode must not emit a warning" + ); + } + + #[test] + fn use_after_on_no_preimage_generated_soft_drops_to_omit() { + // User replaced a synthesized-from-metadata container wholesale. + // The new-side block is Generated with no Invocation anchor + // → no source position to anchor a Rewrite → Omit + Q-3-43. + let new_block = Block::Div(Div { + attr: empty_attr(), + content: vec![], + source_info: SourceInfo::generated(By::appendix()), + attr_source: quarto_pandoc_types::AttrSourceInfo::empty(), + }); + let new_ast = quarto_pandoc_types::Pandoc { + blocks: vec![new_block], + meta: ConfigValue::default(), + }; + let orig_block = para(vec![], SourceInfo::original(TARGET, 0, 0)); + let original_ast = quarto_pandoc_types::Pandoc { + blocks: vec![orig_block], + meta: ConfigValue::default(), + }; + let plan = ReconciliationPlan { + block_alignments: vec![BlockAlignment::UseAfter(0)], + ..Default::default() + }; + let mut warnings = Vec::new(); + let qmd = "0".repeat(20); + let entries = coarsen(&qmd, &original_ast, &new_ast, &plan, &mut warnings).unwrap(); + + assert_eq!(entries.len(), 1); + assert!(matches!(entries[0], CoarsenedEntry::Omit)); + assert_eq!(warnings.len(), 1); + assert_eq!(warnings[0].code.as_deref(), Some("Q-3-43")); + } + + // ------------------------------------------------------------------------- + // RecurseIntoContainer soft-drop on non-editable original block + // ------------------------------------------------------------------------- + + #[test] + fn recurse_into_atomic_custom_node_soft_drops_to_verbatim() { + // User typed inside a CrossrefResolvedRef. Substitute Verbatim + // (wrapper's preimage bytes) + Q-3-43. + let orig_cn = CustomNode::new( + "CrossrefResolvedRef", + empty_attr(), + SourceInfo::original(TARGET, 5, 25), + ); + let new_cn = CustomNode::new( + "CrossrefResolvedRef", + empty_attr(), + SourceInfo::original(TARGET, 5, 25), + ); + let original_ast = quarto_pandoc_types::Pandoc { + blocks: vec![Block::Custom(orig_cn)], + meta: ConfigValue::default(), + }; + let new_ast = quarto_pandoc_types::Pandoc { + blocks: vec![Block::Custom(new_cn)], + meta: ConfigValue::default(), + }; + let plan = ReconciliationPlan { + block_alignments: vec![BlockAlignment::RecurseIntoContainer { + before_idx: 0, + after_idx: 0, + }], + ..Default::default() + }; + let mut warnings = Vec::new(); + let qmd = "0".repeat(30); + let entries = coarsen(&qmd, &original_ast, &new_ast, &plan, &mut warnings).unwrap(); + + assert_eq!(entries.len(), 1); + match &entries[0] { + CoarsenedEntry::Verbatim { byte_range, .. } => { + assert_eq!(byte_range, &(5..25)); + } + other => panic!("expected Verbatim, got {:?}", other), + } + assert_eq!(warnings.len(), 1); + assert_eq!(warnings[0].code.as_deref(), Some("Q-3-43")); + } + + #[test] + fn recurse_into_no_preimage_generated_soft_drops_to_omit() { + // User typed inside a synthesized appendix container (Generated + // with no Invocation anchor, no preimage in target). + let orig_div = Block::Div(Div { + attr: empty_attr(), + content: vec![para(vec![], SourceInfo::original(TARGET, 0, 5))], + source_info: SourceInfo::generated(By::appendix()), + attr_source: quarto_pandoc_types::AttrSourceInfo::empty(), + }); + let new_div = Block::Div(Div { + attr: empty_attr(), + content: vec![para(vec![], SourceInfo::original(TARGET, 0, 5))], + source_info: SourceInfo::generated(By::appendix()), + attr_source: quarto_pandoc_types::AttrSourceInfo::empty(), + }); + // Force target_file_id to TARGET by giving the AST another block + // whose source_info is Original in TARGET. + let target_setter = para(vec![], SourceInfo::original(TARGET, 0, 5)); + let original_ast = quarto_pandoc_types::Pandoc { + blocks: vec![target_setter.clone(), orig_div], + meta: ConfigValue::default(), + }; + let new_ast = quarto_pandoc_types::Pandoc { + blocks: vec![target_setter, new_div], + meta: ConfigValue::default(), + }; + let plan = ReconciliationPlan { + block_alignments: vec![ + BlockAlignment::KeepBefore(0), + BlockAlignment::RecurseIntoContainer { + before_idx: 1, + after_idx: 1, + }, + ], + ..Default::default() + }; + let mut warnings = Vec::new(); + let qmd = "0".repeat(30); + let entries = coarsen(&qmd, &original_ast, &new_ast, &plan, &mut warnings).unwrap(); + + assert_eq!(entries.len(), 2); + assert!(matches!(entries[0], CoarsenedEntry::Verbatim { .. })); + assert!(matches!(entries[1], CoarsenedEntry::Omit)); + assert_eq!(warnings.len(), 1); + assert_eq!(warnings[0].code.as_deref(), Some("Q-3-43")); + } + + // ------------------------------------------------------------------------- + // Inline-level multi-inline dedupe + soft-drop + // ------------------------------------------------------------------------- + + fn shortcode_inline(text: &str, token_si: SourceInfo) -> Inline { + let mut gen_info = SourceInfo::generated(By::shortcode("meta")); + gen_info.append_anchor(AnchorRole::Invocation, Arc::new(token_si)); + make_str(text, gen_info) + } + + #[test] + fn multi_inline_dedupe_emits_token_once_when_invocation_shared() { + // Three inlines sharing the same Invocation anchor (a multi-inline + // shortcode resolution). The original qmd has the shortcode token + // at bytes 0..18. Expected output: those 18 bytes once. + let qmd = "{{< meta footer >}}"; + assert_eq!(qmd.len(), 19); + let token_si = SourceInfo::original(TARGET, 0, 19); + + let orig_inlines = vec![ + shortcode_inline("Hello", token_si.clone()), + shortcode_inline(" ", token_si.clone()), + shortcode_inline("World", token_si.clone()), + ]; + let new_inlines = orig_inlines.clone(); + let plan = InlineReconciliationPlan { + inline_alignments: vec![ + InlineAlignment::KeepBefore(0), + InlineAlignment::KeepBefore(1), + InlineAlignment::KeepBefore(2), + ], + ..Default::default() + }; + + let mut warnings = Vec::new(); + let out = assemble_inline_content( + qmd, + &orig_inlines, + &new_inlines, + &plan, + TARGET, + &mut warnings, + ) + .unwrap(); + + assert_eq!( + out, qmd, + "Three shared-Invocation inlines must emit the token bytes once" + ); + } + + #[test] + fn multi_inline_no_dedupe_when_invocations_differ() { + // Two inlines, each pointing at a *different* token range — no + // dedupe; each emits its own range. + let qmd = "AB"; + let orig_inlines = vec![ + shortcode_inline("A", SourceInfo::original(TARGET, 0, 1)), + shortcode_inline("B", SourceInfo::original(TARGET, 1, 2)), + ]; + let new_inlines = orig_inlines.clone(); + let plan = InlineReconciliationPlan { + inline_alignments: vec![ + InlineAlignment::KeepBefore(0), + InlineAlignment::KeepBefore(1), + ], + ..Default::default() + }; + let mut warnings = Vec::new(); + let out = assemble_inline_content( + qmd, + &orig_inlines, + &new_inlines, + &plan, + TARGET, + &mut warnings, + ) + .unwrap(); + + // No dedupe: each inline's bytes emit. + assert_eq!(out, "AB"); + } + + #[test] + fn multi_inline_dedupe_with_value_source_difference_still_dedupes() { + // Forward-compat with Plan 9: two inlines whose Invocation anchors + // are PartialEq-equal but whose ValueSource anchors differ — still + // dedupes (dedupe consults Invocation only). + let qmd = "{{< meta foo >}}"; + let token_si = SourceInfo::original(TARGET, 0, qmd.len()); + + let mut si_a = SourceInfo::generated(By::shortcode("meta")); + si_a.append_anchor(AnchorRole::Invocation, Arc::new(token_si.clone())); + si_a.append_anchor( + AnchorRole::ValueSource, + Arc::new(SourceInfo::original(TARGET, 100, 110)), + ); + + let mut si_b = SourceInfo::generated(By::shortcode("meta")); + si_b.append_anchor(AnchorRole::Invocation, Arc::new(token_si)); + si_b.append_anchor( + AnchorRole::ValueSource, + Arc::new(SourceInfo::original(TARGET, 200, 215)), + ); + + let orig_inlines = vec![make_str("a", si_a), make_str("b", si_b)]; + let new_inlines = orig_inlines.clone(); + let plan = InlineReconciliationPlan { + inline_alignments: vec![ + InlineAlignment::KeepBefore(0), + InlineAlignment::KeepBefore(1), + ], + ..Default::default() + }; + let mut warnings = Vec::new(); + let out = assemble_inline_content( + qmd, + &orig_inlines, + &new_inlines, + &plan, + TARGET, + &mut warnings, + ) + .unwrap(); + + // Still dedupes — emit the token once. + assert_eq!(out, qmd); + } + + #[test] + fn inline_use_after_on_atomic_generated_soft_drops_to_keep_before_with_q3_42() { + // User retyped over a shortcode-resolved inline. UseAfter + // → KeepBefore(0) (the positional proxy) + Q-3-42. + let qmd = "{{< meta foo >}}"; + let token_si = SourceInfo::original(TARGET, 0, qmd.len()); + let mut gen_info = SourceInfo::generated(By::shortcode("meta")); + gen_info.append_anchor(AnchorRole::Invocation, Arc::new(token_si)); + + let orig_inlines = vec![make_str("Resolved", gen_info)]; + // New-side inline: a plain user edit (no Invocation anchor). + let new_inlines = vec![make_str("Retyped", SourceInfo::default())]; + let plan = InlineReconciliationPlan { + inline_alignments: vec![InlineAlignment::UseAfter(0)], + ..Default::default() + }; + let mut warnings = Vec::new(); + let out = assemble_inline_content( + qmd, + &orig_inlines, + &new_inlines, + &plan, + TARGET, + &mut warnings, + ) + .unwrap(); + + // Soft-drop: emit the original inline's bytes (its preimage maps + // to the whole shortcode token). + assert_eq!(out, qmd); + assert_eq!(warnings.len(), 1); + assert_eq!(warnings[0].code.as_deref(), Some("Q-3-42")); + } +} diff --git a/crates/pampa/src/writers/json.rs b/crates/pampa/src/writers/json.rs index 0c7237844..74910cca0 100644 --- a/crates/pampa/src/writers/json.rs +++ b/crates/pampa/src/writers/json.rs @@ -11,7 +11,7 @@ use crate::pandoc::{ use hashlink::LinkedHashMap; use quarto_error_reporting::{DiagnosticMessage, DiagnosticMessageBuilder}; use quarto_pandoc_types::{ConfigValue, ConfigValueKind}; -use quarto_source_map::{FileId, SourceInfo}; +use quarto_source_map::{AnchorRole, By, FileId, SourceInfo}; use serde::Serialize; use serde_json::{Value, json}; use std::collections::HashMap; @@ -110,9 +110,15 @@ struct FileEntryJson { /// Fields ordered alphabetically: d, r, t #[derive(Serialize)] struct SourceInfoJson { - d: Value, // data (file_id, parent_id, pieces, or filter info) + d: Value, // data (file_id, parent_id, pieces, or Generated { by, from }) r: [usize; 2], // range [start, end] - t: u8, // type code (0=Original, 1=Substring, 2=Concat, 3=FilterProvenance) + // type code: + // 0 = Original + // 1 = Substring + // 2 = Concat + // 3 = Legacy (read-only — old Transformed + buggy FilterProvenance) + // 4 = Generated { by, from } + t: u8, } /// Generic node with type, optional content, and source info. @@ -177,8 +183,26 @@ impl SerializableSourceInfo { .collect(); (2, json!(piece_arrays)) } - SerializableSourceMapping::FilterProvenance { filter_path, line } => { - (3, json!((filter_path, line))) + SerializableSourceMapping::Generated { by, from } => { + let mut by_json = json!({ "kind": by.kind }); + if !by.data.is_null() { + by_json["data"] = by.data.clone(); + } + let mut d_obj = serde_json::Map::new(); + d_obj.insert("by".to_string(), by_json); + if !from.is_empty() { + let arr: Vec = from + .iter() + .map(|(role, si_id)| { + json!({ + "role": serialize_anchor_role(role), + "si_id": si_id, + }) + }) + .collect(); + d_obj.insert("from".to_string(), Value::Array(arr)); + } + (4, Value::Object(d_obj)) } }; SourceInfoJson { @@ -189,6 +213,19 @@ impl SerializableSourceInfo { } } +/// Serialize an [`AnchorRole`] to its wire-format string. +/// +/// Inverse of `parse_anchor_role` in `crates/pampa/src/readers/json.rs`. +/// The two must agree on the string forms — see also the TS mirror at +/// `ts-packages/preview-renderer/src/types/sourceInfo.ts`. +fn serialize_anchor_role(role: &AnchorRole) -> String { + match role { + AnchorRole::Invocation => "invocation".to_string(), + AnchorRole::ValueSource => "value-source".to_string(), + AnchorRole::Other(s) => format!("other:{}", s), + } +} + /// Serializable version of SourceMapping that uses parent_id instead of Rc. enum SerializableSourceMapping { Original { @@ -200,9 +237,15 @@ enum SerializableSourceMapping { Concat { pieces: Vec, }, - FilterProvenance { - filter_path: String, - line: usize, + /// Wire-code 4: a pipeline transform's output. + /// + /// `by` carries the producer identity (kebab-case `kind` + optional + /// JSON `data`). `from` is an ordered list of `(role, si_id)` + /// pairs — each `si_id` points to another pool entry that already + /// exists (interned strictly before this entry). + Generated { + by: By, + from: Vec<(AnchorRole, usize)>, }, } @@ -311,14 +354,42 @@ impl<'a> SourceInfoSerializer<'a> { }, ) } - SourceInfo::FilterProvenance { filter_path, line } => ( - 0, - 0, - SerializableSourceMapping::FilterProvenance { - filter_path: filter_path.clone(), - line: *line, - }, - ), + SourceInfo::Generated { by, from } => { + // Anchors are interned *before* this Generated entry so that + // every si_id is strictly less than the resulting pool index + // — the reader's `si_id < current_index` guard depends on it. + // + // Dedup keyed by `Arc::as_ptr(&anchor.source_info)`, sharing + // the same `arc_parent_ids` cache used for `Substring.parent`. + // Multi-inline shortcode resolutions whose anchors point at a + // shared `Arc` collapse to a single pool entry on the write + // side; deserialization rebuilds each anchor with a fresh + // Arc, so this is a write-time optimization only (see Plan 5 + // §"Risk areas" → anchor-dedup-invariant). + let from_ids: Vec<(AnchorRole, usize)> = from + .iter() + .map(|anchor| { + let arc_ptr = std::sync::Arc::as_ptr(&anchor.source_info); + let id = if let Some(&id) = self.arc_parent_ids.get(&arc_ptr) { + self.stat_arc_parent_hits += 1; + id + } else { + let id = self.intern(&anchor.source_info); + self.arc_parent_ids.insert(arc_ptr, id); + id + }; + (anchor.role.clone(), id) + }) + .collect(); + ( + 0, + 0, + SerializableSourceMapping::Generated { + by: by.clone(), + from: from_ids, + }, + ) + } }; let id = self.pool.len(); @@ -555,7 +626,7 @@ fn node_with_source( // to map offsets to row/column positions. Commenting out for now. // fn write_location(source_info: &quarto_source_map::SourceInfo, ctx: &SourceContext) -> Value { // // Extract filename index by walking to the Original mapping -// let filename_index = crate::pandoc::location::extract_filename_index(source_info); +// let filename_index = source_info.root_file_id().map(|fid| fid.0); // // // Map start and end offsets to locations with row/column // let start_mapped = source_info.map_offset(0, ctx).unwrap(); @@ -3496,11 +3567,35 @@ fn stream_write_source_info_pool( } w.end_array()?; } - SerializableSourceMapping::FilterProvenance { filter_path, line } => { - w.begin_array()?; - w.str_value(filter_path)?; - w.u64_value(*line as u64)?; - w.end_array()?; + SerializableSourceMapping::Generated { by, from } => { + // Mirror SerializableSourceInfo::to_json byte-for-byte. + // Object shape: { "by": { "kind": ..., "data": ... }, + // "from": [ { "role": ..., "si_id": N }, ... ] } + // `data` is skipped when null; `from` is skipped when empty. + w.begin_object()?; + w.key("by")?; + w.begin_object()?; + w.key("kind")?; + w.str_value(&by.kind)?; + if !by.data.is_null() { + w.key("data")?; + stream_write_json_value(w, &by.data)?; + } + w.end_object()?; + if !from.is_empty() { + w.key("from")?; + w.begin_array()?; + for (role, si_id) in from { + w.begin_object()?; + w.key("role")?; + w.str_value(&serialize_anchor_role(role))?; + w.key("si_id")?; + w.u64_value(*si_id as u64)?; + w.end_object()?; + } + w.end_array()?; + } + w.end_object()?; } } w.key("r")?; @@ -3513,7 +3608,7 @@ fn stream_write_source_info_pool( SerializableSourceMapping::Original { .. } => 0, SerializableSourceMapping::Substring { .. } => 1, SerializableSourceMapping::Concat { .. } => 2, - SerializableSourceMapping::FilterProvenance { .. } => 3, + SerializableSourceMapping::Generated { .. } => 4, })?; w.end_object()?; } @@ -3521,6 +3616,45 @@ fn stream_write_source_info_pool( Ok(()) } +/// Recursively stream-write an arbitrary `serde_json::Value` via the +/// `JsonStreamWriter`. Used to emit the `By.data` payload inside a +/// `Generated` pool entry without materializing a serialized buffer. +fn stream_write_json_value(w: &mut JsonStreamWriter, v: &Value) -> io::Result<()> { + match v { + Value::Null => w.null_value(), + Value::Bool(b) => w.bool_value(*b), + Value::Number(n) => { + if let Some(u) = n.as_u64() { + w.u64_value(u) + } else if let Some(i) = n.as_i64() { + w.i64_value(i) + } else if let Some(f) = n.as_f64() { + w.f64_value(f) + } else { + // Unreachable: serde_json::Number always converts to one of + // the three numeric forms above. Emit null defensively. + w.null_value() + } + } + Value::String(s) => w.str_value(s), + Value::Array(arr) => { + w.begin_array()?; + for item in arr { + stream_write_json_value(w, item)?; + } + w.end_array() + } + Value::Object(obj) => { + w.begin_object()?; + for (k, val) in obj { + w.key(k)?; + stream_write_json_value(w, val)?; + } + w.end_object() + } + } +} + /// Emit the whole document. Streaming order: /// `{blocks, meta, pandoc-api-version, astContext}` — alphabetical-friendly /// except astContext last (it carries `sourceInfoPool` which is only complete @@ -3673,7 +3807,8 @@ fn stream_write_pandoc( #[cfg(test)] mod tests { use super::*; - use quarto_source_map::{FileId, SourceInfo}; + use quarto_source_map::{Anchor, AnchorRole, By, FileId, SourceInfo}; + use smallvec::SmallVec; use std::sync::Arc; fn make_test_context() -> ASTContext { @@ -4205,4 +4340,273 @@ mod tests { _ => panic!("Expected Custom block"), } } + + // ---------------------------------------------------------------- + // Plan 5 Phase 3+4 — writer-side Generated emission + // ---------------------------------------------------------------- + + /// `Generated { by, from: [] }` interns as a single code-4 pool entry + /// with `r = (0, 0)` and the right `by` shape. + #[test] + fn test_source_info_pool_generated_no_anchors() { + let context = make_test_context(); + let config = make_test_config(); + let mut serializer = SourceInfoSerializer::new(&context, &config); + + let gen_info = SourceInfo::Generated { + by: By::sectionize(), + from: SmallVec::new(), + }; + let id = serializer.intern(&gen_info); + + assert_eq!(id, 0); + assert_eq!(serializer.pool.len(), 1); + let entry = &serializer.pool[0]; + assert_eq!(entry.start_offset, 0); + assert_eq!(entry.end_offset, 0); + match &entry.mapping { + SerializableSourceMapping::Generated { by, from } => { + assert_eq!(by.kind, "sectionize"); + assert!(by.data.is_null()); + assert!(from.is_empty()); + } + _ => panic!("Expected Generated mapping"), + } + } + + /// `Generated { by: filter, from: [] }` carries `by.data` through. + #[test] + fn test_source_info_pool_generated_filter_with_data() { + let context = make_test_context(); + let config = make_test_config(); + let mut serializer = SourceInfoSerializer::new(&context, &config); + + let gen_info = SourceInfo::generated(By::filter("/x.lua", 42)); + let id = serializer.intern(&gen_info); + + let entry = &serializer.pool[id]; + match &entry.mapping { + SerializableSourceMapping::Generated { by, .. } => { + assert_eq!(by.kind, "filter"); + assert_eq!(by.as_filter(), Some(("/x.lua", 42))); + } + _ => panic!("Expected Generated mapping"), + } + } + + /// Anchors must be interned strictly *before* their owning Generated + /// entry — the reader's `si_id < current_index` guard requires it. + #[test] + fn test_source_info_pool_generated_with_invocation_anchor() { + let context = make_test_context(); + let config = make_test_config(); + let mut serializer = SourceInfoSerializer::new(&context, &config); + + let target = Arc::new(SourceInfo::Original { + file_id: FileId(0), + start_offset: 5, + end_offset: 12, + }); + let mut from = SmallVec::<[Anchor; 2]>::new(); + from.push(Anchor::invocation(Arc::clone(&target))); + let gen_info = SourceInfo::Generated { + by: By::shortcode("meta"), + from, + }; + + let id = serializer.intern(&gen_info); + // Anchor target interned first (ID 0), Generated second (ID 1). + assert_eq!(id, 1); + assert!(matches!( + serializer.pool[0].mapping, + SerializableSourceMapping::Original { .. } + )); + match &serializer.pool[1].mapping { + SerializableSourceMapping::Generated { by, from } => { + assert_eq!(by.kind, "shortcode"); + assert_eq!(from.len(), 1); + assert!(matches!(from[0].0, AnchorRole::Invocation)); + assert_eq!(from[0].1, 0); // si_id points to the target + } + _ => panic!("Expected Generated mapping"), + } + } + + /// Multi-inline shortcode resolution: N Generated nodes sharing one + /// `Arc` anchor target collapse to a single pool entry on + /// the write side. The dedup is keyed by `Arc::as_ptr`. + #[test] + fn test_source_info_pool_generated_anchor_dedup() { + let context = make_test_context(); + let config = make_test_config(); + let mut serializer = SourceInfoSerializer::new(&context, &config); + + let shared = Arc::new(SourceInfo::Original { + file_id: FileId(0), + start_offset: 0, + end_offset: 10, + }); + + // Three sibling Generated entries each pointing at `shared`. + let make = || { + let mut from = SmallVec::<[Anchor; 2]>::new(); + from.push(Anchor::invocation(Arc::clone(&shared))); + SourceInfo::Generated { + by: By::shortcode("meta"), + from, + } + }; + let id1 = serializer.intern(&make()); + let id2 = serializer.intern(&make()); + let id3 = serializer.intern(&make()); + + // Pool: shared(0), gen1(1), gen2(2), gen3(3) — shared interned once. + assert_eq!(serializer.pool.len(), 4); + let original_count = serializer + .pool + .iter() + .filter(|e| matches!(e.mapping, SerializableSourceMapping::Original { .. })) + .count(); + assert_eq!(original_count, 1, "shared target must intern exactly once"); + + for id in [id1, id2, id3] { + match &serializer.pool[id].mapping { + SerializableSourceMapping::Generated { from, .. } => { + assert_eq!(from.len(), 1); + assert_eq!(from[0].1, 0); // all reference the same si_id + } + _ => panic!("Expected Generated"), + } + } + } + + /// `Concat { pieces: [Generated, ...] }` round-trips: each piece's + /// Generated source_info interns through the new code-4 path; the + /// outer Concat references those IDs. + #[test] + fn test_source_info_pool_concat_of_generated() { + let context = make_test_context(); + let config = make_test_config(); + let mut serializer = SourceInfoSerializer::new(&context, &config); + + let g1 = SourceInfo::generated(By::filter("/a.lua", 1)); + let g2 = SourceInfo::generated(By::filter("/b.lua", 2)); + let concat = SourceInfo::concat(vec![(g1, 5), (g2, 7)]); + + let id = serializer.intern(&concat); + // Two Generated entries (0, 1) + Concat (2). + assert_eq!(id, 2); + assert!(matches!( + serializer.pool[0].mapping, + SerializableSourceMapping::Generated { .. } + )); + assert!(matches!( + serializer.pool[1].mapping, + SerializableSourceMapping::Generated { .. } + )); + match &serializer.pool[2].mapping { + SerializableSourceMapping::Concat { pieces } => { + assert_eq!(pieces.len(), 2); + assert_eq!(pieces[0].source_info_id, 0); + assert_eq!(pieces[1].source_info_id, 1); + } + _ => panic!("Expected Concat"), + } + } + + /// `Substring { parent: Arc, ... }` interns the Generated + /// parent first; the Substring references it by ID. + #[test] + fn test_source_info_pool_substring_of_generated() { + let context = make_test_context(); + let config = make_test_config(); + let mut serializer = SourceInfoSerializer::new(&context, &config); + + let parent = Arc::new(SourceInfo::generated(By::filter("/x.lua", 1))); + let child = SourceInfo::Substring { + parent: Arc::clone(&parent), + start_offset: 0, + end_offset: 4, + }; + let id = serializer.intern(&child); + + assert_eq!(id, 1); + assert!(matches!( + serializer.pool[0].mapping, + SerializableSourceMapping::Generated { .. } + )); + match &serializer.pool[1].mapping { + SerializableSourceMapping::Substring { parent_id } => { + assert_eq!(*parent_id, 0); + } + _ => panic!("Expected Substring"), + } + } + + /// `to_json` emits the Generated entry as `{"t":4, "r":[0,0], "d": ...}` + /// with the expected `by`/`from` shape. + #[test] + fn test_to_json_generated_emits_code_4() { + let context = make_test_context(); + let config = make_test_config(); + let mut serializer = SourceInfoSerializer::new(&context, &config); + + let target = Arc::new(SourceInfo::Original { + file_id: FileId(0), + start_offset: 5, + end_offset: 12, + }); + let mut from = SmallVec::<[Anchor; 2]>::new(); + from.push(Anchor::invocation(Arc::clone(&target))); + let gen_info = SourceInfo::Generated { + by: By::shortcode("meta"), + from, + }; + let _ = serializer.intern(&gen_info); + + let gen_entry_json = serializer.pool[1].to_json(); + assert_eq!(gen_entry_json.t, 4); + assert_eq!(gen_entry_json.r, [0, 0]); + + // Expected wire shape: + // { "by": { "kind": "shortcode", "data": { "name": "meta" } }, + // "from": [ { "role": "invocation", "si_id": 0 } ] } + let expected = json!({ + "by": { "kind": "shortcode", "data": { "name": "meta" } }, + "from": [ { "role": "invocation", "si_id": 0 } ] + }); + assert_eq!(gen_entry_json.d, expected); + } + + /// `to_json` skips `"data"` when `by.data` is null and skips `"from"` + /// when the anchor list is empty. + #[test] + fn test_to_json_generated_skips_null_data_and_empty_from() { + let context = make_test_context(); + let config = make_test_config(); + let mut serializer = SourceInfoSerializer::new(&context, &config); + + let gen_info = SourceInfo::generated(By::sectionize()); + let _ = serializer.intern(&gen_info); + let entry_json = serializer.pool[0].to_json(); + assert_eq!(entry_json.t, 4); + // Exactly: { "by": { "kind": "sectionize" } } — no data, no from. + let expected = json!({ "by": { "kind": "sectionize" } }); + assert_eq!(entry_json.d, expected); + } + + /// AnchorRole round-trip via the writer's `serialize_anchor_role` — + /// every known role plus an extension-defined `Other` survives. + #[test] + fn test_serialize_anchor_role_all_roles() { + assert_eq!(serialize_anchor_role(&AnchorRole::Invocation), "invocation"); + assert_eq!( + serialize_anchor_role(&AnchorRole::ValueSource), + "value-source" + ); + assert_eq!( + serialize_anchor_role(&AnchorRole::Other("ext/foo/bar".to_string())), + "other:ext/foo/bar" + ); + } } diff --git a/crates/pampa/tests/incremental_writer_tests.rs b/crates/pampa/tests/incremental_writer_tests.rs index d8b782299..c1437eb4d 100644 --- a/crates/pampa/tests/incremental_writer_tests.rs +++ b/crates/pampa/tests/incremental_writer_tests.rs @@ -53,8 +53,10 @@ fn read_json(json: &str) -> Pandoc { .0 } -/// Simulate the WASM incremental_write_qmd path: -/// 1. Parse original_qmd to get original_ast with accurate source spans +/// Simulate the WASM incremental_write_qmd path (Plan 7 contract): +/// 1. Parse original_qmd to get the baseline AST with accurate source spans +/// (in the real bridge the caller supplies this; here we synthesize it +/// from the qmd to keep the helper self-contained) /// 2. JSON round-trip the new_ast (simulates client serialization/deserialization) /// 3. Compute reconciliation plan and run incremental_write fn incremental_write_via_json_roundtrip(original_qmd: &str, new_ast: &Pandoc) -> String { @@ -64,6 +66,7 @@ fn incremental_write_via_json_roundtrip(original_qmd: &str, new_ast: &Pandoc) -> let plan = compute_reconciliation(&original_ast, &new_ast_from_json); writers::incremental::incremental_write(original_qmd, &original_ast, &new_ast_from_json, &plan) .expect("incremental_write failed") + .0 } // ============================================================================= @@ -89,7 +92,8 @@ fn assert_idempotent(input: &str) { } let result = writers::incremental::incremental_write(input, &ast, &ast, &plan) - .expect("incremental_write failed"); + .expect("incremental_write failed") + .0; assert_eq!( result, input, @@ -293,6 +297,568 @@ A paragraph. ); } +// ============================================================================= +// Sectionize wrapper soft-drop (incremental.rs RecurseIntoContainer regression) +// ============================================================================= +// +// The post-q2-preview-pipeline AST wraps all user content in a single +// top-level `Block::Div` with `SourceInfo::Generated { by: sectionize }` +// (no Invocation anchor). When the React side mutates a child Para and +// posts the new AST, the reconciler aligns "1 Div : 1 Div" as a +// `RecurseIntoContainer`. The Plan 7 soft-drop guard in coarsen +// (`incremental.rs:342`) trips because `is_editable_inside_block` on a +// no-preimage Generated wrapper returns false — and since the *whole* +// document is the wrapper, the resulting `CoarsenedEntry::Omit` +// produces an empty document. +// +// The correct behavior: recurse Transparent into the wrapper's +// source-bearing children using `block_container_plans[result_idx]`, +// the same way `coarsen_keep_before_block` handles unchanged +// non-atomic Generated wrappers (`incremental.rs:459-479`). + +/// Construct a `Pandoc` whose first (and only) top-level block is a +/// `Generated { by: sectionize }` Div wrapping the parsed AST of the +/// supplied qmd. The inner blocks retain their original Source positions. +fn wrap_in_sectionize_div(parsed: pampa::pandoc::Pandoc) -> pampa::pandoc::Pandoc { + use pampa::pandoc::Block; + let wrapper_si = quarto_source_map::SourceInfo::generated(quarto_source_map::By::sectionize()); + let wrapper = Block::Div(pampa::pandoc::Div { + attr: ( + String::new(), + vec!["section".to_string()], + hashlink::LinkedHashMap::new(), + ), + content: parsed.blocks, + source_info: wrapper_si, + attr_source: pampa::pandoc::attr::AttrSourceInfo::empty(), + }); + pampa::pandoc::Pandoc { + blocks: vec![wrapper], + ..parsed + } +} + +#[test] +fn sectionize_wrapper_with_inner_para_edit_produces_nonempty_output() { + // Original qmd: a header followed by a paragraph. + let original_qmd = "# Heading\n\nA paragraph that the user will edit.\n"; + + // Baseline AST mirrors the post-pipeline shape: the whole document + // wrapped in a sectionize Div. + let baseline_ast = wrap_in_sectionize_div(parse_qmd(original_qmd)); + + // New AST: copy baseline, dive into the Div's content, append a + // reaction Span to the inner Paragraph (mirrors comment.tsx's + // addReaction path). + let mut new_ast = baseline_ast.clone(); + { + let pampa::pandoc::Block::Div(ref mut div) = new_ast.blocks[0] else { + panic!("expected wrapper Div at blocks[0]"); + }; + let last_idx = div + .content + .iter() + .rposition(|b| matches!(b, pampa::pandoc::Block::Paragraph(_))) + .expect("paragraph inside wrapper"); + if let pampa::pandoc::Block::Paragraph(ref mut p) = div.content[last_idx] { + let attr = ( + String::new(), + vec!["quarto-edit-comment".to_string()], + hashlink::LinkedHashMap::new(), + ); + p.content + .push(pampa::pandoc::Inline::Span(pampa::pandoc::Span { + attr, + content: vec![pampa::pandoc::Inline::Str(pampa::pandoc::Str { + text: "🎉".to_string(), + source_info: quarto_source_map::SourceInfo::default(), + })], + source_info: quarto_source_map::SourceInfo::default(), + attr_source: pampa::pandoc::attr::AttrSourceInfo::empty(), + })); + } + } + + let plan = compute_reconciliation(&baseline_ast, &new_ast); + let (result_qmd, warnings) = + writers::incremental::incremental_write(original_qmd, &baseline_ast, &new_ast, &plan) + .expect("incremental_write Ok arm"); + + assert!( + !result_qmd.is_empty(), + "sectionize-wrapper with inner Para edit yielded empty qmd \ + (warnings: {})", + warnings.len() + ); + + // The user's appended reaction should land in the inner Para; the + // wrapper itself should not re-emit any synthetic bytes. + assert!( + result_qmd.contains("[>> 🎉]"), + "expected reaction span [>> 🎉] in result; got:\n{}", + result_qmd + ); + // Unchanged Header (the orig blocks[0] inside the wrapper) should + // also be preserved. + assert!( + result_qmd.contains("# Heading"), + "expected '# Heading' (unchanged sibling inside wrapper) in result; got:\n{:?}", + result_qmd + ); +} + +#[test] +fn sectionize_wrapper_preserves_frontmatter_after_inner_edit() { + // Reproduce the second-order bug: when the post-pipeline AST wraps + // the user content in a top-level sectionize Div, the writer's + // `emit_metadata_prefix` reads `blocks[0].start_offset()` to decide + // where the metadata region ends. The wrapper's start_offset is 0 + // (Generated, no preimage), so the function concludes "no metadata" + // and deletes the YAML frontmatter from the output. + let original_qmd = "\ +--- +format: q2-preview +render-components: + - comment.tsx +--- + +# Heading + +A paragraph that the user will edit. +"; + + let baseline_ast = wrap_in_sectionize_div(parse_qmd(original_qmd)); + + let mut new_ast = baseline_ast.clone(); + { + let pampa::pandoc::Block::Div(ref mut div) = new_ast.blocks[0] else { + panic!("expected wrapper Div at blocks[0]"); + }; + let para_idx = div + .content + .iter() + .rposition(|b| matches!(b, pampa::pandoc::Block::Paragraph(_))) + .expect("paragraph inside wrapper"); + if let pampa::pandoc::Block::Paragraph(ref mut p) = div.content[para_idx] { + let attr = ( + String::new(), + vec!["quarto-edit-comment".to_string()], + hashlink::LinkedHashMap::new(), + ); + p.content + .push(pampa::pandoc::Inline::Span(pampa::pandoc::Span { + attr, + content: vec![pampa::pandoc::Inline::Str(pampa::pandoc::Str { + text: "🎉".to_string(), + source_info: quarto_source_map::SourceInfo::default(), + })], + source_info: quarto_source_map::SourceInfo::default(), + attr_source: pampa::pandoc::attr::AttrSourceInfo::empty(), + })); + } + } + + let plan = compute_reconciliation(&baseline_ast, &new_ast); + let (result_qmd, _warnings) = + writers::incremental::incremental_write(original_qmd, &baseline_ast, &new_ast, &plan) + .expect("incremental_write Ok arm"); + + assert!( + result_qmd + .starts_with("---\nformat: q2-preview\nrender-components:\n - comment.tsx\n---\n"), + "frontmatter deleted from output. result:\n{}", + result_qmd, + ); + // And the edit still lands inside the wrapper's child. + assert!( + result_qmd.contains("[>> 🎉]"), + "expected reaction span in result; got:\n{}", + result_qmd + ); +} + +#[test] +fn sectionize_wrapper_with_shortcode_child_edit_does_not_panic() { + // Discovered 2026-05-25 during the TS-gate-bypass UX experiment. + // When the framework's atomic-aware NOOP gate is disabled, + // edits to shortcode-resolved content (e.g. inside + // `{{< lipsum 3 >}}`) reach the writer. The writer's + // RecurseIntoContainer arm for the top-level sectionize wrapper + // descends via the Transparent recursion (commit bdcfdc53), + // which calls coarsen_blocks on the wrapper's children with a + // CHILD-RELATIVE plan. Inside that recursion, the existing + // `coarsen_keep_before_block` catch-all (~line 484) emits + // `Rewrite { new_idx: result_idx }` — but result_idx is the + // child-relative index, not the top-level index. `emit_entries` + // later does `new_ast.blocks[*new_idx]` (top-level) and panics + // with "index out of bounds". + // + // The doc-comment on coarsen_keep_before_block explicitly notes + // this is "not exercised by today's synthesizers" — true before + // the Transparent recursion was added, no longer true now. + // + // This test pins the panic so the architectural fix (carry the + // text on the Rewrite entry instead of an index, mirroring + // InlineSplice's pattern) has a regression target. + use pampa::pandoc::{Block, Header, Inline, Pandoc, Paragraph, Span, Str}; + use quarto_pandoc_types::{AttrSourceInfo, ConfigValue}; + use quarto_source_map::{AnchorRole, By, FileId, SourceInfo}; + use std::sync::Arc; + + const TARGET: FileId = FileId(0); + // Original qmd byte ranges are illustrative; the source text is + // long enough to contain all the byte ranges referenced below. + let original_qmd = "# Heading\n\n{{< lipsum 3 >}}\n\nMore text.\n"; + + // Build the lipsum shortcode token's anchor (Original in target). + let token_si = SourceInfo::original(TARGET, 11, 27); // "{{< lipsum 3 >}}" + + // Construct a Generated{shortcode} Para representing one of + // lipsum's resolved paragraphs. + let mut lipsum_si = SourceInfo::generated(By::shortcode("lipsum")); + lipsum_si.append_anchor(AnchorRole::Invocation, Arc::new(token_si.clone())); + + // Also construct a child Para that has NEITHER preimage in + // target NOR a recognized Generated kind: an Original Para from + // a DIFFERENT file. This is the cross-file-Original case that + // coarsen_keep_before_block's catch-all falls through to. + // (Pre-Plan-8 the AST didn't carry these; the panic the user + // observed must hit a different shape — but the structural + // failure is the same: a Rewrite emitted inside a Transparent + // wrapper.) + let other_file_para_si = SourceInfo::original(FileId(1), 0, 10); + + fn make_header(level: usize, text: &str, si: SourceInfo) -> Block { + Block::Header(Header { + level, + attr: (String::new(), Vec::new(), hashlink::LinkedHashMap::new()), + content: vec![Inline::Str(Str { + text: text.to_string(), + source_info: SourceInfo::default(), + })], + source_info: si, + attr_source: AttrSourceInfo::empty(), + }) + } + fn make_para(text: &str, si: SourceInfo) -> Block { + Block::Paragraph(Paragraph { + content: vec![Inline::Str(Str { + text: text.to_string(), + source_info: SourceInfo::default(), + })], + source_info: si, + }) + } + + // Wrapper children: Header + cross-file Para + lipsum Para. + let header = make_header(1, "Heading", SourceInfo::original(TARGET, 0, 9)); + let other_file_para = make_para("Cross", other_file_para_si); + let lipsum_para = make_para("Lorem ipsum…", lipsum_si.clone()); + let original = wrap_in_sectionize_div(Pandoc { + blocks: vec![header.clone(), other_file_para.clone(), lipsum_para], + meta: ConfigValue::default(), + }); + + // User clicks +react on the lipsum Para — append a Span to its + // inlines. The cross-file Para and Header are unchanged. + let mut lipsum_para_new = make_para("Lorem ipsum…", lipsum_si); + if let Block::Paragraph(ref mut p) = lipsum_para_new { + p.content.push(Inline::Span(Span { + attr: ( + String::new(), + vec!["quarto-edit-comment".to_string()], + hashlink::LinkedHashMap::new(), + ), + content: vec![Inline::Str(Str { + text: "🎉".to_string(), + source_info: SourceInfo::default(), + })], + source_info: SourceInfo::default(), + attr_source: AttrSourceInfo::empty(), + })); + } + let new = wrap_in_sectionize_div(Pandoc { + blocks: vec![header, other_file_para, lipsum_para_new], + meta: ConfigValue::default(), + }); + + let plan = compute_reconciliation(&original, &new); + + // Before the architectural fix: panics with + // "index out of bounds: the len is 1 but the index is N". + // After the fix: returns Ok. (This test does NOT assert on + // output bytes — see `sectionize_wrapper_shortcode_child_edit_soft_drops` + // for the byte-level expectation.) + let result = writers::incremental::incremental_write(original_qmd, &original, &new, &plan); + assert!( + result.is_ok(), + "incremental_write should not panic on a sectionize wrapper containing \ + a cross-file child + a shortcode child + an inline edit; got {:?}", + result.err() + ); +} + +#[test] +fn sectionize_wrapper_shortcode_child_edit_soft_drops() { + // The user clicks +react on a paragraph inside `{{< lipsum 3 >}}` + // with the framework's atomic-aware NOOP gate bypassed. The + // shortcode resolution is atomic-kind Generated; the inline edit + // has no source-side knob (the user's source is the token, not + // the resolved bytes). The writer must: + // + // (a) preserve the `{{< lipsum 3 >}}` token bytes in the qmd + // (b) NOT emit the resolved bytes / the reactji + // (c) surface a Q-3-42 or Q-3-43 warning so the UI can show + // a Monaco squiggle on the token line + // + // Two alignment shapes can reach the lipsum Para at child level + // of a Transparent (sectionize) recursion: + // + // 1. `RecurseIntoContainer { lipsum_idx, lipsum_idx }` — + // reconciler matches the original and the new structurally. + // Hits the existing soft-drop cascade priority 1 + // (preimage_in → Verbatim of token bytes). Works today. + // + // 2. `UseAfter(lipsum_idx)` (paired with a KeepBefore on the + // previous original) — reconciler can't pair the original + // and the new and treats it as a wholesale replacement. + // Falls through to let-user-win Rewrite (the writer emits + // the new block's resolved bytes verbatim). That's wrong + // for atomic-Generated with preimage. + // + // This test exercises shape #2 by giving the new Para a + // SourceInfo::default() (simulating a React-side wholesale + // replacement that loses provenance), then asserts the soft-drop + // outcome. Pre-fix: the resolved bytes leak into the qmd. Post- + // fix: the token is preserved + Q-3-42/43 fires. + use pampa::pandoc::{Block, Header, Inline, Pandoc, Paragraph, Span, Str}; + use quarto_pandoc_types::{AttrSourceInfo, ConfigValue}; + use quarto_source_map::{AnchorRole, By, FileId, SourceInfo}; + use std::sync::Arc; + + const TARGET: FileId = FileId(0); + let original_qmd = "# Heading\n\n{{< lipsum 3 >}}\n"; + + let token_si = SourceInfo::original(TARGET, 11, 27); + let mut lipsum_si = SourceInfo::generated(By::shortcode("lipsum")); + lipsum_si.append_anchor(AnchorRole::Invocation, Arc::new(token_si)); + + fn make_header(level: usize, text: &str, si: SourceInfo) -> Block { + Block::Header(Header { + level, + attr: (String::new(), Vec::new(), hashlink::LinkedHashMap::new()), + content: vec![Inline::Str(Str { + text: text.to_string(), + source_info: SourceInfo::default(), + })], + source_info: si, + attr_source: AttrSourceInfo::empty(), + }) + } + fn make_para_with_text(text: &str, si: SourceInfo) -> Block { + Block::Paragraph(Paragraph { + content: vec![Inline::Str(Str { + text: text.to_string(), + source_info: SourceInfo::default(), + })], + source_info: si, + }) + } + + let header = make_header(1, "Heading", SourceInfo::original(TARGET, 0, 9)); + + // Original lipsum paragraph carries the shortcode anchor. + let lipsum_orig = make_para_with_text( + "Lorem ipsum dolor sit amet, consectetur adipiscing elit.", + lipsum_si.clone(), + ); + let original = wrap_in_sectionize_div(Pandoc { + blocks: vec![header.clone(), lipsum_orig], + meta: ConfigValue::default(), + }); + + // New lipsum paragraph: different inline content + reactji Span, + // but source_info IS preserved (matches what the React framework + // does when constructing the post-edit AST — block source_info + // is inherited from the original). + let mut lipsum_new = make_para_with_text("Etiam maximus accumsan gravida.", lipsum_si.clone()); + if let Block::Paragraph(ref mut p) = lipsum_new { + p.content.push(Inline::Span(Span { + attr: ( + String::new(), + vec!["quarto-edit-comment".to_string()], + hashlink::LinkedHashMap::new(), + ), + content: vec![Inline::Str(Str { + text: "🎉".to_string(), + source_info: SourceInfo::default(), + })], + source_info: SourceInfo::default(), + attr_source: AttrSourceInfo::empty(), + })); + } + let new = wrap_in_sectionize_div(Pandoc { + blocks: vec![header, lipsum_new], + meta: ConfigValue::default(), + }); + + let plan = compute_reconciliation(&original, &new); + eprintln!("plan = {:#?}", plan); + + let (qmd, warnings) = + writers::incremental::incremental_write(original_qmd, &original, &new, &plan) + .expect("write should succeed"); + eprintln!("--- qmd ---\n{}\n--- end ---", qmd); + eprintln!("--- warnings ({}) ---", warnings.len()); + for w in &warnings { + eprintln!(" code={:?} title={:?}", w.code, w.title); + } + + // (a) token bytes preserved. + assert!( + qmd.contains("{{< lipsum 3 >}}"), + "qmd should preserve the lipsum token bytes; got: {:?}", + qmd + ); + // (b) reactji NOT emitted. + assert!( + !qmd.contains("🎉"), + "qmd should NOT contain the user's reactji; got: {:?}", + qmd + ); + // (b cont.) resolved bytes (the new Para's text) NOT emitted. + assert!( + !qmd.contains("Etiam maximus accumsan"), + "qmd should NOT contain the new Para's resolved-shortcode bytes; \ + got: {:?}", + qmd + ); + // (c) Q-3-42 or Q-3-43 warning fired. + let saw_soft_drop = warnings + .iter() + .any(|w| matches!(w.code.as_deref(), Some("Q-3-42") | Some("Q-3-43"))); + assert!( + saw_soft_drop, + "expected a Q-3-42 or Q-3-43 soft-drop warning; got: {:?}", + warnings.iter().map(|w| &w.code).collect::>() + ); +} + +// --- target_file_id derivation skips no-root_file_id first blocks --- +// +// Plan 7c Phase 8 — `coarsen`'s `target_file_id` is derived from the +// first block whose `root_file_id()` resolves to `Some`. A synthesized +// title-block (or sectionize wrapper) at `blocks[0]` with no +// `Invocation` anchor returns `None`, so the writer needs to skip past +// it and look at later blocks. Pre-fix, the fallback to `FileId(0)` +// would make `preimage_in(target)` return `None` for every real block +// at `FileId(N != 0)` — i.e. all editability checks fail and edits +// silently soft-drop. + +#[test] +fn target_file_id_skips_synthesized_first_block() { + use pampa::pandoc::{Block, Header, Pandoc, Paragraph, Str}; + use quarto_pandoc_types::{AttrSourceInfo, ConfigValue}; + use quarto_source_map::{By, FileId, SourceInfo}; + + // blocks[0] = synthesized title-block Header (Generated, no + // Invocation). blocks[1] = real Paragraph at FileId(7). + const REAL_FILE: FileId = FileId(7); + let title_block = Block::Header(Header { + level: 1, + attr: (String::new(), Vec::new(), hashlink::LinkedHashMap::new()), + content: vec![pampa::pandoc::Inline::Str(Str { + text: "Synthesized title".to_string(), + source_info: SourceInfo::default(), + })], + source_info: SourceInfo::generated(By::title_block()), + attr_source: AttrSourceInfo::empty(), + }); + // Real Para holds two Strs, both at FileId(7). The user edit + // mutates the second Str so the reconciler emits a + // RecurseIntoContainer with an inline plan. That path checks + // `is_editable_inside_block` on the orig Para, which in turn + // calls `preimage_in(target_file_id)` — and that's where a wrong + // `target_file_id` (FileId(0) fallback) makes the editability + // check return false and the writer soft-drops with Q-3-43. + let original_qmd = "Real text"; + let real_para_orig = Block::Paragraph(Paragraph { + content: vec![ + pampa::pandoc::Inline::Str(Str { + text: "Real".to_string(), + source_info: SourceInfo::original(REAL_FILE, 0, 4), + }), + pampa::pandoc::Inline::Space(pampa::pandoc::Space { + source_info: SourceInfo::original(REAL_FILE, 4, 5), + }), + pampa::pandoc::Inline::Str(Str { + text: "text".to_string(), + source_info: SourceInfo::original(REAL_FILE, 5, 9), + }), + ], + source_info: SourceInfo::original(REAL_FILE, 0, 9), + }); + // Mutated Para: replace the second Str with a new (no-source) Str. + let real_para_mut = Block::Paragraph(Paragraph { + content: vec![ + pampa::pandoc::Inline::Str(Str { + text: "Real".to_string(), + source_info: SourceInfo::original(REAL_FILE, 0, 4), + }), + pampa::pandoc::Inline::Space(pampa::pandoc::Space { + source_info: SourceInfo::original(REAL_FILE, 4, 5), + }), + pampa::pandoc::Inline::Str(Str { + text: "edited".to_string(), + source_info: SourceInfo::default(), + }), + ], + source_info: SourceInfo::original(REAL_FILE, 0, 9), + }); + let orig = Pandoc { + blocks: vec![title_block.clone(), real_para_orig], + meta: ConfigValue::default(), + }; + let new = Pandoc { + blocks: vec![title_block, real_para_mut], + meta: ConfigValue::default(), + }; + + let plan = compute_reconciliation(&orig, &new); + let (_qmd, warnings) = + writers::incremental::incremental_write(original_qmd, &orig, &new, &plan) + .expect("incremental_write Ok arm"); + + // Pre-fix target_file_id falls back to FileId(0); preimage_in(0) + // on REAL_FILE-Original Para returns None; coarsen's + // RecurseIntoContainer arm soft-drops with Q-3-43 ("Generated + // content edit dropped"). Post-fix target_file_id resolves to + // REAL_FILE and the inline edit proceeds without a warning. + assert!( + warnings.is_empty(), + "expected no soft-drop warnings; got: {:?}", + warnings.iter().map(|w| &w.title).collect::>() + ); +} + +#[test] +fn target_file_id_defaults_to_zero_for_empty_document() { + // Empty `blocks` — the fallback to `FileId(0)` should fire. + // Driving an identity reconcile on an empty AST should produce a + // no-op write without warnings or panics. + use pampa::pandoc::Pandoc; + use quarto_pandoc_types::ConfigValue; + let ast = Pandoc { + blocks: vec![], + meta: ConfigValue::default(), + }; + let plan = compute_reconciliation(&ast, &ast); + let (result, warnings) = writers::incremental::incremental_write("", &ast, &ast, &plan) + .expect("incremental_write Ok arm on empty document"); + assert_eq!(result, ""); + assert!(warnings.is_empty()); +} + // --- Mixed documents --- #[test] @@ -358,7 +924,8 @@ fn assert_roundtrip(original_qmd: &str, new_qmd: &str) { let plan = compute_reconciliation(&original_ast, &new_ast); let result = writers::incremental::incremental_write(original_qmd, &original_ast, &new_ast, &plan) - .expect("incremental_write failed"); + .expect("incremental_write failed") + .0; // Verify the result round-trips: read(result) should match new_ast structurally let result_ast = parse_qmd(&result); @@ -574,7 +1141,8 @@ fn roundtrip_auto_id_change_no_explicit_id_in_output() { let plan = compute_reconciliation(&original_ast, &new_ast); let result = writers::incremental::incremental_write(original_qmd, &original_ast, &new_ast, &plan) - .expect("incremental_write failed"); + .expect("incremental_write failed") + .0; // Should NOT contain an explicit ID attribute — auto-generated IDs stay implicit assert!( @@ -599,7 +1167,8 @@ fn verbatim_preservation_unchanged_blocks() { let plan = compute_reconciliation(&original_ast, &new_ast); let result = writers::incremental::incremental_write(original_qmd, &original_ast, &new_ast, &plan) - .expect("incremental_write failed"); + .expect("incremental_write failed") + .0; // The first and third paragraphs should be byte-for-byte identical assert!( @@ -977,7 +1546,8 @@ fn assert_equivalent_to_full_writer(original_qmd: &str, new_qmd: &str) { let plan = compute_reconciliation(&original_ast, &new_ast); let incremental_result = writers::incremental::incremental_write(original_qmd, &original_ast, &new_ast, &plan) - .expect("incremental_write failed"); + .expect("incremental_write failed") + .0; let full_result = write_qmd(&new_ast); @@ -1045,7 +1615,8 @@ fn assert_verbatim_preservation(blocks: &[String], mutate_idx: usize, new_block: let plan = compute_reconciliation(&original_ast, &new_ast); let result = writers::incremental::incremental_write(&original, &original_ast, &new_ast, &plan) - .expect("incremental_write failed"); + .expect("incremental_write failed") + .0; // For each unchanged block, verify its text appears verbatim in the result. // We check by finding the original block text in the result string. @@ -1092,7 +1663,7 @@ fn assert_edits_monotonic(original_qmd: &str, new_qmd: &str) { let new_ast = parse_qmd(new_qmd); let plan = compute_reconciliation(&original_ast, &new_ast); - let edits = writers::incremental::compute_incremental_edits( + let (edits, _warnings) = writers::incremental::compute_incremental_edits( original_qmd, &original_ast, &new_ast, @@ -1144,7 +1715,7 @@ proptest! { // Identity case: should produce zero edits let ast = parse_qmd(&qmd); let plan = compute_reconciliation(&ast, &ast); - let edits = + let (edits, _warnings) = writers::incremental::compute_incremental_edits(&qmd, &ast, &ast, &plan) .expect("compute_incremental_edits failed"); prop_assert!( @@ -1327,7 +1898,8 @@ fn comment_preserved_when_adjacent_block_changes() { let result = writers::incremental::incremental_write(original_qmd, &original_ast, &new_ast, &plan) - .expect("incremental_write failed"); + .expect("incremental_write failed") + .0; assert!( result.contains(""), @@ -1363,7 +1935,8 @@ fn comment_preserved_when_containing_paragraph_rewritten() { let plan = compute_reconciliation(&original_ast, &new_ast); let result = writers::incremental::incremental_write(original_qmd, &original_ast, &new_ast, &plan) - .expect("incremental_write failed"); + .expect("incremental_write failed") + .0; assert!( result.contains(""), @@ -1384,7 +1957,8 @@ fn comment_inside_blockquote_preserved_on_rewrite() { let result = writers::incremental::incremental_write(original_qmd, &original_ast, &new_ast, &plan) - .expect("incremental_write failed"); + .expect("incremental_write failed") + .0; assert!( result.contains(""), @@ -1405,7 +1979,8 @@ fn comment_block_preserved_when_blocks_added() { let result = writers::incremental::incremental_write(original_qmd, &original_ast, &new_ast, &plan) - .expect("incremental_write failed"); + .expect("incremental_write failed") + .0; assert!( result.contains(""), @@ -1478,7 +2053,8 @@ fn multiline_comment_preserved_on_rewrite() { let plan = compute_reconciliation(&original_ast, &new_ast); let result = writers::incremental::incremental_write(original, &original_ast, &new_ast, &plan) - .expect("incremental_write failed"); + .expect("incremental_write failed") + .0; assert!( result.contains(""), @@ -1498,7 +2074,8 @@ fn multiline_block_comment_preserved_on_adjacent_change() { let plan = compute_reconciliation(&original_ast, &new_ast); let result = writers::incremental::incremental_write(original, &original_ast, &new_ast, &plan) - .expect("incremental_write failed"); + .expect("incremental_write failed") + .0; assert!( result.contains(""), diff --git a/crates/pampa/tests/inline_splice_integration_tests.rs b/crates/pampa/tests/inline_splice_integration_tests.rs index 30e6cb370..fd3d0746f 100644 --- a/crates/pampa/tests/inline_splice_integration_tests.rs +++ b/crates/pampa/tests/inline_splice_integration_tests.rs @@ -102,7 +102,8 @@ fn assert_incremental_write_correct(original_qmd: &str, new_ast: &Pandoc) { let result = writers::incremental::incremental_write(original_qmd, &original_ast, new_ast, &plan) - .expect("incremental_write failed"); + .expect("incremental_write failed") + .0; // Verify round-trip: parsing the result should produce an AST structurally // equivalent to new_ast @@ -137,7 +138,8 @@ fn splice_str_change_in_paragraph() { let plan = compute_reconciliation(&original_ast, &new_ast); let result = writers::incremental::incremental_write(original_qmd, &original_ast, &new_ast, &plan) - .unwrap(); + .unwrap() + .0; assert_eq!(result, "Goodbye world.\n"); } @@ -154,7 +156,8 @@ fn splice_str_change_preserves_surrounding_text() { let plan = compute_reconciliation(&original_ast, &new_ast); let result = writers::incremental::incremental_write(original_qmd, &original_ast, &new_ast, &plan) - .unwrap(); + .unwrap() + .0; assert_eq!(result, "The slow brown fox.\n"); } @@ -179,7 +182,8 @@ fn splice_str_change_in_header() { let plan = compute_reconciliation(&original_ast, &new_ast); let result = writers::incremental::incremental_write(original_qmd, &original_ast, &new_ast, &plan) - .unwrap(); + .unwrap() + .0; // The header prefix "## " should be preserved assert_eq!(result, "## Goodbye World\n"); } @@ -202,7 +206,8 @@ fn splice_str_change_in_multiline_paragraph() { let plan = compute_reconciliation(&original_ast, &new_ast); let result = writers::incremental::incremental_write(original_qmd, &original_ast, &new_ast, &plan) - .unwrap(); + .unwrap() + .0; assert_eq!(result, "Goodbye\nworld\n"); } @@ -242,7 +247,8 @@ fn splice_str_change_in_multiline_blockquote() { let plan = compute_reconciliation(&original_ast, &new_ast); let result = writers::incremental::incremental_write(original_qmd, &original_ast, &new_ast, &plan) - .unwrap(); + .unwrap() + .0; assert_eq!(result, "> Goodbye\n> world\n"); } @@ -280,7 +286,8 @@ fn splice_str_change_in_multiline_bulletlist() { let plan = compute_reconciliation(&original_ast, &new_ast); let result = writers::incremental::incremental_write(original_qmd, &original_ast, &new_ast, &plan) - .unwrap(); + .unwrap() + .0; // The list continuation indent should be preserved assert_eq!(result, "* Goodbye\n world\n"); } @@ -302,7 +309,8 @@ fn splice_preserves_other_blocks() { let plan = compute_reconciliation(&original_ast, &new_ast); let result = writers::incremental::incremental_write(original_qmd, &original_ast, &new_ast, &plan) - .unwrap(); + .unwrap() + .0; assert_eq!( result, "First paragraph.\n\nModified paragraph.\n\nThird paragraph.\n" @@ -367,7 +375,8 @@ fn splice_str_change_inside_emphasis() { let plan = compute_reconciliation(&original_ast, &new_ast); let result = writers::incremental::incremental_write(original_qmd, &original_ast, &new_ast, &plan) - .unwrap(); + .unwrap() + .0; // The emphasis delimiters should be preserved from original source assert_eq!(result, "*Goodbye* world.\n"); } @@ -392,7 +401,8 @@ fn splice_str_change_inside_strong() { let plan = compute_reconciliation(&original_ast, &new_ast); let result = writers::incremental::incremental_write(original_qmd, &original_ast, &new_ast, &plan) - .unwrap(); + .unwrap() + .0; assert_eq!(result, "**Goodbye** world.\n"); } @@ -405,7 +415,9 @@ fn splice_idempotent_simple_paragraph() { let original_qmd = "Hello world.\n"; let ast = parse_qmd(original_qmd); let plan = compute_reconciliation(&ast, &ast); - let result = writers::incremental::incremental_write(original_qmd, &ast, &ast, &plan).unwrap(); + let result = writers::incremental::incremental_write(original_qmd, &ast, &ast, &plan) + .unwrap() + .0; assert_eq!(result, original_qmd); } @@ -414,6 +426,8 @@ fn splice_idempotent_blockquote_multiline() { let original_qmd = "> Hello\n> world\n"; let ast = parse_qmd(original_qmd); let plan = compute_reconciliation(&ast, &ast); - let result = writers::incremental::incremental_write(original_qmd, &ast, &ast, &plan).unwrap(); + let result = writers::incremental::incremental_write(original_qmd, &ast, &ast, &plan) + .unwrap() + .0; assert_eq!(result, original_qmd); } diff --git a/crates/pampa/tests/inline_splice_property_tests.rs b/crates/pampa/tests/inline_splice_property_tests.rs index 6219b8c4d..c63c774e8 100644 --- a/crates/pampa/tests/inline_splice_property_tests.rs +++ b/crates/pampa/tests/inline_splice_property_tests.rs @@ -169,7 +169,8 @@ fn assert_inline_roundtrip(original_qmd: &str, new_ast: &Pandoc) { let result = writers::incremental::incremental_write(original_qmd, &original_ast, new_ast, &plan) - .expect("incremental_write failed"); + .expect("incremental_write failed") + .0; // Round-trip: parse result, write both to QMD, compare let result_ast = parse_qmd(&result); @@ -192,7 +193,8 @@ fn assert_splice_equivalent_to_full_writer(original_qmd: &str, new_ast: &Pandoc) let incremental_result = writers::incremental::incremental_write(original_qmd, &original_ast, new_ast, &plan) - .expect("incremental_write failed"); + .expect("incremental_write failed") + .0; let full_result = write_qmd(new_ast); @@ -235,7 +237,8 @@ fn assert_inline_locality(original_qmd: &str, new_ast: &Pandoc, changed_block_id let result = writers::incremental::incremental_write(original_qmd, &original_ast, new_ast, &plan) - .expect("incremental_write failed"); + .expect("incremental_write failed") + .0; // For each unchanged block, verify its text appears in the result. for (i, block) in original_ast.blocks.iter().enumerate() { @@ -546,7 +549,9 @@ fn prop7_idempotent_paragraph_with_emphasis() { let qmd = "*Hello* world.\n"; let ast = parse_qmd(qmd); let plan = compute_reconciliation(&ast, &ast); - let result = writers::incremental::incremental_write(qmd, &ast, &ast, &plan).unwrap(); + let result = writers::incremental::incremental_write(qmd, &ast, &ast, &plan) + .unwrap() + .0; assert_eq!(result, qmd); } @@ -555,7 +560,9 @@ fn prop7_idempotent_paragraph_with_strong() { let qmd = "**Hello** world.\n"; let ast = parse_qmd(qmd); let plan = compute_reconciliation(&ast, &ast); - let result = writers::incremental::incremental_write(qmd, &ast, &ast, &plan).unwrap(); + let result = writers::incremental::incremental_write(qmd, &ast, &ast, &plan) + .unwrap() + .0; assert_eq!(result, qmd); } @@ -564,7 +571,9 @@ fn prop7_idempotent_paragraph_with_code() { let qmd = "Use `code` here.\n"; let ast = parse_qmd(qmd); let plan = compute_reconciliation(&ast, &ast); - let result = writers::incremental::incremental_write(qmd, &ast, &ast, &plan).unwrap(); + let result = writers::incremental::incremental_write(qmd, &ast, &ast, &plan) + .unwrap() + .0; assert_eq!(result, qmd); } @@ -573,7 +582,9 @@ fn prop7_idempotent_mixed_inline_formatting() { let qmd = "Normal *emph* **strong** `code` end.\n"; let ast = parse_qmd(qmd); let plan = compute_reconciliation(&ast, &ast); - let result = writers::incremental::incremental_write(qmd, &ast, &ast, &plan).unwrap(); + let result = writers::incremental::incremental_write(qmd, &ast, &ast, &plan) + .unwrap() + .0; assert_eq!(result, qmd); } @@ -582,7 +593,9 @@ fn prop7_idempotent_multiline_blockquote_with_emphasis() { let qmd = "> *Hello*\n> world.\n"; let ast = parse_qmd(qmd); let plan = compute_reconciliation(&ast, &ast); - let result = writers::incremental::incremental_write(qmd, &ast, &ast, &plan).unwrap(); + let result = writers::incremental::incremental_write(qmd, &ast, &ast, &plan) + .unwrap() + .0; assert_eq!(result, qmd); } @@ -592,7 +605,7 @@ proptest! { let ast = parse_qmd(&qmd); let plan = compute_reconciliation(&ast, &ast); let result = - writers::incremental::incremental_write(&qmd, &ast, &ast, &plan).unwrap(); + writers::incremental::incremental_write(&qmd, &ast, &ast, &plan).unwrap().0; prop_assert_eq!(result, qmd); } } @@ -688,8 +701,9 @@ fn prop9_no_newlines_in_splice_simple() { ); // Verify the incremental write result - let result = - writers::incremental::incremental_write(qmd, &original_ast, &new_ast, &plan).unwrap(); + let result = writers::incremental::incremental_write(qmd, &original_ast, &new_ast, &plan) + .unwrap() + .0; // The result should be correct assert_eq!(result, "Goodbye world.\n"); @@ -711,8 +725,9 @@ fn prop9_no_newlines_in_blockquote_splice() { let original_ast = parse_qmd(qmd); let plan = compute_reconciliation(&original_ast, &new_ast); - let result = - writers::incremental::incremental_write(qmd, &original_ast, &new_ast, &plan).unwrap(); + let result = writers::incremental::incremental_write(qmd, &original_ast, &new_ast, &plan) + .unwrap() + .0; // Verify the result parses correctly (critical for indentation contexts) assert_inline_roundtrip(qmd, &new_ast); @@ -736,8 +751,9 @@ fn prop9_no_newlines_in_multiline_blockquote_splice() { let original_ast = parse_qmd(qmd); let plan = compute_reconciliation(&original_ast, &new_ast); - let result = - writers::incremental::incremental_write(qmd, &original_ast, &new_ast, &plan).unwrap(); + let result = writers::incremental::incremental_write(qmd, &original_ast, &new_ast, &plan) + .unwrap() + .0; // The > prefix after the SoftBreak must be preserved assert_eq!(result, "> Goodbye\n> world.\n"); @@ -896,7 +912,7 @@ fn stress_many_blocks_single_change() { // Verify the edits are small (Property 8 / locality) let original_ast = parse_qmd(&qmd); let plan = compute_reconciliation(&original_ast, &new_ast); - let edits = + let (edits, _warnings) = writers::incremental::compute_incremental_edits(&qmd, &original_ast, &new_ast, &plan) .unwrap(); @@ -997,7 +1013,7 @@ proptest! { let original_ast = parse_qmd(&qmd); let plan = compute_reconciliation(&original_ast, &new_ast); - let edits = writers::incremental::compute_incremental_edits( + let (edits, _warnings) = writers::incremental::compute_incremental_edits( &qmd, &original_ast, &new_ast, diff --git a/crates/pampa/tests/json_reader_smoke_tests.rs b/crates/pampa/tests/json_reader_smoke_tests.rs index 1aa48514a..d5416b647 100644 --- a/crates/pampa/tests/json_reader_smoke_tests.rs +++ b/crates/pampa/tests/json_reader_smoke_tests.rs @@ -1,6 +1,12 @@ +use pampa::pandoc::{Block, Inline, Pandoc, Plain, Str}; use pampa::readers::json; +use pampa::writers::json as json_writer; +use quarto_source_map::{Anchor, AnchorRole, By, FileId, SourceInfo}; +use smallvec::SmallVec; use std::fs; +use std::io::Cursor; use std::path::PathBuf; +use std::sync::Arc; #[test] fn test_read_all_json_files_in_tests_readers() { @@ -79,3 +85,179 @@ fn test_manybullets_json_specifically() { _ => panic!("Expected OrderedList block"), } } + +// ---------------------------------------------------------------- +// Plan 5 — End-to-end round-trip through the streaming writer +// and the public reader API. +// +// These tests exercise the *production* JSON path: +// `pampa::writers::json::write` → bytes → `pampa::readers::json::read`. +// The writer's streaming arm (`stream_write_source_info_pool`) is what +// the orchestrator uses, so a regression here is exactly what bd-3odjm +// surfaced. The hand-constructed reader/writer unit tests live next to +// their respective modules; these tests guard the wire. +// ---------------------------------------------------------------- + +/// Round-trip a single `Pandoc` through the streaming writer and the +/// reader. Returns the recovered `source_info` of the inner `Str`. +fn roundtrip_str_source_info(str_source_info: SourceInfo) -> SourceInfo { + let mut pandoc = Pandoc::default(); + let inner = Inline::Str(Str { + text: "hi".to_string(), + source_info: str_source_info, + }); + let plain = Plain { + content: vec![inner], + source_info: SourceInfo::default(), + }; + pandoc.blocks.push(Block::Plain(plain)); + + let context = pampa::pandoc::ASTContext::anonymous(); + let mut buf = Vec::new(); + json_writer::write(&pandoc, &context, &mut buf).expect("write_pandoc"); + + let mut cursor = Cursor::new(&buf); + let (round, _ctx) = json::read(&mut cursor).expect("read_pandoc"); + + let Block::Plain(plain) = &round.blocks[0] else { + panic!("Expected Plain block") + }; + let Inline::Str(str_node) = &plain.content[0] else { + panic!("Expected Str inline") + }; + str_node.source_info.clone() +} + +#[test] +fn roundtrip_generated_no_anchors_via_public_api() { + let original = SourceInfo::generated(By::sectionize()); + let recovered = roundtrip_str_source_info(original.clone()); + assert_eq!(original, recovered); +} + +#[test] +fn roundtrip_generated_filter_with_data_via_public_api() { + let original = SourceInfo::generated(By::filter("/x.lua", 42)); + let recovered = roundtrip_str_source_info(original.clone()); + assert_eq!(original, recovered); +} + +#[test] +fn roundtrip_generated_with_invocation_anchor_via_public_api() { + let target = Arc::new(SourceInfo::Original { + file_id: FileId(0), + start_offset: 5, + end_offset: 12, + }); + let mut from = SmallVec::<[Anchor; 2]>::new(); + from.push(Anchor::invocation(Arc::clone(&target))); + let original = SourceInfo::Generated { + by: By::shortcode("meta"), + from, + }; + let recovered = roundtrip_str_source_info(original.clone()); + assert_eq!(original, recovered); +} + +#[test] +fn roundtrip_generated_with_all_anchor_roles_via_public_api() { + let mk_target = |start: usize, end: usize| { + Arc::new(SourceInfo::Original { + file_id: FileId(0), + start_offset: start, + end_offset: end, + }) + }; + let mut from = SmallVec::<[Anchor; 2]>::new(); + from.push(Anchor::invocation(mk_target(0, 5))); + from.push(Anchor::value_source(mk_target(10, 20))); + from.push(Anchor { + role: AnchorRole::Other("ext/foo/bar".to_string()), + source_info: mk_target(30, 35), + }); + let original = SourceInfo::Generated { + by: By::shortcode("meta"), + from, + }; + let recovered = roundtrip_str_source_info(original.clone()); + assert_eq!(original, recovered); +} + +#[test] +fn roundtrip_concat_of_generated_via_public_api() { + let g1 = SourceInfo::generated(By::filter("/a.lua", 1)); + let g2 = SourceInfo::generated(By::filter("/b.lua", 2)); + let original = SourceInfo::concat(vec![(g1, 5), (g2, 7)]); + let recovered = roundtrip_str_source_info(original.clone()); + assert_eq!(original, recovered); +} + +#[test] +fn roundtrip_substring_of_generated_via_public_api() { + let parent = Arc::new(SourceInfo::generated(By::filter("/x.lua", 1))); + let original = SourceInfo::Substring { + parent: Arc::clone(&parent), + start_offset: 0, + end_offset: 4, + }; + let recovered = roundtrip_str_source_info(original.clone()); + assert_eq!(original, recovered); +} + +#[test] +fn roundtrip_original_via_public_api() { + let original = SourceInfo::Original { + file_id: FileId(0), + start_offset: 7, + end_offset: 12, + }; + let recovered = roundtrip_str_source_info(original.clone()); + assert_eq!(original, recovered); +} + +#[test] +fn roundtrip_substring_via_public_api() { + let parent = Arc::new(SourceInfo::Original { + file_id: FileId(0), + start_offset: 0, + end_offset: 100, + }); + let original = SourceInfo::Substring { + parent: Arc::clone(&parent), + start_offset: 10, + end_offset: 20, + }; + let recovered = roundtrip_str_source_info(original.clone()); + assert_eq!(original, recovered); +} + +/// Streaming-writer parity: the streaming writer emits a code-4 entry +/// whose payload reads back as the same `Generated` value the writer +/// was given. Specifically guards `stream_write_source_info_pool`'s +/// match arms, which are independent from `to_json`'s. +#[test] +fn streaming_writer_generated_round_trip_preserves_by_data() { + let target = Arc::new(SourceInfo::Original { + file_id: FileId(0), + start_offset: 0, + end_offset: 5, + }); + let mut from = SmallVec::<[Anchor; 2]>::new(); + from.push(Anchor::invocation(Arc::clone(&target))); + let original = SourceInfo::Generated { + by: By::raw( + "ext/example/foo", + serde_json::json!({ + "nested": { + "n": 7, + "flag": true, + "items": [1, 2, "three"], + "empty": null + } + }), + ), + from, + }; + let recovered = roundtrip_str_source_info(original.clone()); + assert_eq!(original, recovered); +} diff --git a/crates/pampa/tests/test_metadata_source_tracking.rs b/crates/pampa/tests/test_metadata_source_tracking.rs index 252621a8b..bc7e2fed9 100644 --- a/crates/pampa/tests/test_metadata_source_tracking.rs +++ b/crates/pampa/tests/test_metadata_source_tracking.rs @@ -23,8 +23,8 @@ fn resolve_source_offset(source: &quarto_source_map::SourceInfo) -> usize { // For concat, use the start offset of the first piece pieces.first().map_or(0, |p| p.offset_in_concat) } - quarto_source_map::SourceInfo::FilterProvenance { .. } => { - // Filter provenance doesn't have a traditional offset + quarto_source_map::SourceInfo::Generated { .. } => { + // Generated nodes have no offset-within-current-text. 0 } } diff --git a/crates/quarto-ast-reconcile/Cargo.toml b/crates/quarto-ast-reconcile/Cargo.toml index 8b322fb78..57caa34cc 100644 --- a/crates/quarto-ast-reconcile/Cargo.toml +++ b/crates/quarto-ast-reconcile/Cargo.toml @@ -21,6 +21,7 @@ rustc-hash = "2.1" [dev-dependencies] proptest = "1.10" +yaml-rust2 = { workspace = true } [lints] workspace = true diff --git a/crates/quarto-ast-reconcile/src/hash.rs b/crates/quarto-ast-reconcile/src/hash.rs index f425dc22a..e734dd530 100644 --- a/crates/quarto-ast-reconcile/src/hash.rs +++ b/crates/quarto-ast-reconcile/src/hash.rs @@ -11,7 +11,7 @@ */ use quarto_pandoc_types::custom::{CustomNode, Slot}; -use quarto_pandoc_types::{Attr, Block, Inline}; +use quarto_pandoc_types::{Attr, Block, ConfigMapEntry, ConfigValue, ConfigValueKind, Inline}; use rustc_hash::FxHashMap; use std::hash::{Hash, Hasher}; use std::marker::PhantomData; @@ -488,6 +488,318 @@ fn hash_slot(slot: &Slot, cache: &mut HashCache<'_>, hasher: &mut impl Hasher) { } } +// ============================================================================= +// Meta (ConfigValue) Hashing +// ============================================================================= +// +// Idempotence checks (Plan 3) need a structural hash of the document +// `meta` field that: +// +// - excludes `source_info` and `key_source` so Plan-4 source-info +// churn doesn't affect the contract; +// - hashes `Map` entries in *insertion order* with no sort, so a +// transform that stuffs a `HashMap` into meta is *detectable* (a +// sort would silently mask that class of non-determinism — exactly +// the bug an idempotence test is meant to catch); +// - includes `merge_op` so a transform that flips merge semantics +// non-deterministically shows up; +// - recurses into `PandocInlines` / `PandocBlocks` via the existing +// inline/block hashers (which already exclude source_info). + +/// Compute a structural hash of a `ConfigValue` tree. +/// +/// Source-info-agnostic: skips `ConfigValue::source_info` and +/// `ConfigMapEntry::key_source`. See module-level note above for the +/// design rationale (insertion-order maps, `merge_op` participates). +pub fn compute_meta_hash_fresh(meta: &ConfigValue) -> u64 { + let mut cache = HashCache::new(); + let mut hasher = rustc_hash::FxHasher::default(); + hash_config_value(meta, &mut cache, &mut hasher); + hasher.finish() +} + +/// Compute a structural hash of a `ConfigValue` tree, excluding the +/// top-level `rendered` map entry. +/// +/// Used by the q2-preview idempotence gate: chrome transforms +/// (navbar / sidebar / footer / page-nav), `IncludeResolveStage`, the +/// favicon transform, and the Bootstrap/clipboard injection stages +/// populate `meta.rendered.*` with HTML-string side outputs. Two +/// runs may produce HTML strings whose *bytes* differ but whose +/// rendered shape is equivalent (attribute order, whitespace); that +/// case belongs to an HTML-canonicalization concern, not to the +/// pipeline-determinism contract this hash defends. +/// +/// The exclusion only applies at the document root. A `rendered` +/// key nested deeper in the tree is hashed normally — meta is +/// structured as a single top-level Map in practice, so a nested +/// `rendered` would be intentional content. +pub fn compute_meta_hash_fresh_excluding_rendered(meta: &ConfigValue) -> u64 { + let mut cache = HashCache::new(); + let mut hasher = rustc_hash::FxHasher::default(); + hash_config_value_excluding(meta, &["rendered"], &mut cache, &mut hasher); + hasher.finish() +} + +fn hash_config_value(value: &ConfigValue, cache: &mut HashCache<'_>, hasher: &mut impl Hasher) { + hash_config_value_excluding(value, &[], cache, hasher); +} + +/// Hash a `ConfigValue`, optionally skipping certain top-level map +/// keys. `top_skip` is only consulted for the `Map` variant at this +/// call's root and is not propagated into recursion: nested values +/// see an empty skip list. +fn hash_config_value_excluding( + value: &ConfigValue, + top_skip: &[&str], + cache: &mut HashCache<'_>, + hasher: &mut impl Hasher, +) { + // `merge_op` participates. The enum doesn't derive Hash, so + // route through its discriminant + the byte tag. + std::mem::discriminant(&value.merge_op).hash(hasher); + + hash_config_value_kind(&value.value, top_skip, cache, hasher); +} + +fn hash_config_value_kind( + kind: &ConfigValueKind, + top_skip: &[&str], + cache: &mut HashCache<'_>, + hasher: &mut impl Hasher, +) { + std::mem::discriminant(kind).hash(hasher); + + match kind { + ConfigValueKind::Scalar(yaml) => { + yaml.hash(hasher); + } + ConfigValueKind::PandocInlines(inlines) => { + hash_inlines(inlines, cache, hasher); + } + ConfigValueKind::PandocBlocks(blocks) => { + hash_blocks(blocks, cache, hasher); + } + ConfigValueKind::Path(s) | ConfigValueKind::Glob(s) | ConfigValueKind::Expr(s) => { + s.hash(hasher); + } + ConfigValueKind::Array(items) => { + items.len().hash(hasher); + for item in items { + hash_config_value(item, cache, hasher); + } + } + ConfigValueKind::Map(entries) => { + // Insertion-order, filtered by `top_skip`. Skip set is + // intentionally NOT propagated into recursion. + let kept_len = entries + .iter() + .filter(|e| !top_skip.contains(&e.key.as_str())) + .count(); + kept_len.hash(hasher); + for entry in entries { + if top_skip.contains(&entry.key.as_str()) { + continue; + } + hash_config_map_entry(entry, cache, hasher); + } + } + } +} + +fn hash_config_map_entry( + entry: &ConfigMapEntry, + cache: &mut HashCache<'_>, + hasher: &mut impl Hasher, +) { + entry.key.hash(hasher); + // `key_source` deliberately not hashed. + hash_config_value(&entry.value, cache, hasher); +} + +// ============================================================================= +// Divergence Localization +// ============================================================================= + +/// First place two documents' structural hashes diverge. +/// +/// Returned by [`find_first_divergence`] to make idempotence failures +/// debuggable: the test driver embeds this in its panic message so +/// the sub-agent investigation prompt arrives with "block index 7" +/// or "meta.listings.foo" instead of just "hash mismatch." +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum DivergencePoint { + /// Blocks at the same index hash differently. `path` is intentionally + /// flat: we don't dig into block subtrees because the per-block hash + /// already provides enough localization for triage. + Block { + index: usize, + hash_a: u64, + hash_b: u64, + }, + /// A meta key path hashes differently. The path walks insertion + /// order through nested Maps; the last element is the leaf key + /// whose recursive hash diverges. + MetaKey { + path: Vec, + hash_a: u64, + hash_b: u64, + }, + /// The two documents' top-level hashes agree on both blocks and + /// meta. The caller should never see this if it was reached via + /// "hashes differ, find me a divergence" — it would indicate a + /// hasher bug. Returned for completeness. + None, +} + +/// Find the first structural divergence between two documents. +/// +/// `blocks` are compared in order by per-block fresh hash; the first +/// index whose hashes disagree yields a `Block` variant. If the +/// blocks all match, `meta` is walked in insertion order with the +/// same `rendered.*` exclusion the +/// [`compute_meta_hash_fresh_excluding_rendered`] hash uses; the +/// first map key whose recursive hash diverges yields a `MetaKey` +/// variant. +/// +/// This lives in `quarto-ast-reconcile` next to the hashers so the +/// localization logic shares the source-info-exclusion contract by +/// construction. The caller (Plan 3's `idempotence.rs` test driver) +/// supplies `&[Block]` + `&ConfigValue` rather than passing the +/// crate's `DocumentAst` type, which is owned by `quarto-core`. +pub fn find_first_divergence( + blocks_a: &[Block], + meta_a: &ConfigValue, + blocks_b: &[Block], + meta_b: &ConfigValue, +) -> DivergencePoint { + // Block walk: linear scan with the existing per-block hasher. + // If block counts differ we still report the first mismatching + // index (or the boundary index for the longer side). + let common = blocks_a.len().min(blocks_b.len()); + for index in 0..common { + let hash_a = compute_block_hash_fresh(&blocks_a[index]); + let hash_b = compute_block_hash_fresh(&blocks_b[index]); + if hash_a != hash_b { + return DivergencePoint::Block { + index, + hash_a, + hash_b, + }; + } + } + if blocks_a.len() != blocks_b.len() { + // Report the first "missing" position as a divergence at + // index `common`. We synthesize a hash for the empty side as + // 0 — it just needs to be observably different from the + // present side's hash. + let (hash_a, hash_b) = if blocks_a.len() > blocks_b.len() { + (compute_block_hash_fresh(&blocks_a[common]), 0) + } else { + (0, compute_block_hash_fresh(&blocks_b[common])) + }; + return DivergencePoint::Block { + index: common, + hash_a, + hash_b, + }; + } + + // Meta walk: recursive insertion-order traversal that excludes + // top-level `rendered`. Matches the excluding-variant hash so a + // failure reported here is reproducible from the hash itself. + if let Some(point) = find_meta_divergence(meta_a, meta_b, &["rendered"], &mut Vec::new()) { + return point; + } + + DivergencePoint::None +} + +fn find_meta_divergence( + a: &ConfigValue, + b: &ConfigValue, + top_skip: &[&str], + path: &mut Vec, +) -> Option { + // Fast path: equal recursive hashes -> no divergence in this + // subtree. + let hash_a = meta_subtree_hash(a, top_skip); + let hash_b = meta_subtree_hash(b, top_skip); + if hash_a == hash_b { + return None; + } + + // Different. Drill down through Maps in insertion order; report + // the deepest meaningful path. + match (&a.value, &b.value) { + (ConfigValueKind::Map(entries_a), ConfigValueKind::Map(entries_b)) => { + for entry_a in entries_a { + if top_skip.contains(&entry_a.key.as_str()) { + continue; + } + match entries_b.iter().find(|e| e.key == entry_a.key) { + Some(entry_b) => { + path.push(entry_a.key.clone()); + if let Some(point) = + find_meta_divergence(&entry_a.value, &entry_b.value, &[], path) + { + return Some(point); + } + path.pop(); + } + None => { + // Key present in `a`, missing in `b`. Report + // as a leaf divergence at this path. + let mut full = path.clone(); + full.push(entry_a.key.clone()); + return Some(DivergencePoint::MetaKey { + path: full, + hash_a: meta_subtree_hash(&entry_a.value, &[]), + hash_b: 0, + }); + } + } + } + // Any keys in `b` not in `a`? + for entry_b in entries_b { + if top_skip.contains(&entry_b.key.as_str()) { + continue; + } + if !entries_a.iter().any(|e| e.key == entry_b.key) { + let mut full = path.clone(); + full.push(entry_b.key.clone()); + return Some(DivergencePoint::MetaKey { + path: full, + hash_a: 0, + hash_b: meta_subtree_hash(&entry_b.value, &[]), + }); + } + } + // Hashes differed but no key-level divergence found + // (e.g. value of a present key changed but the recursion + // bottomed out without finding a Map to descend into): + // report at the current path. + Some(DivergencePoint::MetaKey { + path: path.clone(), + hash_a, + hash_b, + }) + } + _ => Some(DivergencePoint::MetaKey { + path: path.clone(), + hash_a, + hash_b, + }), + } +} + +fn meta_subtree_hash(value: &ConfigValue, top_skip: &[&str]) -> u64 { + let mut cache = HashCache::new(); + let mut hasher = rustc_hash::FxHasher::default(); + hash_config_value_excluding(value, top_skip, &mut cache, &mut hasher); + hasher.finish() +} + // ============================================================================= // Structural Equality (for hash collision verification) // ============================================================================= @@ -1999,6 +2311,162 @@ mod tests { assert!(structural_eq_slot(&slot1, &slot2)); } + // ==================== Plan 7 — Generated source_info blindness ==================== + // + // The reconciler must compare nodes for structural equality WITHOUT + // consulting their source_info. This is the foundation invariant the + // writer relies on: KeepBefore decisions are made off these functions, + // and a leak of source_info into the comparison would degenerate + // round-trips to whole-document Rewrite. + + fn generated_with_by(by: quarto_source_map::source_info::By) -> SourceInfo { + SourceInfo::generated(by) + } + + #[test] + fn test_structural_eq_blocks_generated_different_by_payloads() { + // Two paragraphs with identical content but Generated source_info + // carrying *different* By payloads (sectionize vs shortcode). + // Reconciler must still see them as equal. + let blocks1 = vec![Block::Paragraph(Paragraph { + content: vec![make_str("a")], + source_info: generated_with_by(quarto_source_map::source_info::By::sectionize()), + })]; + let blocks2 = vec![Block::Paragraph(Paragraph { + content: vec![make_str("a")], + source_info: generated_with_by(quarto_source_map::source_info::By::shortcode("meta")), + })]; + + assert!(structural_eq_blocks(&blocks1, &blocks2)); + } + + #[test] + fn test_structural_eq_blocks_generated_different_anchor_lists() { + // Two paragraphs with identical content. Both Generated with + // matching By, but with different anchor lists (one empty, one + // with an Invocation anchor pointing into file 0). + use quarto_source_map::source_info::{AnchorRole, By}; + use std::sync::Arc; + + let mut si_with_anchor = SourceInfo::generated(By::shortcode("meta")); + si_with_anchor.append_anchor( + AnchorRole::Invocation, + Arc::new(SourceInfo::original(FileId(0), 10, 25)), + ); + + let blocks1 = vec![Block::Paragraph(Paragraph { + content: vec![make_str("a")], + source_info: si_with_anchor, + })]; + let blocks2 = vec![Block::Paragraph(Paragraph { + content: vec![make_str("a")], + source_info: SourceInfo::generated(By::shortcode("meta")), + })]; + + assert!(structural_eq_blocks(&blocks1, &blocks2)); + } + + #[test] + fn test_structural_eq_inlines_generated_different_by_and_anchors() { + // Inline-level analogue of the above two tests bundled. + use quarto_source_map::source_info::{AnchorRole, By}; + use std::sync::Arc; + + let mut si_a = SourceInfo::generated(By::shortcode("meta")); + si_a.append_anchor( + AnchorRole::Invocation, + Arc::new(SourceInfo::original(FileId(0), 10, 25)), + ); + + let mut si_b = SourceInfo::generated(By::shortcode("var")); + si_b.append_anchor( + AnchorRole::ValueSource, + Arc::new(SourceInfo::original(FileId(1), 200, 215)), + ); + + let inlines1 = vec![Inline::Str(Str { + text: "x".into(), + source_info: si_a, + })]; + let inlines2 = vec![Inline::Str(Str { + text: "x".into(), + source_info: si_b, + })]; + + assert!(structural_eq_inlines(&inlines1, &inlines2)); + } + + #[test] + fn test_structural_eq_custom_node_generated_source_info_blind() { + // CustomNode whose wrapper source_info is Generated (the + // Plan-6-stamped shape) vs an Original — equal iff structure matches. + let cn_generated = CustomNode { + type_name: "Callout".to_string(), + attr: empty_attr(), + plain_data: serde_json::json!({"type": "note"}), + slots: LinkedHashMap::new(), + source_info: generated_with_by(quarto_source_map::source_info::By::sectionize()), + }; + let cn_original = CustomNode { + type_name: "Callout".to_string(), + attr: empty_attr(), + plain_data: serde_json::json!({"type": "note"}), + slots: LinkedHashMap::new(), + source_info: dummy_source(), + }; + + assert!(structural_eq_custom_node(&cn_generated, &cn_original)); + } + + #[test] + fn test_structural_eq_custom_node_slot_child_source_info_blind() { + // CustomNode with slot children whose own source_infos differ + // (Generated with anchors vs Original). Same structural content + // → must be equal. + use quarto_source_map::source_info::{AnchorRole, By}; + use std::sync::Arc; + + let mut child_si = SourceInfo::generated(By::shortcode("meta")); + child_si.append_anchor( + AnchorRole::Invocation, + Arc::new(SourceInfo::original(FileId(0), 0, 5)), + ); + + let mut slots_a = LinkedHashMap::new(); + slots_a.insert( + "body".into(), + Slot::Blocks(vec![Block::Paragraph(Paragraph { + content: vec![make_str("hi")], + source_info: child_si, + })]), + ); + let mut slots_b = LinkedHashMap::new(); + slots_b.insert( + "body".into(), + Slot::Blocks(vec![Block::Paragraph(Paragraph { + content: vec![make_str("hi")], + source_info: other_source(), + })]), + ); + + let cn_a = CustomNode { + type_name: "Callout".to_string(), + attr: empty_attr(), + plain_data: serde_json::Value::Null, + slots: slots_a, + source_info: dummy_source(), + }; + let cn_b = CustomNode { + type_name: "Callout".to_string(), + attr: empty_attr(), + plain_data: serde_json::Value::Null, + slots: slots_b, + source_info: dummy_source(), + }; + + assert!(structural_eq_custom_node(&cn_a, &cn_b)); + } + // ==================== NodePtr Tests ==================== #[test] @@ -2013,4 +2481,224 @@ mod tests { assert_eq!(ptr1, ptr2); } + + // ==================== Meta Hash Tests ==================== + + use quarto_pandoc_types::MergeOp; + use yaml_rust2::Yaml; + + fn scalar_str(s: &str) -> ConfigValue { + ConfigValue { + value: ConfigValueKind::Scalar(Yaml::String(s.to_string())), + source_info: dummy_source(), + merge_op: MergeOp::default(), + } + } + + fn scalar_int(i: i64) -> ConfigValue { + ConfigValue { + value: ConfigValueKind::Scalar(Yaml::Integer(i)), + source_info: dummy_source(), + merge_op: MergeOp::default(), + } + } + + fn map_of(entries: Vec<(&str, ConfigValue)>) -> ConfigValue { + map_of_with_source(entries, dummy_source()) + } + + fn map_of_with_source(entries: Vec<(&str, ConfigValue)>, src: SourceInfo) -> ConfigValue { + let entries = entries + .into_iter() + .map(|(k, v)| ConfigMapEntry { + key: k.to_string(), + key_source: src.clone(), + value: v, + }) + .collect(); + ConfigValue { + value: ConfigValueKind::Map(entries), + source_info: src, + merge_op: MergeOp::default(), + } + } + + #[test] + fn meta_hash_same_content_same_hash() { + let a = map_of(vec![("title", scalar_str("hello")), ("toc", scalar_int(3))]); + let b = map_of(vec![("title", scalar_str("hello")), ("toc", scalar_int(3))]); + assert_eq!(compute_meta_hash_fresh(&a), compute_meta_hash_fresh(&b)); + } + + #[test] + fn meta_hash_different_content_different_hash() { + let a = map_of(vec![("title", scalar_str("hello"))]); + let b = map_of(vec![("title", scalar_str("world"))]); + assert_ne!(compute_meta_hash_fresh(&a), compute_meta_hash_fresh(&b)); + } + + #[test] + fn meta_hash_excludes_source_info_and_key_source() { + // Same content, different SourceInfo on values and on keys. + let a = map_of_with_source(vec![("title", scalar_str("hello"))], dummy_source()); + let b = map_of_with_source(vec![("title", scalar_str("hello"))], other_source()); + // Also flip the inner scalar's source_info. + let mut b = b; + if let ConfigValueKind::Map(entries) = &mut b.value { + entries[0].value.source_info = other_source(); + } + assert_eq!(compute_meta_hash_fresh(&a), compute_meta_hash_fresh(&b)); + } + + #[test] + fn meta_hash_excluding_rendered_ignores_top_level_rendered() { + let a = map_of(vec![ + ("title", scalar_str("hello")), + ( + "rendered", + map_of(vec![("navbar", scalar_str(""))]), + ), + ]); + let b = map_of(vec![ + ("title", scalar_str("hello")), + ( + "rendered", + map_of(vec![("navbar", scalar_str(""))]), + ), + ]); + assert_ne!( + compute_meta_hash_fresh(&a), + compute_meta_hash_fresh(&b), + "the non-excluding hash must observe the difference", + ); + assert_eq!( + compute_meta_hash_fresh_excluding_rendered(&a), + compute_meta_hash_fresh_excluding_rendered(&b), + "the excluding-rendered hash must ignore top-level rendered.* divergence", + ); + } + + #[test] + fn meta_hash_excluding_rendered_does_not_propagate_to_nested_rendered() { + // A nested `rendered` key is part of the content and must + // still participate in the hash. + let a = map_of(vec![( + "listings", + map_of(vec![("rendered", scalar_str("a"))]), + )]); + let b = map_of(vec![( + "listings", + map_of(vec![("rendered", scalar_str("b"))]), + )]); + assert_ne!( + compute_meta_hash_fresh_excluding_rendered(&a), + compute_meta_hash_fresh_excluding_rendered(&b), + ); + } + + #[test] + fn meta_hash_map_insertion_order_matters() { + // Regression guard for the no-sort choice: a transform that + // stuffs a HashMap into meta would produce different + // insertion orders across runs; the hash must catch that. + let a = map_of(vec![("a", scalar_int(1)), ("b", scalar_int(2))]); + let b = map_of(vec![("b", scalar_int(2)), ("a", scalar_int(1))]); + assert_ne!( + compute_meta_hash_fresh(&a), + compute_meta_hash_fresh(&b), + "different Map insertion order must produce different hashes", + ); + } + + #[test] + fn meta_hash_merge_op_participates() { + let a = ConfigValue { + value: ConfigValueKind::Scalar(Yaml::String("x".into())), + source_info: dummy_source(), + merge_op: MergeOp::Concat, + }; + let b = ConfigValue { + value: ConfigValueKind::Scalar(Yaml::String("x".into())), + source_info: dummy_source(), + merge_op: MergeOp::Prefer, + }; + assert_ne!(compute_meta_hash_fresh(&a), compute_meta_hash_fresh(&b)); + } + + // ==================== Divergence Localization Tests ==================== + + fn para(text: &str) -> Block { + Block::Paragraph(Paragraph { + content: vec![make_str(text)], + source_info: dummy_source(), + }) + } + + #[test] + fn divergence_identical_docs_returns_none() { + let blocks = vec![para("alpha"), para("beta")]; + let meta = map_of(vec![("title", scalar_str("t"))]); + let point = find_first_divergence(&blocks, &meta, &blocks, &meta); + assert_eq!(point, DivergencePoint::None); + } + + #[test] + fn divergence_reports_first_block_mismatch() { + let a = vec![para("alpha"), para("beta"), para("gamma")]; + let b = vec![para("alpha"), para("DIFFERENT"), para("gamma")]; + let meta = map_of(vec![]); + let point = find_first_divergence(&a, &meta, &b, &meta); + match point { + DivergencePoint::Block { + index, + hash_a, + hash_b, + } => { + assert_eq!(index, 1); + assert_ne!(hash_a, hash_b); + } + other => panic!("expected Block divergence, got {:?}", other), + } + } + + #[test] + fn divergence_reports_meta_key_path() { + let meta_a = map_of(vec![( + "listings", + map_of(vec![("foo", map_of(vec![("title", scalar_str("a"))]))]), + )]); + let meta_b = map_of(vec![( + "listings", + map_of(vec![("foo", map_of(vec![("title", scalar_str("b"))]))]), + )]); + let blocks: Vec = vec![]; + let point = find_first_divergence(&blocks, &meta_a, &blocks, &meta_b); + match point { + DivergencePoint::MetaKey { + path, + hash_a, + hash_b, + } => { + assert_eq!(path, vec!["listings", "foo", "title"]); + assert_ne!(hash_a, hash_b); + } + other => panic!("expected MetaKey divergence, got {:?}", other), + } + } + + #[test] + fn divergence_skips_rendered_top_level() { + // Only `rendered.*` differs at the top level -> no divergence. + let meta_a = map_of(vec![ + ("title", scalar_str("hello")), + ("rendered", map_of(vec![("navbar", scalar_str("a"))])), + ]); + let meta_b = map_of(vec![ + ("title", scalar_str("hello")), + ("rendered", map_of(vec![("navbar", scalar_str("b"))])), + ]); + let blocks: Vec = vec![]; + let point = find_first_divergence(&blocks, &meta_a, &blocks, &meta_b); + assert_eq!(point, DivergencePoint::None); + } } diff --git a/crates/quarto-ast-reconcile/src/lib.rs b/crates/quarto-ast-reconcile/src/lib.rs index 3d6e33a4f..98abb9083 100644 --- a/crates/quarto-ast-reconcile/src/lib.rs +++ b/crates/quarto-ast-reconcile/src/lib.rs @@ -27,8 +27,10 @@ pub mod types; pub use apply::apply_reconciliation; pub use compute::{compute_reconciliation, compute_reconciliation_for_blocks}; pub use hash::{ - HashCache, compute_block_hash_fresh, compute_blocks_hash_fresh, compute_inline_hash_fresh, - structural_eq_block, structural_eq_blocks, structural_eq_inline, structural_eq_inlines, + DivergencePoint, HashCache, compute_block_hash_fresh, compute_blocks_hash_fresh, + compute_inline_hash_fresh, compute_meta_hash_fresh, compute_meta_hash_fresh_excluding_rendered, + find_first_divergence, structural_eq_block, structural_eq_blocks, structural_eq_inline, + structural_eq_inlines, }; pub use remap::remap_file_ids; pub use types::{ diff --git a/crates/quarto-core/Cargo.toml b/crates/quarto-core/Cargo.toml index 8c4f8978f..0a4f3d0a2 100644 --- a/crates/quarto-core/Cargo.toml +++ b/crates/quarto-core/Cargo.toml @@ -18,6 +18,7 @@ tokio-util.workspace = true pollster.workspace = true serde_json.workspace = true yaml-rust2.workspace = true +smallvec.workspace = true hashlink = "0.11" pathdiff = "0.2" sha2 = "0.11" diff --git a/crates/quarto-core/src/crossref/mod.rs b/crates/quarto-core/src/crossref/mod.rs index e4d66d9d2..8c4b8e8d5 100644 --- a/crates/quarto-core/src/crossref/mod.rs +++ b/crates/quarto-core/src/crossref/mod.rs @@ -89,4 +89,29 @@ pub const EQUATION: &str = "Equation"; /// Produced by `CrossrefResolveTransform` when it rewrites a `Cite` whose id /// classifies as a crossref (per [`RefTypeRegistry`]). Back-end renderers /// convert this into a format-specific link or reference. +/// +/// Kept in lockstep with +/// [`quarto_pandoc_types::ATOMIC_CUSTOM_NODES`] — the q2-preview incremental +/// writer treats this type_name as atomic. A cross-check test below pins +/// the two literals together. pub const CROSSREF_RESOLVED_REF: &str = "CrossrefResolvedRef"; + +#[cfg(test)] +mod atomic_lockstep_tests { + use super::CROSSREF_RESOLVED_REF; + + /// Pin that the `CROSSREF_RESOLVED_REF` literal here matches the entry + /// in `quarto_pandoc_types::ATOMIC_CUSTOM_NODES`. If either string + /// changes, the writer's atomicity gate silently mis-fires; this test + /// fails noisily. + #[test] + fn crossref_resolved_ref_is_in_atomic_registry() { + assert!( + quarto_pandoc_types::ATOMIC_CUSTOM_NODES.contains(&CROSSREF_RESOLVED_REF), + "CROSSREF_RESOLVED_REF (`{}`) must appear in \ + quarto_pandoc_types::ATOMIC_CUSTOM_NODES; the q2-preview \ + writer relies on the lockstep.", + CROSSREF_RESOLVED_REF + ); + } +} diff --git a/crates/quarto-core/src/project/pass2_renderer.rs b/crates/quarto-core/src/project/pass2_renderer.rs index 04f88afc5..f8a4b58eb 100644 --- a/crates/quarto-core/src/project/pass2_renderer.rs +++ b/crates/quarto-core/src/project/pass2_renderer.rs @@ -371,6 +371,14 @@ pub struct RenderToHtmlRenderer { /// will be constructed with this root. vfs_root: std::path::PathBuf, + /// bd-rz2we: when set, the per-page resolver is built with + /// [`ResourceResolverContext::vfs_root_with_url_root`] using + /// this string as the URL prefix while `vfs_root` keeps acting + /// as the disk-write root. `None` keeps today's behavior + /// (URL root derived from `vfs_root`). Used by native test + /// helpers so rendered URLs don't capture the host's tempdir. + vfs_url_root: Option, + /// Optional user-grammar provider attached by the caller. Shared /// across every page the renderer touches (one /// `RenderToHtmlRenderer` may produce many pages in `ActivePage` @@ -386,6 +394,7 @@ impl RenderToHtmlRenderer { pub fn new(vfs_root: impl Into) -> Self { Self { vfs_root: vfs_root.into(), + vfs_url_root: None, user_grammars: None, } } @@ -401,6 +410,25 @@ impl RenderToHtmlRenderer { self.user_grammars = Some(provider); self } + + /// bd-rz2we: override the URL prefix used for resolved-asset + /// links/srcs. Disk writes still go through `vfs_root` (a real + /// tempdir in native test runs); only the URL strings embedded + /// in HTML change. Used by native test helpers so rendered + /// output doesn't leak the host's tempdir. + pub fn with_url_root(mut self, url_root: impl Into) -> Self { + self.vfs_url_root = Some(url_root.into()); + self + } + + fn build_resolver(&self) -> ResourceResolverContext { + match &self.vfs_url_root { + Some(url) => { + ResourceResolverContext::vfs_root_with_url_root(self.vfs_root.clone(), url.clone()) + } + None => ResourceResolverContext::vfs_root(self.vfs_root.clone()), + } + } } #[async_trait(?Send)] @@ -434,7 +462,9 @@ impl Pass2Renderer for RenderToHtmlRenderer { // URLs land under `/.quarto/project-artifacts/...` (the // post-processor reads from VFS at the matching path); see // Phase 5 sub-plan §"`ResourceResolverContext::vfs_root`". - let resolver = ResourceResolverContext::vfs_root(self.vfs_root.clone()); + // bd-rz2we: native test helpers can override the URL prefix + // via `with_url_root` to keep rendered URLs path-independent. + let resolver = self.build_resolver(); let binaries = BinaryDependencies::new(); let options = RenderOptions { @@ -549,7 +579,7 @@ impl Pass2Renderer for RenderToHtmlRenderer { // already embeds in HTML. `lib_dir` is intentionally // ignored — the post-processor just needs to find the // bytes at the URL's path. - ResourceResolverContext::vfs_root(self.vfs_root.clone()) + self.build_resolver() } } @@ -573,6 +603,14 @@ pub struct RenderToPreviewAstRenderer { /// Synthetic VFS root under which every artifact lives in WASM. /// Same semantics as [`RenderToHtmlRenderer::new`]. vfs_root: std::path::PathBuf, + /// bd-rz2we: when set, the per-page resolver is built with + /// [`ResourceResolverContext::vfs_root_with_url_root`] using + /// this string as the URL prefix while `vfs_root` keeps acting + /// as the disk-write root. `None` keeps today's behavior + /// (URL root derived from `vfs_root`). Used by native test + /// helpers (idempotence harness) so rendered URLs don't + /// capture the host's tempdir. + vfs_url_root: Option, /// bd-lucp: optional engine-execution capture used to splice /// recorded engine output into the AST at preview time. Plumbed /// through to [`crate::pipeline::render_qmd_to_preview_ast`] on @@ -597,6 +635,7 @@ impl RenderToPreviewAstRenderer { pub fn new(vfs_root: impl Into) -> Self { Self { vfs_root: vfs_root.into(), + vfs_url_root: None, attribution_json: None, capture: None, } @@ -629,6 +668,26 @@ impl RenderToPreviewAstRenderer { self.attribution_json = Some(json); self } + + /// bd-rz2we: override the URL prefix used for resolved-asset + /// links/srcs and cross-page links. Disk writes still go + /// through `vfs_root` (a real tempdir in native test runs); + /// only the URL strings embedded in the rendered AST change. + /// Used by native test helpers so rendered AST is + /// path-independent across runs. + pub fn with_url_root(mut self, url_root: impl Into) -> Self { + self.vfs_url_root = Some(url_root.into()); + self + } + + fn build_resolver(&self) -> ResourceResolverContext { + match &self.vfs_url_root { + Some(url) => { + ResourceResolverContext::vfs_root_with_url_root(self.vfs_root.clone(), url.clone()) + } + None => ResourceResolverContext::vfs_root(self.vfs_root.clone()), + } + } } #[async_trait(?Send)] @@ -658,7 +717,10 @@ impl Pass2Renderer for RenderToPreviewAstRenderer { )) })?; - let resolver = ResourceResolverContext::vfs_root(self.vfs_root.clone()); + // bd-rz2we: native test helpers can override the URL prefix + // via `with_url_root` so rendered AST link/asset URLs stay + // path-independent across runs in different tempdirs. + let resolver = self.build_resolver(); let binaries = BinaryDependencies::new(); let options = RenderOptions { @@ -795,6 +857,6 @@ impl Pass2Renderer for RenderToPreviewAstRenderer { // (which runs in the q2-preview pipeline) embeds image URLs // using this resolver, so the iframe sees URLs that resolve // to the matching VFS path. - ResourceResolverContext::vfs_root(self.vfs_root.clone()) + self.build_resolver() } } diff --git a/crates/quarto-core/src/resource_resolver.rs b/crates/quarto-core/src/resource_resolver.rs index 04d654dec..1f82f898f 100644 --- a/crates/quarto-core/src/resource_resolver.rs +++ b/crates/quarto-core/src/resource_resolver.rs @@ -35,6 +35,37 @@ use std::path::{Path, PathBuf}; use crate::artifact::ArtifactScope; +/// VFS-root resolver state. Splits the two roles a single +/// `PathBuf` used to play (bd-rz2we): the **disk-write root** +/// (where `runtime.file_write` and `OutputSink::allowed_roots` +/// land) and the **URL root** (what gets embedded in HTML +/// link/asset URLs). +/// +/// Production WASM constructs this via [`ResourceResolverContext::vfs_root`] +/// with the two fields populated from one path — they're +/// intentionally identical, since the WASM runtime serves the +/// synthetic VFS path from memory. Native test helpers construct +/// it via [`ResourceResolverContext::vfs_root_with_url_root`] +/// with a real tempdir for `write_root` and the synthetic +/// `/.quarto/project-artifacts` string for `url_root`, so that +/// `runtime.file_write` actually succeeds while rendered AST/HTML +/// stays path-independent (idempotent across runs in different +/// tempdirs). +#[derive(Debug, Clone)] +struct VfsRootMode { + /// Absolute disk path. `runtime.file_write` and + /// `OutputSink::allowed_roots` use this. In WASM this is a + /// synthetic VFS path (the runtime serves it from memory); in + /// native tests it's a real tempdir subdirectory. + write_root: PathBuf, + /// URL prefix embedded in HTML links / asset srcs. In WASM + /// this matches `write_root` by construction. In native tests + /// it's a fixed synthetic string (e.g. + /// `/.quarto/project-artifacts`) so URLs don't capture the + /// host machine's tempdir. + url_root: String, +} + /// Per-page context for resolving artifact paths and URLs. /// /// All paths are absolute and pre-normalized; the resolver does @@ -56,12 +87,14 @@ pub struct ResourceResolverContext { lib_dir: String, /// Per-page resource directory name (e.g. `"api_files"`). page_files_dir: String, - /// When `Some(root)`, the resolver is in **VFS-root mode**: - /// every artifact resolves to `{root}/{artifact_path}` for - /// both the on-disk path and the HTML URL, regardless of - /// scope. Used by the WASM hub-client where the runtime - /// serves files from a synthetic absolute path. - vfs_root_mode: Option, + /// When `Some(_)`, the resolver is in **VFS-root mode**: every + /// artifact resolves to `{write_root}/{artifact_path}` on disk + /// and `{url_root}/{artifact_path}` in HTML, regardless of + /// scope. Used by the WASM hub-client (write_root == url_root) + /// and by native test helpers (write_root is a tempdir, + /// url_root is a synthetic string for idempotence). See + /// [`VfsRootMode`]. + vfs_root_mode: Option, } impl ResourceResolverContext { @@ -132,18 +165,54 @@ impl ResourceResolverContext { /// The browser fetches the URL absolute, the runtime serves /// it from VFS at the matching synthetic path. No relative- /// path computation needed because the URLs are absolute. + /// + /// Single-arg form: `write_root == url_root`. Preserves the + /// pinned contract that VFS-mode URLs and on-disk paths are + /// byte-identical (see + /// `website_post_render::vfs_root_resolver_url_matches_on_disk_path`). pub fn vfs_root(vfs_root: impl Into) -> Self { - let root = vfs_root.into(); + let root: PathBuf = vfs_root.into(); + let url_root = root.to_string_lossy().replace('\\', "/"); + Self::vfs_root_with_url_root(root, url_root) + } + + /// Two-arg VFS-root constructor (bd-rz2we): decouple the + /// disk-write root from the URL prefix. + /// + /// - `write_root` is the absolute on-disk path + /// `runtime.file_write` and `OutputSink::allowed_roots` use. + /// In native test runs this is a real tempdir subdirectory. + /// - `url_root` is the URL prefix embedded in HTML links and + /// asset srcs. In native test runs this is a synthetic + /// string (e.g. `"/.quarto/project-artifacts"`) so rendered + /// AST/HTML is independent of the host's tempdir layout. + /// + /// Production WASM doesn't call this directly — it calls + /// [`Self::vfs_root`] with one path that's used for both + /// roles. The two-arg form exists for in-process native + /// callers of the q2-preview / WASM-style renderers + /// (`RenderToPreviewAstRenderer::with_url_root`, + /// `RenderToHtmlRenderer::with_url_root`) so their integration + /// tests get byte-identical AST output across runs. + pub fn vfs_root_with_url_root( + write_root: impl Into, + url_root: impl Into, + ) -> Self { + let write_root: PathBuf = write_root.into(); + let url_root: String = url_root.into(); Self { - page_output: root.join("__page__.html"), - site_root: root.clone(), + page_output: write_root.join("__page__.html"), + site_root: write_root.clone(), // Empty lib_dir on its own would route Project to // page_files_dir; we override scope_root to ignore // both fields when the resolver is in vfs-root mode - // (see the `vfs_root_mode` flag below). + // (see the `vfs_root_mode` field below). lib_dir: String::new(), page_files_dir: String::new(), - vfs_root_mode: Some(root), + vfs_root_mode: Some(VfsRootMode { + write_root, + url_root, + }), } } @@ -182,8 +251,8 @@ impl ResourceResolverContext { /// - An absolute URL of the form `/{vfs_root}/{artifact_path}` /// (VFS-root mode — used by the WASM hub-client). pub fn html_url_for(&self, scope: ArtifactScope, artifact_path: &Path) -> String { - if let Some(root) = &self.vfs_root_mode { - return rel_to_url(&root.join(artifact_path)); + if let Some(mode) = &self.vfs_root_mode { + return join_url_root(&mode.url_root, artifact_path); } let target = self.on_disk_path_for(scope, artifact_path); let page_dir = self.page_output.parent().unwrap_or_else(|| Path::new(".")); @@ -208,8 +277,8 @@ impl ResourceResolverContext { /// `{site_root}/{target_output_href}`. For single-doc renders /// this collapses to the input (since `site_root == page_dir`). pub fn page_url_for(&self, target_output_href: &str) -> String { - if let Some(root) = &self.vfs_root_mode { - return rel_to_url(&root.join(target_output_href)); + if let Some(mode) = &self.vfs_root_mode { + return join_url_root(&mode.url_root, Path::new(target_output_href)); } let target_abs = self.site_root.join(target_output_href); let page_dir = self.page_output.parent().unwrap_or_else(|| Path::new(".")); @@ -248,8 +317,8 @@ impl ResourceResolverContext { /// the resolver-side half of bd-cfl67) is then refused by the /// sink rather than written. pub fn allowed_output_roots(&self) -> Vec { - if let Some(root) = &self.vfs_root_mode { - return vec![root.clone()]; + if let Some(mode) = &self.vfs_root_mode { + return vec![mode.write_root.clone()]; } vec![self.site_root.clone()] } @@ -306,8 +375,8 @@ impl ResourceResolverContext { "artifact path must be relative (got {}); root-prefixed paths bypass scope_root and risk overwriting source files (bd-cfl67)", artifact_path.display(), ); - if let Some(root) = &self.vfs_root_mode { - return root.join(artifact_path); + if let Some(mode) = &self.vfs_root_mode { + return mode.write_root.join(artifact_path); } let scope_root = self.scope_root(scope); scope_root.join(artifact_path) @@ -336,6 +405,24 @@ impl ResourceResolverContext { } } +/// Build a `{url_root}/{artifact_path}` URL string in VFS-root +/// mode. `url_root` is taken verbatim (no path manipulation — +/// the WASM contract is that it stays byte-identical to the +/// disk path; native tests pass a synthetic string). The +/// artifact path is rendered with forward-slash separators +/// regardless of host OS. +fn join_url_root(url_root: &str, artifact_path: &Path) -> String { + let suffix = artifact_path.to_string_lossy().replace('\\', "/"); + if suffix.is_empty() { + return url_root.to_string(); + } + if url_root.ends_with('/') || suffix.starts_with('/') { + format!("{}{}", url_root, suffix) + } else { + format!("{}/{}", url_root, suffix) + } +} + /// Render a relative path as a forward-slash URL string. On /// Windows, `pathdiff` may yield backslash separators; HTML /// always wants forward slashes. @@ -697,4 +784,29 @@ mod tests { let url = r.html_url_for(ArtifactScope::Project, Path::new("styles.css")); assert_eq!(url, on_disk.to_string_lossy().replace('\\', "/")); } + + /// bd-rz2we: the two-arg VFS-root constructor decouples the + /// disk-write root (where the runtime actually puts bytes) from + /// the URL prefix embedded in HTML. Native test helpers pass a + /// real tempdir for the write root and a synthetic string for + /// the URL root, so rendered AST/HTML is path-independent + /// (idempotent across runs in different tempdirs) while + /// `runtime.file_write` still succeeds against a real disk path. + #[test] + fn resolver_vfs_root_with_url_root_splits_write_and_url() { + let r = ResourceResolverContext::vfs_root_with_url_root( + "/tmp/abc", + "/.quarto/project-artifacts", + ); + // URL side uses url_root. + let url = r.html_url_for(ArtifactScope::Project, Path::new("styles.css")); + assert_eq!(url, "/.quarto/project-artifacts/styles.css"); + let page_url = r.page_url_for("about.html"); + assert_eq!(page_url, "/.quarto/project-artifacts/about.html"); + // Disk side uses write_root. + let on_disk = r.on_disk_path_for(ArtifactScope::Project, Path::new("styles.css")); + assert_eq!(on_disk, PathBuf::from("/tmp/abc/styles.css")); + // allowed_output_roots tracks the write side. + assert_eq!(r.allowed_output_roots(), vec![PathBuf::from("/tmp/abc")]); + } } diff --git a/crates/quarto-core/src/stage/stages/apply_template.rs b/crates/quarto-core/src/stage/stages/apply_template.rs index 32e322c02..cfc2f6ab1 100644 --- a/crates/quarto-core/src/stage/stages/apply_template.rs +++ b/crates/quarto-core/src/stage/stages/apply_template.rs @@ -817,19 +817,9 @@ mod tests { .or(diag.location.as_ref()) .expect("diagnostic should carry a SourceInfo location"); - fn root_file_id(info: &quarto_source_map::SourceInfo) -> Option { - match info { - quarto_source_map::SourceInfo::Original { file_id, .. } => Some(*file_id), - quarto_source_map::SourceInfo::Substring { parent, .. } => root_file_id(parent), - quarto_source_map::SourceInfo::Concat { pieces } => { - pieces.first().and_then(|p| root_file_id(&p.source_info)) - } - quarto_source_map::SourceInfo::FilterProvenance { .. } => None, - } - } - - let file_id = - root_file_id(location).expect("diagnostic location should have a resolvable FileId"); + let file_id = location + .root_file_id() + .expect("diagnostic location should have a resolvable FileId"); let file = result .source_context .get_file(file_id) diff --git a/crates/quarto-core/src/stage/stages/engine_execution.rs b/crates/quarto-core/src/stage/stages/engine_execution.rs index a93274222..f1460ebfc 100644 --- a/crates/quarto-core/src/stage/stages/engine_execution.rs +++ b/crates/quarto-core/src/stage/stages/engine_execution.rs @@ -814,39 +814,25 @@ mod tests { pandoc: &quarto_pandoc_types::pandoc::Pandoc, ) -> std::collections::HashSet { use quarto_pandoc_types::{Block, Inline}; - use quarto_source_map::{FileId, SourceInfo}; + use quarto_source_map::FileId; - fn walk_source_info(si: &SourceInfo, out: &mut std::collections::HashSet) { - match si { - SourceInfo::Original { file_id, .. } => { - out.insert(*file_id); - } - SourceInfo::Substring { parent, .. } => walk_source_info(parent, out), - SourceInfo::Concat { pieces } => { - for p in pieces { - walk_source_info(&p.source_info, out); - } - } - SourceInfo::FilterProvenance { .. } => {} - } - } fn walk_inline(i: &Inline, out: &mut std::collections::HashSet) { match i { - Inline::Str(x) => walk_source_info(&x.source_info, out), + Inline::Str(x) => x.source_info.collect_file_ids(out), Inline::Emph(x) => { for c in &x.content { walk_inline(c, out); } - walk_source_info(&x.source_info, out); + x.source_info.collect_file_ids(out); } Inline::Strong(x) => { for c in &x.content { walk_inline(c, out); } - walk_source_info(&x.source_info, out); + x.source_info.collect_file_ids(out); } - Inline::Space(x) => walk_source_info(&x.source_info, out), - Inline::SoftBreak(x) => walk_source_info(&x.source_info, out), + Inline::Space(x) => x.source_info.collect_file_ids(out), + Inline::SoftBreak(x) => x.source_info.collect_file_ids(out), _ => { // Other variants not needed for this test. Add as needed. } @@ -858,19 +844,19 @@ mod tests { for i in &p.content { walk_inline(i, out); } - walk_source_info(&p.source_info, out); + p.source_info.collect_file_ids(out); } Block::Header(h) => { for i in &h.content { walk_inline(i, out); } - walk_source_info(&h.source_info, out); + h.source_info.collect_file_ids(out); } Block::Div(d) => { for b in &d.content { walk_block(b, out); } - walk_source_info(&d.source_info, out); + d.source_info.collect_file_ids(out); } _ => { // Other block types not needed for this test. diff --git a/crates/quarto-core/src/transforms/appendix.rs b/crates/quarto-core/src/transforms/appendix.rs index 9ac5379f1..b374e48d2 100644 --- a/crates/quarto-core/src/transforms/appendix.rs +++ b/crates/quarto-core/src/transforms/appendix.rs @@ -49,7 +49,8 @@ use quarto_pandoc_types::attr::AttrSourceInfo; use quarto_pandoc_types::block::{Block, Div, Header, Paragraph}; use quarto_pandoc_types::inline::{Inline, Link, Str}; use quarto_pandoc_types::pandoc::Pandoc; -use quarto_source_map::SourceInfo; +use quarto_source_map::{By, SourceInfo}; +use smallvec::smallvec; use quarto_pandoc_types::ConfigValue; @@ -227,7 +228,10 @@ fn extract_footnotes(blocks: &mut Vec) -> Option { /// Wrap bibliography in a section with appropriate attributes. fn wrap_bibliography(bibliography: Block) -> Block { - let source_info = SourceInfo::default(); + let source_info = SourceInfo::Generated { + by: By::appendix(), + from: smallvec![], + }; // Create header for the bibliography section let header = Block::Header(Header { @@ -262,7 +266,10 @@ fn create_appendix_container(sections: Blocks, style_class: &str) -> Block { LinkedHashMap::new(), ), content: sections, - source_info: SourceInfo::default(), + source_info: SourceInfo::Generated { + by: By::appendix(), + from: smallvec![], + }, attr_source: AttrSourceInfo::empty(), }) } @@ -283,7 +290,10 @@ fn create_license_section(meta: &ConfigValue) -> Option { .map(|s| s.to_string())? }; - let source_info = SourceInfo::default(); + let source_info = SourceInfo::Generated { + by: By::appendix(), + from: smallvec![], + }; let header = Block::Header(Header { level: 2, @@ -332,7 +342,10 @@ fn create_copyright_section(meta: &ConfigValue) -> Option { .map(|s| s.to_string())? }; - let source_info = SourceInfo::default(); + let source_info = SourceInfo::Generated { + by: By::appendix(), + from: smallvec![], + }; let header = Block::Header(Header { level: 2, @@ -373,7 +386,10 @@ fn create_citation_section(meta: &ConfigValue) -> Option { // It can have various formats - for now, look for a "url" or create a simple reference let citation_url = citation.get("url").and_then(|v| v.as_str()); - let source_info = SourceInfo::default(); + let source_info = SourceInfo::Generated { + by: By::appendix(), + from: smallvec![], + }; let header = Block::Header(Header { level: 2, @@ -878,4 +894,21 @@ mod tests { panic!("Expected appendix Div"); } } + + #[test] + fn test_create_appendix_container_has_generated_provenance() { + // Plan 6: the synthesized appendix container Div carries + // Generated { by: appendix(), from: [] }. + let block = create_appendix_container(vec![], "default"); + let Block::Div(div) = &block else { + panic!("Expected Div"); + }; + match &div.source_info { + SourceInfo::Generated { by, from } => { + assert_eq!(by.kind, "appendix"); + assert!(from.is_empty()); + } + other => panic!("Expected Generated, got {:?}", other), + } + } } diff --git a/crates/quarto-core/src/transforms/code_block_generate.rs b/crates/quarto-core/src/transforms/code_block_generate.rs index 06a4f0cf9..71d69050b 100644 --- a/crates/quarto-core/src/transforms/code_block_generate.rs +++ b/crates/quarto-core/src/transforms/code_block_generate.rs @@ -783,10 +783,7 @@ mod tests { let concat = SourceInfo::Concat { pieces: vec![] }; assert!(CodeBlockDecorationKey::from_source_info(&concat).is_none()); - let filter = SourceInfo::FilterProvenance { - filter_path: "fixture.lua".into(), - line: 1, - }; + let filter = SourceInfo::generated(quarto_source_map::By::filter("fixture.lua", 1)); assert!(CodeBlockDecorationKey::from_source_info(&filter).is_none()); } diff --git a/crates/quarto-core/src/transforms/footnotes.rs b/crates/quarto-core/src/transforms/footnotes.rs index 024f5bebb..f84572281 100644 --- a/crates/quarto-core/src/transforms/footnotes.rs +++ b/crates/quarto-core/src/transforms/footnotes.rs @@ -50,7 +50,8 @@ use quarto_pandoc_types::block::{Block, Div, OrderedList, Paragraph}; use quarto_pandoc_types::inline::{Inline, Link, Span, Str, Superscript}; use quarto_pandoc_types::pandoc::Pandoc; use quarto_pandoc_types::{Blocks, Inlines, ListNumberDelim, ListNumberStyle}; -use quarto_source_map::SourceInfo; +use quarto_source_map::{By, SourceInfo}; +use smallvec::smallvec; use quarto_pandoc_types::ConfigValue; @@ -492,7 +493,14 @@ fn create_footnote_ref(number: usize, source_info: &SourceInfo, is_margin: bool) /// /// ``` fn create_footnotes_section(footnotes: &[CollectedFootnote]) -> Block { - let source_info = SourceInfo::default(); + // The synthesized container chrome (section Div, embedded
, and the + // OrderedList wrapping the footnote items) is pure synthesis: it + // corresponds to no source bytes. The footnote content inside (created + // by `create_footnote_item`) retains the original Note's source_info. + let source_info = SourceInfo::Generated { + by: By::footnotes(), + from: smallvec![], + }; // Create list items for each footnote let list_items: Vec = footnotes @@ -1061,4 +1069,31 @@ mod tests { // Check footnotes section exists assert!(matches!(ast.blocks[1], Block::Div(_))); } + + #[test] + fn test_create_footnotes_section_has_generated_provenance() { + // Plan 6: the synthesized footnotes container Div (and its embedded + // chrome — HorizontalRule, OrderedList) carry + // Generated { by: footnotes(), from: [] }. The footnote *items* + // inside retain the original Note's source_info via + // create_footnote_item. + let block = create_footnotes_section(&[]); + let Block::Div(div) = &block else { + panic!("Expected Div"); + }; + match &div.source_info { + SourceInfo::Generated { by, from } => { + assert_eq!(by.kind, "footnotes"); + assert!(from.is_empty()); + } + other => panic!("Expected Generated, got {:?}", other), + } + // The embedded HorizontalRule chrome carries the same shape. + let Block::HorizontalRule(hr) = &div.content[0] else { + panic!("Expected HorizontalRule"); + }; + assert!( + matches!(&hr.source_info, SourceInfo::Generated { by, .. } if by.kind == "footnotes") + ); + } } diff --git a/crates/quarto-core/src/transforms/proof.rs b/crates/quarto-core/src/transforms/proof.rs index 137b328f9..a81d4c65d 100644 --- a/crates/quarto-core/src/transforms/proof.rs +++ b/crates/quarto-core/src/transforms/proof.rs @@ -132,8 +132,8 @@ fn empty_attr() -> Attr { fn convert_div(mut div: Div) -> CustomNode { // Extract title: `name=` attribute, then first Header. Same rule as // theorem sugar. - let title: Option = - extract_name_attr(&mut div.attr).or_else(|| extract_first_header_title(&mut div.content)); + let title: Option = extract_name_attr(&mut div.attr, &div.attr_source) + .or_else(|| extract_first_header_title(&mut div.content)); // Strip the `.proof` class so a later "match div.proof" filter // doesn't double-apply (same pattern as theorem sugar). @@ -155,8 +155,30 @@ fn convert_div(mut div: Div) -> CustomNode { node } -fn extract_name_attr(attr: &mut Attr) -> Option { +/// Read and remove the `name` attribute from `attr`. See +/// `crate::transforms::theorem::extract_name_attr` for the +/// positional-alignment rationale (this is the parallel implementation +/// for `.proof` Divs). +fn extract_name_attr(attr: &mut Attr, attr_source: &AttrSourceInfo) -> Option { let (_id, _classes, kvs) = attr; + + let name_idx = kvs.keys().position(|k| k == "name")?; + + // See `theorem::extract_name_attr` — empty attr_source signals + // "no provenance available" (common in tests); only assert on + // populated-but-misaligned input. + debug_assert!( + attr_source.attributes.is_empty() || kvs.len() == attr_source.attributes.len(), + "AttrSourceInfo.attributes is out of sync with Attr.2 (bd-3aolj / bd-1e6a5): kvs={}, attr_source={}", + kvs.len(), + attr_source.attributes.len(), + ); + let value_source = if kvs.len() == attr_source.attributes.len() { + attr_source.attributes[name_idx].1.clone() + } else { + None + }; + let name = kvs.remove("name")?; if name.is_empty() { return None; @@ -164,7 +186,7 @@ fn extract_name_attr(attr: &mut Attr) -> Option { Some(vec![quarto_pandoc_types::inline::Inline::Str( quarto_pandoc_types::inline::Str { text: name, - source_info: quarto_source_map::SourceInfo::default(), + source_info: value_source.unwrap_or_default(), }, )]) } diff --git a/crates/quarto-core/src/transforms/shortcode_resolve.rs b/crates/quarto-core/src/transforms/shortcode_resolve.rs index cc9d5e8f5..58f076fbd 100644 --- a/crates/quarto-core/src/transforms/shortcode_resolve.rs +++ b/crates/quarto-core/src/transforms/shortcode_resolve.rs @@ -41,7 +41,8 @@ use quarto_pandoc_types::inline::{ use quarto_pandoc_types::pandoc::Pandoc; use quarto_pandoc_types::shortcode::{Shortcode, ShortcodeArg}; use quarto_pandoc_types::table::Table; -use quarto_source_map::SourceInfo; +use quarto_source_map::{Anchor, By, SourceInfo}; +use smallvec::smallvec; use std::future::Future; use std::path::PathBuf; @@ -303,12 +304,38 @@ impl ShortcodeResolveTransform { /// Resolve a shortcode using the appropriate handler. /// /// Priority: built-in Rust handlers > loaded Lua handlers > extension name lookup. + /// + /// All `ShortcodeResult::Inlines`/`Blocks` outcomes flow through this single + /// funnel and are post-walked by `stamp_shortcode_anchors`, which stamps each + /// returned node with `Generated { by: shortcode(name), from: [Invocation -> ctx.source_info] }` + /// (and enriches any Lua filter-attached source_info). `Preserve` and `Error` + /// outcomes do not need stamping — `Preserve` becomes a literal Str via + /// `shortcode_to_literal` and `Error` becomes a visible error via + /// `make_error_inline`; both sites carry the token's `Original` source_info + /// directly. async fn resolve_shortcode( &self, shortcode: &Shortcode, ctx: &ShortcodeContext<'_>, resolution_ctx: ResolutionContext, lua_engine: &mut Option, + ) -> ShortcodeResult { + let mut result = self + .dispatch_shortcode(shortcode, ctx, resolution_ctx, lua_engine) + .await; + stamp_shortcode_anchors(&mut result, &shortcode.name, ctx.source_info); + result + } + + /// Inner dispatch — picks the handler and returns the raw result. Wrapped by + /// [`resolve_shortcode`], which post-walks the result to stamp Invocation + /// anchors. + async fn dispatch_shortcode( + &self, + shortcode: &Shortcode, + ctx: &ShortcodeContext<'_>, + resolution_ctx: ResolutionContext, + lua_engine: &mut Option, ) -> ShortcodeResult { // Handle escaped shortcodes - preserve as literal text if shortcode.is_escaped { @@ -483,6 +510,292 @@ fn lua_result_to_shortcode_result( } } +/// After every shortcode handler dispatch, stamp Invocation provenance on the +/// returned nodes. Recurses into nested AST so every block and inline gets the +/// anchor. +/// +/// Enrichment rules (per Plan 6 §"Lua-shortcode enrichment"): +/// - If the existing source_info is `Generated { by: filter, ... }` (Lua's +/// `filter_source_info` auto-attach), promote `by.kind` to `"shortcode"` and +/// move the `filter_path`/`line` data fields into `lua_path`/`lua_line`, +/// then append the Invocation anchor. +/// - Otherwise, replace with a fresh `Generated { by: shortcode(name), +/// from: [Invocation] }`. +fn stamp_shortcode_anchors( + result: &mut ShortcodeResult, + shortcode_name: &str, + token_si: &SourceInfo, +) { + let token_arc = Arc::new(token_si.clone()); + match result { + ShortcodeResult::Inlines(inlines) => { + for inline in inlines.iter_mut() { + stamp_inline(inline, shortcode_name, &token_arc); + } + } + ShortcodeResult::Blocks(blocks) => { + for block in blocks.iter_mut() { + stamp_block(block, shortcode_name, &token_arc); + } + } + ShortcodeResult::Preserve | ShortcodeResult::Error(_) => {} + } +} + +/// Stamp the Invocation anchor on a single inline and recurse into its children. +fn stamp_inline(inline: &mut Inline, name: &str, token_arc: &Arc) { + let new_si = enrich_or_create(inline.source_info(), name, token_arc); + *inline.source_info_mut() = new_si; + match inline { + Inline::Emph(Emph { content, .. }) + | Inline::Underline(Underline { content, .. }) + | Inline::Strong(Strong { content, .. }) + | Inline::Strikeout(Strikeout { content, .. }) + | Inline::Superscript(Superscript { content, .. }) + | Inline::Subscript(Subscript { content, .. }) + | Inline::SmallCaps(SmallCaps { content, .. }) + | Inline::Insert(Insert { content, .. }) + | Inline::Delete(Delete { content, .. }) + | Inline::Highlight(Highlight { content, .. }) + | Inline::Quoted(Quoted { content, .. }) + | Inline::Cite(Cite { content, .. }) + | Inline::Link(Link { content, .. }) + | Inline::Image(Image { content, .. }) + | Inline::Span(Span { content, .. }) + | Inline::EditComment(EditComment { content, .. }) => { + for child in content.iter_mut() { + stamp_inline(child, name, token_arc); + } + } + Inline::Note(Note { content, .. }) => { + for child in content.iter_mut() { + stamp_block(child, name, token_arc); + } + } + Inline::Custom(custom) => { + for slot in custom.slots.values_mut() { + match slot { + quarto_pandoc_types::custom::Slot::Inline(i) => { + stamp_inline(i, name, token_arc); + } + quarto_pandoc_types::custom::Slot::Inlines(is) => { + for child in is.iter_mut() { + stamp_inline(child, name, token_arc); + } + } + quarto_pandoc_types::custom::Slot::Block(b) => { + stamp_block(b, name, token_arc); + } + quarto_pandoc_types::custom::Slot::Blocks(bs) => { + for child in bs.iter_mut() { + stamp_block(child, name, token_arc); + } + } + } + } + } + // Leaves — no nested AST to walk. + Inline::Str(_) + | Inline::Code(_) + | Inline::Space(_) + | Inline::SoftBreak(_) + | Inline::LineBreak(_) + | Inline::Math(_) + | Inline::RawInline(_) + | Inline::Shortcode(_) + | Inline::NoteReference(_) + | Inline::Attr(_) => {} + } +} + +/// Stamp the Invocation anchor on a single block and recurse into its children. +fn stamp_block(block: &mut Block, name: &str, token_arc: &Arc) { + let new_si = enrich_or_create(block.source_info(), name, token_arc); + *block.source_info_mut() = new_si; + match block { + Block::Plain(Plain { content, .. }) | Block::Paragraph(Paragraph { content, .. }) => { + for child in content.iter_mut() { + stamp_inline(child, name, token_arc); + } + } + Block::LineBlock(LineBlock { content, .. }) => { + for line in content.iter_mut() { + for child in line.iter_mut() { + stamp_inline(child, name, token_arc); + } + } + } + Block::Header(Header { content, .. }) => { + for child in content.iter_mut() { + stamp_inline(child, name, token_arc); + } + } + Block::BlockQuote(BlockQuote { content, .. }) => { + for child in content.iter_mut() { + stamp_block(child, name, token_arc); + } + } + Block::OrderedList(OrderedList { content, .. }) + | Block::BulletList(BulletList { content, .. }) => { + for item in content.iter_mut() { + for child in item.iter_mut() { + stamp_block(child, name, token_arc); + } + } + } + Block::DefinitionList(DefinitionList { content, .. }) => { + for (term, defs) in content.iter_mut() { + for child in term.iter_mut() { + stamp_inline(child, name, token_arc); + } + for def in defs.iter_mut() { + for child in def.iter_mut() { + stamp_block(child, name, token_arc); + } + } + } + } + Block::Figure(Figure { + content, caption, .. + }) => { + for child in content.iter_mut() { + stamp_block(child, name, token_arc); + } + if let Some(short) = caption.short.as_mut() { + for child in short.iter_mut() { + stamp_inline(child, name, token_arc); + } + } + if let Some(long) = caption.long.as_mut() { + for child in long.iter_mut() { + stamp_block(child, name, token_arc); + } + } + } + Block::Div(Div { content, .. }) => { + for child in content.iter_mut() { + stamp_block(child, name, token_arc); + } + } + Block::Table(Table { + caption, + head, + bodies, + foot, + .. + }) => { + if let Some(short) = caption.short.as_mut() { + for child in short.iter_mut() { + stamp_inline(child, name, token_arc); + } + } + if let Some(long) = caption.long.as_mut() { + for child in long.iter_mut() { + stamp_block(child, name, token_arc); + } + } + for row in head.rows.iter_mut() { + for cell in row.cells.iter_mut() { + for child in cell.content.iter_mut() { + stamp_block(child, name, token_arc); + } + } + } + for body in bodies.iter_mut() { + for row in body.body.iter_mut() { + for cell in row.cells.iter_mut() { + for child in cell.content.iter_mut() { + stamp_block(child, name, token_arc); + } + } + } + } + for row in foot.rows.iter_mut() { + for cell in row.cells.iter_mut() { + for child in cell.content.iter_mut() { + stamp_block(child, name, token_arc); + } + } + } + } + Block::Custom(custom) => { + for slot in custom.slots.values_mut() { + match slot { + quarto_pandoc_types::custom::Slot::Inline(i) => { + stamp_inline(i, name, token_arc); + } + quarto_pandoc_types::custom::Slot::Inlines(is) => { + for child in is.iter_mut() { + stamp_inline(child, name, token_arc); + } + } + quarto_pandoc_types::custom::Slot::Block(b) => { + stamp_block(b, name, token_arc); + } + quarto_pandoc_types::custom::Slot::Blocks(bs) => { + for child in bs.iter_mut() { + stamp_block(child, name, token_arc); + } + } + } + } + } + // Leaves — no nested AST to walk. + Block::CodeBlock(_) + | Block::RawBlock(_) + | Block::HorizontalRule(_) + | Block::BlockMetadata(_) + | Block::NoteDefinitionPara(_) + | Block::NoteDefinitionFencedBlock(_) + | Block::CaptionBlock(_) => {} + } +} + +/// Build the `SourceInfo` for a freshly-resolved shortcode node. +/// +/// If the existing source_info is `Generated { by: filter, ... }` (a Lua +/// auto-attach from `filter_source_info`), promote the kind to `"shortcode"` +/// and migrate the `filter_path`/`line` data fields into `lua_path`/`lua_line`, +/// preserving the Lua-side dispatch precision alongside the new shortcode +/// context. Otherwise, mint a fresh `Generated { by: shortcode(name), ... }`. +/// +/// In both branches, append an Invocation anchor pointing at the shortcode +/// token's source range (`token_arc`). +/// +/// NOTE: the `filter_path`/`line` reads below are temporary. When +/// **bd-36fr9** (Lua-file registration in `SourceContext`) lands, those +/// fields move out of `by.data` and into a typed `Dispatch` anchor inside +/// `from`. This branch will then read the existing Dispatch anchor and copy +/// it alongside the Invocation. +/// +/// NOTE: **bd-129m3** (ValueSource anchor stamping for `meta` / `var` +/// shortcodes) is the integration point for appending a second anchor +/// when the metadata loader threads per-key source-info through. +fn enrich_or_create(existing: &SourceInfo, name: &str, token_arc: &Arc) -> SourceInfo { + let by = match existing { + SourceInfo::Generated { by, .. } if by.kind == "filter" => { + let lua_path = by.data.get("filter_path").cloned(); + let lua_line = by.data.get("line").cloned(); + let mut data = serde_json::json!({ "name": name }); + if let Some(p) = lua_path { + data["lua_path"] = p; + } + if let Some(l) = lua_line { + data["lua_line"] = l; + } + By { + kind: "shortcode".to_string(), + data, + } + } + _ => By::shortcode(name), + }; + SourceInfo::Generated { + by, + from: smallvec![Anchor::invocation(Arc::clone(token_arc))], + } +} + /// Extract shortcode paths from merged metadata. /// /// After metadata merge, `meta["shortcodes"]` contains an array of paths @@ -656,7 +969,8 @@ fn resolve_blocks<'a>( } ShortcodeResult::Error(error) => { diagnostics.push(error.diagnostic); - let error_inline = make_error_inline(&error.key); + let error_inline = + make_error_inline(&error.key, &shortcode_owned.source_info); replace_shortcode_in_block(&mut blocks[i], vec![error_inline]); i += 1; continue; @@ -911,7 +1225,8 @@ fn resolve_inlines<'a>( // Emit diagnostic diagnostics.push(error.diagnostic); // Replace with visible error (TS Quarto style) - let error_inline = make_error_inline(&error.key); + let error_inline = + make_error_inline(&error.key, &shortcode_owned.source_info); inlines[i] = error_inline; i += 1; } @@ -1027,19 +1342,29 @@ fn recurse_inline<'a>( } /// Create visible error inline: Strong("?key") -fn make_error_inline(key: &str) -> Inline { +/// +/// Both the inner Str and outer Strong carry the shortcode token's original +/// `source_info` (not `Generated`). The error region is treated as normal +/// editable user-source content — Plan 7's `is_atomic_kind()` does not fire on +/// Original, so the incremental writer Verbatim-copies the original token +/// bytes on round-trip. The Strong-wraps-Str overlap is structurally parallel +/// to the footnote `` case (Plan 7 §footnotes). +fn make_error_inline(key: &str, token_source_info: &SourceInfo) -> Inline { Inline::Strong(Strong { content: vec![Inline::Str(Str { text: format!("?{}", key), - source_info: SourceInfo::default(), + source_info: token_source_info.clone(), })], - source_info: SourceInfo::default(), + source_info: token_source_info.clone(), }) } /// Convert an escaped shortcode to literal text. /// -/// For `{{{< meta title >}}}`, this produces `{{< meta title >}}` +/// For `{{{< meta title >}}}`, this produces `{{< meta title >}}`. The +/// resulting `Str` carries the shortcode token's original `source_info` +/// (an Original), so Plan 7's `is_atomic_kind()` does not fire — round-trip +/// through the incremental writer verbatim-copies the source bytes. fn shortcode_to_literal(shortcode: &Shortcode) -> Inline { let mut text = String::from("{{< "); text.push_str(&shortcode.name); @@ -1106,7 +1431,7 @@ fn shortcode_to_literal(shortcode: &Shortcode) -> Inline { Inline::Str(Str { text, - source_info: SourceInfo::default(), + source_info: shortcode.source_info.clone(), }) } @@ -1329,12 +1654,16 @@ mod tests { #[test] fn test_make_error_inline() { - let inline = make_error_inline("meta:title"); + let token_si = dummy_source_info(); + let inline = make_error_inline("meta:title", &token_si); match inline { Inline::Strong(strong) => { assert_eq!(strong.content.len(), 1); + // Both layers carry the token's source_info (not Default, not Generated). + assert_eq!(&strong.source_info, &token_si); if let Inline::Str(s) = &strong.content[0] { assert_eq!(s.text, "?meta:title"); + assert_eq!(&s.source_info, &token_si); } else { panic!("Expected Str inline"); } @@ -1987,5 +2316,457 @@ mod tests { } assert!(ctx.diagnostics.is_empty()); } + + /// Plan 6 §"Lua-shortcode enrichment": when a Lua handler returns a + /// *typed* Inline (e.g. `pandoc.Str(...)`), the filter_source_info + /// auto-attach gives it `Generated { by: filter, data: { filter_path, + /// line } }`. The resolver's post-walk should then promote this to + /// `Generated { by: shortcode, data: { name, lua_path, lua_line }, + /// from: [Invocation] }` — kind promoted, fields renamed, anchor + /// appended. + #[tokio::test] + async fn lua_shortcode_typed_return_enriched_to_shortcode_kind() { + let tmp = TempDir::new().unwrap(); + // Note: pandoc.Str(...) returns a typed Lua userdata that the + // Lua engine's filter_source_info auto-attach picks up. + let script_path = write_lua_script( + tmp.path(), + "typed.lua", + r#"return { typed = function(args) return pandoc.Str("Hello typed") end }"#, + ); + + let runtime = make_runtime(); + let transform = ShortcodeResolveTransform::with_lua_support( + vec![script_path.clone()], + Vec::new(), + runtime, + "html".to_string(), + ); + + let tok = token_si(); + let mut ast = Pandoc { + meta: ConfigValue::default(), + blocks: vec![Block::Paragraph(Paragraph { + content: vec![Inline::Shortcode(make_shortcode_with_si( + "typed", + vec![], + tok.clone(), + ))], + source_info: dummy_source_info(), + })], + }; + + let project = make_test_project(); + let doc = DocumentInfo::from_path("/project/doc.qmd"); + let format = Format::html(); + let binaries = BinaryDependencies::new(); + let mut ctx = RenderContext::new(&project, &doc, &format, &binaries); + transform.transform(&mut ast, &mut ctx).await.unwrap(); + + let Block::Paragraph(para) = &ast.blocks[0] else { + panic!("Expected Paragraph"); + }; + let Inline::Str(s) = ¶.content[0] else { + panic!("Expected resolved Str, got {:?}", ¶.content[0]); + }; + assert_eq!(s.text, "Hello typed"); + match &s.source_info { + SourceInfo::Generated { by, from } => { + // Kind promoted to "shortcode", NOT "filter". + assert_eq!( + by.kind, "shortcode", + "kind should be promoted from filter to shortcode" + ); + // Name is the shortcode name. + assert_eq!(by.data.get("name").and_then(|v| v.as_str()), Some("typed")); + // filter_path → lua_path + let lua_path = by + .data + .get("lua_path") + .and_then(|v| v.as_str()) + .expect("lua_path should be preserved from filter_path"); + assert!( + lua_path.contains("typed.lua"), + "lua_path {:?} should reference the script", + lua_path + ); + // line → lua_line + let lua_line = by + .data + .get("lua_line") + .and_then(|v| v.as_u64()) + .expect("lua_line should be preserved from line"); + assert!(lua_line >= 1, "lua_line should be positive"); + // Invocation anchor points at the token. + assert_eq!(from.len(), 1); + assert_eq!(from[0].role, quarto_source_map::AnchorRole::Invocation); + assert_eq!(&*from[0].source_info, &tok); + } + other => panic!("Expected Generated, got {:?}", other), + } + } + } + + // === Plan 6: shortcode-resolution provenance shape tests === + + /// A test handler that returns a Strong wrapping a Str — exercises + /// the multi-inline / nested-container stamping path. + struct MultiInlineTestHandler; + impl ShortcodeHandler for MultiInlineTestHandler { + fn name(&self) -> &str { + "multi" + } + fn resolve( + &self, + _shortcode: &Shortcode, + _ctx: &ShortcodeContext, + _resolution_ctx: ResolutionContext, + ) -> ShortcodeResult { + ShortcodeResult::Inlines(vec![ + Inline::Strong(Strong { + content: vec![Inline::Str(Str { + text: "Bold".into(), + source_info: SourceInfo::default(), + })], + source_info: SourceInfo::default(), + }), + Inline::Space(quarto_pandoc_types::inline::Space { + source_info: SourceInfo::default(), + }), + Inline::Str(Str { + text: "Title".into(), + source_info: SourceInfo::default(), + }), + ]) + } + } + + /// Distinct token source_info so we can check Invocation anchors + /// point at the *shortcode token*, not at the default. + fn token_si() -> SourceInfo { + SourceInfo::original(FileId(0), 100, 130) + } + + fn make_shortcode_with_si(name: &str, args: Vec<&str>, si: SourceInfo) -> Shortcode { + Shortcode { + is_escaped: false, + name: name.to_string(), + positional_args: args + .into_iter() + .map(|s| ShortcodeArg::String(s.to_string())) + .collect(), + keyword_args: hashlink::LinkedHashMap::new(), + source_info: si, + } + } + + fn make_escaped_shortcode_with_si(name: &str, si: SourceInfo) -> Shortcode { + Shortcode { + is_escaped: true, + name: name.to_string(), + positional_args: vec![], + keyword_args: hashlink::LinkedHashMap::new(), + source_info: si, + } + } + + /// Resolved Str from a meta shortcode carries + /// Generated { by: shortcode("meta"), from: [Invocation -> token_si] }. + #[tokio::test] + async fn shortcode_resolution_has_generated_with_invocation_anchor() { + let transform = ShortcodeResolveTransform::new(); + let tok = token_si(); + let mut ast = Pandoc { + meta: ConfigValue::new_map( + vec![make_map_entry( + "title", + ConfigValue::new_string("Test Title", dummy_source_info()), + )], + dummy_source_info(), + ), + blocks: vec![Block::Paragraph(Paragraph { + content: vec![Inline::Shortcode(make_shortcode_with_si( + "meta", + vec!["title"], + tok.clone(), + ))], + source_info: dummy_source_info(), + })], + }; + + let project = make_test_project(); + let doc = DocumentInfo::from_path("/project/doc.qmd"); + let format = Format::html(); + let binaries = BinaryDependencies::new(); + let mut ctx = RenderContext::new(&project, &doc, &format, &binaries); + transform.transform(&mut ast, &mut ctx).await.unwrap(); + + let Block::Paragraph(para) = &ast.blocks[0] else { + panic!("Expected Paragraph"); + }; + let Inline::Str(s) = ¶.content[0] else { + panic!("Expected resolved Str"); + }; + assert_eq!(s.text, "Test Title"); + match &s.source_info { + SourceInfo::Generated { by, from } => { + assert_eq!(by.kind, "shortcode"); + assert_eq!(by.data.get("name").and_then(|v| v.as_str()), Some("meta")); + assert_eq!(from.len(), 1); + assert_eq!(from[0].role, quarto_source_map::AnchorRole::Invocation); + assert_eq!(&*from[0].source_info, &tok); + } + other => panic!("Expected Generated, got {:?}", other), + } + } + + /// Multi-inline resolution (Strong[Str], Space, Str) — every node gets + /// stamped with the same Invocation anchor source_info. + #[tokio::test] + async fn multi_inline_shortcode_resolution_shares_invocation_source() { + let mut transform = ShortcodeResolveTransform::new(); + transform.handlers.push(Box::new(MultiInlineTestHandler)); + let tok = token_si(); + let mut ast = Pandoc { + meta: ConfigValue::default(), + blocks: vec![Block::Paragraph(Paragraph { + content: vec![Inline::Shortcode(make_shortcode_with_si( + "multi", + vec![], + tok.clone(), + ))], + source_info: dummy_source_info(), + })], + }; + + let project = make_test_project(); + let doc = DocumentInfo::from_path("/project/doc.qmd"); + let format = Format::html(); + let binaries = BinaryDependencies::new(); + let mut ctx = RenderContext::new(&project, &doc, &format, &binaries); + transform.transform(&mut ast, &mut ctx).await.unwrap(); + + let Block::Paragraph(para) = &ast.blocks[0] else { + panic!("Expected Paragraph"); + }; + assert_eq!(para.content.len(), 3); + + // Helper: extract the Invocation source_info from an inline. + fn invocation_si(inline: &Inline) -> &SourceInfo { + match inline.source_info() { + SourceInfo::Generated { by, from } => { + assert_eq!(by.kind, "shortcode", "Got by.kind = {:?}", by.kind); + assert_eq!(from.len(), 1); + assert_eq!(from[0].role, quarto_source_map::AnchorRole::Invocation); + &from[0].source_info + } + other => panic!("Expected Generated, got {:?}", other), + } + } + + let strong_si = invocation_si(¶.content[0]); + let space_si = invocation_si(¶.content[1]); + let str_si = invocation_si(¶.content[2]); + assert_eq!(strong_si, &tok); + assert_eq!(space_si, &tok); + assert_eq!(str_si, &tok); + // The Strong's inner Str must also be stamped. + let Inline::Strong(strong) = ¶.content[0] else { + panic!("Expected Strong"); + }; + let inner_si = invocation_si(&strong.content[0]); + assert_eq!(inner_si, &tok); + } + + /// Escaped shortcode resolves to a literal Str whose source_info is + /// the token's Original (NOT Generated) — Plan 7's is_atomic_kind() + /// does not fire on round-trip. + #[tokio::test] + async fn escaped_shortcode_keeps_original_source_info() { + let transform = ShortcodeResolveTransform::new(); + let tok = token_si(); + let mut ast = Pandoc { + meta: ConfigValue::default(), + blocks: vec![Block::Paragraph(Paragraph { + content: vec![Inline::Shortcode(make_escaped_shortcode_with_si( + "meta", + tok.clone(), + ))], + source_info: dummy_source_info(), + })], + }; + + let project = make_test_project(); + let doc = DocumentInfo::from_path("/project/doc.qmd"); + let format = Format::html(); + let binaries = BinaryDependencies::new(); + let mut ctx = RenderContext::new(&project, &doc, &format, &binaries); + transform.transform(&mut ast, &mut ctx).await.unwrap(); + + let Block::Paragraph(para) = &ast.blocks[0] else { + panic!("Expected Paragraph"); + }; + let Inline::Str(s) = ¶.content[0] else { + panic!("Expected literal Str"); + }; + // Source_info is Original (the token's bytes), not Generated. + match &s.source_info { + SourceInfo::Original { .. } => {} + other => panic!("Expected Original, got {:?}", other), + } + assert_eq!(&s.source_info, &tok); + } + + /// Unknown shortcode resolves to Strong[Str("?name")] with both + /// layers carrying the token's Original source_info (NOT Generated, + /// NOT Default). + #[tokio::test] + async fn unknown_shortcode_error_uses_token_source_info() { + let transform = ShortcodeResolveTransform::new(); + let tok = token_si(); + let mut ast = Pandoc { + meta: ConfigValue::default(), + blocks: vec![Block::Paragraph(Paragraph { + content: vec![Inline::Shortcode(make_shortcode_with_si( + "bogus", + vec![], + tok.clone(), + ))], + source_info: dummy_source_info(), + })], + }; + + let project = make_test_project(); + let doc = DocumentInfo::from_path("/project/doc.qmd"); + let format = Format::html(); + let binaries = BinaryDependencies::new(); + let mut ctx = RenderContext::new(&project, &doc, &format, &binaries); + transform.transform(&mut ast, &mut ctx).await.unwrap(); + + let Block::Paragraph(para) = &ast.blocks[0] else { + panic!("Expected Paragraph"); + }; + let Inline::Strong(strong) = ¶.content[0] else { + panic!("Expected Strong"); + }; + assert!(matches!(strong.source_info, SourceInfo::Original { .. })); + assert_eq!(&strong.source_info, &tok); + let Inline::Str(inner) = &strong.content[0] else { + panic!("Expected inner Str"); + }; + assert!(matches!(inner.source_info, SourceInfo::Original { .. })); + assert_eq!(&inner.source_info, &tok); + assert_eq!(inner.text, "?bogus"); + } + + /// Plan 6 source_info-determinism: running the transform twice on + /// the same input produces structurally-identical ASTs (every + /// Generated.by, every Generated.from[], and every Original + /// SourceInfo is ==-equal across runs). + #[tokio::test] + async fn shortcode_resolution_is_deterministic() { + async fn run_once() -> Pandoc { + let mut transform = ShortcodeResolveTransform::new(); + transform.handlers.push(Box::new(MultiInlineTestHandler)); + let tok = token_si(); + let mut ast = Pandoc { + meta: ConfigValue::new_map( + vec![make_map_entry( + "title", + ConfigValue::new_string("Title", dummy_source_info()), + )], + dummy_source_info(), + ), + blocks: vec![Block::Paragraph(Paragraph { + content: vec![ + Inline::Shortcode(make_shortcode_with_si( + "meta", + vec!["title"], + tok.clone(), + )), + Inline::Shortcode(make_shortcode_with_si("multi", vec![], tok)), + ], + source_info: dummy_source_info(), + })], + }; + let project = make_test_project(); + let doc = DocumentInfo::from_path("/project/doc.qmd"); + let format = Format::html(); + let binaries = BinaryDependencies::new(); + let mut ctx = RenderContext::new(&project, &doc, &format, &binaries); + transform.transform(&mut ast, &mut ctx).await.unwrap(); + ast + } + + let a = run_once().await; + let b = run_once().await; + // Pandoc, Block, Inline, and SourceInfo all derive PartialEq — + // == compares structurally, including every Generated.by / + // Generated.from[] and every Original byte range. + assert_eq!(a, b, "Plan-6 stamper must be deterministic across runs"); + } + + /// Audit-completion test: after Plan 6's stamping pass, the AST + /// should contain no `Generated { by: shortcode, from: [] }` nodes + /// (the required-anchor invariant: every shortcode-resolved node + /// carries an Invocation anchor). + #[tokio::test] + async fn shortcode_resolution_required_anchor_invariant() { + let mut transform = ShortcodeResolveTransform::new(); + transform.handlers.push(Box::new(MultiInlineTestHandler)); + let tok = token_si(); + let mut ast = Pandoc { + meta: ConfigValue::new_map( + vec![make_map_entry( + "title", + ConfigValue::new_string("Title", dummy_source_info()), + )], + dummy_source_info(), + ), + blocks: vec![Block::Paragraph(Paragraph { + content: vec![ + Inline::Shortcode(make_shortcode_with_si("meta", vec!["title"], tok.clone())), + Inline::Shortcode(make_shortcode_with_si("multi", vec![], tok.clone())), + ], + source_info: dummy_source_info(), + })], + }; + let project = make_test_project(); + let doc = DocumentInfo::from_path("/project/doc.qmd"); + let format = Format::html(); + let binaries = BinaryDependencies::new(); + let mut ctx = RenderContext::new(&project, &doc, &format, &binaries); + transform.transform(&mut ast, &mut ctx).await.unwrap(); + + // Walk every inline in the AST and assert: any + // Generated{by.kind=="shortcode"} carries at least one Invocation. + fn check_inline(inline: &Inline) { + if let SourceInfo::Generated { by, from } = inline.source_info() { + if by.kind == "shortcode" { + assert!( + from.iter() + .any(|a| a.role == quarto_source_map::AnchorRole::Invocation), + "Generated{{by:shortcode}} missing Invocation anchor" + ); + } + } + // Recurse into children for the common containers exercised here. + match inline { + Inline::Strong(s) => { + for c in &s.content { + check_inline(c); + } + } + _ => {} + } + } + + for block in &ast.blocks { + if let Block::Paragraph(p) = block { + for inline in &p.content { + check_inline(inline); + } + } + } } } diff --git a/crates/quarto-core/src/transforms/theorem.rs b/crates/quarto-core/src/transforms/theorem.rs index 9db924bc5..b63d8b1e1 100644 --- a/crates/quarto-core/src/transforms/theorem.rs +++ b/crates/quarto-core/src/transforms/theorem.rs @@ -268,8 +268,8 @@ fn convert_div(mut div: Div, ref_type: &str, kind: &str) -> CustomNode { // Extract title: // 1. `name=` attribute on the Div (Q1 convention). // 2. First Header child, if present. - let title: Option = - extract_name_attr(&mut div.attr).or_else(|| extract_first_header_title(&mut div.content)); + let title: Option = extract_name_attr(&mut div.attr, &div.attr_source) + .or_else(|| extract_first_header_title(&mut div.content)); // Strip the theorem class so downstream transforms don't re-match. div.attr @@ -301,8 +301,41 @@ fn convert_div(mut div: Div, ref_type: &str, kind: &str) -> CustomNode { /// `vec![Str("Pythagoras")]`. Inline markup inside the title (bold, /// italic, etc.) isn't supported today because attribute values are /// bare strings in Pandoc's data model — matching Q1's behavior. -fn extract_name_attr(attr: &mut Attr) -> Option { +/// +/// The returned `Str` carries the attribute value's parser-recorded +/// source range (an `Original` covering the bytes between the `=` and +/// the matching quote / whitespace) so attribution and the incremental +/// writer can resolve the title back to user-editable bytes. +/// +/// Uses `AttrSourceInfo`'s positional-alignment invariant (see +/// `crates/quarto-pandoc-types/src/attr.rs`) to find the value's +/// `SourceInfo`; falls back to `SourceInfo::default()` if alignment +/// fails (bd-3aolj / bd-1e6a5) so production never panics. +fn extract_name_attr(attr: &mut Attr, attr_source: &AttrSourceInfo) -> Option { let (_id, _classes, kvs) = attr; + + // Find the positional index of "name" before removing it so we can + // index into attr_source.attributes (which is parallel to kvs in + // insertion order). + let name_idx = kvs.keys().position(|k| k == "name")?; + + // Validate the positional-alignment invariant. An empty `attr_source` + // signals "no provenance available" (common pattern in tests that + // construct theorem divs by hand) — that case isn't a bug, so don't + // assert. Only assert when `attr_source.attributes` is populated but + // misaligned with `kvs` (the bd-3aolj / bd-1e6a5 parser bugs). + debug_assert!( + attr_source.attributes.is_empty() || kvs.len() == attr_source.attributes.len(), + "AttrSourceInfo.attributes is out of sync with Attr.2 (bd-3aolj / bd-1e6a5): kvs={}, attr_source={}", + kvs.len(), + attr_source.attributes.len(), + ); + let value_source = if kvs.len() == attr_source.attributes.len() { + attr_source.attributes[name_idx].1.clone() + } else { + None + }; + let name = kvs.remove("name")?; if name.is_empty() { return None; @@ -310,7 +343,7 @@ fn extract_name_attr(attr: &mut Attr) -> Option { Some(vec![quarto_pandoc_types::inline::Inline::Str( quarto_pandoc_types::inline::Str { text: name, - source_info: quarto_source_map::SourceInfo::default(), + source_info: value_source.unwrap_or_default(), }, )]) } diff --git a/crates/quarto-core/src/transforms/title_block.rs b/crates/quarto-core/src/transforms/title_block.rs index 240c18d44..1def2021b 100644 --- a/crates/quarto-core/src/transforms/title_block.rs +++ b/crates/quarto-core/src/transforms/title_block.rs @@ -32,7 +32,8 @@ use quarto_pandoc_types::block::{Block, Header}; use quarto_pandoc_types::inline::{Inline, Str}; use quarto_pandoc_types::pandoc::Pandoc; use quarto_pandoc_types::{ConfigValue, ConfigValueKind}; -use quarto_source_map::SourceInfo; +use quarto_source_map::{By, SourceInfo}; +use smallvec::smallvec; use crate::Result; use crate::format::is_minimal_html; @@ -174,15 +175,24 @@ fn blocks_to_plain_text(blocks: &[Block]) -> String { } /// Create a level-1 header block with the given title. +/// +/// The synthesized Header (and its inner Str) carry +/// `Generated { by: title_block(), from: [] }` provenance. Both nodes are +/// atomic per Plan 4's `is_atomic_kind` set — the writer treats them as a +/// single non-editable unit on round-trip. fn create_title_header(title: &str) -> Block { + let source_info = SourceInfo::Generated { + by: By::title_block(), + from: smallvec![], + }; Block::Header(Header { level: 1, attr: empty_attr(), content: vec![Inline::Str(Str { text: title.to_string(), - source_info: SourceInfo::default(), + source_info: source_info.clone(), })], - source_info: SourceInfo::default(), + source_info, attr_source: AttrSourceInfo::empty(), }) } @@ -496,4 +506,32 @@ mod tests { let transform = TitleBlockTransform::new(); assert_eq!(transform.name(), "title-block"); } + + #[test] + fn test_create_title_header_has_generated_provenance() { + // Plan 6: the synthesized h1 + inner Str both carry + // Generated { by: title_block(), from: [] }. + let block = create_title_header("My Title"); + let Block::Header(header) = &block else { + panic!("Expected Header"); + }; + match &header.source_info { + SourceInfo::Generated { by, from } => { + assert_eq!(by.kind, "title-block"); + assert!(from.is_empty()); + } + other => panic!("Expected Generated, got {:?}", other), + } + // Inner Str carries the same shape. + let Inline::Str(s) = &header.content[0] else { + panic!("Expected Str inside header"); + }; + match &s.source_info { + SourceInfo::Generated { by, from } => { + assert_eq!(by.kind, "title-block"); + assert!(from.is_empty()); + } + other => panic!("Expected Generated, got {:?}", other), + } + } } diff --git a/crates/quarto-core/tests/fixtures/idempotence/README.md b/crates/quarto-core/tests/fixtures/idempotence/README.md new file mode 100644 index 000000000..41e8fdac8 --- /dev/null +++ b/crates/quarto-core/tests/fixtures/idempotence/README.md @@ -0,0 +1,51 @@ +# Plan 3 — idempotence fixtures + +Holds the per-fixture project directories the q2-preview idempotence +gate at `crates/quarto-core/tests/idempotence.rs` drives through the +pipeline twice and hashes for equality. + +For the contract a transform / filter / stage author must meet to +land here without breaking the gate, read +`claude-notes/instructions/idempotence-contract.md`. The full plan +that introduced the gate lives at +`claude-notes/plans/2026-05-04-q2-preview-plan-3-builtin-filter-idempotence.md`. +The rules below are the ones that bite at fixture-authoring time. + +## Fixture-format rules + +1. **No executable engine cells.** Use only fenced code blocks + (`` ```python ``, `` ```r ``, etc.) — these are AST nodes, not + executed. Do NOT use `{python}` / `{r}` / `{julia}` style cells; CI + has no kernels, the `engine-execution` stage either fails or falls + through to the markdown passthrough, and the resulting two runs + are not reliably comparable. + +2. **No absolute process paths in fixture content.** Use only paths + that resolve relative to the fixture root (`./local.png`, not + `/private/var/.../local.png`). Resource-collector, include-resolve, + built-in-extension lookup, and similar transforms record paths into + meta; the built-in extensions resource bundle extracts to a + process-specific `temp_dir()`. Stable within a process — fine for + Plan 3's two-runs-compare contract today, but a latent issue for + any future stored-snapshot variant. + +3. **Per-fixture mode mapping.** Document-only fixtures (plain text, + callouts, theorems, code blocks, …) run in both `SingleFile` and + `ProjectOrchestrator` modes. Website-chrome fixtures (navbar, + sidebar, listings, page-nav, footer) are **orchestrator-only** + because the chrome transforms require a populated `ProjectIndex`; + driving them through `SingleFile` mode would test a partial pipeline + that doesn't exist in production. + +## What lives here + +Subdirectories named for each non-trivial fixture (typically the +website / multi-file cases that need a `_quarto.yml` plus several +sibling pages). Trivial single-page fixtures live as in-source +literals in `idempotence.rs` — the fixture's `setup` closure writes +them into a `TempDir` at run time. + +Pattern matches `tests/fixtures/websites/hub-smoke/` and +`tests/fixtures/phase5-website-baseline/`; use `copy_fixture(...)` from +`render_page_in_project.rs:616` as the lift point if a fixture grows +big enough to want a pre-built directory tree. diff --git a/crates/quarto-core/tests/idempotence.rs b/crates/quarto-core/tests/idempotence.rs new file mode 100644 index 000000000..d770048bb --- /dev/null +++ b/crates/quarto-core/tests/idempotence.rs @@ -0,0 +1,858 @@ +/* + * tests/idempotence.rs + * Copyright (c) 2026 Posit, PBC + * + * Plan 3 — q2-preview pipeline idempotence gate. + * + * Each fixture is driven through the q2-preview pipeline twice in + * each drive mode (`SingleFile` and `ProjectOrchestrator`) and the + * resulting `blocks` and `meta` (excluding `rendered.*`) hashes must + * compare equal across the two runs. + * + * See: + * claude-notes/plans/2026-05-04-q2-preview-plan-3-builtin-filter-idempotence.md + * + * The plan documents the long-lived-integration-branch policy: a + * fixture that surfaces real non-determinism stays failing here, and + * a beads issue (filled in from the panic message's + * `DivergencePoint`) is filed against the offending transform/stage. + * Do not `#[ignore]` a failing fixture without explicit user approval. + */ + +use std::path::{Path, PathBuf}; +use std::sync::Arc; + +use tempfile::TempDir; + +use pampa::pandoc::ASTContext; +use quarto_ast_reconcile::{ + compute_blocks_hash_fresh, compute_meta_hash_fresh_excluding_rendered, find_first_divergence, +}; +use quarto_core::format::Format; +use quarto_core::pipeline::{build_q2_preview_pipeline_stages, run_pipeline}; +use quarto_core::project::ProjectContext; +use quarto_core::project::orchestrator::{ProjectPipeline, RenderMode, project_type_for}; +use quarto_core::project::pass2_renderer::{RenderToPreviewAstRenderer, WasmPassTwoOutput}; +use quarto_core::render::{BinaryDependencies, RenderContext}; +use quarto_core::stage::DocumentAst; +use quarto_pandoc_types::Pandoc; +use quarto_source_map::SourceContext; +use quarto_system_runtime::{NativeRuntime, SystemRuntime}; + +// ─── Helpers (copied verbatim from render_page_in_project.rs) ───── +// +// Each `tests/*.rs` file is its own test binary, so sharing helpers +// between integration tests requires a `tests/common/` module that +// every test then explicitly imports. The plan rules dedup of that +// shape out of scope for Plan 3, so for now we copy these tiny +// utilities. If/when a second consumer wants them, this pair plus +// the `render_active_page_preview` body below is the natural +// extraction point. + +fn write(path: &Path, contents: &str) { + if let Some(parent) = path.parent() { + std::fs::create_dir_all(parent).unwrap(); + } + std::fs::write(path, contents).unwrap(); +} + +fn write_bytes(path: &Path, contents: &[u8]) { + if let Some(parent) = path.parent() { + std::fs::create_dir_all(parent).unwrap(); + } + std::fs::write(path, contents).unwrap(); +} + +fn canonical(path: &Path) -> PathBuf { + path.canonicalize().unwrap_or_else(|_| path.to_path_buf()) +} + +// ─── Drive modes ────────────────────────────────────────────────── + +/// How a fixture is driven through the pipeline. Every fixture runs +/// once per mode; the two runs within a mode must hash equal. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +enum DriveMode { + /// `run_pipeline` directly with `build_q2_preview_pipeline_stages`. + /// Mirrors `render_qmd_to_preview_ast` — the lowest-level entry + /// point used by the WASM preview. + SingleFile, + /// Drives `ProjectPipeline` with + /// `RenderMode::ActivePage(active)`. Reuses the same orchestrator + /// path real `q2 preview` / hub-client takes. + ProjectOrchestrator, +} + +const BOTH_MODES: &[DriveMode] = &[DriveMode::SingleFile, DriveMode::ProjectOrchestrator]; +#[allow(dead_code)] // Used by website / orchestrator-only fixtures in Phase 4. +const ORCHESTRATOR_ONLY: &[DriveMode] = &[DriveMode::ProjectOrchestrator]; + +// ─── Fixture struct ─────────────────────────────────────────────── + +/// A single Plan-3 fixture. Each fixture owns its own `TempDir` per +/// run; `setup` writes the project contents into that root. +struct Fixture { + name: &'static str, + /// Idempotent setup callback. Receives the freshly-created + /// project root (a canonicalized `TempDir` path) and writes the + /// page contents — at minimum `/`, plus any + /// `_quarto.yml` or sibling files the fixture needs. + setup: Box, + /// The active page, relative to the project root. + active: PathBuf, + /// Which drive modes this fixture is meaningful in. Document-only + /// fixtures run in both modes; website-chrome fixtures are + /// orchestrator-only (chrome transforms need a populated + /// ProjectIndex). + modes: &'static [DriveMode], + /// Optional transport-shape attribution JSON. When set, both + /// `DriveMode`s install a + /// `PreBuiltAttributionProvider(json.to_string())` on the + /// `RenderContext` (single-file) or pass it to the renderer + /// via `with_attribution` (orchestrator). `None` = no provider. + attribution_json: Option<&'static str>, +} + +impl Fixture { + fn run_in_each_mode(&self) { + for &mode in self.modes { + run_fixture(self, mode); + } + } +} + +// ─── Test driver ────────────────────────────────────────────────── + +fn run_fixture(fixture: &Fixture, mode: DriveMode) { + let doc_1 = run_q2_preview(fixture, mode); + let doc_2 = run_q2_preview(fixture, mode); + + let blocks_a = compute_blocks_hash_fresh(&doc_1.ast.blocks); + let blocks_b = compute_blocks_hash_fresh(&doc_2.ast.blocks); + let meta_a = compute_meta_hash_fresh_excluding_rendered(&doc_1.ast.meta); + let meta_b = compute_meta_hash_fresh_excluding_rendered(&doc_2.ast.meta); + + if blocks_a != blocks_b || meta_a != meta_b { + let point = find_first_divergence( + &doc_1.ast.blocks, + &doc_1.ast.meta, + &doc_2.ast.blocks, + &doc_2.ast.meta, + ); + panic!( + "fixture {} ({:?}): non-idempotent\n \ + blocks: {:016x} vs {:016x}\n \ + meta: {:016x} vs {:016x}\n \ + first divergence: {:?}", + fixture.name, mode, blocks_a, blocks_b, meta_a, meta_b, point, + ); + } +} + +fn run_q2_preview(fixture: &Fixture, mode: DriveMode) -> DocumentAst { + let temp = TempDir::new().unwrap(); + let project_dir = canonical(temp.path()); + (fixture.setup)(&project_dir); + let active = canonical(&project_dir.join(&fixture.active)); + + let doc = match mode { + DriveMode::SingleFile => run_single_file(&project_dir, &active, fixture.attribution_json), + DriveMode::ProjectOrchestrator => { + run_orchestrator(&project_dir, &active, fixture.attribution_json) + } + }; + drop(temp); + doc +} + +// ─── SingleFile mode ────────────────────────────────────────────── + +fn run_single_file( + _project_dir: &Path, + active: &Path, + attribution_json: Option<&'static str>, +) -> DocumentAst { + pollster::block_on(async { + let runtime: Arc = Arc::new(NativeRuntime::new()); + + // Mirror `render_active_page_preview`'s discovery dance so + // a fixture that writes a `_quarto.yml` ends up with a + // populated `project.files` rather than a single-file + // synthetic project. + let mut project = ProjectContext::discover(active, runtime.as_ref()).unwrap(); + if !project.is_single_file { + project = ProjectContext::discover(&project.dir, runtime.as_ref()).unwrap(); + } + + let doc_info = project + .files + .iter() + .find(|d| d.input == active) + .expect("active file present in discovered project") + .clone(); + + let format = Format::from_format_string("q2-preview") + .expect("q2-preview is a recognized pseudo-format"); + let binaries = BinaryDependencies::new(); + let mut ctx = RenderContext::new(&project, &doc_info, &format, &binaries); + if let Some(json) = attribution_json { + ctx.attribution_provider = Some(Arc::new( + quarto_core::attribution::PreBuiltAttributionProvider::new(json.to_string()), + )); + } + + let content = std::fs::read(active).unwrap(); + let stages = build_q2_preview_pipeline_stages(None, None); + let (output, _diagnostics) = run_pipeline( + &content, + &active.to_string_lossy(), + &mut ctx, + runtime, + stages, + ) + .await + .expect("q2-preview pipeline run (SingleFile mode)"); + + output + .into_document_ast() + .expect("q2-preview pipeline produces DocumentAst at its tail") + }) +} + +// ─── ProjectOrchestrator mode ───────────────────────────────────── + +fn run_orchestrator( + _project_dir: &Path, + active: &Path, + attribution_json: Option<&'static str>, +) -> DocumentAst { + let output = render_active_page_preview(active, attribution_json); + let ast_json = output + .payload + .as_ast_json() + .expect("orchestrator must emit Pass2Payload::AstJson"); + let mut bytes = ast_json.as_bytes(); + let (pandoc, ast_context) = + pampa::readers::json::read(&mut bytes).expect("re-parse AST JSON from orchestrator"); + pandoc_to_document_ast(pandoc, ast_context, active.to_path_buf()) +} + +/// Lifted from `crates/quarto-core/tests/render_page_in_project.rs:660`. +/// Each `tests/*.rs` is its own binary, so the helper has to be +/// duplicated rather than imported. The plan flags this as +/// acceptable for now. +fn render_active_page_preview( + active: &Path, + attribution_json: Option<&'static str>, +) -> WasmPassTwoOutput { + let runtime: Arc = Arc::new(NativeRuntime::new()); + let mut project = ProjectContext::discover(active, runtime.as_ref()).unwrap(); + if !project.is_single_file { + project = ProjectContext::discover(&project.dir, runtime.as_ref()).unwrap(); + } + + let project_type = project_type_for(&project); + let vfs_root = project.dir.join(".quarto/project-artifacts"); + // bd-rz2we: override the URL root so rendered AST link/asset + // URLs use the synthetic VFS prefix instead of the host's + // tempdir path. Disk writes still land under `vfs_root` (the + // real tempdir) via `runtime.file_write`, but the URLs + // embedded in the AST stay byte-identical across runs in + // different tempdirs — which is what this idempotence gate + // is asserting. + let mut renderer = + RenderToPreviewAstRenderer::new(&vfs_root).with_url_root("/.quarto/project-artifacts"); + if let Some(json) = attribution_json { + renderer = renderer.with_attribution(json.to_string()); + } + + let format = + Format::from_format_string("q2-preview").expect("q2-preview is a recognized pseudo-format"); + + let mut pipeline = ProjectPipeline::with_renderer( + &mut project, + project_type, + format, + "q2-preview", + runtime.clone(), + renderer, + ) + .with_mode(RenderMode::ActivePage(active.to_path_buf())); + + let summary = pollster::block_on(pipeline.run()).expect("q2-preview pipeline run"); + assert!( + summary.pass1_failures.is_empty(), + "unexpected pass-1 failures: {:?}", + summary.pass1_failures, + ); + assert!( + summary.pass2_failures.is_empty(), + "unexpected pass-2 failures: {:?}", + summary.pass2_failures, + ); + assert_eq!( + summary.outputs.len(), + 1, + "ActivePage mode should produce exactly one output", + ); + summary.outputs.into_iter().next().unwrap() +} + +/// Shuffle a re-parsed `Pandoc` + `ASTContext` into the `DocumentAst` +/// shape the hashing helpers expect. The hash only reads +/// `ast.blocks` and `ast.meta`; the other `DocumentAst` fields are +/// defaulted because they're outside the contract this gate defends. +fn pandoc_to_document_ast(pandoc: Pandoc, ast_context: ASTContext, path: PathBuf) -> DocumentAst { + DocumentAst { + path, + ast: pandoc, + ast_context, + source_context: SourceContext::new(), + warnings: Vec::new(), + recorded_includes: Vec::new(), + } +} + +// ─── Convenience constructors ───────────────────────────────────── + +/// Single-file fixture: writes `content` to `/index.qmd`, +/// runs in both `SingleFile` and `ProjectOrchestrator` modes. +fn doc_fixture(name: &'static str, content: &'static str) -> Fixture { + Fixture { + name, + setup: Box::new(move |root: &Path| { + write(&root.join("index.qmd"), content); + }), + active: PathBuf::from("index.qmd"), + modes: BOTH_MODES, + attribution_json: None, + } +} + +// ===================================================================== +// Phase-2 smoke fixture +// ===================================================================== +// +// One minimal fixture proves the harness works end-to-end before +// Phases 3-4 (existing-fixture carry-forward, gap-closure fixtures) +// land. The fixture body is intentionally trivial — a single +// paragraph — so any failure points unambiguously at the harness, +// not at a transform. + +#[test] +fn smoke_plain_paragraph() { + doc_fixture("smoke-plain-paragraph", "hello\n").run_in_each_mode(); +} + +// ===================================================================== +// Phase 3 — carry-forward fixtures (one per transform / feature) +// ===================================================================== +// +// Each `#[test]` calls `run_in_each_mode`, which loops through +// `SingleFile` and `ProjectOrchestrator`. Failures are *expected* on +// first run for some of these — that's the whole point of the gate. +// Per Phase 5 / §"CI failure policy", leave failing fixtures failing +// and file a beads issue using the sub-agent investigation prompt +// the panic message fills in. Do NOT `#[ignore]` without explicit +// user approval. + +// ─── shortcode-resolve, metadata-normalize ──────────────────────── + +#[test] +fn meta_single() { + doc_fixture("meta-single", "---\nfoo: hello\n---\n\n{{< meta foo >}}\n").run_in_each_mode(); +} + +#[test] +fn meta_markdown() { + doc_fixture( + "meta-markdown", + "---\nfoo: '**Bold** title'\n---\n\n{{< meta foo >}}\n", + ) + .run_in_each_mode(); +} + +// ─── include-expansion + shortcode-resolve ──────────────────────── + +#[test] +fn include_trivial() { + let fixture = Fixture { + name: "include-trivial", + setup: Box::new(|root: &Path| { + write(&root.join("child.qmd"), "Child content\n"); + write(&root.join("index.qmd"), "{{< include child.qmd >}}\n"); + }), + active: PathBuf::from("index.qmd"), + modes: BOTH_MODES, + attribution_json: None, + }; + fixture.run_in_each_mode(); +} + +// ─── callout (callout-resolve is excluded from q2-preview) ──────── + +#[test] +fn callout_warning() { + doc_fixture( + "callout-warning", + "::: {.callout-warning}\nBody of the callout.\n:::\n", + ) + .run_in_each_mode(); +} + +// ─── theorem-sugar ──────────────────────────────────────────────── + +#[test] +fn theorem() { + doc_fixture( + "theorem", + "::: {#thm-foo .theorem}\nThere is a theorem here.\n:::\n", + ) + .run_in_each_mode(); +} + +// ─── float-ref-target-sugar ─────────────────────────────────────── + +#[test] +fn figure_ref_target() { + // Image file is not actually opened by AST transforms; absence + // is fine for AST-level hashing. If a downstream transform + // grows a path-resolution side effect, add a tiny stub here. + doc_fixture( + "figure-ref-target", + ":::: {#fig-foo}\n![cap](img.png)\n::::\n", + ) + .run_in_each_mode(); +} + +// ─── crossref-index + crossref-resolve ──────────────────────────── + +#[test] +fn crossref_to_theorem() { + doc_fixture( + "crossref-to-theorem", + "::: {#thm-foo .theorem}\nThere is a theorem here.\n:::\n\nSee @thm-foo for the proof.\n", + ) + .run_in_each_mode(); +} + +// ─── sectionize ─────────────────────────────────────────────────── + +#[test] +fn sectionize_multi() { + doc_fixture( + "sectionize-multi", + "## A\n\nBody A.\n\n### B\n\nBody B.\n\n## C\n\nBody C.\n", + ) + .run_in_each_mode(); +} + +// ─── footnotes ──────────────────────────────────────────────────── + +#[test] +fn footnotes_mixed() { + doc_fixture( + "footnotes-mixed", + "Text with inline^[an inline footnote] and reference[^foo].\n\n[^foo]: footnote body\n", + ) + .run_in_each_mode(); +} + +// ─── appendix-structure (with license meta + footnotes interaction) + +#[test] +fn appendix_license() { + doc_fixture( + "appendix-license", + "---\nlicense: CC BY\ncopyright: 2026 ACME\n---\n\nBody paragraph.\n\n::: {.appendix}\nAppendix content.\n:::\n\nReference[^a]\n\n[^a]: footnote\n", + ) + .run_in_each_mode(); +} + +// ─── combined: sectionize + callouts + shortcodes ──────────────── + +#[test] +fn combined_stress() { + doc_fixture( + "combined-stress", + "---\ntitle: '**Bold** title'\n---\n\n## A\n\n::: {.callout-warning}\nWarning: {{< meta title >}}\n:::\n\n### B\n\nMore body text.\n", + ) + .run_in_each_mode(); +} + +// ===================================================================== +// Phase 4a — gap-closure fixtures (single-file, no extra scaffolding) +// ===================================================================== + +// ─── code-block-generate, code-block-render, code-highlight ─────── + +#[test] +fn code_block_fenced() { + doc_fixture( + "code-block-fenced", + "Some text.\n\n```python\nprint(\"hello\")\n```\n", + ) + .run_in_each_mode(); +} + +// ─── shortcode-resolve via Lua-loaded handler ───────────────────── +// +// `{{< version >}}` returns `quarto.version` joined by dots. Lua +// state is constructed fresh per pipeline run (see plan §"Design +// decisions"), so two runs over the same input must agree. + +#[test] +fn lua_shortcode_version() { + doc_fixture("lua-shortcode-version", "version: {{< version >}}\n").run_in_each_mode(); +} + +// `{{< lipsum 3 >}}` (no `random=` kwarg) — `math.randomseed` runs +// at module load but `math.random` is never reached on this code +// path, so the output is deterministically the first 3 paragraphs +// of the canned text. The `random=true` variant is intentionally +// non-deterministic and out of scope (plan §"Noted, not actively +// tested"). + +#[test] +fn lua_shortcode_lipsum_fixed() { + doc_fixture( + "lua-shortcode-lipsum-fixed", + // The comment in-document survives as part of the markdown + // (it's an HTML comment in the parsed AST), so the seed + // observation is recorded in the fixture itself rather than + // only in this Rust source. + "\n\n{{< lipsum 3 >}}\n", + ) + .run_in_each_mode(); +} + +// ─── proof-sugar ────────────────────────────────────────────────── + +#[test] +fn proof() { + doc_fixture( + "proof", + "::: {.proof}\nThe proof is left as an exercise.\n:::\n", + ) + .run_in_each_mode(); +} + +// ─── equation-label + crossref-resolve (equation branch) ────────── + +#[test] +fn equation_labeled() { + doc_fixture( + "equation-labeled", + "$$ E = mc^2 $$ {#eq-mass}\n\nSee @eq-mass for the relation.\n", + ) + .run_in_each_mode(); +} + +// ─── toc-generate, toc-render ───────────────────────────────────── + +#[test] +fn toc_on() { + doc_fixture( + "toc-on", + "---\ntoc: true\n---\n\n## A\n\nBody A.\n\n## B\n\nBody B.\n\n## C\n\nBody C.\n", + ) + .run_in_each_mode(); +} + +// ─── built-in Lua filter (video) ────────────────────────────────── +// +// `resources/extensions/quarto/video/` is embedded at compile time +// via `include_dir!` (see `crates/quarto-core/src/extension/mod.rs`) +// and auto-discovered for every `StageContext::new` call, so the +// fixture needs no scaffolding beyond declaring the filter. + +#[test] +fn video_filter_header() { + doc_fixture( + "video-filter-header", + "---\nfilters:\n - video\n---\n\n# Title {background-video=\"https://www.youtube.com/embed/abc\"}\n", + ) + .run_in_each_mode(); +} + +// ─── table-bootstrap-class ──────────────────────────────────────── + +#[test] +fn table_bootstrap_class() { + doc_fixture("table-bootstrap-class", "| col |\n| --- |\n| val |\n").run_in_each_mode(); +} + +// ─── compile-theme-css stage ────────────────────────────────────── +// +// Default theme. The `theme:` key isn't required to opt the stage +// in; `compile-theme-css` runs in the q2-preview stage list for +// HTML-shaped formats unconditionally. Hash excludes `rendered.*`, +// so the compiled CSS (which lands under `meta.rendered.*` and may +// vary in trivial whitespace) doesn't participate. + +#[test] +fn theme_bootstrap() { + doc_fixture("theme-bootstrap", "---\ntheme: cosmo\n---\n\nBody.\n").run_in_each_mode(); +} + +// ===================================================================== +// Phase 4b — gap-closure fixtures (multi-file) +// ===================================================================== + +// ─── include-resolve stage ──────────────────────────────────────── + +#[test] +fn include_in_header() { + let fixture = Fixture { + name: "include-in-header", + setup: Box::new(|root: &Path| { + write( + &root.join("header.html"), + "\n", + ); + write( + &root.join("index.qmd"), + "---\ninclude-in-header: header.html\n---\n\nBody paragraph.\n", + ); + }), + active: PathBuf::from("index.qmd"), + modes: BOTH_MODES, + attribution_json: None, + }; + fixture.run_in_each_mode(); +} + +// ─── resource-collector ─────────────────────────────────────────── +// +// 67-byte minimal valid PNG (1×1 transparent pixel). The AST-level +// transforms only record the path, not the bytes, but providing a +// real file means resource-collector can resolve relative to the +// fixture root rather than warning about a missing local resource. +// Per the fixtures README, paths must resolve relative to the +// project root (no absolute process paths). + +const TINY_PNG: &[u8] = &[ + 0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a, 0x00, 0x00, 0x00, 0x0d, 0x49, 0x48, 0x44, 0x52, + 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x08, 0x06, 0x00, 0x00, 0x00, 0x1f, 0x15, 0xc4, + 0x89, 0x00, 0x00, 0x00, 0x0d, 0x49, 0x44, 0x41, 0x54, 0x78, 0x9c, 0x63, 0x00, 0x01, 0x00, 0x00, + 0x05, 0x00, 0x01, 0x0d, 0x0a, 0x2d, 0xb4, 0x00, 0x00, 0x00, 0x00, 0x49, 0x45, 0x4e, 0x44, 0xae, + 0x42, 0x60, 0x82, +]; + +#[test] +fn resource_image() { + let fixture = Fixture { + name: "resource-image", + setup: Box::new(|root: &Path| { + write_bytes(&root.join("local.png"), TINY_PNG); + write(&root.join("index.qmd"), "![alt text](./local.png)\n"); + }), + active: PathBuf::from("index.qmd"), + modes: BOTH_MODES, + attribution_json: None, + }; + fixture.run_in_each_mode(); +} + +// ===================================================================== +// Phase 4c — website-project fixtures (orchestrator-only) +// ===================================================================== +// +// Chrome transforms (navbar / sidebar / page-nav / footer / favicon / +// bootstrap-icons / canonical-url) require a populated `ProjectIndex`, +// which the orchestrator pass-1 builds and pass-2 consumes. Driving +// a website fixture through `SingleFile` mode would test a partial +// pipeline that doesn't exist in production — so these run in +// `ProjectOrchestrator` only. + +// ─── website-chrome: navbar, sidebar, page-nav, footer, favicon, +// bootstrap-icons, canonical-url, title-prefix ──────────────────── + +#[test] +fn website_chrome() { + let fixture = Fixture { + name: "website-chrome", + setup: Box::new(|root: &Path| { + write( + &root.join("_quarto.yml"), + concat!( + "project:\n", + " type: website\n", + "\n", + "website:\n", + " title: Idempotence Chrome\n", + " site-url: https://example.com/\n", + " favicon: favicon.ico\n", + " navbar:\n", + " title: Idempotence Chrome\n", + " background: primary\n", + " left:\n", + " - index.qmd\n", + " - other.qmd\n", + " sidebar:\n", + " contents:\n", + " - index.qmd\n", + " - other.qmd\n", + " page-navigation: true\n", + " page-footer: \"Copyright 2026\"\n", + ), + ); + write( + &root.join("index.qmd"), + "---\ntitle: Home\n---\n\nHome body.\n", + ); + write( + &root.join("other.qmd"), + "---\ntitle: Other\n---\n\nOther body.\n", + ); + // favicon.ico — a tiny stub so any path-resolution side + // effect resolves. Per the fixtures README rule. + write_bytes(&root.join("favicon.ico"), &[0u8; 4]); + }), + active: PathBuf::from("index.qmd"), + modes: ORCHESTRATOR_ONLY, + attribution_json: None, + }; + fixture.run_in_each_mode(); +} + +// ─── website-links: cross-page .qmd body links → link-rewrite, +// link-resolution stage ─────────────────────────────────────────── + +#[test] +fn website_links() { + let fixture = Fixture { + name: "website-links", + setup: Box::new(|root: &Path| { + write( + &root.join("_quarto.yml"), + concat!( + "project:\n", + " type: website\n", + "\n", + "website:\n", + " title: Idempotence Links\n", + ), + ); + write( + &root.join("index.qmd"), + "---\ntitle: Home\n---\n\nSee [the other page](other.qmd) for more.\n", + ); + write( + &root.join("other.qmd"), + "---\ntitle: Other\n---\n\nReturn to [home](index.qmd).\n", + ); + }), + active: PathBuf::from("index.qmd"), + modes: ORCHESTRATOR_ONLY, + attribution_json: None, + }; + fixture.run_in_each_mode(); +} + +// ─── website-listing: listing-generate, listing-render, +// categories-sidebar, listing-feed-link, listing-feed-stage, +// listing-item-info stage ───────────────────────────────────────── + +#[test] +fn website_listing() { + let fixture = Fixture { + name: "website-listing", + setup: Box::new(|root: &Path| { + write( + &root.join("_quarto.yml"), + concat!( + "project:\n", + " type: website\n", + "\n", + "website:\n", + " title: Idempotence Listing\n", + " site-url: https://example.com/\n", + ), + ); + write( + &root.join("index.qmd"), + concat!( + "---\n", + "title: Posts\n", + "listing:\n", + " contents: posts\n", + " type: default\n", + " categories: true\n", + " feed: true\n", + "---\n", + "\n", + "Listing index.\n", + ), + ); + write( + &root.join("posts/first.qmd"), + concat!( + "---\n", + "title: First Post\n", + "categories: [alpha, beta]\n", + "date: 2026-05-01\n", + "---\n", + "\n", + "First body.\n", + ), + ); + write( + &root.join("posts/second.qmd"), + concat!( + "---\n", + "title: Second Post\n", + "categories: [beta]\n", + "date: 2026-05-15\n", + "---\n", + "\n", + "Second body.\n", + ), + ); + }), + active: PathBuf::from("index.qmd"), + modes: ORCHESTRATOR_ONLY, + attribution_json: None, + }; + fixture.run_in_each_mode(); +} + +// ===================================================================== +// Phase 4d — attribution fixture +// ===================================================================== +// +// Deterministic stub. `PreBuiltAttributionProvider` reads a static +// JSON payload; the writer-side machinery (`AttributionGenerateStage` +// + `AttributionRenderTransform`) then populates `format_options.json` +// and writes per-node attribution records into the AST. Using +// `GitBlameProvider` here would be flaky — depends on actual git +// history; the prebuilt path is the same shape the WASM client +// drives in production. + +/// Tiny transport-shape attribution JSON: one actor, one run +/// covering bytes 0..1024 so it overlaps anything the fixture +/// document might contain. +const STUB_ATTRIBUTION_JSON: &str = concat!( + "{", + "\"runs\":[{\"start\":0,\"end\":1024,\"actor\":\"alice\",\"time\":1700000000}],", + "\"identities\":{\"alice\":{\"name\":\"Alice\",\"color\":\"#ff0000\"}}", + "}" +); + +#[test] +fn attribution_basic() { + let fixture = Fixture { + name: "attribution-basic", + setup: Box::new(|root: &Path| { + // Plenty of bytes for the attribution run to overlap. + write( + &root.join("index.qmd"), + "---\ntitle: Attributed\n---\n\n## Section\n\nA paragraph attributed to alice for the whole byte range.\n", + ); + }), + active: PathBuf::from("index.qmd"), + modes: BOTH_MODES, + attribution_json: Some(STUB_ATTRIBUTION_JSON), + }; + fixture.run_in_each_mode(); +} diff --git a/crates/quarto-core/tests/render_page_in_project.rs b/crates/quarto-core/tests/render_page_in_project.rs index bc1d17bc0..b168484d9 100644 --- a/crates/quarto-core/tests/render_page_in_project.rs +++ b/crates/quarto-core/tests/render_page_in_project.rs @@ -78,7 +78,11 @@ fn render_active_page(project_dir: &Path, active: &Path) -> WasmPassTwoOutput { let project_type = project_type_for(&project); let vfs_root = project.dir.join(".quarto/project-artifacts"); - let renderer = RenderToHtmlRenderer::new(&vfs_root); + // bd-rz2we: keep rendered HTML URLs path-independent. Disk + // writes still go to the tempdir at `vfs_root`; only the URLs + // embedded in the HTML use the synthetic VFS prefix. See the + // matching helper in `tests/idempotence.rs`. + let renderer = RenderToHtmlRenderer::new(&vfs_root).with_url_root("/.quarto/project-artifacts"); let mut pipeline = ProjectPipeline::with_renderer( &mut project, @@ -561,35 +565,42 @@ fn default_project_theme_artifact_lands_in_vfs() { let output = render_active_page(&project_dir, &active); // The HTML should embed a `` to a quarto theme CSS file - // under the synthetic vfs root. - let vfs_root = project_dir.join(".quarto/project-artifacts"); - let vfs_root_str = vfs_root.to_string_lossy().to_string(); - let needle_prefix = format!("{}/quarto/quarto-theme-", vfs_root_str); + // under the synthetic vfs URL root. bd-rz2we: native test + // helpers pass `with_url_root("/.quarto/project-artifacts")`, + // so URLs use that synthetic prefix regardless of where the + // bytes actually land on disk. + let url_root = "/.quarto/project-artifacts"; + let url_needle_prefix = format!("{}/quarto/quarto-theme-", url_root); let theme_link = output .html() .lines() - .filter(|line| line.contains(&needle_prefix) && line.contains(".css")) - .next() + .find(|line| line.contains(&url_needle_prefix) && line.contains(".css")) .unwrap_or_else(|| { panic!( - "expected a theme under {}quarto/quarto-theme-…; html: {}", - vfs_root_str, + "expected a theme under {}/quarto/quarto-theme-…; html: {}", + url_root, snippet(&output.html()), ) }); - // Extract the actual CSS path from the href attribute and - // confirm the bytes landed at that path. + // Extract the URL fragment from the href attribute and translate + // it back to the on-disk path. bd-rz2we: the URL embeds the + // synthetic prefix; bytes land under the tempdir `vfs_root`. let href_start = theme_link - .find(&needle_prefix) + .find(&url_needle_prefix) .expect("needle present (filter just confirmed it)"); let after_prefix = &theme_link[href_start..]; let css_end = after_prefix .find(".css") .map(|i| href_start + i + ".css".len()) .expect("href ends with .css"); - let css_path_str = &theme_link[href_start..css_end]; - let css_path = PathBuf::from(css_path_str); + let url_str = &theme_link[href_start..css_end]; + let suffix = url_str + .strip_prefix(url_root) + .expect("URL starts with the synthetic prefix") + .trim_start_matches('/'); + let vfs_root = project_dir.join(".quarto/project-artifacts"); + let css_path = vfs_root.join(suffix); let runtime = NativeRuntime::new(); let bytes = runtime.file_read(&css_path).unwrap_or_else(|e| { diff --git a/crates/quarto-error-reporting/error_catalog.json b/crates/quarto-error-reporting/error_catalog.json index 202e9e2f2..7092e882b 100644 --- a/crates/quarto-error-reporting/error_catalog.json +++ b/crates/quarto-error-reporting/error_catalog.json @@ -524,6 +524,20 @@ "docs_url": "https://quarto.org/docs/errors/writer/Q-3-40", "since_version": "99.9.9" }, + "Q-3-42": { + "subsystem": "writer", + "title": "Shortcode edit dropped", + "message_template": "An edit to shortcode-resolved (or other atomic-Generated) content was reverted. The resolved text is read-only; edit the invocation token in source instead.", + "docs_url": "https://quarto.org/docs/errors/Q-3-42", + "since_version": "99.9.9" + }, + "Q-3-43": { + "subsystem": "writer", + "title": "Generated content edit dropped", + "message_template": "An edit to pipeline-generated content was reverted. The content has no editable source position in this file; edit its upstream definition (an include, metadata key, or other source) instead.", + "docs_url": "https://quarto.org/docs/errors/Q-3-43", + "since_version": "99.9.9" + }, "Q-3-50": { "subsystem": "writer", "title": "LineBlock Not Supported in ANSI Format", diff --git a/crates/quarto-error-reporting/src/diagnostic.rs b/crates/quarto-error-reporting/src/diagnostic.rs index 8ffb35a1b..d86e503dd 100644 --- a/crates/quarto-error-reporting/src/diagnostic.rs +++ b/crates/quarto-error-reporting/src/diagnostic.rs @@ -552,28 +552,6 @@ impl DiagnosticMessage { obj } - /// Extract the original file_id from a SourceInfo by traversing the mapping chain - fn extract_file_id( - source_info: &quarto_source_map::SourceInfo, - ) -> Option { - match source_info { - quarto_source_map::SourceInfo::Original { file_id, .. } => Some(*file_id), - quarto_source_map::SourceInfo::Substring { parent, .. } => { - Self::extract_file_id(parent) - } - quarto_source_map::SourceInfo::Concat { pieces } => { - // For concatenated sources, use the first piece's file_id - pieces - .first() - .and_then(|p| Self::extract_file_id(&p.source_info)) - } - quarto_source_map::SourceInfo::FilterProvenance { .. } => { - // Filter provenance doesn't have a traditional file_id - None - } - } - } - /// Wrap a file path with OSC 8 ANSI hyperlink codes for clickable terminal links. /// /// OSC 8 is a terminal escape sequence that creates clickable hyperlinks: @@ -671,18 +649,21 @@ impl DiagnosticMessage { const ARIADNE_UNIMPORTANT_COLOR: Color = Color::Fixed(249); // Extract file_id from the source mapping by traversing the chain - let file_id = Self::extract_file_id(main_location)?; + let file_id = main_location.root_file_id()?; let file = ctx.get_file(file_id)?; - // Get file content: use stored content for ephemeral files, or read from disk + // Get file content: use stored content for ephemeral files, or read from disk. + // In WASM (and any host with no real filesystem) the disk read fails with + // "operation not supported on this platform"; the only graceful response is + // to drop the source-context snippet. The diagnostic's code, message, and + // hints still surface — only the Ariadne visual is unavailable. let content = match &file.content { - Some(c) => c.clone(), // Ephemeral file: use stored content - None => { - // Disk-backed file: read from disk - std::fs::read_to_string(&file.path) - .unwrap_or_else(|e| panic!("Failed to read file '{}': {}", file.path, e)) - } + Some(c) => c.clone(), + None => match std::fs::read_to_string(&file.path) { + Ok(s) => s, + Err(_) => return None, + }, }; // Map the location offsets back to original file positions @@ -770,7 +751,7 @@ impl DiagnosticMessage { for detail in &self.details { if let Some(detail_loc) = &detail.location { // Extract file_id from detail location - let detail_file_id = match Self::extract_file_id(detail_loc) { + let detail_file_id = match detail_loc.root_file_id() { Some(fid) => fid, None => continue, // Skip if we can't extract file_id }; diff --git a/crates/quarto-pandoc-types/src/atomic_custom_nodes.rs b/crates/quarto-pandoc-types/src/atomic_custom_nodes.rs new file mode 100644 index 000000000..63327c8b4 --- /dev/null +++ b/crates/quarto-pandoc-types/src/atomic_custom_nodes.rs @@ -0,0 +1,63 @@ +//! Registry of `CustomNode` type names that q2-preview's incremental writer +//! treats as **atomic**. +//! +//! An atomic CustomNode is a single replaceable unit. Users can swap or +//! delete one wholesale via a React-side component menu, but they cannot +//! type *inside* it — there is no editable text region the writer can map +//! back to source bytes. The writer treats edits *into* an atomic +//! CustomNode as a soft-drop (Q-3-43); UseAfter on an atomic CustomNode +//! is let-user-win (the qmd writer's CustomNode arm serializes the fresh +//! `plain_data`). +//! +//! See Plan 7 §"`is_atomic_custom_node` registry" for the design and the +//! `is_editable_inside` consumer in `pampa::writers::incremental`. +//! +//! Lives in `quarto-pandoc-types` (not `quarto-core` as Plan 7 originally +//! suggested) because `pampa` consumes it and `pampa` sits below +//! `quarto-core` in the dependency graph. +//! +//! The TypeScript hand-mirror lives at +//! `ts-packages/preview-renderer/src/utils/atomicCustomNodes.ts` and must +//! be kept in lockstep with this list. + +/// `CustomNode` type names that q2-preview treats as atomic. +/// +/// Today: just `"CrossrefResolvedRef"` (kept in lockstep with +/// `quarto_core::crossref::CROSSREF_RESOLVED_REF` — see the cross-check +/// test in `quarto-core::crossref`). Plan 8 will add `"IncludeExpansion"`. +/// +/// Extension-contributed atomic types are out of scope for this const; +/// a future plan adds a runtime registry sourced from `_extension.yml`. +pub const ATOMIC_CUSTOM_NODES: &[&str] = &["CrossrefResolvedRef"]; + +/// Return `true` iff `type_name` names a CustomNode the incremental +/// writer must treat as atomic. +/// +/// See [`ATOMIC_CUSTOM_NODES`] for the list and the module doc-comment +/// for what atomicity means in this context. +pub fn is_atomic_custom_node(type_name: &str) -> bool { + ATOMIC_CUSTOM_NODES.contains(&type_name) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn crossref_resolved_ref_is_atomic() { + assert!(is_atomic_custom_node("CrossrefResolvedRef")); + } + + #[test] + fn unknown_type_name_is_not_atomic() { + assert!(!is_atomic_custom_node("FloatRefTarget")); + assert!(!is_atomic_custom_node("Theorem")); + assert!(!is_atomic_custom_node("Callout")); + assert!(!is_atomic_custom_node("")); + } + + #[test] + fn registry_contains_crossref_resolved_ref() { + assert!(ATOMIC_CUSTOM_NODES.contains(&"CrossrefResolvedRef")); + } +} diff --git a/crates/quarto-pandoc-types/src/attr.rs b/crates/quarto-pandoc-types/src/attr.rs index 191dd409c..502b46c35 100644 --- a/crates/quarto-pandoc-types/src/attr.rs +++ b/crates/quarto-pandoc-types/src/attr.rs @@ -24,6 +24,27 @@ pub fn is_empty_attr(attr: &Attr) -> bool { /// - id: Source location of the id string (None if id is empty "") /// - classes: Source locations for each class string /// - attributes: Source locations for each key-value pair (both key and value) +/// +/// **Positional-alignment invariant** (added 2026-05-22, Plan 6): +/// `attributes[i]` is the `(key_src, val_src)` for the i-th entry in +/// `Attr.2` (`LinkedHashMap`) in **insertion order**. +/// Consumers that index into `attributes` by key position (e.g. to +/// recover the source range of a value before `kvs.remove(key)`) rely +/// on this lockstep. +/// +/// This invariant holds in the parser's main path but is **broken by +/// two preexisting code paths** (tracked separately): +/// - **bd-3aolj** — `commonmark_attribute.rs:41-49` (duplicate-key +/// handling: `LinkedHashMap::insert` updates in place while +/// `attr_source.attributes.push` always appends). +/// - **bd-1e6a5** — caption-attr-into-table merge in `section.rs` and +/// `postprocess.rs` (same root cause when caption + table keys +/// overlap). +/// +/// Until those fix-ups land, indexing consumers should guard with a +/// runtime length check (`kvs.len() == attr_source.attributes.len()`) +/// plus a `debug_assert_eq!` and fall back to `SourceInfo::default()` +/// on mismatch so production never panics on misaligned input. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct AttrSourceInfo { pub id: Option, diff --git a/crates/quarto-pandoc-types/src/block.rs b/crates/quarto-pandoc-types/src/block.rs index 7fde1317d..5ec384a46 100644 --- a/crates/quarto-pandoc-types/src/block.rs +++ b/crates/quarto-pandoc-types/src/block.rs @@ -64,6 +64,33 @@ impl Block { Block::Custom(b) => &b.source_info, } } + + /// Mutable counterpart to [`source_info`]. Mechanical mirror of the read + /// accessor; lets Plan-6 stamping rewrite the per-node `source_info` field + /// through the enum without holding a typed variant reference. + pub fn source_info_mut(&mut self) -> &mut quarto_source_map::SourceInfo { + match self { + Block::Plain(b) => &mut b.source_info, + Block::Paragraph(b) => &mut b.source_info, + Block::LineBlock(b) => &mut b.source_info, + Block::CodeBlock(b) => &mut b.source_info, + Block::RawBlock(b) => &mut b.source_info, + Block::BlockQuote(b) => &mut b.source_info, + Block::OrderedList(b) => &mut b.source_info, + Block::BulletList(b) => &mut b.source_info, + Block::DefinitionList(b) => &mut b.source_info, + Block::Header(b) => &mut b.source_info, + Block::HorizontalRule(b) => &mut b.source_info, + Block::Table(b) => &mut b.source_info, + Block::Figure(b) => &mut b.source_info, + Block::Div(b) => &mut b.source_info, + Block::BlockMetadata(b) => &mut b.source_info, + Block::NoteDefinitionPara(b) => &mut b.source_info, + Block::NoteDefinitionFencedBlock(b) => &mut b.source_info, + Block::CaptionBlock(b) => &mut b.source_info, + Block::Custom(b) => &mut b.source_info, + } + } } pub type Blocks = Vec; @@ -257,4 +284,17 @@ mod tests { }); assert_eq!(block.source_info(), &si); } + + #[test] + fn source_info_mut_round_trip_paragraph() { + let original = test_si(0, 0, 10); + let updated = test_si(9, 200, 220); + let mut block = Block::Paragraph(Paragraph { + content: vec![], + source_info: original.clone(), + }); + assert_eq!(block.source_info(), &original); + *block.source_info_mut() = updated.clone(); + assert_eq!(block.source_info(), &updated); + } } diff --git a/crates/quarto-pandoc-types/src/inline.rs b/crates/quarto-pandoc-types/src/inline.rs index 788a936d5..8b55c40c5 100644 --- a/crates/quarto-pandoc-types/src/inline.rs +++ b/crates/quarto-pandoc-types/src/inline.rs @@ -86,6 +86,42 @@ impl Inline { Inline::Custom(c) => &c.source_info, } } + + /// Mutable counterpart to [`source_info`]. Mechanical mirror of the read + /// accessor; lets Plan-6 stamping rewrite the per-node `source_info` field + /// through the enum without holding a typed variant reference. + pub fn source_info_mut(&mut self) -> &mut quarto_source_map::SourceInfo { + match self { + Inline::Str(s) => &mut s.source_info, + Inline::Emph(e) => &mut e.source_info, + Inline::Underline(u) => &mut u.source_info, + Inline::Strong(s) => &mut s.source_info, + Inline::Strikeout(s) => &mut s.source_info, + Inline::Superscript(s) => &mut s.source_info, + Inline::Subscript(s) => &mut s.source_info, + Inline::SmallCaps(s) => &mut s.source_info, + Inline::Quoted(q) => &mut q.source_info, + Inline::Cite(c) => &mut c.source_info, + Inline::Code(c) => &mut c.source_info, + Inline::Space(s) => &mut s.source_info, + Inline::SoftBreak(s) => &mut s.source_info, + Inline::LineBreak(l) => &mut l.source_info, + Inline::Math(m) => &mut m.source_info, + Inline::RawInline(r) => &mut r.source_info, + Inline::Link(l) => &mut l.source_info, + Inline::Image(i) => &mut i.source_info, + Inline::Note(n) => &mut n.source_info, + Inline::Span(s) => &mut s.source_info, + Inline::Shortcode(s) => &mut s.source_info, + Inline::NoteReference(n) => &mut n.source_info, + Inline::Attr(a) => &mut a.source_info, + Inline::Insert(i) => &mut i.source_info, + Inline::Delete(d) => &mut d.source_info, + Inline::Highlight(h) => &mut h.source_info, + Inline::EditComment(e) => &mut e.source_info, + Inline::Custom(c) => &mut c.source_info, + } + } } pub type Inlines = Vec; @@ -1478,4 +1514,17 @@ mod tests { }); assert_eq!(inline.source_info(), &si); } + + #[test] + fn source_info_mut_round_trip_str() { + let original = test_si(0, 0, 5); + let updated = test_si(7, 100, 110); + let mut inline = Inline::Str(Str { + text: "hello".into(), + source_info: original.clone(), + }); + assert_eq!(inline.source_info(), &original); + *inline.source_info_mut() = updated.clone(); + assert_eq!(inline.source_info(), &updated); + } } diff --git a/crates/quarto-pandoc-types/src/lib.rs b/crates/quarto-pandoc-types/src/lib.rs index aa764ddfc..91131b37f 100644 --- a/crates/quarto-pandoc-types/src/lib.rs +++ b/crates/quarto-pandoc-types/src/lib.rs @@ -10,6 +10,7 @@ * by any crate that needs to work with Pandoc AST structures. */ +pub mod atomic_custom_nodes; pub mod attr; pub mod block; pub mod caption; @@ -23,6 +24,7 @@ pub mod shortcode; pub mod table; // Re-export commonly used types at the crate root +pub use atomic_custom_nodes::{ATOMIC_CUSTOM_NODES, is_atomic_custom_node}; pub use attr::{Attr, AttrSourceInfo, TargetSourceInfo, empty_attr, is_empty_attr}; pub use block::{ Block, BlockQuote, Blocks, BulletList, CaptionBlock, CodeBlock, DefinitionList, Div, Figure, diff --git a/crates/quarto-source-map/Cargo.toml b/crates/quarto-source-map/Cargo.toml index 5b688804c..e231a7f95 100644 --- a/crates/quarto-source-map/Cargo.toml +++ b/crates/quarto-source-map/Cargo.toml @@ -8,6 +8,8 @@ repository.workspace = true [dependencies] serde = { workspace = true, features = ["derive", "rc"] } +serde_json.workspace = true +smallvec.workspace = true [dev-dependencies] serde_json.workspace = true diff --git a/crates/quarto-source-map/src/lib.rs b/crates/quarto-source-map/src/lib.rs index ae8afa1bd..e09f26d91 100644 --- a/crates/quarto-source-map/src/lib.rs +++ b/crates/quarto-source-map/src/lib.rs @@ -41,6 +41,6 @@ pub mod utils; pub use context::{FileMetadata, SourceContext, SourceFile}; pub use file_info::FileInformation; pub use mapping::MappedLocation; -pub use source_info::{SourceInfo, SourcePiece}; +pub use source_info::{Anchor, AnchorRole, By, SourceInfo, SourcePiece}; pub use types::{FileId, Location, Range}; pub use utils::{line_col_to_offset, offset_to_location, range_from_offsets}; diff --git a/crates/quarto-source-map/src/mapping.rs b/crates/quarto-source-map/src/mapping.rs index c8bc1f499..c3269840c 100644 --- a/crates/quarto-source-map/src/mapping.rs +++ b/crates/quarto-source-map/src/mapping.rs @@ -65,9 +65,9 @@ impl SourceInfo { } None // Offset not found in any piece } - SourceInfo::FilterProvenance { .. } => { - // FilterProvenance doesn't have traditional byte offsets - // The location information is stored directly in the variant + SourceInfo::Generated { .. } => { + // Generated nodes have no offset-within-current-text; + // callers wanting source coordinates use resolve_byte_range. None } } diff --git a/crates/quarto-source-map/src/source_info.rs b/crates/quarto-source-map/src/source_info.rs index 91f5800af..64cafdbb8 100644 --- a/crates/quarto-source-map/src/source_info.rs +++ b/crates/quarto-source-map/src/source_info.rs @@ -2,6 +2,7 @@ use crate::types::{FileId, Range}; use serde::{Deserialize, Serialize}; +use smallvec::SmallVec; use std::sync::Arc; /// Source information tracking a location and its transformation history @@ -13,7 +14,9 @@ use std::sync::Arc; /// - Original: Points directly to a file with byte offsets /// - Substring: Points to a range within a parent SourceInfo (offsets are relative to parent) /// - Concat: Combines multiple SourceInfo pieces (preserves provenance when coalescing text) -/// - FilterProvenance: Tracks elements created by Lua filters for diagnostics +/// - Generated: Produced by a pipeline transform. `by` records the producer; `from` +/// records source-side anchors (empty for pure synthesis, `Invocation` for +/// shortcode-style resolutions). /// /// The Transformed variant was removed because it's not used in production code. /// Text transformations (smart quotes, em-dashes) use Original SourceInfo pointing @@ -42,18 +45,86 @@ pub enum SourceInfo { /// Used when coalescing adjacent text nodes while preserving /// the fact that they came from different source locations. Concat { pieces: Vec }, - /// Provenance from a Lua filter + /// Node produced by a pipeline transform /// - /// Used to track elements created by Lua filters for diagnostic messages. - /// Contains the filter file path and line number where the element was created. - FilterProvenance { - /// Path to the Lua filter file (from debug.getinfo source) - filter_path: String, - /// Line number in the filter where the element was created - line: usize, + /// `by` records the producer ("which transform made me"); `from` is a + /// list of typed, role-labeled source-info pointers ("which source + /// bytes contributed to me"). Empty `from` means pure synthesis + /// (sectionize wrappers, filter constructions, title-block h1). + /// An `Invocation` anchor present means there is a source-side + /// preimage (every shortcode resolution). + Generated { + by: By, + #[serde(default, skip_serializing_if = "SmallVec::is_empty")] + from: SmallVec<[Anchor; 2]>, }, } +/// Producer identity for a [`SourceInfo::Generated`] node. +/// +/// `kind` is a short, kebab-case identifier describing which transform +/// produced the node ("filter", "shortcode", "sectionize", ...). Third +/// parties should namespace as `ext//`. +/// +/// `data` is per-kind configuration that is **not** a source-info pointer. +/// Source-side anchors live in the parent `Generated.from` list, not here. +/// `Null` for kinds that don't carry per-instance data. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct By { + /// Short kind tag, kebab-case. Examples: "filter", "shortcode", + /// "sectionize", "user-edit", "title-block". + /// Third-party kinds should namespace: "ext/my-extension/foo". + pub kind: String, + + /// Per-kind configuration that is NOT a source-info pointer. + /// Anchors live in `Generated.from`, not here. + /// `Null` for kinds that don't carry per-instance data. + #[serde(default, skip_serializing_if = "serde_json::Value::is_null")] + pub data: serde_json::Value, +} + +/// Role describing what kind of source-side contribution an anchor records. +/// +/// The known roles are load-bearing — `Invocation` is what the writer's +/// preimage walk and attribution consult; `ValueSource` is diagnostic-only. +/// `Other(String)` is an open escape hatch for extension-defined roles. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub enum AnchorRole { + /// The user-written construct that triggered this node's creation + /// (e.g. the `{{< meta foo >}}` token in the active document). + /// Load-bearing: the writer's `preimage_in` and attribution's + /// `resolve_byte_range` consult the first anchor with this role. + /// At most one per node by convention. + Invocation, + + /// Where the VALUE this node carries was defined, when distinct + /// from the invocation site (e.g. `footer:` in `_metadata.yml` for + /// a `{{< meta footer >}}` resolution). Diagnostic-only — does not + /// affect the writer or attribution decisions in v1. + ValueSource, + + /// Extension-defined or future role we haven't enumerated. + /// String is kebab-case, namespaced (`ext//`). + /// + /// **`preimage_in` does not walk this role.** Future anchor roles + /// default to non-walked unless explicitly added to + /// [`SourceInfo::preimage_in`]'s `Generated` arm. Extensions adding + /// `Other("…")` should treat this as a feature: attribution data + /// attached via `Other` is not accidentally consulted by the writer's + /// byte-copying path. If a role *does* contribute to body-text + /// preimage in `target`, it must be explicitly enumerated in + /// `preimage_in`. + Other(String), +} + +/// A single typed, role-labeled source-info pointer attached to a +/// [`SourceInfo::Generated`] node. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct Anchor { + pub role: AnchorRole, + pub source_info: Arc, +} + /// A piece of a concatenated source #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct SourcePiece { @@ -133,15 +204,80 @@ impl SourceInfo { } } - /// Create source info for a filter-created element + /// Create a [`SourceInfo::Generated`] with an empty anchor list. + /// + /// Use [`SourceInfo::append_anchor`] to add anchors after construction. + /// For Generated nodes that need to carry anchors at construction + /// time, build the variant directly: `SourceInfo::Generated { by, from }`. + pub fn generated(by: By) -> Self { + SourceInfo::Generated { + by, + from: SmallVec::new(), + } + } + + /// If this is a [`SourceInfo::Generated`], return the first anchor whose + /// role is [`AnchorRole::Invocation`]. + /// + /// Returns `None` otherwise (including for non-`Generated` variants). + /// By convention there is at most one `Invocation` anchor per node. + pub fn invocation_anchor(&self) -> Option<&Arc> { + match self { + SourceInfo::Generated { from, .. } => from + .iter() + .find(|a| matches!(a.role, AnchorRole::Invocation)) + .map(|a| &a.source_info), + _ => None, + } + } + + /// If this is a [`SourceInfo::Generated`], return the first anchor whose + /// role is [`AnchorRole::ValueSource`]. + /// + /// Returns `None` otherwise. By convention there is at most one + /// `ValueSource` anchor per node. + pub fn value_source_anchor(&self) -> Option<&Arc> { + match self { + SourceInfo::Generated { from, .. } => from + .iter() + .find(|a| matches!(a.role, AnchorRole::ValueSource)) + .map(|a| &a.source_info), + _ => None, + } + } + + /// Iterate over every anchor in this [`SourceInfo::Generated`] whose role + /// equals `role`. + /// + /// Returns an empty iterator for non-`Generated` variants. Iteration order + /// is the append order. + pub fn anchors_with_role<'a>( + &'a self, + role: &'a AnchorRole, + ) -> Box> + 'a> { + match self { + SourceInfo::Generated { from, .. } => Box::new( + from.iter() + .filter(move |a| &a.role == role) + .map(|a| &a.source_info), + ), + _ => Box::new(std::iter::empty()), + } + } + + /// Append `(role, source_info)` to this [`SourceInfo::Generated`]'s + /// anchor list. /// - /// Used to track the provenance of elements created by Lua filters. - /// The filter_path should be the path to the filter file (from debug.getinfo source). - /// The line should be the line number where the element was created. - pub fn filter_provenance(filter_path: impl Into, line: usize) -> Self { - SourceInfo::FilterProvenance { - filter_path: filter_path.into(), - line, + /// Panics if `self` is not [`SourceInfo::Generated`]. By convention there + /// is at most one anchor per known role; appending a second anchor with + /// the same role does not replace the first — accessors that find by + /// role return the earliest match. + pub fn append_anchor(&mut self, role: AnchorRole, source_info: Arc) { + match self { + SourceInfo::Generated { from, .. } => { + from.push(Anchor { role, source_info }); + } + _ => panic!("append_anchor called on non-Generated SourceInfo"), } } @@ -173,7 +309,7 @@ impl SourceInfo { .. } => end_offset - start_offset, SourceInfo::Concat { pieces } => pieces.iter().map(|p| p.length).sum(), - SourceInfo::FilterProvenance { .. } => 0, + SourceInfo::Generated { .. } => 0, } } @@ -181,13 +317,13 @@ impl SourceInfo { /// /// For Original and Substring, returns the start_offset field. /// For Concat, returns 0 (the concat represents a new text starting at 0). - /// For FilterProvenance, returns 0. + /// For Generated, returns 0. pub fn start_offset(&self) -> usize { match self { SourceInfo::Original { start_offset, .. } => *start_offset, SourceInfo::Substring { start_offset, .. } => *start_offset, SourceInfo::Concat { .. } => 0, - SourceInfo::FilterProvenance { .. } => 0, + SourceInfo::Generated { .. } => 0, } } @@ -195,24 +331,26 @@ impl SourceInfo { /// /// For Original and Substring, returns the end_offset field. /// For Concat, returns the total length. - /// For FilterProvenance, returns 0. + /// For Generated, returns 0. pub fn end_offset(&self) -> usize { match self { SourceInfo::Original { end_offset, .. } => *end_offset, SourceInfo::Substring { end_offset, .. } => *end_offset, SourceInfo::Concat { .. } => self.length(), - SourceInfo::FilterProvenance { .. } => 0, + SourceInfo::Generated { .. } => 0, } } /// Chain-resolve to `(file_id, start_offset, end_offset)` in the /// root source file. /// - /// Returns `None` for `Concat` and `FilterProvenance` — these - /// don't map cleanly to a single contiguous byte range. The - /// attribution v1 sidecar relies on this contract; project-scoped - /// (v2) features that need the full chain resolver should use - /// `map_offset` against a `SourceContext` instead. + /// Returns `None` for `Concat` — Concat doesn't map cleanly to a + /// single contiguous byte range. For `Generated`, delegates to the + /// first `Invocation` anchor and recurses (`None` when no + /// `Invocation` anchor is present). The attribution v1 sidecar + /// relies on this contract; project-scoped (v2) features that need + /// the full chain resolver should use `map_offset` against a + /// `SourceContext` instead. pub fn resolve_byte_range(&self) -> Option<(usize, usize, usize)> { match self { SourceInfo::Original { @@ -228,7 +366,78 @@ impl SourceInfo { let (fid, parent_start, _) = parent.resolve_byte_range()?; Some((fid, parent_start + start_offset, parent_start + end_offset)) } - SourceInfo::Concat { .. } | SourceInfo::FilterProvenance { .. } => None, + SourceInfo::Concat { .. } => None, + SourceInfo::Generated { .. } => self + .invocation_anchor() + .and_then(|si| si.resolve_byte_range()), + } + } + + /// Byte range in `target` that this `SourceInfo`'s preimage covers, if any. + /// + /// This is the writer's "can I Verbatim-copy bytes from `target` for the + /// node carrying this source_info?" check. + /// + /// Semantics by variant: + /// - `Original` → `Some(start..end)` iff the file matches `target`, else `None`. + /// - `Substring` → recurse the parent; offsets compose additively. + /// - `Concat` → every piece must resolve into `target` AND the resolved + /// ranges must be byte-contiguous (no gaps, no overlaps). A gappy Concat + /// returns `None` — the writer can't Verbatim-copy a non-contiguous span. + /// - `Generated` → walk the `Invocation` anchor only via + /// [`invocation_anchor`](Self::invocation_anchor). **No other anchor + /// role is consulted** — not `ValueSource` (Plan 9), not future + /// `Dispatch` (Plan 10), not `AnchorRole::Other`. See the + /// role-asymmetry section below. + /// + /// # Role asymmetry + /// + /// `preimage_in` only walks `AnchorRole::Invocation`. This is load-bearing: + /// copying bytes from a `ValueSource` source range would emit raw YAML + /// metadata (or whatever the value lived in) into the body — a hard + /// correctness bug. The same applies to `Dispatch` (which points at Lua + /// source) and to any extension-defined `Other` role. + /// + /// **Future anchor roles default to non-walked.** Extensions introducing + /// `AnchorRole::Other("…")` should treat this as a feature: their + /// attribution metadata is not accidentally consulted by the writer's + /// byte-copying path. If a role *does* contribute to body-text preimage, + /// it must be explicitly added to this function's `Generated` arm. + pub fn preimage_in(&self, target: FileId) -> Option> { + match self { + SourceInfo::Original { + file_id, + start_offset, + end_offset, + } if *file_id == target => Some(*start_offset..*end_offset), + SourceInfo::Original { .. } => None, + SourceInfo::Substring { + parent, + start_offset, + end_offset, + } => { + let parent_range = parent.preimage_in(target)?; + Some(parent_range.start + start_offset..parent_range.start + end_offset) + } + SourceInfo::Concat { pieces } => { + let ranges: Vec> = pieces + .iter() + .map(|p| p.source_info.preimage_in(target)) + .collect::>>()?; + if ranges.is_empty() { + return None; + } + if ranges.windows(2).all(|w| w[0].end == w[1].start) { + let first = ranges.first().unwrap().start; + let last = ranges.last().unwrap().end; + Some(first..last) + } else { + None + } + } + SourceInfo::Generated { .. } => self + .invocation_anchor() + .and_then(|si| si.preimage_in(target)), } } @@ -257,13 +466,229 @@ impl SourceInfo { piece.source_info.remap_file_ids(map); } } - SourceInfo::FilterProvenance { .. } => { - // No FileId inside — the filter_path is a separate string. + SourceInfo::Generated { from, .. } => { + for anchor in from { + // Arc::make_mut clones if there are other references. + let inner = Arc::make_mut(&mut anchor.source_info); + inner.remap_file_ids(map); + } + } + } + } + + /// First `FileId` reachable from this `SourceInfo`'s root. + /// + /// - `Original` → `Some(file_id)`. + /// - `Substring` → recurse parent. + /// - `Concat` → `pieces.iter().find_map(|p| p.source_info.root_file_id())` + /// (`find_map` semantics — skips Generated holes and empty pieces). + /// - `Generated` → `invocation_anchor().and_then(|si| si.root_file_id())`; + /// `None` when no `Invocation` anchor is present. + pub fn root_file_id(&self) -> Option { + match self { + SourceInfo::Original { file_id, .. } => Some(*file_id), + SourceInfo::Substring { parent, .. } => parent.root_file_id(), + SourceInfo::Concat { pieces } => { + pieces.iter().find_map(|p| p.source_info.root_file_id()) + } + SourceInfo::Generated { .. } => { + self.invocation_anchor().and_then(|si| si.root_file_id()) + } + } + } + + /// Insert every `FileId` reachable from this `SourceInfo` into `out`. + /// + /// Walks every `Original`, every `Substring` parent, every `Concat` + /// piece, and every `Generated` anchor (all roles — `Invocation`, + /// `ValueSource`, `Other`). + pub fn collect_file_ids(&self, out: &mut std::collections::HashSet) { + match self { + SourceInfo::Original { file_id, .. } => { + out.insert(*file_id); + } + SourceInfo::Substring { parent, .. } => parent.collect_file_ids(out), + SourceInfo::Concat { pieces } => { + for piece in pieces { + piece.source_info.collect_file_ids(out); + } + } + SourceInfo::Generated { from, .. } => { + for anchor in from { + anchor.source_info.collect_file_ids(out); + } } } } } +impl By { + /// Producer kind for a node constructed by a Lua filter + /// (e.g. `pandoc.Str("decoration")` inside a filter callback). + /// + /// `filter_path` is the path the Lua engine reported via + /// `debug.getinfo(...).source` (with the leading "@" stripped); + /// `line` is the line number inside that file where the constructor + /// ran. Until Lua-file-registration lands (bd-36fr9), `(filter_path, + /// line)` lives in `by.data`; afterwards it migrates to a `Dispatch` + /// anchor and `by.data` shrinks to `{}`. + pub fn filter(filter_path: impl Into, line: usize) -> Self { + Self { + kind: "filter".to_string(), + data: serde_json::json!({ + "filter_path": filter_path.into(), + "line": line, + }), + } + } + + /// Producer kind for the `SectionizeTransform`'s synthesized section + /// Divs. Children remain editable; the wrapper itself is structural. + pub fn sectionize() -> Self { + Self { + kind: "sectionize".to_string(), + data: serde_json::Value::Null, + } + } + + /// Producer kind for React-constructed (user-typed) content reaching + /// the AST through the q2-preview client. + pub fn user_edit() -> Self { + Self { + kind: "user-edit".to_string(), + data: serde_json::Value::Null, + } + } + + /// Producer kind for shortcode resolutions. + /// + /// **Invariant.** Every `Generated { by: shortcode(...), .. }` must + /// carry at least one `Invocation` anchor in `from` pointing at the + /// source token's byte range. Use only inside a `Generated` whose + /// anchor list is populated; constructing the bare shape with empty + /// `from` is rejected by Plan 6's audit-completion test and trips + /// Plan 7's writer `debug_assert!`. + pub fn shortcode(name: impl Into) -> Self { + Self { + kind: "shortcode".to_string(), + data: serde_json::json!({ "name": name.into() }), + } + } + + /// Producer kind for `IncludeStage`'s expansion wrapper. Note that + /// most include-related synthesized content keeps its `Original` + /// `source_info` (inherited from the include-line Paragraph) — this + /// kind is only used where a `Generated` is explicitly required. + pub fn include() -> Self { + Self { + kind: "include".to_string(), + data: serde_json::Value::Null, + } + } + + /// Producer kind for the title-block stage's synthesized title `h1`. + pub fn title_block() -> Self { + Self { + kind: "title-block".to_string(), + data: serde_json::Value::Null, + } + } + + /// Producer kind for the footnotes stage's container Div. + pub fn footnotes() -> Self { + Self { + kind: "footnotes".to_string(), + data: serde_json::Value::Null, + } + } + + /// Producer kind for the appendix-structure stage's wrapper Div. + pub fn appendix() -> Self { + Self { + kind: "appendix".to_string(), + data: serde_json::Value::Null, + } + } + + /// Producer kind for parser-side synthetic Spaces inserted by the + /// tree-sitter post-processing pass. + pub fn tree_sitter_postprocess() -> Self { + Self { + kind: "tree-sitter-postprocess".to_string(), + data: serde_json::Value::Null, + } + } + + /// Escape-hatch constructor for any `kind` string — including built-in + /// names and extension-defined kinds (`ext//`). + /// + /// Forgery (an extension calling `By::raw("shortcode", …)` without the + /// required `Invocation` anchor) is caught downstream by Plan 6's + /// audit-completion test and Plan 7's `debug_assert!`. The convention + /// for third-party kinds is `ext//`. + pub fn raw(kind: impl Into, data: serde_json::Value) -> Self { + Self { + kind: kind.into(), + data, + } + } + + /// True if a `Generated { by: , .. }` node should be treated + /// as atomic by the incremental writer. + /// + /// Atomic nodes are produced by the pipeline and represent content + /// the user shouldn't edit through React (filter constructions, + /// shortcode resolutions, synthesized title h1, tree-sitter-inserted + /// spaces). Atomicity is determined by `kind` alone — orthogonal to + /// anchor-presence. + /// + /// Extensions that contribute new `by.kind` values are not atomic by + /// default in v1. + pub fn is_atomic_kind(&self) -> bool { + matches!( + self.kind.as_str(), + "filter" | "shortcode" | "title-block" | "tree-sitter-postprocess" + ) + } + + /// True if this `By`'s `kind` equals `kind`. + pub fn is_kind(&self, kind: &str) -> bool { + self.kind == kind + } + + /// If `self.kind == "filter"`, return `(filter_path, line)`. + /// + /// Returns `None` for any other kind, or when the data payload is + /// malformed (missing or non-string `filter_path`, missing or + /// non-integer `line`). + pub fn as_filter(&self) -> Option<(&str, usize)> { + if self.kind != "filter" { + return None; + } + let path = self.data.get("filter_path")?.as_str()?; + let line = self.data.get("line")?.as_u64()? as usize; + Some((path, line)) + } +} + +impl Anchor { + /// Construct an [`AnchorRole::Invocation`] anchor. + pub fn invocation(source_info: Arc) -> Self { + Self { + role: AnchorRole::Invocation, + source_info, + } + } + + /// Construct an [`AnchorRole::ValueSource`] anchor. + pub fn value_source(source_info: Arc) -> Self { + Self { + role: AnchorRole::ValueSource, + source_info, + } + } +} + #[cfg(test)] mod tests { use super::*; @@ -346,18 +771,401 @@ mod tests { } #[test] - fn test_remap_file_ids_filter_provenance_is_noop() { - let mut info = SourceInfo::filter_provenance("foo.lua", 42); + fn test_remap_file_ids_generated_empty_from_is_noop() { + let mut info = SourceInfo::generated(By::filter("foo.lua", 42)); info.remap_file_ids(&|_| FileId(99)); match info { - SourceInfo::FilterProvenance { filter_path, line } => { - assert_eq!(filter_path, "foo.lua"); + SourceInfo::Generated { by, from } => { + assert!(from.is_empty()); + let (path, line) = by.as_filter().unwrap(); + assert_eq!(path, "foo.lua"); assert_eq!(line, 42); } - _ => panic!("Expected FilterProvenance"), + _ => panic!("Expected Generated"), + } + } + + // ------------------------------------------------------------------------- + // Plan 4 — By / Anchor / Generated coverage + // ------------------------------------------------------------------------- + + #[test] + fn test_by_filter_builder() { + let by = By::filter("a.lua", 7); + assert_eq!(by.kind, "filter"); + assert_eq!(by.as_filter(), Some(("a.lua", 7))); + } + + #[test] + fn test_by_sectionize_builder() { + let by = By::sectionize(); + assert_eq!(by.kind, "sectionize"); + assert!(by.data.is_null()); + } + + #[test] + fn test_by_user_edit_builder() { + assert_eq!(By::user_edit().kind, "user-edit"); + } + + #[test] + fn test_by_shortcode_builder_records_name() { + let by = By::shortcode("meta"); + assert_eq!(by.kind, "shortcode"); + assert_eq!(by.data.get("name").and_then(|v| v.as_str()), Some("meta")); + } + + #[test] + fn test_by_include_title_footnotes_appendix_tree_sitter_builders() { + assert_eq!(By::include().kind, "include"); + assert_eq!(By::title_block().kind, "title-block"); + assert_eq!(By::footnotes().kind, "footnotes"); + assert_eq!(By::appendix().kind, "appendix"); + assert_eq!( + By::tree_sitter_postprocess().kind, + "tree-sitter-postprocess" + ); + } + + #[test] + fn test_by_raw_builder_accepts_any_kind() { + let by = By::raw("ext/my-plugin/foo", serde_json::json!({"k": 1})); + assert_eq!(by.kind, "ext/my-plugin/foo"); + assert_eq!(by.data.get("k").and_then(|v| v.as_u64()), Some(1)); + } + + #[test] + fn test_by_is_atomic_kind() { + assert!(By::filter("x.lua", 1).is_atomic_kind()); + assert!(By::shortcode("meta").is_atomic_kind()); + assert!(By::title_block().is_atomic_kind()); + assert!(By::tree_sitter_postprocess().is_atomic_kind()); + + assert!(!By::sectionize().is_atomic_kind()); + assert!(!By::user_edit().is_atomic_kind()); + assert!(!By::include().is_atomic_kind()); + assert!(!By::footnotes().is_atomic_kind()); + assert!(!By::appendix().is_atomic_kind()); + assert!(!By::raw("ext/anywhere/foo", serde_json::Value::Null).is_atomic_kind()); + } + + #[test] + fn test_by_is_kind() { + let by = By::shortcode("meta"); + assert!(by.is_kind("shortcode")); + assert!(!by.is_kind("filter")); + } + + #[test] + fn test_by_as_filter_rejects_non_filter() { + assert!(By::sectionize().as_filter().is_none()); + // Malformed filter (missing line) → None. + let by = By { + kind: "filter".to_string(), + data: serde_json::json!({ "filter_path": "x.lua" }), + }; + assert!(by.as_filter().is_none()); + } + + #[test] + fn test_anchor_invocation_value_source_constructors() { + let original = Arc::new(SourceInfo::original(FileId(1), 0, 5)); + let inv = Anchor::invocation(Arc::clone(&original)); + let vs = Anchor::value_source(Arc::clone(&original)); + assert!(matches!(inv.role, AnchorRole::Invocation)); + assert!(matches!(vs.role, AnchorRole::ValueSource)); + } + + #[test] + fn test_by_json_round_trip() { + let by = By::shortcode("meta"); + let json = serde_json::to_string(&by).unwrap(); + let back: By = serde_json::from_str(&json).unwrap(); + assert_eq!(by, back); + } + + #[test] + fn test_anchor_json_round_trip() { + let anchor = Anchor::invocation(Arc::new(SourceInfo::original(FileId(2), 10, 20))); + let json = serde_json::to_string(&anchor).unwrap(); + let back: Anchor = serde_json::from_str(&json).unwrap(); + assert_eq!(anchor, back); + } + + #[test] + fn test_generated_json_round_trip_empty_from() { + let info = SourceInfo::generated(By::sectionize()); + let json = serde_json::to_string(&info).unwrap(); + let back: SourceInfo = serde_json::from_str(&json).unwrap(); + assert_eq!(info, back); + } + + #[test] + fn test_generated_json_round_trip_with_invocation_anchor() { + let mut info = SourceInfo::generated(By::shortcode("meta")); + info.append_anchor( + AnchorRole::Invocation, + Arc::new(SourceInfo::original(FileId(5), 100, 110)), + ); + let json = serde_json::to_string(&info).unwrap(); + let back: SourceInfo = serde_json::from_str(&json).unwrap(); + assert_eq!(info, back); + } + + #[test] + fn test_generated_json_round_trip_multi_anchor() { + let mut info = SourceInfo::generated(By::shortcode("meta")); + info.append_anchor( + AnchorRole::Invocation, + Arc::new(SourceInfo::original(FileId(5), 100, 110)), + ); + info.append_anchor( + AnchorRole::ValueSource, + Arc::new(SourceInfo::original(FileId(7), 200, 220)), + ); + let json = serde_json::to_string(&info).unwrap(); + let back: SourceInfo = serde_json::from_str(&json).unwrap(); + assert_eq!(info, back); + } + + #[test] + fn test_generated_length_start_end_are_zero() { + let info = SourceInfo::generated(By::sectionize()); + assert_eq!(info.length(), 0); + assert_eq!(info.start_offset(), 0); + assert_eq!(info.end_offset(), 0); + } + + #[test] + fn test_generated_resolve_byte_range_recurses_through_substring() { + let parent = SourceInfo::original(FileId(42), 100, 200); + let sub = SourceInfo::substring(parent, 10, 20); + let mut info = SourceInfo::generated(By::shortcode("meta")); + info.append_anchor(AnchorRole::Invocation, Arc::new(sub)); + assert_eq!(info.resolve_byte_range(), Some((42, 110, 120))); + } + + #[test] + fn test_generated_resolve_byte_range_empty_returns_none() { + let info = SourceInfo::generated(By::sectionize()); + assert!(info.resolve_byte_range().is_none()); + } + + #[test] + fn test_generated_resolve_byte_range_value_source_only_returns_none() { + let mut info = SourceInfo::generated(By::shortcode("meta")); + info.append_anchor( + AnchorRole::ValueSource, + Arc::new(SourceInfo::original(FileId(5), 100, 110)), + ); + assert!(info.resolve_byte_range().is_none()); + } + + #[test] + fn test_generated_remap_file_ids_walks_anchors() { + let mut info = SourceInfo::generated(By::shortcode("meta")); + info.append_anchor( + AnchorRole::Invocation, + Arc::new(SourceInfo::original(FileId(0), 0, 5)), + ); + info.append_anchor( + AnchorRole::ValueSource, + Arc::new(SourceInfo::original(FileId(3), 10, 20)), + ); + info.remap_file_ids(&|id| FileId(id.0 + 10)); + match &info { + SourceInfo::Generated { from, .. } => { + assert_eq!(from.len(), 2); + match from[0].source_info.as_ref() { + SourceInfo::Original { file_id, .. } => assert_eq!(*file_id, FileId(10)), + _ => panic!("Expected Original anchor 0"), + } + match from[1].source_info.as_ref() { + SourceInfo::Original { file_id, .. } => assert_eq!(*file_id, FileId(13)), + _ => panic!("Expected Original anchor 1"), + } + } + _ => panic!("Expected Generated"), + } + } + + #[test] + fn test_root_file_id_per_variant() { + // Original + let original = SourceInfo::original(FileId(7), 0, 5); + assert_eq!(original.root_file_id(), Some(FileId(7))); + + // Substring → recurse parent + let sub = SourceInfo::substring(original.clone(), 0, 5); + assert_eq!(sub.root_file_id(), Some(FileId(7))); + + // Concat find_map skips Generated holes + let empty_gen = SourceInfo::generated(By::sectionize()); + let real = SourceInfo::original(FileId(42), 0, 5); + let concat = SourceInfo::concat(vec![(empty_gen, 0), (real, 5)]); + assert_eq!(concat.root_file_id(), Some(FileId(42))); + + // Generated with Invocation + let mut g = SourceInfo::generated(By::shortcode("meta")); + g.append_anchor( + AnchorRole::Invocation, + Arc::new(SourceInfo::original(FileId(9), 0, 1)), + ); + assert_eq!(g.root_file_id(), Some(FileId(9))); + + // Generated with no Invocation + let mut g2 = SourceInfo::generated(By::shortcode("meta")); + g2.append_anchor( + AnchorRole::ValueSource, + Arc::new(SourceInfo::original(FileId(9), 0, 1)), + ); + assert_eq!(g2.root_file_id(), None); + + // Generated empty + let g3 = SourceInfo::generated(By::sectionize()); + assert_eq!(g3.root_file_id(), None); + } + + #[test] + fn test_collect_file_ids_walks_every_anchor_role() { + let mut info = SourceInfo::generated(By::shortcode("meta")); + info.append_anchor( + AnchorRole::Invocation, + Arc::new(SourceInfo::original(FileId(1), 0, 1)), + ); + info.append_anchor( + AnchorRole::ValueSource, + Arc::new(SourceInfo::original(FileId(2), 0, 1)), + ); + info.append_anchor( + AnchorRole::Other("dispatch".to_string()), + Arc::new(SourceInfo::original(FileId(3), 0, 1)), + ); + let mut out = std::collections::HashSet::new(); + info.collect_file_ids(&mut out); + assert!(out.contains(&FileId(1))); + assert!(out.contains(&FileId(2))); + assert!(out.contains(&FileId(3))); + assert_eq!(out.len(), 3); + } + + #[test] + fn test_collect_file_ids_walks_concat_and_substring() { + let inner = SourceInfo::original(FileId(5), 0, 100); + let sub = SourceInfo::substring(inner, 10, 20); + let other = SourceInfo::original(FileId(11), 0, 5); + let concat = SourceInfo::concat(vec![(sub, 10), (other, 5)]); + let mut out = std::collections::HashSet::new(); + concat.collect_file_ids(&mut out); + assert!(out.contains(&FileId(5))); + assert!(out.contains(&FileId(11))); + assert_eq!(out.len(), 2); + } + + #[test] + fn test_invocation_anchor_accessor() { + let mut info = SourceInfo::generated(By::shortcode("meta")); + assert!(info.invocation_anchor().is_none()); + info.append_anchor( + AnchorRole::ValueSource, + Arc::new(SourceInfo::original(FileId(2), 0, 1)), + ); + assert!(info.invocation_anchor().is_none()); + info.append_anchor( + AnchorRole::Invocation, + Arc::new(SourceInfo::original(FileId(1), 0, 1)), + ); + assert!(info.invocation_anchor().is_some()); + // Non-Generated returns None. + assert!( + SourceInfo::original(FileId(0), 0, 0) + .invocation_anchor() + .is_none() + ); + } + + #[test] + fn test_value_source_anchor_accessor() { + let mut info = SourceInfo::generated(By::shortcode("meta")); + assert!(info.value_source_anchor().is_none()); + info.append_anchor( + AnchorRole::Invocation, + Arc::new(SourceInfo::original(FileId(1), 0, 1)), + ); + assert!(info.value_source_anchor().is_none()); + info.append_anchor( + AnchorRole::ValueSource, + Arc::new(SourceInfo::original(FileId(2), 0, 1)), + ); + assert!(info.value_source_anchor().is_some()); + } + + #[test] + fn test_anchors_with_role() { + let mut info = SourceInfo::generated(By::shortcode("meta")); + info.append_anchor( + AnchorRole::Invocation, + Arc::new(SourceInfo::original(FileId(1), 0, 1)), + ); + info.append_anchor( + AnchorRole::ValueSource, + Arc::new(SourceInfo::original(FileId(2), 0, 1)), + ); + info.append_anchor( + AnchorRole::Other("ext/foo".to_string()), + Arc::new(SourceInfo::original(FileId(3), 0, 1)), + ); + assert_eq!(info.anchors_with_role(&AnchorRole::Invocation).count(), 1); + assert_eq!(info.anchors_with_role(&AnchorRole::ValueSource).count(), 1); + assert_eq!( + info.anchors_with_role(&AnchorRole::Other("ext/foo".to_string())) + .count(), + 1 + ); + assert_eq!( + info.anchors_with_role(&AnchorRole::Other("missing".to_string())) + .count(), + 0 + ); + } + + #[test] + fn test_append_anchor_preserves_order() { + let mut info = SourceInfo::generated(By::shortcode("meta")); + info.append_anchor( + AnchorRole::Invocation, + Arc::new(SourceInfo::original(FileId(1), 0, 1)), + ); + info.append_anchor( + AnchorRole::ValueSource, + Arc::new(SourceInfo::original(FileId(2), 0, 1)), + ); + match info { + SourceInfo::Generated { from, .. } => { + assert_eq!(from.len(), 2); + assert!(matches!(from[0].role, AnchorRole::Invocation)); + assert!(matches!(from[1].role, AnchorRole::ValueSource)); + } + _ => panic!("Expected Generated"), } } + #[test] + fn test_combine_with_generated_is_zero_length_piece() { + let original = SourceInfo::original(FileId(0), 10, 20); + let generated = SourceInfo::generated(By::sectionize()); + let combined = original.combine(&generated); + match &combined { + SourceInfo::Concat { pieces } => { + assert_eq!(pieces.len(), 2); + assert_eq!(pieces[1].length, 0); + } + _ => panic!("Expected Concat"), + } + // Length of the combined value equals only the Original side. + assert_eq!(combined.length(), 10); + } + #[test] fn test_source_info_serialization() { let file_id = FileId(0); @@ -801,4 +1609,166 @@ mod tests { let deserialized: SourceInfo = serde_json::from_value(json).unwrap(); assert_eq!(combined, deserialized); } + + // ------------------------------------------------------------------------- + // Plan 7 — preimage_in accessor + // ------------------------------------------------------------------------- + + #[test] + fn test_preimage_in_original_same_file() { + let info = SourceInfo::original(FileId(0), 10, 25); + assert_eq!(info.preimage_in(FileId(0)), Some(10..25)); + } + + #[test] + fn test_preimage_in_original_different_file_returns_none() { + let info = SourceInfo::original(FileId(0), 10, 25); + assert_eq!(info.preimage_in(FileId(1)), None); + } + + #[test] + fn test_preimage_in_substring_composes_offsets() { + // Parent points at bytes 100..200 in file 0. + // Substring takes bytes 5..15 *relative to parent*. + // Preimage in file 0 should be 105..115. + let parent = SourceInfo::original(FileId(0), 100, 200); + let info = SourceInfo::substring(parent, 5, 15); + assert_eq!(info.preimage_in(FileId(0)), Some(105..115)); + } + + #[test] + fn test_preimage_in_substring_different_file_returns_none() { + let parent = SourceInfo::original(FileId(0), 100, 200); + let info = SourceInfo::substring(parent, 5, 15); + assert_eq!(info.preimage_in(FileId(7)), None); + } + + #[test] + fn test_preimage_in_substring_chain() { + // Original 1000..2000 in file 0; Substring 100..500 relative; Substring 10..50 relative. + // Expected preimage in file 0: 1100 + 10 .. 1100 + 50 = 1110..1150. + let root = SourceInfo::original(FileId(0), 1000, 2000); + let mid = SourceInfo::substring(root, 100, 500); + let leaf = SourceInfo::substring(mid, 10, 50); + assert_eq!(leaf.preimage_in(FileId(0)), Some(1110..1150)); + } + + #[test] + fn test_preimage_in_concat_contiguous() { + // Two adjacent pieces of file 0: 10..15 and 15..25 → contiguous → 10..25. + let a = SourceInfo::original(FileId(0), 10, 15); + let b = SourceInfo::original(FileId(0), 15, 25); + let info = SourceInfo::concat(vec![(a, 5), (b, 10)]); + assert_eq!(info.preimage_in(FileId(0)), Some(10..25)); + } + + #[test] + fn test_preimage_in_concat_gappy_returns_none() { + // 10..15 then 20..25 → gap between 15 and 20 → None. + let a = SourceInfo::original(FileId(0), 10, 15); + let b = SourceInfo::original(FileId(0), 20, 25); + let info = SourceInfo::concat(vec![(a, 5), (b, 5)]); + assert_eq!(info.preimage_in(FileId(0)), None); + } + + #[test] + fn test_preimage_in_concat_overlapping_returns_none() { + // 10..20 then 15..25 → overlap → not byte-contiguous → None. + let a = SourceInfo::original(FileId(0), 10, 20); + let b = SourceInfo::original(FileId(0), 15, 25); + let info = SourceInfo::concat(vec![(a, 10), (b, 10)]); + assert_eq!(info.preimage_in(FileId(0)), None); + } + + #[test] + fn test_preimage_in_concat_mixed_files_returns_none() { + // One piece in file 0, another in file 1 → resolving in file 0 fails + // because the file-1 piece can't be resolved. + let a = SourceInfo::original(FileId(0), 10, 15); + let b = SourceInfo::original(FileId(1), 15, 25); + let info = SourceInfo::concat(vec![(a, 5), (b, 10)]); + assert_eq!(info.preimage_in(FileId(0)), None); + } + + #[test] + fn test_preimage_in_generated_no_anchors_returns_none() { + // Sectionize-style wrapper, footnotes-container, etc.: Generated with + // empty `from`. No Invocation anchor → no preimage. + let info = SourceInfo::generated(By::sectionize()); + assert_eq!(info.preimage_in(FileId(0)), None); + } + + #[test] + fn test_preimage_in_generated_with_invocation_in_target() { + // Shortcode resolution: Generated with an Invocation anchor pointing + // at the {{< meta foo >}} token bytes. + let token = SourceInfo::original(FileId(0), 50, 70); + let mut info = SourceInfo::generated(By::shortcode("meta")); + info.append_anchor(AnchorRole::Invocation, Arc::new(token)); + assert_eq!(info.preimage_in(FileId(0)), Some(50..70)); + } + + #[test] + fn test_preimage_in_generated_with_invocation_outside_target() { + // Invocation anchor points at file 0; query asks about file 1 → None. + let token = SourceInfo::original(FileId(0), 50, 70); + let mut info = SourceInfo::generated(By::shortcode("meta")); + info.append_anchor(AnchorRole::Invocation, Arc::new(token)); + assert_eq!(info.preimage_in(FileId(1)), None); + } + + #[test] + fn test_preimage_in_generated_walks_through_substring_in_invocation() { + // Invocation anchor is itself a Substring chain. preimage_in must + // walk through it correctly. + let root = SourceInfo::original(FileId(0), 100, 200); + let token = SourceInfo::substring(root, 10, 30); + let mut info = SourceInfo::generated(By::shortcode("meta")); + info.append_anchor(AnchorRole::Invocation, Arc::new(token)); + assert_eq!(info.preimage_in(FileId(0)), Some(110..130)); + } + + // ------------------------------------------------------------------------- + // Plan 7 — preimage_in role-asymmetry: only Invocation is walked. + // ------------------------------------------------------------------------- + + #[test] + fn test_preimage_in_generated_value_source_only_returns_none() { + // Plan 9-shape: Generated whose only anchor is ValueSource (points at + // YAML metadata bytes). The writer must NOT copy those bytes into the + // body — preimage_in returns None. + let meta_si = SourceInfo::original(FileId(0), 10, 25); + let mut info = SourceInfo::generated(By::appendix()); + info.append_anchor(AnchorRole::ValueSource, Arc::new(meta_si)); + assert_eq!(info.preimage_in(FileId(0)), None); + } + + #[test] + fn test_preimage_in_generated_other_only_returns_none() { + // Extension-defined Other role. preimage_in must not walk it. + let lua_si = SourceInfo::original(FileId(0), 10, 25); + let mut info = SourceInfo::generated(By::filter("upper.lua", 14)); + info.append_anchor( + AnchorRole::Other("ext/my-ext/dispatch".to_string()), + Arc::new(lua_si), + ); + assert_eq!(info.preimage_in(FileId(0)), None); + } + + #[test] + fn test_preimage_in_generated_invocation_plus_value_source_walks_invocation_only() { + // Plan 2/Plan 9 mixed shape: Invocation in file 0 + ValueSource in + // file 1. Query file 0 → Invocation resolves → Some(token range). + // Query file 1 → Invocation resolves to file 0 (not 1) → None. + // (The writer must not see the value-source range when asked about + // any file, even the file the ValueSource points into.) + let token = SourceInfo::original(FileId(0), 50, 70); + let value = SourceInfo::original(FileId(1), 200, 215); + let mut info = SourceInfo::generated(By::shortcode("meta")); + info.append_anchor(AnchorRole::Invocation, Arc::new(token)); + info.append_anchor(AnchorRole::ValueSource, Arc::new(value)); + + assert_eq!(info.preimage_in(FileId(0)), Some(50..70)); + assert_eq!(info.preimage_in(FileId(1)), None); + } } diff --git a/crates/quarto/tests/smoke-all/q2-preview/render-components-write/_quarto.yml b/crates/quarto/tests/smoke-all/q2-preview/render-components-write/_quarto.yml new file mode 100644 index 000000000..815860fe9 --- /dev/null +++ b/crates/quarto/tests/smoke-all/q2-preview/render-components-write/_quarto.yml @@ -0,0 +1,2 @@ +project: + title: Render-components write smoke diff --git a/crates/quarto/tests/smoke-all/q2-preview/render-components-write/comment.tsx b/crates/quarto/tests/smoke-all/q2-preview/render-components-write/comment.tsx new file mode 100644 index 000000000..cd9788cfc --- /dev/null +++ b/crates/quarto/tests/smoke-all/q2-preview/render-components-write/comment.tsx @@ -0,0 +1,348 @@ +const React = window.React; +const { + Block: B +} = window.__Q2_PREVIEW_RENDERER__; + +function isComment(inline: InlineNode): boolean { + if (inline.t === 'Span' && 'c' in inline) { + const attrs = (inline as SpanInline).c[0]; + const classes = attrs[1]; + return classes.includes('quarto-edit-comment'); + } + return false; +} + +// export const Block = B +// BlockWithComments component +const splitEmoji = (string: string) => [...new Intl.Segmenter().segment(string)].map(x => x.segment) +export const Block = (args: NodeArgs) => { + const { node: block, onNavigateToDocument, setLocalAst } = args + // Gather comments from inline children if block has them + let comments: InlineNode[] = []; + let newBlock = block + if ('c' in block && block.c) { + // For Para, Plain: c is Inline[] + if ((block.t === 'Para' || block.t === 'Plain') && Array.isArray(block.c)) { + comments = block.c.filter(isComment); + newBlock = structuredClone(block) + newBlock.c = block.c.filter((n: any) => !isComment(n)); + } + // For Header: c is [number, [string, string[], [string, string][]], Inline[]] + else if (block.t === 'Header' && Array.isArray(block.c) && Array.isArray(block.c[2])) { + comments = block.c[2].filter(isComment); + newBlock = structuredClone(block) + //@ts-ignore + newBlock.c[2] = block.c[2].filter((n: any) => !isComment(n)); + } + } + + const commentContents = comments.map((c) => (c as SpanInline).c[1].map((o: InlineNode) => { + if (o.t === 'Str') return (o as StrInline).c; + if (o.t === 'Space') return ' '; + return ''; + }).join('')) + const reactions = commentContents.filter(c => splitEmoji(c).length === 1) + const reactionCounts = reactions.reduce((acc, emoji) => + acc.set(emoji, (acc.get(emoji) || 0) + 1), + new Map() + ); + comments = comments.filter((_, i) => splitEmoji(commentContents[i]).length !== 1) + + // Skip CommentWrapper for BulletList and OrderedList + if (block.t === 'BulletList' || block.t === 'OrderedList') { + return ; + } + + return + + ; +}; + +/** + * CommentWrapper renders children in a box and displays gathered comments + */ +const CommentWrapper = ({ children, comments, reactionCounts, setLocalAst, block }: { children: React.ReactNode, reactionCounts: Map, comments: InlineNode[], setLocalAst: (newBlock: BlockNode) => void, block: BlockNode }) => { + const [commentText, setCommentText] = React.useState(''); + const [showEmojiPicker, setShowEmojiPicker] = React.useState(false); + const [showCommentsList, setShowCommentsList] = React.useState(false); + const [isHovered, setIsHovered] = React.useState(false); + const emojiPickerRef = React.useRef(null); + const commentsListRef = React.useRef(null); + const commentInputRef = React.useRef(null); + + // Close emoji picker when clicking outside + React.useEffect(() => { + if (!showEmojiPicker) return; + + const handleClickOutside = (event: MouseEvent) => { + if (emojiPickerRef.current && !emojiPickerRef.current.contains(event.target as Node)) { + setShowEmojiPicker(false); + } + }; + + document.addEventListener('mousedown', handleClickOutside); + return () => { + document.removeEventListener('mousedown', handleClickOutside); + }; + }, [showEmojiPicker]); + + // Close comments list when clicking outside + React.useEffect(() => { + if (!showCommentsList) return; + + const handleClickOutside = (event: MouseEvent) => { + if (commentsListRef.current && !commentsListRef.current.contains(event.target as Node)) { + setShowCommentsList(false); + } + }; + + document.addEventListener('mousedown', handleClickOutside); + return () => { + document.removeEventListener('mousedown', handleClickOutside); + }; + }, [showCommentsList]); + + // Focus the input when comments list opens + React.useEffect(() => { + if (showCommentsList && commentInputRef.current) { + commentInputRef.current.focus(); + } + }, [showCommentsList]); + + const addComment = () => { + const newComment: SpanInline = { + t: 'Span', + c: [['', ['quarto-edit-comment'], []], [{ t: 'Str', c: commentText }]] + }; + + const newBlock = structuredClone(block); + if (newBlock.t === 'Para' || newBlock.t === 'Plain') { + (newBlock as ParaBlock | PlainBlock).c.push(newComment); + } else if (newBlock.t === 'Header') { + (newBlock as HeaderBlock).c[2].push(newComment); + } + setLocalAst(newBlock); + setCommentText('') + }; + + const addReaction = (emoji: string) => { + const newReaction: SpanInline = { + t: 'Span', + c: [['', ['quarto-edit-comment'], []], [{ t: 'Str', c: emoji }]] + }; + + const newBlock: BlockNode = structuredClone(block) as BlockNode; + if (newBlock.t === 'Para' || newBlock.t === 'Plain') { + (newBlock as ParaBlock | PlainBlock).c.push(newReaction); + } else if (newBlock.t === 'Header') { + (newBlock as HeaderBlock).c[2].push(newReaction); + } + setLocalAst(newBlock); + setShowEmojiPicker(false); + }; + + const commonEmojis = ['👍', '❤️', '😂', '🎉', '🤔', '👀', '🔥', '✅']; + const reactionEntries = Array.from(reactionCounts.entries()); + const hasContent = reactionEntries.length > 0 || comments.length > 0; + + return ( +
+ {children} + + {/* Container for all bubbles */} +
setIsHovered(true)} + onMouseLeave={() => setIsHovered(false)} + > + {/* Reaction count bubbles */} + {reactionEntries.map(([emoji, count]) => ( +
addReaction(emoji as string)} + onMouseEnter={(e) => e.currentTarget.style.backgroundColor = '#ededed'} + onMouseLeave={(e) => e.currentTarget.style.backgroundColor = '#dbdbdb'} + title={`Add ${emoji}`} + > + {emoji} + {count} +
+ ))} + + {/* Add reaction bubble */} +
+
setShowEmojiPicker(!showEmojiPicker)} + onMouseEnter={(e) => e.currentTarget.style.backgroundColor = '#e0f0ff'} + onMouseLeave={(e) => e.currentTarget.style.backgroundColor = showEmojiPicker ? '#e0f0ff' : '#b3d9ff'} + title="Add reaction" + > + + 🙂 +
+ + {/* Simple emoji picker */} + {showEmojiPicker && ( +
+ {commonEmojis.map(emoji => ( + addReaction(emoji)} + onMouseEnter={(e) => e.currentTarget.style.backgroundColor = '#f0f0f0'} + onMouseLeave={(e) => e.currentTarget.style.backgroundColor = 'transparent'} + > + {emoji} + + ))} +
+ )} +
+ + {/* Comments count bubble */} + {( +
+
setShowCommentsList(!showCommentsList)} + onMouseEnter={(e) => e.currentTarget.style.backgroundColor = '#e0f0ff'} + onMouseLeave={(e) => e.currentTarget.style.backgroundColor = showCommentsList ? '#e0f0ff' : '#b3d9ff'} + title={`${comments.length} comment${comments.length !== 1 ? 's' : ''}`} + > + 💬 {comments.length} +
+ + {/* Comments list popup */} + {showCommentsList && ( +
+ {comments.map((comment, i) => { + const commentContent = (comment as SpanInline).c[1] + .map((inline: InlineNode) => { + if (inline.t === 'Str') return (inline as StrInline).c; + if (inline.t === 'Space') return ' '; + return ''; + }) + .join(''); + + return ( +
+ {commentContent} +
+ ); + })} +
+ setCommentText(e.target.value)} + onKeyDown={(e) => e.key === 'Enter' && commentText && addComment()} + placeholder="Add comment" + style={{ flex: 1, padding: '4px', fontFamily: 'monospace', fontSize: '0.75rem', backgroundColor: '#f0f0f0', color: 'black', border: '1px solid #ccc', borderRadius: '4px' }} + /> + +
+
+ )} +
+ )} +
+
+ ); +}; \ No newline at end of file diff --git a/crates/quarto/tests/smoke-all/q2-preview/render-components-write/drag.tsx b/crates/quarto/tests/smoke-all/q2-preview/render-components-write/drag.tsx new file mode 100644 index 000000000..8f3d257a4 --- /dev/null +++ b/crates/quarto/tests/smoke-all/q2-preview/render-components-write/drag.tsx @@ -0,0 +1,93 @@ +const React = window.React; +const { + renderChildren, + blockStyle +} = window.__Q2_PREVIEW_RENDERER__; + +export const Div = (args) => { + const attrs = new Map(args.node.c[0][2]) + const initialX = Number(attrs.get('x') ?? 0) + const initialY = Number(attrs.get('y') ?? 0) + + const [x, setX] = React.useState(initialX) + const [y, setY] = React.useState(initialY) + const dragStartRef = React.useRef(null) + const divRef = React.useRef(null) + + // Sync x and y when attrs change externally (not during drag) + React.useEffect(() => { + if (!dragStartRef.current) { + setX(initialX) + setY(initialY) + } + }, [initialX, initialY]) + + const getScale = (el) => { + let scale = 1 + let current = el + while (current && current !== document.body) { + const style = window.getComputedStyle(current) + const transform = style.transform + if (transform && transform !== 'none') { + const matrix = new DOMMatrix(transform) + scale *= matrix.a + } + current = current.parentElement + } + return scale + } + + React.useEffect(() => { + const handleMouseMove = (e) => { + if (dragStartRef.current) { + const scale = getScale(divRef.current) + const dx = (e.clientX - dragStartRef.current.mouseX) / scale + const dy = (e.clientY - dragStartRef.current.mouseY) / scale + setX(dragStartRef.current.startX + dx) + setY(dragStartRef.current.startY + dy) + } + } + + const handleMouseUp = () => { + if (dragStartRef.current) { + args.node.c[0][2] = [['x', x + ''], ['y', y + '']] + args.setLocalAst(args.node) + dragStartRef.current = null + } + } + + window.addEventListener('mousemove', handleMouseMove) + window.addEventListener('mouseup', handleMouseUp) + return () => { + window.removeEventListener('mousemove', handleMouseMove) + window.removeEventListener('mouseup', handleMouseUp) + } + }, [x, y]) + + const t = `translate(${x}px, ${y}px)` + + return
+
{ + dragStartRef.current = { + mouseX: e.clientX, + mouseY: e.clientY, + startX: x, + startY: y + } + }} + /> + {renderChildren(args)} +
+} \ No newline at end of file diff --git a/crates/quarto/tests/smoke-all/q2-preview/render-components-write/index.qmd b/crates/quarto/tests/smoke-all/q2-preview/render-components-write/index.qmd new file mode 100644 index 000000000..097374aa6 --- /dev/null +++ b/crates/quarto/tests/smoke-all/q2-preview/render-components-write/index.qmd @@ -0,0 +1,91 @@ +--- +format: q2-preview +render-components: + - comment.tsx + - kanban.tsx +source-location: full +--- + +# Gordon's render-components demo + +This page renders under **q2-preview**, the new format that ships +real-HTML built-in components for every Pandoc base type and the +Quarto custom-node taxonomy (callouts, theorems, proofs, figures, +equations, cross-references). The TSX files loaded above only need +to declare the components that go *beyond* the built-ins. + +## What's built-in + +After q2-preview Plan 2B + 2C, the iframe ships native renderers for +every Pandoc Block (Para, Header, Code, BlockQuote, Div, Figure, +LineBlock, DefinitionList, Table, ...) and Inline (Str, Emph, Strong, +Code, Link, Image, Math, Span, Cite, Note, ...) plus all six core +Quarto custom-nodes (Callout, Theorem, Proof, FloatRefTarget, +Equation, CrossrefResolvedRef). You no longer need to fork +`html.tsx` to get a real-HTML render. + +Examples that just work without an override file: + +::: {.callout-note} +A built-in callout — no TSX needed. +::: + +::: {#thm-pythagoras .theorem} +For a right triangle with legs of length $a$ and $b$ and hypotenuse $c$, +$a^2 + b^2 = c^2$. +::: + +@thm-pythagoras states the Pythagorean identity. + +$$e = mc^2$$ {#eq-einstein} + +## What's worth overriding + +Override TSX files declare components that aren't covered by the +built-ins — domain-specific UIs that take advantage of the AST +shape Quarto's parser emits: + +* **comment.tsx** — Slack-like commenting UI keyed off + `[>> body]{.quarto-edit-comment}` inline syntax. +* **kanban.tsx** — drag-and-droppable columns keyed off + `::: {.kanban}` divs (uses `drag.tsx` as a helper). + +These layer onto the built-in registry via the `render-components:` +frontmatter list above. User exports of the same name as a built-in +shadow it (e.g. an export named `Callout` would replace the +built-in callout component). + +::: {.kanban} + +## todo + +* Pull comments + reactions through Automerge so they sync between + collaborators. +* Add a slide-deck override demo (q2-preview + Reveal). + +## doing + +* Render-components demo polish. + +## done + +* Fork from elliot/ and rebase for q2-preview. + +::: + +### Stable html and math + +```{=html} + +``` + +[>> 😸] + +$$ +y = mx + b +\newline +G(a_n;x)=\sum_{n=0}^\infty a_n x^{n+1}. +$$[>> 👀] diff --git a/crates/quarto/tests/smoke-all/q2-preview/render-components-write/kanban.tsx b/crates/quarto/tests/smoke-all/q2-preview/render-components-write/kanban.tsx new file mode 100644 index 000000000..86abe6965 --- /dev/null +++ b/crates/quarto/tests/smoke-all/q2-preview/render-components-write/kanban.tsx @@ -0,0 +1,152 @@ +const React = window.React; +const { renderChildren } = window.__Q2_PREVIEW_RENDERER__; + +export const Div = (args) => { + const { node: div, setLocalAst } = args; + + // Check if this is a kanban div + const [[id, classes, attrs]] = div.c; + + if (!classes.includes('kanban')) { + return
{renderChildren(args)}
; + } + + // Parse kanban structure + const blocks = div.c[1]; + const columns = []; + let currentColumn = null; + + for (const block of blocks) { + if (block.t === 'Header' && block.c[0] === 2) { + // New column header + const title = block.c[2].map(inline => { + if (inline.t === 'Str') return inline.c; + if (inline.t === 'Space') return ' '; + return ''; + }).join(''); + + currentColumn = { title, items: [] }; + columns.push(currentColumn); + } else if (block.t === 'BulletList' && currentColumn) { + // Items for current column + const items = block.c.map(listItem => { + // Each listItem is [Block] - an array of blocks + return listItem.map(b => { + if (b.t === 'Plain' || b.t === 'Para') { + return b.c.map(inline => { + if (inline.t === 'Str') return inline.c; + if (inline.t === 'Space') return ' '; + return ''; + }).join(''); + } + return ''; + }).join(''); + }); + currentColumn.items.push(...items); + } + } + + return ; +}; + +const KanbanBoard = ({ columns, div, setLocalAst }) => { + const [draggedItem, setDraggedItem] = React.useState(null); + + const handleDragStart = (colIndex, itemIndex) => { + setDraggedItem({ colIndex, itemIndex }); + }; + + const handleDrop = (targetColIndex) => { + if (!draggedItem) return; + + const { colIndex: srcColIndex, itemIndex: srcItemIndex } = draggedItem; + if (srcColIndex === targetColIndex) { + setDraggedItem(null); + return; + } + + // Build new AST + const newColumns = columns.map((col) => ({ + ...col, + items: [...col.items] + })); + + const [movedItem] = newColumns[srcColIndex].items.splice(srcItemIndex, 1); + newColumns[targetColIndex].items.push(movedItem); + + // Reconstruct div blocks + const newBlocks = []; + for (const col of newColumns) { + // Add header + newBlocks.push({ + t: 'Header', + c: [2, ['', [], []], col.title.split(' ').flatMap(word => [{ t: 'Str', c: word },{t: 'Space'}])] + }); + + // Add bullet list if items exist + if (col.items.length > 0) { + newBlocks.push({ + t: 'BulletList', + c: col.items.map(itemText => [{ + t: 'Plain', + c: [{ t: 'Str', c: itemText }] + }]) + }); + } + } + + const newDiv = structuredClone(div); + newDiv.c[1] = newBlocks; + setLocalAst(newDiv); + setDraggedItem(null); + }; + + return ( +
+ {columns.map((col, colIndex) => ( +
e.preventDefault()} + onDrop={() => handleDrop(colIndex)} + style={{ + minWidth: '150px', + backgroundColor: '#fff', + borderRadius: '8px', + padding: '12px', + boxShadow: '0 2px 4px rgba(0,0,0,0.1)' + }} + > +

+ {col.title} +

+
+ {col.items.map((item, itemIndex) => ( +
handleDragStart(colIndex, itemIndex)} + style={{ + padding: '8px', + backgroundColor: '#fafafa', + border: '1px solid #e0e0e0', + borderRadius: '4px', + cursor: 'move', + fontSize: '0.875rem' + }} + > + {item} +
+ ))} +
+
+ ))} +
+ ); +}; diff --git a/crates/wasm-quarto-hub-client/Cargo.lock b/crates/wasm-quarto-hub-client/Cargo.lock index 745a3d97c..41edc6441 100644 --- a/crates/wasm-quarto-hub-client/Cargo.lock +++ b/crates/wasm-quarto-hub-client/Cargo.lock @@ -2284,6 +2284,7 @@ dependencies = [ "serde", "serde_json", "sha1", + "smallvec", "tokio", "tree-sitter", "tree-sitter-qmd", @@ -2650,6 +2651,7 @@ dependencies = [ "serde_json", "serde_yaml", "sha2 0.11.0", + "smallvec", "tempfile", "thiserror 2.0.18", "time", @@ -2832,6 +2834,8 @@ name = "quarto-source-map" version = "0.1.0" dependencies = [ "serde", + "serde_json", + "smallvec", ] [[package]] diff --git a/crates/wasm-quarto-hub-client/src/lib.rs b/crates/wasm-quarto-hub-client/src/lib.rs index 7caed5fce..26631dfdd 100644 --- a/crates/wasm-quarto-hub-client/src/lib.rs +++ b/crates/wasm-quarto-hub-client/src/lib.rs @@ -2741,34 +2741,49 @@ pub fn ast_to_qmd(ast_json: &str) -> String { /// Incrementally write a modified AST back to QMD, preserving unchanged /// portions of the original source text verbatim. /// -/// Re-parses `original_qmd` internally to obtain an AST with accurate source -/// spans, then computes a reconciliation plan against the new AST and applies -/// the incremental writer. +/// Deserializes the caller-supplied **baseline** AST (the AST whose +/// source spans line up byte-for-byte with `original_qmd`) and computes +/// a reconciliation plan against the new AST. Plan 7 removed the +/// internal re-parse: previously the bridge re-parsed `original_qmd` +/// to recover spans, which lost any provenance the host had already +/// attached to the baseline (e.g. `preimage_in` after a prior +/// incremental edit). Now the caller is responsible for the +/// baseline-tier contract. /// /// # Arguments /// * `original_qmd` - The original QMD source text -/// * `new_ast_json` - JSON-serialized Pandoc AST representing the modified document +/// * `baseline_ast_json` - JSON-serialized Pandoc AST whose source +/// spans correspond to `original_qmd`. **Must be the same tier as +/// `new_ast_json`** (e.g. both `parse`-tier or both +/// `parse+sugar`-tier). Mixing tiers will mis-anchor reconciliation +/// and corrupt the write. +/// * `new_ast_json` - JSON-serialized Pandoc AST representing the +/// modified document, in the same tier as `baseline_ast_json`. /// /// # Returns /// JSON: `{ "success": true, "qmd": "" }` /// or `{ "success": false, "error": "...", "diagnostics": [...] }` #[wasm_bindgen] -pub fn incremental_write_qmd(original_qmd: &str, new_ast_json: &str) -> String { +pub fn incremental_write_qmd( + original_qmd: &str, + baseline_ast_json: &str, + new_ast_json: &str, +) -> String { use pampa::readers::json::read as json_read; - use pampa::wasm_entry_points::qmd_to_pandoc; use pampa::writers::incremental::incremental_write; use quarto_ast_reconcile::compute_reconciliation; - // Step 1: Parse original QMD to get AST with accurate source spans - let (original_ast, _original_context) = match qmd_to_pandoc(original_qmd.as_bytes()) { + // Step 1: Deserialize baseline AST from JSON (carries source spans + // anchored to `original_qmd` and any host-side provenance). + let mut baseline_cursor = std::io::Cursor::new(baseline_ast_json.as_bytes()); + let (baseline_ast, baseline_context) = match json_read(&mut baseline_cursor) { Ok(result) => result, - Err(error_strings) => { - let error_msg = error_strings.join("\n"); + Err(e) => { return serde_json::to_string(&AstResponse { success: false, ast: None, qmd: None, - error: Some(format!("Failed to parse original QMD: {}", error_msg)), + error: Some(format!("Failed to parse baseline AST JSON: {}", e)), diagnostics: None, warnings: None, }) @@ -2777,8 +2792,8 @@ pub fn incremental_write_qmd(original_qmd: &str, new_ast_json: &str) -> String { }; // Step 2: Deserialize new AST from JSON - let mut cursor = std::io::Cursor::new(new_ast_json.as_bytes()); - let (new_ast, _new_context) = match json_read(&mut cursor) { + let mut new_cursor = std::io::Cursor::new(new_ast_json.as_bytes()); + let (new_ast, _new_context) = match json_read(&mut new_cursor) { Ok(result) => result, Err(e) => { return serde_json::to_string(&AstResponse { @@ -2794,19 +2809,32 @@ pub fn incremental_write_qmd(original_qmd: &str, new_ast_json: &str) -> String { }; // Step 3: Compute reconciliation plan - let plan = compute_reconciliation(&original_ast, &new_ast); + let plan = compute_reconciliation(&baseline_ast, &new_ast); // Step 4: Incremental write - match incremental_write(original_qmd, &original_ast, &new_ast, &plan) { - Ok(result_qmd) => serde_json::to_string(&AstResponse { - success: true, - ast: None, - qmd: Some(result_qmd), - error: None, - diagnostics: None, - warnings: None, - }) - .unwrap(), + match incremental_write(original_qmd, &baseline_ast, &new_ast, &plan) { + Ok((result_qmd, warnings)) => { + // Plan 7: soft-drop warnings (Q-3-42 / Q-3-43) ride alongside + // a successful write. The TS wrapper surfaces them via the + // existing `warnings` channel on `AstResponse`. + let warnings_json = if warnings.is_empty() { + None + } else { + Some(diagnostics_to_json( + &warnings, + &baseline_context.source_context, + )) + }; + serde_json::to_string(&AstResponse { + success: true, + ast: None, + qmd: Some(result_qmd), + error: None, + diagnostics: None, + warnings: warnings_json, + }) + .unwrap() + } Err(diags) => { let error_msg = diags .iter() diff --git a/hub-client/changelog.md b/hub-client/changelog.md index 97e19f2dc..3c52bacb6 100644 --- a/hub-client/changelog.md +++ b/hub-client/changelog.md @@ -15,6 +15,15 @@ be in reverse chronological order (latest first). --> +### 2026-05-25 + +- [`5f2bbab0`](https://github.com/quarto-dev/q2/commits/5f2bbab0): Soft-drop warnings (Q-3-42, Q-3-43) now surface in the diagnostic panel even when the rewrite produces byte-identical output. Before, clicking +react inside a shortcode-resolved region (e.g. `{{< lipsum 3 >}}`) silently declined the edit with no visible feedback; the warning was queued for the next render but no re-render fired because nothing changed. +- [`bdcfdc53`](https://github.com/quarto-dev/q2/commits/bdcfdc53): Fix q2-preview edits silently failing with "Incremental write failed: undefined" on documents where the render pipeline produced a single top-level sectionize wrapper around the user content. The writer now recurses into non-atomic Generated wrappers (sectionize, footnotes-container, appendix-container) instead of soft-dropping the whole document. + +### 2026-05-24 + +- [`a0a4c7c8`](https://github.com/quarto-dev/q2/commits/a0a4c7c8): q2-preview edits now write back to the document. The read-only guard is gone; component-driven edits (kanban drag, future comment buttons) flow through the incremental writer using the live preview AST as the baseline, and soft-drop warnings (Q-3-42 / Q-3-43) surface in the existing diagnostics panel when an edit hits an atomic region. + ### 2026-05-21 - [`6c84696d`](https://github.com/quarto-dev/q2/commits/6c84696d): Login screen and post-logout view now respect the saved `colorScheme` preference (and system `prefers-color-scheme`) instead of always rendering light on first visit and inheriting the previous session's class after logout. diff --git a/hub-client/e2e/q2-preview-render-components-write.spec.ts b/hub-client/e2e/q2-preview-render-components-write.spec.ts new file mode 100644 index 000000000..5c1fca0f3 --- /dev/null +++ b/hub-client/e2e/q2-preview-render-components-write.spec.ts @@ -0,0 +1,177 @@ +/** + * E2E repro for "Incremental write failed: undefined" on q2-preview. + * + * Sister to `q2-debug-render-components.spec.ts`, but the click here + * triggers `setLocalAst` (not local React state). That threads through + * the renderer dispatch into `ReactPreview.handleSetAst` → + * `incrementalWriteQmd` (`ts-packages/preview-runtime/src/wasmRenderer.ts`), + * which is the path the user hit while clicking a reactji button in the + * `render-components` demo. + * + * The fixture's `write-reaction.tsx` mirrors the addReaction code in + * `~/docs/demo-playground/gordon/render-components/comment.tsx`: append a + * fresh `Span.quarto-edit-comment` to the clicked Para's inline children + * and `setLocalAst(newBlock)`. The dispatch wraps that into a full AST + * (one block replaced) and feeds it as the new-AST to the WASM bridge. + * + * Expected after the bug is fixed: the write succeeds and no "Incremental + * write failed" console error fires. + * + * Current behaviour (the bug we're chasing): the bridge returns + * `{success: true, qmd: '', warnings: [Q-3-43]}` — empty document with a + * "Generated content edit dropped" warning. The wasmRenderer.ts:758 + * throw site (instrumented to distinguish this empty-qmd path) logs + * `incrementalWriteQmd failed; raw response: ...` and throws. + */ + +import { readFileSync } from 'node:fs'; +import { resolve } from 'node:path'; +import { test, expect, type ConsoleMessage } from '@playwright/test'; +import { + bootstrapProjectSet, + createProjectOnServer, + seedProjectInBrowser, + getServerUrl, +} from './helpers/projectFactory'; + +const FIXTURE_DIR = resolve( + import.meta.dirname, + '../../crates/quarto/tests/smoke-all/q2-preview/render-components-write', +); + +const qmdContent = readFileSync(resolve(FIXTURE_DIR, 'index.qmd'), 'utf-8'); +const commentTsxContent = readFileSync( + resolve(FIXTURE_DIR, 'comment.tsx'), + 'utf-8', +); +const kanbanTsxContent = readFileSync( + resolve(FIXTURE_DIR, 'kanban.tsx'), + 'utf-8', +); +const dragTsxContent = readFileSync( + resolve(FIXTURE_DIR, 'drag.tsx'), + 'utf-8', +); +const quartoYmlContent = readFileSync( + resolve(FIXTURE_DIR, '_quarto.yml'), + 'utf-8', +); + +test.describe('q2-preview render-components write', () => { + test('clicking +react triggers setLocalAst → incremental_write_qmd without empty-qmd error', async ({ + page, + }) => { + const serverUrl = getServerUrl(); + + // Collect every console.error from the page (and its iframes). The + // instrumentation in `wasmRenderer.ts:758` emits + // `incrementalWriteQmd failed; raw response: { ... }` + // when the WASM bridge returns Ok with an empty qmd string. We assert + // no such message lands during the click → write round-trip. + const consoleErrors: string[] = []; + const consoleAll: string[] = []; + page.on('console', (msg: ConsoleMessage) => { + const loc = msg.location(); + const tag = `[${msg.type()}] ${msg.text()} @ ${loc.url}:${loc.lineNumber}`; + consoleAll.push(tag); + if (msg.type() === 'error') { + consoleErrors.push(msg.text()); + } + }); + // Surface page errors too so a thrown JS error doesn't look like a + // silent pass. + const pageErrors: string[] = []; + page.on('pageerror', (err) => { + pageErrors.push(`${err.message}\n${err.stack ?? ''}`); + }); + + const indexDocId = await createProjectOnServer(serverUrl, [ + { + path: '_quarto.yml', + content: quartoYmlContent, + contentType: 'text', + }, + { + path: 'comment.tsx', + content: commentTsxContent, + contentType: 'text', + }, + { + path: 'kanban.tsx', + content: kanbanTsxContent, + contentType: 'text', + }, + { + path: 'drag.tsx', + content: dragTsxContent, + contentType: 'text', + }, + { + path: 'index.qmd', + content: qmdContent, + contentType: 'text', + }, + ]); + + await bootstrapProjectSet(page, serverUrl); + const localId = await seedProjectInBrowser(page, indexDocId, serverUrl); + + await page.goto( + `/#/p/${localId}/file/${encodeURIComponent('index.qmd')}`, + ); + + // The q2-preview iframe is `q2-preview.html`, distinct from the + // q2-debug iframe used by the sister spec. The user's CommentWrapper + // renders a "+ 🙂" button (title="Add reaction") next to every Para; + // clicking it opens an emoji picker, clicking an emoji calls + // addReaction → setLocalAst. + const iframe = page.frameLocator('iframe[src*="q2-preview.html"]'); + + // Wait for the iframe to render the first paragraph's CommentWrapper + // chrome — the "+ 🙂" emoji-picker open button. + const openPicker = iframe.locator('[title="Add reaction"]').first(); + try { + await expect(openPicker).toBeVisible({ timeout: 30_000 }); + } catch (e) { + console.error('--- console messages so far ---'); + for (const line of consoleAll) console.error(line); + console.error('--- page errors ---'); + for (const err of pageErrors) console.error(err); + throw e; + } + + // Open the picker, then click the 😂 emoji. Picker emoji spans + // carry no test id — locate by text. There's a 😂 in + // CommentWrapper's `commonEmojis` list (`'👍', '❤️', '😂', ...`). + await openPicker.click(); + await iframe.locator('text="😂"').first().click(); + + // Give the WASM call time to run and emit its console.error if it + // hits the failure path. + await page.waitForTimeout(1500); + + const writeFailures = consoleErrors.filter((line) => + line.includes('incrementalWriteQmd failed'), + ); + + if (writeFailures.length > 0) { + console.error('--- Full console log on failure ---'); + for (const line of consoleAll) console.error(line); + } + + expect( + writeFailures, + 'Incremental write should not fail when appending a reaction to the first paragraph. ' + + 'Raw console errors:\n' + + consoleErrors.join('\n'), + ).toEqual([]); + + // Filter out unrelated Monaco loader internal errors (Monaco runs + // inside the markup-view panel; its load can throw without + // affecting the preview). + const relevantPageErrors = pageErrors.filter( + (e) => !e.includes('monaco-editor'), + ); + expect(relevantPageErrors, 'Page should not throw').toEqual([]); + }); +}); diff --git a/hub-client/src/components/render/ReactPreview.tsx b/hub-client/src/components/render/ReactPreview.tsx index fa34e36ca..c8aa19369 100644 --- a/hub-client/src/components/render/ReactPreview.tsx +++ b/hub-client/src/components/render/ReactPreview.tsx @@ -9,7 +9,7 @@ import { isWasmReady, incrementalWriteQmd, } from '@quarto/preview-runtime'; -import { pipelineKindForFormat } from '../../utils/pipelineKind'; +import { pipelineKindForFormat } from '@quarto/preview-runtime'; import { useAttribution } from '../../hooks/useAttribution'; import { stripAnsi } from '@quarto/preview-renderer/utils/stripAnsi'; import { PreviewErrorOverlay } from '@quarto/preview-renderer/overlays/PreviewErrorOverlay'; @@ -314,6 +314,19 @@ export default function ReactPreview({ const renderTimeoutRef = useRef(null); const lastContentRef = useRef(''); + // Plan 7: soft-drop warnings from the most recent incremental write, + // pending injection into the next render's diagnostics. Drained when + // the content-driven re-render fires. + const pendingWriteWarningsRef = useRef([]); + + // Tracks the most recent set of render-side diagnostics we sent + // upward. `handleSetAst` reads this when surfacing soft-drop + // warnings *immediately* (without waiting for the next render): + // the immediate push must include the current render's + // diagnostics so it doesn't accidentally clear them. Updated on + // every `onDiagnosticsChange` call. + const lastRenderDiagnosticsRef = useRef([]); + // Handler for cross-document navigation const handleNavigateToDocument = useCallback( (targetPath: string, anchor: string | null) => { @@ -350,11 +363,24 @@ export default function ReactPreview({ }); if (qmdContent !== lastContentRef.current) return; - // Update diagnostics - onDiagnosticsChange(result.diagnostics); + // Update diagnostics. Plan 7: drain any soft-drop warnings from + // the most recent incremental write into this push so they reach + // the diagnostics surface alongside render-side diagnostics. + const pendingWriteWarnings = pendingWriteWarningsRef.current; + pendingWriteWarningsRef.current = []; + const mergedDiagnostics = pendingWriteWarnings.length > 0 + ? [...result.diagnostics, ...pendingWriteWarnings] + : result.diagnostics; + // Remember just the render-side portion so a follow-up immediate + // push from `handleSetAst` (when the writer returns warnings + // alongside byte-identical output and no re-render fires) can + // merge new warnings *with* the current render diagnostics rather + // than clobbering them. + lastRenderDiagnosticsRef.current = result.diagnostics; + onDiagnosticsChange(mergedDiagnostics); setCurrentError(result.success ? null : { message: result.error!, - diagnostics: result.diagnostics, + diagnostics: mergedDiagnostics, }); if (result.success) { @@ -415,29 +441,57 @@ export default function ReactPreview({ setCurrentError(null); }, [currentFile?.path]); - // Handler for AST modifications - converts AST back to QMD and updates content. + // Handler for AST modifications — converts AST back to QMD and + // updates content. + // + // Plan 7 lifted the v1 read-only guard. The bridge now takes the + // displayed AST as the **baseline** (its source spans line up with + // `content`) and the new edited AST. + // + // Soft-drop warnings (Q-3-42 / Q-3-43) reach the diagnostic surface + // via two paths, both load-bearing: + // + // 1. **Immediate push.** If the writer returns warnings, surface + // them right away by calling `onDiagnosticsChange` here. This + // is the path that matters when the rewrite produces + // byte-identical output (the common soft-drop case — the + // writer faithfully preserves the original bytes when the + // edit was rejected). With identical bytes, no Monaco edit + // fires, no automerge update, no re-render — so without an + // immediate push the warnings would never surface. // - // q2-preview is **read-only in v1** (Plan 1 §"Multi-plan contract: - // read-only mode lifts at Plan 7"). The post-pipeline AST diverges - // from source enough that a naive incrementalWriteQmd would - // corrupt the qmd; Plan 7 lifts this guard once the writer's - // round-trip machinery understands q2-preview's transform shapes - // (Synthetic / Derived / atomic CustomNodes). Component-driven - // edits (kanban drag, comment buttons in Plan 2) call this and - // silently no-op with a console.warn — that is the accepted - // post-Plan-2 UX gap until Plan 7 ships. + // 2. **Ride-along on next render.** If the rewrite *did* change + // content, the re-render fires and `doRenderWithStateManagement` + // drains `pendingWriteWarningsRef` into its merged diagnostics + // push. This keeps the warning temporally associated with the + // render of the *edited* document, which is the cleanest UX + // for the "edit applied + warning fired" case (rare today — + // most warnings imply soft-drop, i.e. no content change — but + // kept as a safety net). + // + // The immediate push merges with `lastRenderDiagnosticsRef` so we + // don't accidentally clear the current render-side diagnostics + // when we add write warnings to them. const handleSetAst = useCallback((newAst: any) => { - if (pipelineKindForFormat(format) === 'preview') { - console.warn('q2-preview is read-only in v1; AST edit dropped (Plan 7 lifts this guard)'); - return; - } try { - const newQmd = incrementalWriteQmd(content, newAst); + const baseline = ast ? JSON.parse(ast) : null; + if (!baseline) { + console.warn('Cannot write AST: no baseline render available yet'); + return; + } + const { qmd: newQmd, warnings } = incrementalWriteQmd(content, baseline, newAst); + if (warnings && warnings.length > 0) { + // (1) Immediate push for the byte-identical (no re-render) case. + onDiagnosticsChange([...lastRenderDiagnosticsRef.current, ...warnings]); + // (2) Queue for ride-along on the next render (no-op when no + // re-render fires, which is the typical soft-drop path). + pendingWriteWarningsRef.current = warnings; + } onContentRewrite(newQmd); } catch (err) { console.error('Failed to write AST back to QMD:', err); } - }, [content, onContentRewrite, format]); + }, [ast, content, onContentRewrite, onDiagnosticsChange]); return (
diff --git a/hub-client/src/services/incrementalWrite.wasm.test.ts b/hub-client/src/services/incrementalWrite.wasm.test.ts new file mode 100644 index 000000000..6a73666d1 --- /dev/null +++ b/hub-client/src/services/incrementalWrite.wasm.test.ts @@ -0,0 +1,156 @@ +/** + * WASM End-to-End Tests for `incremental_write_qmd` (Plan 7). + * + * Verifies the new 3-arg signature + * (`original_qmd, baseline_ast_json, new_ast_json`) at the JS/WASM + * boundary. The Rust-side correctness of soft-drop substitutions + * (Q-3-42 / Q-3-43) is covered by `crates/pampa/src/writers/incremental.rs` + * unit tests; these tests pin the wrapper contract: + * + * - Identity round-trip is byte-equal (baseline === new ⇒ original qmd). + * - The returned shape is `{ qmd, warnings? }`; `warnings` is absent + * when nothing was soft-dropped. + * - A simple paragraph-text edit reaches the result qmd; the + * surrounding structure (headings, other paragraphs) is preserved + * verbatim from the original. + * + * The exhaustive scenario matrix (sectionized docs, multi-inline + * shortcode dedupe, Q-3-42 byte-equal-no-op, Q-3-43 footnotes + * regeneration) lives in the Rust-side coarsen tests + Plan 8 + * Playwright e2e (deferred to follow-up beads). + * + * Run with: npm run test:wasm + */ + +import { describe, it, expect, beforeAll } from 'vitest'; +import { readFile } from 'fs/promises'; +import { dirname, join } from 'path'; +import { fileURLToPath } from 'url'; + +interface WasmModule { + default: (input?: BufferSource) => Promise; + parse_qmd_content: (content: string) => string; + incremental_write_qmd: ( + original_qmd: string, + baseline_ast_json: string, + new_ast_json: string, + ) => string; +} + +interface AstResponse { + success: boolean; + ast?: string; + qmd?: string; + error?: string; + warnings?: unknown[]; +} + +let wasm: WasmModule; + +beforeAll(async () => { + const __dirname = dirname(fileURLToPath(import.meta.url)); + const wasmDir = join(__dirname, '../../wasm-quarto-hub-client'); + const wasmPath = join(wasmDir, 'wasm_quarto_hub_client_bg.wasm'); + const wasmBytes = await readFile(wasmPath); + + wasm = (await import('wasm-quarto-hub-client')) as unknown as WasmModule; + await wasm.default(wasmBytes); +}); + +/** Parse `qmd` and return the resulting AST as a plain object. */ +function parseAst(qmd: string): unknown { + const resp: AstResponse = JSON.parse(wasm.parse_qmd_content(qmd)); + expect(resp.success, `parse_qmd_content failed: ${resp.error}`).toBe(true); + expect(resp.ast).toBeTruthy(); + return JSON.parse(resp.ast!); +} + +/** Run the incremental writer and return its parsed AstResponse. */ +function write( + originalQmd: string, + baselineAst: unknown, + newAst: unknown, +): AstResponse { + return JSON.parse( + wasm.incremental_write_qmd( + originalQmd, + JSON.stringify(baselineAst), + JSON.stringify(newAst), + ), + ); +} + +/** + * Walk a Pandoc AST and mutate the first `Str` whose `c` matches + * `find`, replacing its content with `replace`. Returns true if a + * match was found. Used to synthesize a "user edited a word" + * scenario without going through the qmd reader. + */ +function mutateFirstStr(ast: unknown, find: string, replace: string): boolean { + let done = false; + const walk = (node: unknown): void => { + if (done) return; + if (Array.isArray(node)) { + for (const child of node) walk(child); + return; + } + if (node && typeof node === 'object') { + const obj = node as Record; + if (obj.t === 'Str' && obj.c === find) { + obj.c = replace; + done = true; + return; + } + for (const v of Object.values(obj)) walk(v); + } + }; + walk(ast); + return done; +} + +describe('incremental_write_qmd wrapper contract', () => { + it('identity round-trip is byte-equal and emits no warnings', () => { + const original = '# Heading\n\nA paragraph.\n'; + const baseline = parseAst(original); + const resp = write(original, baseline, baseline); + + expect(resp.success, `write failed: ${resp.error}`).toBe(true); + expect(resp.qmd).toBe(original); + // No warnings field when nothing was soft-dropped. + expect(resp.warnings).toBeUndefined(); + }); + + it('paragraph-text edit reaches the output; surrounding structure preserved', () => { + const original = + '# Heading\n\nFirst paragraph here.\n\n## Sub\n\nSecond paragraph here.\n'; + const baseline = parseAst(original); + // Deep-clone via JSON round-trip so the mutation doesn't alias + // the baseline. The wrapper stringifies both, but defensive + // cloning makes the test's intent obvious. + const next = JSON.parse(JSON.stringify(baseline)); + const mutated = mutateFirstStr(next, 'First', 'Updated'); + expect(mutated, 'expected to find a Str("First") to mutate').toBe(true); + + const resp = write(original, baseline, next); + expect(resp.success, `write failed: ${resp.error}`).toBe(true); + expect(resp.qmd).toMatch(/Updated paragraph here\./); + // Untouched surroundings are preserved verbatim from the + // original — this is the whole point of the incremental writer. + expect(resp.qmd).toContain('# Heading'); + expect(resp.qmd).toContain('## Sub'); + expect(resp.qmd).toContain('Second paragraph here.'); + }); + + it('reports a structured error when the baseline AST JSON is malformed', () => { + const original = '# x\n'; + const baseline = parseAst(original); + const respJson = wasm.incremental_write_qmd( + original, + '{not valid json', + JSON.stringify(baseline), + ); + const resp: AstResponse = JSON.parse(respJson); + expect(resp.success).toBe(false); + expect(resp.error).toMatch(/baseline AST JSON/i); + }); +}); diff --git a/hub-client/src/types/wasm-quarto-hub-client.d.ts b/hub-client/src/types/wasm-quarto-hub-client.d.ts index 514ca4aa0..b92e13f76 100644 --- a/hub-client/src/types/wasm-quarto-hub-client.d.ts +++ b/hub-client/src/types/wasm-quarto-hub-client.d.ts @@ -65,8 +65,20 @@ declare module 'wasm-quarto-hub-client' { // QMD parsing and AST conversion functions export function parse_qmd_content(content: string): string; export function ast_to_qmd(ast_json: string): string; - /** Incrementally write a modified AST back to QMD, preserving unchanged source text. */ - export function incremental_write_qmd(original_qmd: string, new_ast_json: string): string; + /** + * Incrementally write a modified AST back to QMD, preserving unchanged + * source text. + * + * Per Plan 7: the caller is responsible for passing a **baseline** AST + * (`baseline_ast_json`) whose source spans match `original_qmd` and + * whose tier matches `new_ast_json`. The bridge does not re-parse + * `original_qmd`; mixing tiers will corrupt the write. + */ + export function incremental_write_qmd( + original_qmd: string, + baseline_ast_json: string, + new_ast_json: string, + ): string; // Response type for parse/write operations export interface AstResponse { @@ -77,6 +89,11 @@ declare module 'wasm-quarto-hub-client' { qmd?: string; error?: string; diagnostics?: AstDiagnostic[]; + /** + * Soft-drop warnings (Plan 7 Q-3-42 / Q-3-43) that rode alongside + * a successful incremental write. + */ + warnings?: AstDiagnostic[]; } export interface AstDiagnostic { diff --git a/package-lock.json b/package-lock.json index cff75e638..928743e8a 100644 --- a/package-lock.json +++ b/package-lock.json @@ -3777,14 +3777,6 @@ "@types/node": "*" } }, - "node_modules/@types/trusted-types": { - "version": "2.0.7", - "resolved": "https://registry.npmjs.org/@types/trusted-types/-/trusted-types-2.0.7.tgz", - "integrity": "sha512-ScaPdn1dQczgbl0QFTeTOmVHFULt394XJgOQNoyVhZ6r2vLnMLJfBPd53SB52T/3G36VI1/g2MZaX0cwDuXsfw==", - "license": "MIT", - "optional": true, - "peer": true - }, "node_modules/@types/ws": { "version": "8.18.1", "resolved": "https://registry.npmjs.org/@types/ws/-/ws-8.18.1.tgz", diff --git a/q2-demos/hub-react-todo/src/types/wasm-quarto-hub-client.d.ts b/q2-demos/hub-react-todo/src/types/wasm-quarto-hub-client.d.ts index 3664242d7..463c7c3a2 100644 --- a/q2-demos/hub-react-todo/src/types/wasm-quarto-hub-client.d.ts +++ b/q2-demos/hub-react-todo/src/types/wasm-quarto-hub-client.d.ts @@ -5,13 +5,18 @@ declare module 'wasm-quarto-hub-client' { export function parse_qmd_content(content: string): string; export function ast_to_qmd(ast_json: string): string; - export function incremental_write_qmd(original_qmd: string, new_ast_json: string): string; + export function incremental_write_qmd( + original_qmd: string, + baseline_ast_json: string, + new_ast_json: string, + ): string; export interface AstResponse { success: boolean; ast?: string; qmd?: string; error?: string; + warnings?: unknown[]; } export type InitInput = RequestInfo | URL | Response | BufferSource | WebAssembly.Module; diff --git a/q2-demos/hub-react-todo/src/useSyncedAst.ts b/q2-demos/hub-react-todo/src/useSyncedAst.ts index c837ee3d7..2dea297d8 100644 --- a/q2-demos/hub-react-todo/src/useSyncedAst.ts +++ b/q2-demos/hub-react-todo/src/useSyncedAst.ts @@ -90,8 +90,16 @@ export function useSyncedAst(params: SyncedAstParams | null): SyncedAstState { { parseQmd: (content: string) => parseQmdContent(content), writeQmd: (astValue: unknown) => writeQmdFromAst(astValue as RustQmdJson), - incrementalWriteQmd: (originalQmd: string, newAst: unknown) => - incrementalWriteQmd(originalQmd, newAst as RustQmdJson), + incrementalWriteQmd: ( + originalQmd: string, + baselineAst: unknown, + newAst: unknown, + ) => + incrementalWriteQmd( + originalQmd, + baselineAst as RustQmdJson, + newAst as RustQmdJson, + ), fileFilter: (path: string) => path === filePath, }, ) diff --git a/q2-demos/hub-react-todo/src/wasm.ts b/q2-demos/hub-react-todo/src/wasm.ts index ad24eece4..acaa44144 100644 --- a/q2-demos/hub-react-todo/src/wasm.ts +++ b/q2-demos/hub-react-todo/src/wasm.ts @@ -74,20 +74,35 @@ export function writeQmdFromAst(ast: RustQmdJson): string { * Incrementally write a modified AST back to QMD, preserving unchanged * portions of the original source text verbatim. * + * Plan 7 contract: caller must pass the **baseline** AST (whose + * source spans line up with `originalQmd`); the bridge does not + * re-parse `originalQmd`. `baselineAst` may be a parsed object or a + * pre-serialized JSON string. + * * Must call initWasm() before first use. */ -export function incrementalWriteQmd(originalQmd: string, newAst: RustQmdJson): string { +export function incrementalWriteQmd( + originalQmd: string, + baselineAst: RustQmdJson | string, + newAst: RustQmdJson, +): { qmd: string; warnings?: unknown[] } { if (!wasmModule) { throw new Error('WASM not initialized. Call initWasm() first.') } + const baselineAstJson = + typeof baselineAst === 'string' ? baselineAst : JSON.stringify(baselineAst) const newAstJson = JSON.stringify(newAst) - const responseJson = wasmModule.incremental_write_qmd(originalQmd, newAstJson) + const responseJson = wasmModule.incremental_write_qmd( + originalQmd, + baselineAstJson, + newAstJson, + ) const response: AstResponse = JSON.parse(responseJson) if (!response.success || !response.qmd) { throw new Error(`Incremental write failed: ${response.error}`) } - return response.qmd + return { qmd: response.qmd, warnings: response.warnings } } diff --git a/q2-demos/kanban/src/types/wasm-quarto-hub-client.d.ts b/q2-demos/kanban/src/types/wasm-quarto-hub-client.d.ts index 3664242d7..463c7c3a2 100644 --- a/q2-demos/kanban/src/types/wasm-quarto-hub-client.d.ts +++ b/q2-demos/kanban/src/types/wasm-quarto-hub-client.d.ts @@ -5,13 +5,18 @@ declare module 'wasm-quarto-hub-client' { export function parse_qmd_content(content: string): string; export function ast_to_qmd(ast_json: string): string; - export function incremental_write_qmd(original_qmd: string, new_ast_json: string): string; + export function incremental_write_qmd( + original_qmd: string, + baseline_ast_json: string, + new_ast_json: string, + ): string; export interface AstResponse { success: boolean; ast?: string; qmd?: string; error?: string; + warnings?: unknown[]; } export type InitInput = RequestInfo | URL | Response | BufferSource | WebAssembly.Module; diff --git a/q2-demos/kanban/src/useSyncedAst.ts b/q2-demos/kanban/src/useSyncedAst.ts index c837ee3d7..2dea297d8 100644 --- a/q2-demos/kanban/src/useSyncedAst.ts +++ b/q2-demos/kanban/src/useSyncedAst.ts @@ -90,8 +90,16 @@ export function useSyncedAst(params: SyncedAstParams | null): SyncedAstState { { parseQmd: (content: string) => parseQmdContent(content), writeQmd: (astValue: unknown) => writeQmdFromAst(astValue as RustQmdJson), - incrementalWriteQmd: (originalQmd: string, newAst: unknown) => - incrementalWriteQmd(originalQmd, newAst as RustQmdJson), + incrementalWriteQmd: ( + originalQmd: string, + baselineAst: unknown, + newAst: unknown, + ) => + incrementalWriteQmd( + originalQmd, + baselineAst as RustQmdJson, + newAst as RustQmdJson, + ), fileFilter: (path: string) => path === filePath, }, ) diff --git a/q2-demos/kanban/src/wasm.ts b/q2-demos/kanban/src/wasm.ts index ad24eece4..acaa44144 100644 --- a/q2-demos/kanban/src/wasm.ts +++ b/q2-demos/kanban/src/wasm.ts @@ -74,20 +74,35 @@ export function writeQmdFromAst(ast: RustQmdJson): string { * Incrementally write a modified AST back to QMD, preserving unchanged * portions of the original source text verbatim. * + * Plan 7 contract: caller must pass the **baseline** AST (whose + * source spans line up with `originalQmd`); the bridge does not + * re-parse `originalQmd`. `baselineAst` may be a parsed object or a + * pre-serialized JSON string. + * * Must call initWasm() before first use. */ -export function incrementalWriteQmd(originalQmd: string, newAst: RustQmdJson): string { +export function incrementalWriteQmd( + originalQmd: string, + baselineAst: RustQmdJson | string, + newAst: RustQmdJson, +): { qmd: string; warnings?: unknown[] } { if (!wasmModule) { throw new Error('WASM not initialized. Call initWasm() first.') } + const baselineAstJson = + typeof baselineAst === 'string' ? baselineAst : JSON.stringify(baselineAst) const newAstJson = JSON.stringify(newAst) - const responseJson = wasmModule.incremental_write_qmd(originalQmd, newAstJson) + const responseJson = wasmModule.incremental_write_qmd( + originalQmd, + baselineAstJson, + newAstJson, + ) const response: AstResponse = JSON.parse(responseJson) if (!response.success || !response.qmd) { throw new Error(`Incremental write failed: ${response.error}`) } - return response.qmd + return { qmd: response.qmd, warnings: response.warnings } } diff --git a/q2-preview-spa/src/PreviewApp.integration.test.tsx b/q2-preview-spa/src/PreviewApp.integration.test.tsx index f7f4cb70a..fe6bee4fa 100644 --- a/q2-preview-spa/src/PreviewApp.integration.test.tsx +++ b/q2-preview-spa/src/PreviewApp.integration.test.tsx @@ -107,9 +107,10 @@ describe('PreviewApp boot path', () => { const props = capturedIframeProps[capturedIframeProps.length - 1]; expect(props.currentFilePath).toBe('index.qmd'); expect(props.astJson).toBe('{"blocks":[]}'); - // setAst is required by Q2PreviewIframe; Phase A's no-op is fine but - // it must at least be a function so the iframe doesn't crash on - // first DOM-stable edit. + // setAst is required by Q2PreviewIframe; Plan 7 Phase 7 wired the + // real `handleSetAst` (incrementalWriteQmd + echo-prevention). + // The shape check here is the integration-level contract; the + // write path itself is covered by Phase 8's round-trip tests. expect(typeof props.setAst).toBe('function'); }); diff --git a/q2-preview-spa/src/PreviewApp.tsx b/q2-preview-spa/src/PreviewApp.tsx index e7b01a41f..edb98167e 100644 --- a/q2-preview-spa/src/PreviewApp.tsx +++ b/q2-preview-spa/src/PreviewApp.tsx @@ -14,11 +14,15 @@ * * Decisions worth surfacing here: * - * - `setAst` on Q2PreviewIframe is a no-op for now. The iframe takes - * it as a required prop because Phase 2 of q2-preview anticipated a - * WYSIWYG round-trip (the iframe asks the parent to update the - * AST). The SPA doesn't have an editor to round-trip into yet, so a - * no-op is correct. + * - `setAst` on Q2PreviewIframe is wired through `incrementalWriteQmd` + * (Plan 7 Phase 7). Component-driven edits in the iframe (e.g. + * kanban drag, future comment buttons) call back with the modified + * AST; we use the current `astJson` as the baseline, write the + * reconciled qmd to the active file via the sync client, and stash + * the FNV-1a hash so the resulting `onFileContent` echo gets + * suppressed (otherwise the SPA would re-render unnecessarily and, + * in races, blow away an in-flight edit). Soft-drop warnings + * (Q-3-42 / Q-3-43) ride into the DiagnosticStrip. * * - `wsUrl` is derived from `window.location` rather than read from * a server endpoint. The CLI always opens the SPA on the same @@ -37,13 +41,16 @@ * round-trip on boot, no new server-side patterns introduced. */ -import { useCallback, useEffect, useState } from 'react'; +import { useCallback, useEffect, useRef, useState } from 'react'; import { initWasm, connect, setSyncHandlers, renderPageForPreview, getBinaryDocById, + getFileContent, + updateFileContent, + incrementalWriteQmd, } from '@quarto/preview-runtime'; import { Q2PreviewIframe } from '@quarto/preview-renderer/iframe/Q2PreviewIframe'; import { extractMetaString } from '@quarto/preview-renderer/framework'; @@ -52,8 +59,33 @@ import type { CaptureRef, FileEntry } from '@quarto/quarto-automerge-schema'; import { ForceRefreshButton } from './components/ForceRefreshButton'; import { PreviewDiagnosticsOverlay } from './components/PreviewDiagnosticsOverlay'; import { StaleCaptureOverlay } from './components/StaleCaptureOverlay'; +import { DiagnosticStrip } from './components/DiagnosticStrip'; import { pickInitialPage } from './pickInitialPage'; +/** + * FNV-1a 32-bit hash, hex-encoded. Used for content-match + * echo-prevention in `handleSetAst` (Plan 7 Phase 7): we hash the qmd + * we're about to emit, stash `(path, hash)` in a ref, and suppress the + * matching incoming `onFileContent` so the SPA doesn't re-render off + * its own write. + * + * Why FNV-1a and not SHA-256 or xxHash: this is an in-process + * equality check across a single round-trip (write → samod → echo + * back). Cryptographic strength is irrelevant; the collision domain + * is one file's last-emitted qmd, so 32 bits is comfortable. FNV-1a + * is zero-dependency, fast on short-to-medium strings, and the + * codebase already uses it for the actor-color hash. Single source + * of truth: this function in this file. + */ +function fnv1aHex(s: string): string { + let h = 0x811c9dc5; + for (let i = 0; i < s.length; i++) { + h ^= s.charCodeAt(i); + h = (h + ((h << 1) + (h << 4) + (h << 7) + (h << 8) + (h << 24))) >>> 0; + } + return h.toString(16).padStart(8, '0'); +} + /** * Suffix appended to the document's title in the browser tab so a * `q2 preview` tab is distinguishable from the live / published page @@ -352,13 +384,21 @@ function deriveWsUrl(loc: Location = window.location): string { return `${wsScheme}//${loc.host}/ws`; } -/** No-op `setAst` until WYSIWYG mode is wired (post-Phase-A). */ -const noopSetAst = () => { - /* deliberately empty */ -}; - export default function PreviewApp() { const [state, setState] = useState(INITIAL_STATE); + // Plan 7 Phase 7: soft-drop warnings to surface in DiagnosticStrip. + // Accumulated across edits within a session; dismissed by the + // strip's close button. + const [writeWarnings, setWriteWarnings] = useState([]); + + // Plan 7 Phase 7: content-match echo-prevention. `handleSetAst` + // writes qmd via `updateFileContent`, which round-trips through + // samod and fires `onFileContent` back at us. Without this ref the + // SPA would re-render off its own write, and in pathological races + // could overwrite an in-flight follow-up edit. We stash the FNV-1a + // hash of the emitted qmd here; the next `onFileContent` for the + // same path that hashes equal is silently dropped. + const lastEmittedRef = useRef<{ path: string; hash: string } | null>(null); // Force-refresh trigger (bd-b5hf): bumping `contentTick` re-fires // the render useEffect. Reuses the same channel `onFileContent` @@ -370,6 +410,59 @@ export default function PreviewApp() { setState((s) => ({ ...s, contentTick: s.contentTick + 1 })); }, []); + // Plan 7 Phase 7: handleSetAst reads `activeFile` + `astJson` via + // refs so the callback keeps a stable identity for Q2PreviewIframe. + // (The iframe's effect deps include `setAst`; re-binding on every + // astJson change would re-register the postMessage listener.) + const activeFileRef = useRef(null); + const astJsonRef = useRef(null); + useEffect(() => { + activeFileRef.current = state.activeFile; + }, [state.activeFile]); + useEffect(() => { + astJsonRef.current = state.astJson; + }, [state.astJson]); + + // Plan 7 Phase 7: WYSIWYG round-trip. Component-driven edits in the + // iframe (kanban drag, comment buttons, …) call this with the + // modified AST. We use the current `astJson` as the baseline (its + // source spans line up with the qmd in samod), reconcile via + // `incrementalWriteQmd`, and write the result back through + // `updateFileContent`. Soft-drop warnings (Q-3-42 / Q-3-43) flow + // into the DiagnosticStrip. The emitted-qmd hash is stashed in + // `lastEmittedRef` so the echoed `onFileContent` is suppressed. + const handleSetAst = useCallback((newAst: unknown) => { + const path = activeFileRef.current; + const baselineJson = astJsonRef.current; + if (!path || !baselineJson) { + console.warn('q2-preview setAst: no active page or baseline yet'); + return; + } + const originalQmd = getFileContent(path); + if (originalQmd === null) { + console.warn(`q2-preview setAst: no content cached for ${path}`); + return; + } + try { + const { qmd, warnings } = incrementalWriteQmd( + originalQmd, + baselineJson, + newAst as never, + ); + lastEmittedRef.current = { path, hash: fnv1aHex(qmd) }; + updateFileContent(path, qmd); + if (warnings && warnings.length > 0) { + setWriteWarnings((prev) => [...prev, ...warnings]); + } + } catch (err) { + console.error('q2-preview setAst: incremental write failed', err); + } + }, []); + + const handleDismissWarnings = useCallback(() => { + setWriteWarnings([]); + }, []); + // Phase F.1 (bd-kw93.14): the iframe posts NAVIGATE_TO_DOCUMENT // when the user clicks a cross-page artifact-rooted `.html` link. // Update activeFile + pendingAnchor and push a fresh history entry @@ -455,6 +548,20 @@ export default function PreviewApp() { }, onFileContent: (path: string) => { if (cancelled) return; + // Plan 7 Phase 7: echo-prevention. If the incoming + // content is exactly the qmd we just emitted, drop it — + // re-rendering off our own write wastes a tick and can + // race a follow-up edit. The ref carries (path, hash) + // for the last emission; consume it (set to null) so a + // *second* identical write would still re-render. + const last = lastEmittedRef.current; + if (last && last.path === path) { + const incoming = getFileContent(path); + if (incoming !== null && fnv1aHex(incoming) === last.hash) { + lastEmittedRef.current = null; + return; + } + } // Phase D.6 filter: read `activeFile` + `deps` via the // setState callback so the filter sees the *latest* // values (the closure was set up at boot time and would @@ -856,7 +963,7 @@ export default function PreviewApp() { pendingAnchor={state.pendingAnchor} pendingAnchorEpoch={state.pendingAnchorEpoch} onNavigateToDocument={handleNavigate} - setAst={noopSetAst} + setAst={handleSetAst} /> {showStaleOverlay && ( )} + {/* Plan 7 Phase 7: write-side soft-drop warnings (Q-3-42 / + Q-3-43) from `incrementalWriteQmd`. Distinct surface from + the bd-b9kzg render-diagnostics overlay below: those carry + server-side + WASM render diagnostics; this carries + user-edit-rejection signals. Keeping them separate avoids + conflating "your edit was discarded" with "the render + itself complained." */} + {/* bd-b9kzg (extends Phase D.4): non-terminal diagnostics overlay. The overlay defaults to its own internal collapsed state (true) when the `collapsed` prop is diff --git a/q2-preview-spa/src/components/DiagnosticStrip.tsx b/q2-preview-spa/src/components/DiagnosticStrip.tsx new file mode 100644 index 000000000..450426faf --- /dev/null +++ b/q2-preview-spa/src/components/DiagnosticStrip.tsx @@ -0,0 +1,119 @@ +/** + * DiagnosticStrip (Plan 7 Phase 7). + * + * Surfaces soft-drop warnings (Q-3-42 / Q-3-43) returned by + * `incrementalWriteQmd` after a component-driven edit hits an atomic + * region. The SPA has no Monaco squiggle to lean on, so this strip is + * the only diagnostic surface for write-side warnings. + * + * Autosave-context spam mitigation: every keystroke triggers a render + + * write, so a user typing over an atomic-resolved inline would re-emit + * the same Q-3-42 on every tick. We group by source range and show the + * first three occurrences per `(start_line, start_column, end_line, + * end_column)`; further hits are silently dropped (the prior entries + * stay visible). Plan 7 §"Autosave-context spam mitigation". + * + * The catalog messages (`Q-3-42`: "Shortcode edit dropped" + body; + * `Q-3-43`: "Generated content edit dropped" + body) already read as + * imperative instructions ("edit the invocation token in source + * instead"), so DiagnosticStrip surfaces title + problem verbatim. + */ + +import type { Diagnostic } from '@quarto/preview-renderer/types/diagnostic'; + +interface DiagnosticStripProps { + /** Soft-drop warnings to surface. Cleared by the caller on dismiss. */ + warnings: Diagnostic[]; + /** Caller-provided dismiss handler. */ + onDismiss: () => void; +} + +/** + * Group warnings by source-range key and cap each group at 3 entries. + * Exported for tests. + */ +export function suppressAfterThree(warnings: Diagnostic[]): Diagnostic[] { + const counts = new Map(); + const out: Diagnostic[] = []; + for (const w of warnings) { + const key = `${w.code ?? ''}:${w.start_line ?? -1}:${w.start_column ?? -1}:${w.end_line ?? -1}:${w.end_column ?? -1}`; + const n = counts.get(key) ?? 0; + if (n < 3) { + out.push(w); + counts.set(key, n + 1); + } + } + return out; +} + +export function DiagnosticStrip({ warnings, onDismiss }: DiagnosticStripProps) { + if (warnings.length === 0) return null; + const visible = suppressAfterThree(warnings); + + return ( +
+
+ + {visible.length === 1 ? '1 edit dropped' : `${visible.length} edits dropped`} + + +
+
    + {visible.map((w, i) => ( +
  • 0 ? '0.25rem' : 0 }}> + + {w.code ? `${w.code}: ` : ''} + {w.title} + + {w.problem ? ( +
    {w.problem}
    + ) : null} +
  • + ))} +
+
+ ); +} diff --git a/ts-packages/preview-renderer/src/framework/dispatch.tsx b/ts-packages/preview-renderer/src/framework/dispatch.tsx index e4640bdc5..68215fbee 100644 --- a/ts-packages/preview-renderer/src/framework/dispatch.tsx +++ b/ts-packages/preview-renderer/src/framework/dispatch.tsx @@ -1,6 +1,6 @@ import React, { useContext } from 'react'; import { RegistryContext } from './RegistryContext'; -import { isAtomicSourceInfo, ATOMIC_SYNTHETIC_KINDS } from '../utils/sourceInfo'; +import { isAtomicSourceInfo, ATOMIC_KINDS } from '../utils/sourceInfo'; import { isAtomicCustomNode } from '../utils/atomicCustomNodes'; import type { BlockNode, @@ -405,7 +405,7 @@ export function Node({ const isCustom = node.t === 'CustomBlock' || node.t === 'CustomInline'; const isAtomic = - isAtomicSourceInfo(node as { s?: number }, sourceInfoPool, ATOMIC_SYNTHETIC_KINDS) + isAtomicSourceInfo(node as { s?: number }, sourceInfoPool, ATOMIC_KINDS) || (isCustom && isAtomicCustomNode((node as CustomBlockNode | CustomInlineNode).type_name)); const effectiveSetLocalAst = isAtomic ? NOOP_SET_LOCAL_AST : setLocalAst; diff --git a/ts-packages/preview-renderer/src/types/sourceInfo.ts b/ts-packages/preview-renderer/src/types/sourceInfo.ts index ec11652b9..1f5a365e9 100644 --- a/ts-packages/preview-renderer/src/types/sourceInfo.ts +++ b/ts-packages/preview-renderer/src/types/sourceInfo.ts @@ -1,6 +1,14 @@ /** - * Wire-format types for the source-info pool, mirroring - * `crates/pampa/src/writers/json.rs:54-91`. + * Wire-format types for the source-info pool. Hand-mirror of the Rust + * producers — keep this file aligned with two sources of truth: + * + * - `SourceInfo` enum (canonical producer): + * `crates/quarto-source-map/src/source_info.rs` + * - JSON wire mirror: + * `crates/pampa/src/writers/json.rs` + * - `SerializableSourceMapping` (writer-side enum) + * - `SourceInfoJson` (wire entry shape) + * - `SerializableSourceInfo::to_json` (code-4 serializer) * * The pool is an array of entries indexed by `node.s` (the `s` field on * each Pandoc node in the serialized AST). Each entry has a type code @@ -11,32 +19,60 @@ * - 0: Original — `d` is the file id (FileId.0). * - 1: Substring — `d` is a parent_id into the pool. * - 2: Concat — `d` is an array of [source_info_id, offset_in_concat, length]. - * - 3: FilterProvenance — `d` is [filter_path, line]. - * - 4: Synthetic — `d` is a By marker. Dormant; Plan 5 wires this up. - * - 5: Derived — `d` is { from: parent_id, by: By }. Dormant; Plan 5 wires this up. + * - 3: Legacy — read-only compat for two old shapes; no new writes: + * `[parent_id, ...]` (numeric-headed legacy `Transformed`) + * `[filter_path, line]` (string-headed buggy `FilterProvenance`). + * - 4: Generated — `d` is `{ by: By, from?: AnchorRef[] }`. `r` is `[0, 0]`; + * ranges come from the chain-walk via the `invocation` anchor. * - * Codes 4 and 5 are forward-declared so 2A's accessor module doesn't need - * amending when Plan 5 ships writer support for them. + * Code 5 is unassigned and reserved for future use. */ /** - * A `By` marker identifies the synthesizer responsible for a Synthetic or - * Derived source-info entry. The shape is intentionally coarse — Plan 4 - * introduces specific kinds with structured `data`. Once consumers branch - * on `kind`, this can be narrowed to a discriminated union. + * A `By` marker identifies the producer (transform) responsible for a + * `Generated` entry. Mirrors the Rust `By` struct: a kebab-case `kind` + * tag plus an optional per-kind JSON `data` payload. + * + * Known kinds at the time of writing: `"filter"`, `"shortcode"`, + * `"sectionize"`, `"user-edit"`, `"include"`, `"title-block"`, + * `"footnotes"`, `"appendix"`, `"tree-sitter-postprocess"`, `"raw"`. + * Third-party extensions namespace as `"ext//"`. */ export interface By { kind: string; data?: unknown; } +/** + * A typed, role-labeled pointer into the source-info pool, attached to + * a `Generated` entry via its `from` array. Mirrors the Rust `Anchor` + * struct flattened to its writer-internal `(role, si_id)` shape. + * + * `role` is one of: + * - `"invocation"` — the user-written construct that triggered the + * producer (e.g. the `{{< meta foo >}}` token). + * - `"value-source"` — where the value carried by this node was + * defined, when distinct from the invocation site. + * - `"other:"` — extension-defined or future role we haven't + * enumerated. `` is kebab-case, namespaced as + * `ext//`. The bare `"other:"` form (empty + * suffix) is rejected by the reader. + * + * `si_id` is the pool index of the anchor's target (typically an + * `Original` covering the source bytes the anchor describes). + */ +export interface AnchorRef { + role: string; + si_id: number; +} + export type SourceInfoEntry = - | { t: 0; r: [number, number]; d: number } - | { t: 1; r: [number, number]; d: number } - | { t: 2; r: [number, number]; d: Array<[number, number, number]> } - | { t: 3; r: [number, number]; d: [string, number] } - | { t: 4; r: [0, 0]; d: By } - | { t: 5; r: [0, 0]; d: { from: number; by: By } }; + | { t: 0; r: [number, number]; d: number } // Original + | { t: 1; r: [number, number]; d: number } // Substring + | { t: 2; r: [number, number]; d: Array<[number, number, number]> } // Concat + | { t: 3; r: [number, number]; d: [string, number] | [number, ...number[]] } // Legacy (read-only) + | { t: 4; r: [0, 0]; d: { by: By; from?: AnchorRef[] } }; // Generated +// code 5 — unassigned, reserved for future use export type SourceInfoPool = readonly SourceInfoEntry[]; diff --git a/ts-packages/preview-renderer/src/utils/sourceInfo.test.ts b/ts-packages/preview-renderer/src/utils/sourceInfo.test.ts index e2aab8011..0b400cb1a 100644 --- a/ts-packages/preview-renderer/src/utils/sourceInfo.test.ts +++ b/ts-packages/preview-renderer/src/utils/sourceInfo.test.ts @@ -1,26 +1,27 @@ import { describe, test, expect } from 'vitest'; -import { - entryFor, - isDerived, - isAtomicSourceInfo, - ATOMIC_SYNTHETIC_KINDS, -} from './sourceInfo'; +import { entryFor, isAtomicSourceInfo, ATOMIC_KINDS } from './sourceInfo'; import type { SourceInfoPool } from '../types/sourceInfo'; -// Build a representative pool covering each wire code. +// Build a representative pool covering each wire code shipped by the +// Rust writer post-Plan-5. Code 5 is unassigned — no entry exists. const samplePool: SourceInfoPool = [ - { t: 0, r: [0, 10], d: 0 }, // 0: Original - { t: 1, r: [3, 7], d: 0 }, // 1: Substring (parent_id 0) - { t: 2, r: [0, 20], d: [[0, 0, 10], [1, 10, 10]] }, // 2: Concat - { t: 3, r: [5, 15], d: ['filter.lua', 42] }, // 3: FilterProvenance - { t: 4, r: [0, 0], d: { kind: 'IncludeShortcode' } }, // 4: Synthetic - { t: 5, r: [0, 0], d: { from: 0, by: { kind: 'CrossrefResolver' } } }, // 5: Derived + { t: 0, r: [0, 10], d: 0 }, // 0: Original + { t: 1, r: [3, 7], d: 0 }, // 1: Substring (parent_id 0) + { t: 2, r: [0, 20], d: [[0, 0, 10], [0, 10, 10]] }, // 2: Concat + { t: 3, r: [5, 15], d: ['filter.lua', 42] }, // 3: Legacy (string-headed FilterProvenance) + { t: 3, r: [10, 20], d: [0] }, // 4: Legacy (numeric-headed Transformed) + { t: 4, r: [0, 0], d: { by: { kind: 'sectionize' } } }, // 5: Generated, no anchors, no data + { t: 4, r: [0, 0], d: { // 6: Generated with anchor + by: { kind: 'shortcode', data: { name: 'meta' } }, + from: [{ role: 'invocation', si_id: 0 }], + } }, ]; describe('entryFor', () => { test('returns the entry at node.s', () => { expect(entryFor({ s: 0 }, samplePool)).toEqual(samplePool[0]); expect(entryFor({ s: 3 }, samplePool)).toEqual(samplePool[3]); + expect(entryFor({ s: 6 }, samplePool)).toEqual(samplePool[6]); }); test('returns undefined when node lacks an s field', () => { @@ -36,56 +37,62 @@ describe('entryFor', () => { }); }); -describe('isDerived', () => { - test('returns true for code 5 (Derived)', () => { - expect(isDerived({ s: 5 }, samplePool)).toBe(true); - }); - - test('returns false for code 4 (Synthetic)', () => { - expect(isDerived({ s: 4 }, samplePool)).toBe(false); - }); - - test.each([0, 1, 2, 3])('returns false for code %d', (idx) => { - expect(isDerived({ s: idx }, samplePool)).toBe(false); - }); - - test('returns false when entry is missing', () => { - expect(isDerived({}, samplePool)).toBe(false); - expect(isDerived({ s: 99 }, samplePool)).toBe(false); - }); -}); - describe('isAtomicSourceInfo', () => { - const atomicKinds = new Set(['CrossrefResolver']); + const atomicKinds = new Set(['shortcode']); - test('returns true for Derived entries (code 5)', () => { - expect(isAtomicSourceInfo({ s: 5 }, samplePool, atomicKinds)).toBe(true); + test('returns true for Generated (code 4) when by.kind is atomic', () => { + // samplePool[6] has by.kind === 'shortcode'. + expect(isAtomicSourceInfo({ s: 6 }, samplePool, atomicKinds)).toBe(true); }); - test('returns true for Synthetic (code 4) when kind is in atomic set', () => { - const pool: SourceInfoPool = [{ t: 4, r: [0, 0], d: { kind: 'CrossrefResolver' } }]; - expect(isAtomicSourceInfo({ s: 0 }, pool, atomicKinds)).toBe(true); + test('returns false for Generated (code 4) when by.kind is not atomic', () => { + // samplePool[5] has by.kind === 'sectionize'. + expect(isAtomicSourceInfo({ s: 5 }, samplePool, atomicKinds)).toBe(false); }); - test('returns false for Synthetic (code 4) when kind is not atomic', () => { - expect(isAtomicSourceInfo({ s: 4 }, samplePool, atomicKinds)).toBe(false); - }); - - test.each([0, 1, 2, 3])('returns false for non-Synthetic non-Derived code %d', (idx) => { + test.each([0, 1, 2, 3, 4])('returns false for non-Generated code %d', (idx) => { expect(isAtomicSourceInfo({ s: idx }, samplePool, atomicKinds)).toBe(false); }); test('returns false when entry is missing', () => { expect(isAtomicSourceInfo({}, samplePool, atomicKinds)).toBe(false); }); + + test('treats absent `from` as empty (canonical access pattern)', () => { + // Build a pool with one Generated entry that has no `from` field + // at all — the writer omits it when the anchor list is empty. + const pool: SourceInfoPool = [ + { t: 4, r: [0, 0], d: { by: { kind: 'shortcode' } } }, + ]; + expect(isAtomicSourceInfo({ s: 0 }, pool, atomicKinds)).toBe(true); + // `entry.d.from ?? []` is the canonical access pattern for + // consumers that want to iterate the anchor list. + const entry = entryFor({ s: 0 }, pool); + if (entry?.t === 4) { + expect(entry.d.from ?? []).toEqual([]); + } else { + throw new Error('expected code-4 entry'); + } + }); }); -describe('ATOMIC_SYNTHETIC_KINDS', () => { +describe('ATOMIC_KINDS', () => { test('is exported as a ReadonlySet', () => { - expect(ATOMIC_SYNTHETIC_KINDS).toBeInstanceOf(Set); + expect(ATOMIC_KINDS).toBeInstanceOf(Set); + }); + + test('contains the Plan-4 atomic-kind set', () => { + // Mirrors `By::is_atomic_kind` on the Rust side + // (crates/quarto-source-map/src/source_info.rs). + expect(ATOMIC_KINDS.has('filter')).toBe(true); + expect(ATOMIC_KINDS.has('shortcode')).toBe(true); + expect(ATOMIC_KINDS.has('title-block')).toBe(true); + expect(ATOMIC_KINDS.has('tree-sitter-postprocess')).toBe(true); }); - test('is empty in 2A — Plan 4/6 will populate', () => { - expect(ATOMIC_SYNTHETIC_KINDS.size).toBe(0); + test('excludes known non-atomic kinds', () => { + expect(ATOMIC_KINDS.has('sectionize')).toBe(false); + expect(ATOMIC_KINDS.has('user-edit')).toBe(false); + expect(ATOMIC_KINDS.has('include')).toBe(false); }); }); diff --git a/ts-packages/preview-renderer/src/utils/sourceInfo.ts b/ts-packages/preview-renderer/src/utils/sourceInfo.ts index d9e95f4a9..982f353cc 100644 --- a/ts-packages/preview-renderer/src/utils/sourceInfo.ts +++ b/ts-packages/preview-renderer/src/utils/sourceInfo.ts @@ -4,9 +4,9 @@ * and by future features that need source-mapped lookups (preimage * navigation, source-mapped diagnostics). * - * Sync contract: `ATOMIC_SYNTHETIC_KINDS` mirrors the kinds returned - * by `By::is_atomic_synthesizer()` on the Rust side (Plan 4 / 6 - * landing). Update both together. + * Sync contract: `ATOMIC_KINDS` mirrors the kinds returned by + * `By::is_atomic_kind()` on the Rust side + * (`crates/quarto-source-map/src/source_info.rs`). Update both together. */ import type { SourceInfoEntry, SourceInfoPool } from '../types/sourceInfo'; @@ -25,21 +25,8 @@ export function entryFor( } /** - * True iff the entry is a Derived (wire code 5) entry. Plan 6 populates - * Derived entries on shortcode resolutions. - */ -export function isDerived( - node: { s?: number }, - pool: SourceInfoPool | undefined, -): boolean { - const entry = entryFor(node, pool); - return entry?.t === 5; -} - -/** - * True iff the entry indicates an atomic transform — either Derived - * (always atomic) or Synthetic (code 4) whose `By::kind` is in the - * atomic-synthesizer set. + * True iff the entry indicates an atomic transform — a `Generated` + * entry (code 4) whose `By::kind` is in the atomic-producer set. * * Used by Plan 2B's atomic-aware dispatcher gate to decide whether * `setLocalAst` should be a no-op for the subtree. @@ -51,19 +38,22 @@ export function isAtomicSourceInfo( ): boolean { const entry = entryFor(node, pool); if (!entry) return false; - if (entry.t === 5) return true; - if (entry.t === 4) return atomicKinds.has(entry.d.kind); + if (entry.t === 4) return atomicKinds.has(entry.d.by.kind); return false; } /** - * Atomic-synthesizer kinds that mark entire Synthetic subtrees as - * read-only on the iframe side. Empty in 2A — Plan 4 / 6 populate this - * set as their `By` variants land. + * Atomic producer kinds that mark entire `Generated` subtrees as + * read-only on the iframe side. * - * Sync contract: mirrors `By::is_atomic_synthesizer()` on the Rust - * side. The Rust function and this set must agree on which kinds are - * atomic; otherwise q2-preview's edit-back gate desyncs from the - * pipeline's expectation. + * Sync contract: mirrors `By::is_atomic_kind()` on the Rust side + * (`crates/quarto-source-map/src/source_info.rs`). The Rust function + * and this set must agree on which kinds are atomic; otherwise + * q2-preview's edit-back gate desyncs from the pipeline's expectation. */ -export const ATOMIC_SYNTHETIC_KINDS: ReadonlySet = new Set(); +export const ATOMIC_KINDS: ReadonlySet = new Set([ + 'filter', + 'shortcode', + 'title-block', + 'tree-sitter-postprocess', +]); diff --git a/ts-packages/preview-runtime/src/index.ts b/ts-packages/preview-runtime/src/index.ts index c75156a18..1963cf5e9 100644 --- a/ts-packages/preview-runtime/src/index.ts +++ b/ts-packages/preview-runtime/src/index.ts @@ -14,3 +14,4 @@ export * from './wasmRenderer'; export * from './automergeSync'; +export * from './pipelineKind'; diff --git a/hub-client/src/utils/pipelineKind.test.ts b/ts-packages/preview-runtime/src/pipelineKind.test.ts similarity index 100% rename from hub-client/src/utils/pipelineKind.test.ts rename to ts-packages/preview-runtime/src/pipelineKind.test.ts diff --git a/hub-client/src/utils/pipelineKind.ts b/ts-packages/preview-runtime/src/pipelineKind.ts similarity index 100% rename from hub-client/src/utils/pipelineKind.ts rename to ts-packages/preview-runtime/src/pipelineKind.ts diff --git a/ts-packages/preview-runtime/src/wasm-quarto-hub-client.d.ts b/ts-packages/preview-runtime/src/wasm-quarto-hub-client.d.ts index 8a256a755..7705274ca 100644 --- a/ts-packages/preview-runtime/src/wasm-quarto-hub-client.d.ts +++ b/ts-packages/preview-runtime/src/wasm-quarto-hub-client.d.ts @@ -74,8 +74,20 @@ declare module 'wasm-quarto-hub-client' { // QMD parsing and AST conversion functions export function parse_qmd_content(content: string): string; export function ast_to_qmd(ast_json: string): string; - /** Incrementally write a modified AST back to QMD, preserving unchanged source text. */ - export function incremental_write_qmd(original_qmd: string, new_ast_json: string): string; + /** + * Incrementally write a modified AST back to QMD, preserving unchanged + * source text. + * + * Per Plan 7: the caller is responsible for passing a **baseline** AST + * (`baseline_ast_json`) whose source spans match `original_qmd` and + * whose tier matches `new_ast_json`. The bridge does not re-parse + * `original_qmd`; mixing tiers will corrupt the write. + */ + export function incremental_write_qmd( + original_qmd: string, + baseline_ast_json: string, + new_ast_json: string, + ): string; // Response type for parse/write operations export interface AstResponse { @@ -86,6 +98,11 @@ declare module 'wasm-quarto-hub-client' { qmd?: string; error?: string; diagnostics?: AstDiagnostic[]; + /** + * Soft-drop warnings (Plan 7 Q-3-42 / Q-3-43) that rode alongside + * a successful incremental write. + */ + warnings?: AstDiagnostic[]; } export interface AstDiagnostic { diff --git a/ts-packages/preview-runtime/src/wasmRenderer.ts b/ts-packages/preview-runtime/src/wasmRenderer.ts index fc94d9835..756fd0a41 100644 --- a/ts-packages/preview-runtime/src/wasmRenderer.ts +++ b/ts-packages/preview-runtime/src/wasmRenderer.ts @@ -85,7 +85,11 @@ interface WasmModuleExtended { attribution_json: string | undefined, ) => Promise; write_qmd: (astJson: string) => Promise; - incremental_write_qmd(original_qmd: string, new_ast_json: string): string; + incremental_write_qmd( + original_qmd: string, + baseline_ast_json: string, + new_ast_json: string, + ): string; convert: (document: string, inputFormat: string, outputFormat: string) => Promise; lsp_analyze_document: (path: string) => string; lsp_get_symbols: (path: string) => string; @@ -703,26 +707,73 @@ export async function writeQmd(astJson: string): Promise { } } +/** + * Result of `incrementalWriteQmd`: the rewritten QMD plus any + * soft-drop warnings (Q-3-42 / Q-3-43) that surfaced during + * reconciliation. Warnings ride alongside a *successful* write — the + * substituted edit reached source — and are the caller's + * responsibility to surface (or ignore) per its UX policy. + */ +export interface IncrementalWriteQmdResult { + qmd: string; + warnings?: Diagnostic[]; +} + /** * Incrementally write a modified AST back to QMD, preserving unchanged * portions of the original source text verbatim. * + * Per Plan 7, the caller must pass the **baseline** AST — the AST + * whose source spans correspond to `originalQmd` — so the bridge can + * reconcile without re-parsing (which would discard provenance the + * host has already attached). The baseline AST and the new AST must + * be the same tier (e.g. both post-`parseQmdContent`). + * + * `baselineAst` is accepted as either a parsed AST object + * (`RustQmdJson`) or a pre-serialized JSON string — convenient for + * sync-client callers that already have a stringified cache. The + * bridge serializes the AST object branch internally. + * * Must call initWasm() before first use. */ -export function incrementalWriteQmd(originalQmd: string, newAst: RustQmdJson): string { +export function incrementalWriteQmd( + originalQmd: string, + baselineAst: RustQmdJson | string, + newAst: RustQmdJson, +): IncrementalWriteQmdResult { if (!wasmModule) { throw new Error('WASM not initialized. Call initWasm() first.') } + const baselineAstJson = + typeof baselineAst === 'string' ? baselineAst : JSON.stringify(baselineAst) const newAstJson = JSON.stringify(newAst) - const responseJson = wasmModule.incremental_write_qmd(originalQmd, newAstJson) + const responseJson = wasmModule.incremental_write_qmd( + originalQmd, + baselineAstJson, + newAstJson, + ) const response: AstResponse = JSON.parse(responseJson) if (!response.success || !response.qmd) { - throw new Error(`Incremental write failed: ${response.error}`) + // Distinguish the two failure modes — pre-fix this read "undefined": + // - response.success === false → real writer Err (response.error set) + // - response.success === true && response.qmd === "" → writer + // returned Ok with an empty document (every block soft-dropped + // via Q-3-43; bridge omits `error` in this case) + const reason = response.error + ?? (response.qmd === '' + ? 'writer returned empty qmd (warnings: ' + + (response.warnings?.length ?? 0) + + ')' + : 'no qmd field in response') + throw new Error(`Incremental write failed: ${reason}`) } - return response.qmd + return { + qmd: response.qmd, + warnings: response.warnings as Diagnostic[] | undefined, + } } /** diff --git a/ts-packages/quarto-sync-client/src/client.ts b/ts-packages/quarto-sync-client/src/client.ts index c63459e83..abedbe146 100644 --- a/ts-packages/quarto-sync-client/src/client.ts +++ b/ts-packages/quarto-sync-client/src/client.ts @@ -975,8 +975,15 @@ export function createSyncClient(callbacks: SyncClientCallbacks, astOptions?: AS const cached = astCache.get(path); if (astOptions.incrementalWriteQmd && cached) { - // Use incremental writer with cached original source - qmdText = astOptions.incrementalWriteQmd(cached.source, ast); + // Plan 7: pass the cached parsed AST as the baseline so the + // bridge does not have to re-parse `cached.source` (which would + // discard any host-side provenance attached after parse). + // `cached.ast` IS the baseline whose spans match `cached.source`. + // Warnings are surfaced but discarded here — the sync client is + // policy-free; demos / hub-client consume them via their own + // wrappers. + const result = astOptions.incrementalWriteQmd(cached.source, cached.ast, ast); + qmdText = result.qmd; } else { // Fallback to full rewrite qmdText = astOptions.writeQmd(ast); diff --git a/ts-packages/quarto-sync-client/src/types.ts b/ts-packages/quarto-sync-client/src/types.ts index 2c7792523..7eb12946b 100644 --- a/ts-packages/quarto-sync-client/src/types.ts +++ b/ts-packages/quarto-sync-client/src/types.ts @@ -162,11 +162,28 @@ export interface ASTOptions { * portions of the original source text verbatim. Falls back to `writeQmd` * if not provided or if the original source is not cached. * + * Per Plan 7, the caller must supply the **baseline** AST (the one + * whose source spans match `originalQmd`); the sync client passes + * the cached parsed AST for that file. The returned `warnings` are + * structured soft-drop diagnostics (`Q-3-42` / `Q-3-43`) that the + * sync client itself ignores — it stays policy-free. Wrapper code + * in demos / hub-client consumes them. + * + * The diagnostic shape is intentionally `unknown[]` here so the + * sync-client does not pull a render-side type dependency; callers + * typically narrow it to the wasm-bridge `AstDiagnostic` shape. + * * @param originalQmd - The original QMD source text + * @param baselineAst - The cached parsed AST whose spans match `originalQmd` * @param newAst - The modified AST to write - * @returns The new QMD text with unchanged portions preserved + * @returns Object with `qmd` (rewritten source) and optional + * `warnings` (soft-drop diagnostics) */ - incrementalWriteQmd?: (originalQmd: string, newAst: unknown) => string; + incrementalWriteQmd?: ( + originalQmd: string, + baselineAst: unknown, + newAst: unknown, + ) => { qmd: string; warnings?: unknown[] }; /** * Filter which files should be parsed.