NVIDIA · Separius · Oct 27, 2025 · Oct 27, 2025 · Oct 27, 2025 · Oct 28, 2025
@@ -23,6 +23,7 @@ modelopt/torch/nas @NVIDIA/modelopt-torch-nas-prune-codeowners
 modelopt/torch/opt @NVIDIA/modelopt-torch-opt-codeowners
 modelopt/torch/peft @NVIDIA/modelopt-torch-peft-codeowners
 modelopt/torch/prune @NVIDIA/modelopt-torch-nas-prune-codeowners
+modelopt/torch/puzzletron @NVIDIA/modelopt-torch-puzzletron-codeowners
 modelopt/torch/quantization @NVIDIA/modelopt-torch-quantization-codeowners
 modelopt/torch/sparsity @NVIDIA/modelopt-torch-sparsity-codeowners
 modelopt/torch/speculative @NVIDIA/modelopt-torch-speculative-codeowners

@@ -51,14 +51,15 @@ jobs:
           apt-get update && apt-get install -y git-lfs
           git lfs install --system
 
-          pip install ".${{ inputs.pip_install_extras }}"
+          # use `python -m pip` instead of `pip` to avoid conflicts with system pip for nemo containers
+          python -m pip install ".${{ inputs.pip_install_extras }}"
 
           if [[ "${{ inputs.example }}" == *"diffusers"* ]]; then
             echo "Uninstalling apex for diffusers: T5 Int8 (PixArt) + Apex is not supported as per https://github.com/huggingface/transformers/issues/21391"
-            pip uninstall -y apex || true
+            python -m pip uninstall -y apex || true
           fi
 
-          find examples/${{ inputs.example }} -name "requirements.txt" | while read req_file; do pip install -r "$req_file" || exit 1; done
+          find examples/${{ inputs.example }} -name "requirements.txt" | while read req_file; do python -m pip install -r "$req_file" || exit 1; done
       - name: Run tests
         run: |
           echo "Running tests for: ${{ inputs.example }}"

@@ -56,108 +56,33 @@ jobs:
       match_pattern: "^DCO$|^linux$" # Wait for DCO and Unit tests / linux to pass
       delay: 300s
 
-  ##### PyTorch Example Tests (speculative_decoding requires 26.01 image) #####
-  torch-pr:
+  ##### NeMo Example Tests #####
+  nemo-pr:
     needs: [check-file-changes, wait-checks]
     if: startsWith(github.ref, 'refs/heads/pull-request/') && needs.check-file-changes.outputs.any_changed == 'true'
-    strategy: &torch_strategy
-      fail-fast: false
-      matrix:
-        example: [llm_distill, llm_qat, llm_sparsity]
-        include:
-          - example: speculative_decoding
-            docker_image: "26.01"
-    uses: ./.github/workflows/_example_tests_runner.yml
-    secrets: inherit
-    with:
-      docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.01' }}-py3"
-      example: ${{ matrix.example }}
-      timeout_minutes: 30
-      pip_install_extras: "[hf,dev-test]"
-      runner: linux-amd64-gpu-h100-latest-1
-
-  torch-non-pr:
-    if: ${{ !startsWith(github.ref, 'refs/heads/pull-request/') }}
-    strategy: *torch_strategy
-    uses: ./.github/workflows/_example_tests_runner.yml
-    secrets: inherit
-    with:
-      docker_image: "nvcr.io/nvidia/pytorch:${{ matrix.docker_image || '26.01' }}-py3"
-      example: ${{ matrix.example }}
-      timeout_minutes: 30
-      pip_install_extras: "[hf,dev-test]"
-      runner: linux-amd64-gpu-rtxpro6000-latest-2
-
-  ##### TensorRT-LLM Example Tests #####
-  trtllm-pr:
-    needs: [check-file-changes, wait-checks]
-    if: startsWith(github.ref, 'refs/heads/pull-request/') && needs.check-file-changes.outputs.any_changed == 'true'
-    strategy:
-      fail-fast: false
-      matrix:
-        example: [llm_ptq, vlm_ptq]
-    uses: ./.github/workflows/_example_tests_runner.yml
-    secrets: inherit
-    with:
-      docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc5"
-      example: ${{ matrix.example }}
-      pip_install_extras: "[hf,dev-test]"
-      runner: linux-amd64-gpu-rtxpro6000-latest-1
-
-  trtllm-non-pr:
-    if: ${{ !startsWith(github.ref, 'refs/heads/pull-request/') }}
     strategy:
       fail-fast: false
       matrix:
-        example: [llm_autodeploy, llm_eval, llm_ptq, vlm_ptq]
-    uses: ./.github/workflows/_example_tests_runner.yml
-    secrets: inherit
-    with:
-      docker_image: "nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc5"
-      example: ${{ matrix.example }}
-      pip_install_extras: "[hf,dev-test]"
-      runner: linux-amd64-gpu-rtxpro6000-latest-2
-
-  ##### ONNX/TensorRT Example Tests #####
-  onnx-pr:
-    needs: [check-file-changes, wait-checks]
-    if: startsWith(github.ref, 'refs/heads/pull-request/') && needs.check-file-changes.outputs.any_changed == 'true'
-    strategy: &onnx_strategy
-      fail-fast: false
-      matrix:
-        example: [diffusers, torch_onnx]
-    uses: ./.github/workflows/_example_tests_runner.yml
-    secrets: inherit
-    with:
-      docker_image: "nvcr.io/nvidia/tensorrt:26.01-py3"
-      example: ${{ matrix.example }}
-      pip_install_extras: "[all,dev-test]"
-      runner: linux-amd64-gpu-l4-latest-1
-
-  onnx-non-pr:
-    if: ${{ !startsWith(github.ref, 'refs/heads/pull-request/') }}
-    strategy: *onnx_strategy
+        example: [puzzletron]
     uses: ./.github/workflows/_example_tests_runner.yml
     secrets: inherit
     with:
-      docker_image: "nvcr.io/nvidia/tensorrt:26.01-py3"
+      docker_image: "nvcr.io/nvidia/nemo:26.02"
       example: ${{ matrix.example }}
-      pip_install_extras: "[all,dev-test]"
+      pip_install_extras: "[hf,puzzletron,dev-test]"
       runner: linux-amd64-gpu-rtxpro6000-latest-2
 
   ##### Required Check for PR #####
   example-pr-required-check:
     # Run even if example tests are skipped
     if: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && always() }}
-    needs: [check-file-changes, torch-pr, trtllm-pr, onnx-pr]
+    needs: [check-file-changes, nemo-pr]
     runs-on: ubuntu-latest
     steps:
       - name: Required GPU tests did not succeed
         if: |
           needs.check-file-changes.result != 'success' ||
           (needs.check-file-changes.outputs.any_changed == 'true' && (
-            needs.torch-pr.result != 'success' ||
-            needs.trtllm-pr.result != 'success' ||
-            needs.onnx-pr.result != 'success'
+            needs.nemo-pr.result != 'success'
           ))
         run: exit 1
@@ -62,16 +62,16 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - example: gpu
-            timeout: 45
-            container_image: pytorch:26.01-py3
-          - example: gpu-megatron
-            timeout: 45
-            container_image: pytorch:26.01-py3
-          - example: gpu-trtllm
+          - example: gpu-puzzletron
             timeout: 30
-            container_image: tensorrt-llm/release:1.3.0rc5
-    runs-on: linux-amd64-gpu-rtxpro6000-latest-1
+            container_image: pytorch:26.01-py3
+          # - example: gpu-megatron
+          #   timeout: 45
+          #   container_image: pytorch:26.01-py3
+          # - example: gpu-trtllm
+          #   timeout: 30
+          #   container_image: tensorrt-llm/release:1.3.0rc5
+    runs-on: linux-amd64-gpu-rtxpro6000-latest-2
     timeout-minutes: ${{ matrix.timeout }}
     container: &gpu_container
       image: nvcr.io/nvidia/${{ matrix.container_image }}
@@ -85,6 +85,8 @@ jobs:
       - name: Setup environment variables
         run: |
           echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu" >> $GITHUB_ENV
+      - name: Install dependencies for mip
+        run: apt-get update && apt-get install -y libffi-dev
       - name: Run gpu tests
         run: pip install tox-current-env && tox -e cuda13-${{ matrix.example }} --current-env
   gpu-tests-non-pr:

@@ -25,9 +25,20 @@ repos:
     hooks:
       - id: ruff-check
         args: [--fix, --exit-non-zero-on-fix]
-        exclude: ^examples/specdec_bench/specdec_bench/datasets/speed\.py$
+        # See: commit hooks modifies block_config.py leading to test_puzzletron.py failing (#25) · Issues · omniml / modelopt · GitLab
+        exclude: >
+          (?x)^(
+              ^examples/specdec_bench/specdec_bench/datasets/speed\.py$|
+              modelopt/torch/puzzletron/decilm/deci_lm_hf_code/block_config\.py|
+              modelopt/torch/puzzletron/decilm/deci_lm_hf_code/transformers_.*\.py
+          )$
       - id: ruff-format
-        exclude: ^examples/specdec_bench/specdec_bench/datasets/speed\.py$
+        exclude: >
+          (?x)^(
+              ^examples/specdec_bench/specdec_bench/datasets/speed\.py$|
+              modelopt/torch/puzzletron/decilm/deci_lm_hf_code/block_config\.py|
+              modelopt/torch/puzzletron/decilm/deci_lm_hf_code/transformers_.*\.py
+          )$
 
   - repo: https://github.com/pre-commit/mirrors-mypy
     rev: v1.17.1
@@ -84,6 +95,7 @@ repos:
               modelopt/torch/speculative/eagle/utils.py|
               modelopt/torch/speculative/plugins/transformers.py|
               modelopt/torch/utils/plugins/megatron_mmlu.py|
+              modelopt/torch/puzzletron/decilm/deci_lm_hf_code/transformers_.*\.py|
               examples/chained_optimizations/bert_prune_distill_quantize.py|
               examples/deepseek/quantize_to_nvfp4.py|
               examples/deepseek/ptq.py|
@@ -96,10 +108,13 @@ repos:
               examples/llm_eval/modeling.py|
               examples/llm_qat/main.py|
               examples/llm_sparsity/weight_sparsity/finetune.py|
+              examples/puzzletron/evaluation/lm_eval_anymodel.py|
               examples/specdec_bench/specdec_bench/models/specbench_medusa.py|
               examples/speculative_decoding/main.py|
               examples/speculative_decoding/medusa_utils.py|
               examples/speculative_decoding/server_generate.py|
+              examples/puzzletron/evaluation/lm_eval_anymodel.py|
+              modelopt/torch/puzzletron/anymodel/models/gpt_oss/gpt_oss_pruned_to_mxfp4.py|
               experimental/dms/models/qwen3/configuration_qwen3_dms.py|
               experimental/dms/models/qwen3/modeling_qwen3_dms.py|
           )$

@@ -7,6 +7,7 @@ Pruning can involve removal (prune) of Linear and Conv layers; and Transformer a
 This section focuses on applying Model Optimizer's state-of-the-art complementary pruning modes to enable you to search for the best subnet architecture from your provided base model:
 
 1. [Minitron](https://arxiv.org/pdf/2408.11796): A pruning method developed by NVIDIA Research for pruning GPT (and later extended to Mamba, MoE, and Hybrid Transformer Mamba) models in NVIDIA Megatron-LM (M-LM) or Megatron-Bridge (M-Bridge) framework. It uses the activation magnitudes to prune the embedding hidden size; mlp ffn hidden size; transformer attention heads; mamba heads and head dimension; MoE number of experts, ffn hidden size, and shared expert intermediate size; and number of layers of the model.
+1. [Puzzletron](../puzzletron/README.md): An advanced pruning method by NVIDIA using Mixed Integer Programming (MIP) based NAS search algorithm.
 1. FastNAS: A pruning method recommended for Computer Vision models. Given a pretrained model, FastNAS finds the subnet which maximizes the score function while meeting the given constraints.
 1. GradNAS: A light-weight pruning method recommended for language models like Hugging Face BERT, GPT-J. It uses the gradient information to prune the model's linear layers and attention heads to meet the given constraints.
 

@@ -0,0 +1,159 @@
+# Bypass Distillation (Blockwise Local Distillation)
+
+Bypass distillation (also called **Blockwise Local Distillation / BLD**) is an optional pipeline
+stage that trains alternative transformer block configurations using per-block knowledge
+distillation from the teacher model. It significantly improves the quality of aggressively
+compressed models by producing better "puzzle pieces" for the MIP solver.
+
+## When to use bypass
+
+Bypass is most beneficial whenever the pruned block structure deviates significantly from the
+teacher — either because the weight-initialisation heuristic is too coarse, or because one
+sub-block must compensate for something the other no longer provides. Specifically, use bypass
+when:
+
+- **KV head reduction (any amount)**: the `AverageKV` initialisation is a naive starting point
+  that averages existing KV heads together. The resulting weights are a poor local minimum and
+  bypass distillation is needed to repair the quality loss. This applies even to moderate
+  reductions (e.g., 8 → 4 heads).
+- **Attention removed (`no_op: true`)**: removing an entire attention block leaves the co-located
+  FFN doing all the work for that block. Bypass trains the FFN to compensate for the missing
+  attention and recover the representational capacity.
+- **FFN removed (`no_op: true`)**: similarly, when an FFN block is removed, bypass trains the
+  remaining attention to compensate.
+- **Extreme FFN / MoE compression**: when the target `intermediate_size` is reduced by more than
+  ~3/4 of the teacher width, or the number of MoE experts is reduced by half or more, simple
+  weight truncation / expert selection leaves the block far from a good solution and bypass
+  significantly improves quality. For example, on Llama-3.1-8B (`intermediate_size=14336`),
+  bypass is strongly recommended for sizes ≤ 3584.
+
+## Time cost
+
+Bypass distillation is a full training loop. Plan for several hours per configuration when using
+~1B training tokens on H100 GPUs. Total time scales with
+`len(bypass.configs) × training_tokens`. This is comparable to lightweight fine-tuning.
+
+## Sequential execution
+
+Each entry in `bypass.configs` trains **sequentially** (one config at a time). There is no
+parallelism across configurations. Distribute jobs across different runs if time is a
+constraint.
+
+## Enabling bypass
+
+In your concrete model YAML, uncomment the bypass line:
+
+```yaml
+defaults:
+  - Llama-3_1-8B
+  - bypass: defaults   # remove the comment to enable bypass distillation
+  - _self_
+```
+
+A shared `bypass/defaults.yaml` is located at
+[`configs/bypass/defaults.yaml`](configs/bypass/defaults.yaml). It is used by all models.
+Adjust `training.training_tokens` (default is 10K tokens for sanity-check runs; set to `1e+9`
+for production runs) and the `auto_configs` or `configs` settings to match your compression
+targets.
+
+## Decoupled vs. coupled BLD
+
+**Decoupled BLD** trains only one sub-block type at a time while keeping the other frozen:
+
+| `keys_to_learn` | What is trained |
+|---|---|
+| `subblock_ffn` | FFN weights only (attention frozen) |
+| `subblock_attention` | Attention weights only (FFN frozen) |
+| `subblock_mamba` | Mamba SSM weights (hybrid models, e.g. NemotronH) |
+| `entire_block` | Full transformer block (coupled BLD) |
+
+**Coupled BLD** (`keys_to_learn: entire_block`) trains the whole block end-to-end and captures
+interactions between attention and FFN. The main cost is combinatorial: if you have N FFN sizes
+and M attention sizes in your replacement library, coupled BLD requires N × M training runs
+instead of N + M for decoupled. Decoupled BLD is therefore the default and usually sufficient.
+
+## Training multiple configurations
+
+Use `bypass.configs` to train multiple block configurations sequentially:
+
+```yaml
+bypass:
+  training:
+    training_tokens: 1e+9   # ~1B tokens per config
+  configs:
+    - model_config_overrides:
+        ffn:
+          - intermediate_size: 1792  # aggressive — bypass strongly recommended
+        attention:
+          - num_key_value_heads: null
+      keys_to_learn: subblock_ffn
+    - model_config_overrides:
+        ffn:
+          - intermediate_size: 3584
+        attention:
+          - num_key_value_heads: null
+      keys_to_learn: subblock_ffn
+```
+
+> **Note:** Always include `num_key_value_heads: null` under `attention:` even when not
+> changing KV heads. Omitting it when `no_op: true` is set on another field can cause
+> a config parsing issue.
+
+Trained checkpoints are automatically symlinked into `$PUZZLE_DIR/ckpts/` where the replacement
+library builder picks them up in the next pipeline stage.
+
+## Auto-generating configs from the pruning search space
+
+Instead of listing each config manually, use `bypass.auto_configs` to generate configs
+automatically from the pruning search space. The default (`auto_configs.attn: true`) trains
+one attention-only bypass per KV-head reduction specified in `pruning.n_heads_in_group_list`:
+
+```yaml
+bypass:
+  auto_configs:
+    attn: true   # one subblock_attention config per pruned kv-head count
+    ffn: false   # set true: one subblock_ffn config per size in pruning.intermediate_size_list
+    blk: false   # set true: cartesian product (FFN size × kv-head count), entire_block BLD
+  training:
+    training_tokens: 1e+9
+```
+
+Teacher-size subblocks are automatically excluded (no redundant training). For `blk`, all
+combinations where **both** FFN and attention are at teacher values are skipped.
+
+All three flags can be combined. Order of generated configs: FFN → attn → blk.
+
+## Attention no-op + FFN-only bypass
+
+A common aggressive compression pattern removes entire attention blocks (`no_op: true`) and
+trains only the FFN in those blocks. Example config:
+
+```yaml
+configs:
+  - model_config_overrides:
+      ffn:
+        - intermediate_size: 12288
+      attention:
+        - num_key_value_heads: null
+          no_op: true
+    keys_to_learn: subblock_ffn
+```
+
+When attention is removed, only the FFN parameters are trained. The bypass code automatically
+skips attention-related weights (including model-specific ones such as Qwen3's `q_norm`/`k_norm`)
+during student weight initialisation.
+
+## Weights & Biases logging
+
+Enable W&B to track per-block distillation loss and validation metrics:
+
+```yaml
+bypass:
+  wandb_log: true
+  wandb:
+    project: my-puzzletron-project
+    entity: my-org
+```
+
+W&B logs iteration number, token count, learning rate, and per-block loss at each log interval.
+If `wandb` is not installed, logging is silently disabled.