diff --git a/README.md b/README.md
index 42fc0da..aff309e 100644
--- a/README.md
+++ b/README.md
@@ -23,6 +23,13 @@ maximizer = generate_maximizer(bench)
 
 For the full list of benchmarks, the common interface, and detailed usage examples, refer to the [documentation](https://JuliaDecisionFocusedLearning.github.io/DecisionFocusedLearningBenchmarks.jl/stable/).
 
+## Installation
+
+```julia
+using Pkg
+Pkg.add("DecisionFocusedLearningBenchmarks")
+```
+
 ## Related Packages
 
 This package is part of the [JuliaDecisionFocusedLearning](https://github.com/JuliaDecisionFocusedLearning) organization, and built to be compatible with other packages in the ecosystem:
diff --git a/docs/make.jl b/docs/make.jl
index 4a1ec1b..5b851fd 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -5,11 +5,15 @@ using Literate
 md_dir = joinpath(@__DIR__, "src")
 tutorial_dir = joinpath(@__DIR__, "src", "tutorials")
 benchmarks_dir = joinpath(@__DIR__, "src", "benchmarks")
-api_dir = joinpath(@__DIR__, "src", "api")
 
 tutorial_files = readdir(tutorial_dir)
 md_tutorial_files = [split(file, ".")[1] * ".md" for file in tutorial_files]
-benchmark_files = [joinpath("benchmarks", e) for e in readdir(benchmarks_dir)]
+
+categories = [
+    "Static problems" => "static",
+    "Stochastic problems" => "stochastic",
+    "Dynamic problems" => "dynamic",
+]
 
 include_tutorial = true
 
@@ -20,6 +24,19 @@ if include_tutorial
     end
 end
 
+benchmark_sections = Pair{String,Vector{String}}[]
+
+for (label, subdir) in categories
+    dir = joinpath(benchmarks_dir, subdir)
+    jl_files = filter(f -> endswith(f, ".jl"), readdir(dir))
+    md_names = [splitext(f)[1] * ".md" for f in jl_files]
+    for file in jl_files
+        Literate.markdown(joinpath(dir, file), dir; documenter=true, execute=false)
+    end
+    md_paths = [joinpath("benchmarks", subdir, f) for f in md_names]
+    push!(benchmark_sections, label => md_paths)
+end
+
 makedocs(;
     modules=[DecisionFocusedLearningBenchmarks],
     authors="Members of JuliaDecisionFocusedLearning",
@@ -32,7 +49,7 @@ makedocs(;
             "Creating custom benchmarks" => "custom_benchmarks.md",
         ],
         "Tutorials" => include_tutorial ? md_tutorial_files : [],
-        "Benchmark problems list" => benchmark_files,
+        "Benchmarks" => benchmark_sections,
         "API reference" => "api.md",
     ],
 )
@@ -44,6 +61,13 @@ if include_tutorial
     end
 end
 
+for (_, subdir) in categories
+    dir = joinpath(benchmarks_dir, subdir)
+    for f in filter(f -> endswith(f, ".md"), readdir(dir))
+        rm(joinpath(dir, f); force=true)
+    end
+end
+
 deploydocs(;
     repo="github.com/JuliaDecisionFocusedLearning/DecisionFocusedLearningBenchmarks.jl",
     devbranch="main",
diff --git a/docs/src/api.md b/docs/src/api.md
index 0cff868..873c3ab 100644
--- a/docs/src/api.md
+++ b/docs/src/api.md
@@ -1,157 +1,115 @@
 # API Reference
 
-## Interface
+## Public
 
-```@autodocs
-Modules = [DecisionFocusedLearningBenchmarks.Utils]
-Private = false
-```
+### Interface
 
 ```@autodocs
 Modules = [DecisionFocusedLearningBenchmarks.Utils]
-Public = false
+Private = false
 ```
 
-## Argmax2D
+### Argmax2D
 
 ```@autodocs
 Modules = [DecisionFocusedLearningBenchmarks.Argmax2D]
 Private = false
 ```
 
-```@autodocs
-Modules = [DecisionFocusedLearningBenchmarks.Argmax2D]
-Public = false
-```
-
-## Argmax
+### Argmax
 
 ```@autodocs
 Modules = [DecisionFocusedLearningBenchmarks.Argmax]
 Private = false
 ```
 
-```@autodocs
-Modules = [DecisionFocusedLearningBenchmarks.Argmax]
-Public = false
-```
-
-## Contextual Stochastic Argmax
+### Contextual Stochastic Argmax
 
 ```@autodocs
 Modules = [DecisionFocusedLearningBenchmarks.ContextualStochasticArgmax]
 Private = false
 ```
 
-```@autodocs
-Modules = [DecisionFocusedLearningBenchmarks.ContextualStochasticArgmax]
-Public = false
-```
-
-## Dynamic Vehicle Scheduling
+### Dynamic Vehicle Scheduling
 
 ```@autodocs
 Modules = [DecisionFocusedLearningBenchmarks.DynamicVehicleScheduling]
 Private = false
 ```
 
-```@autodocs
-Modules = [DecisionFocusedLearningBenchmarks.DynamicVehicleScheduling]
-Public = false
-```
-
-## Dynamic Assortment
+### Dynamic Assortment
 
 ```@autodocs
 Modules = [DecisionFocusedLearningBenchmarks.DynamicAssortment]
 Private = false
 ```
 
-```@autodocs
-Modules = [DecisionFocusedLearningBenchmarks.DynamicAssortment]
-Public = false
-```
-
-## Fixed-size shortest path
+### Fixed-size shortest path
 
 ```@autodocs
 Modules = [DecisionFocusedLearningBenchmarks.FixedSizeShortestPath]
 Private = false
 ```
 
-```@autodocs
-Modules = [DecisionFocusedLearningBenchmarks.FixedSizeShortestPath]
-Public = false
-```
-
-## Maintenance
+### Maintenance
 
 ```@autodocs
 Modules = [DecisionFocusedLearningBenchmarks.Maintenance]
 Private = false
 ```
 
-```@autodocs
-Modules = [DecisionFocusedLearningBenchmarks.Maintenance]
-Public = false
-```
-
-## Portfolio Optimization
+### Portfolio Optimization
 
 ```@autodocs
 Modules = [DecisionFocusedLearningBenchmarks.PortfolioOptimization]
 Private = false
 ```
 
-```@autodocs
-Modules = [DecisionFocusedLearningBenchmarks.PortfolioOptimization]
-Public = false
-```
-
-## Ranking
+### Ranking
 
 ```@autodocs
 Modules = [DecisionFocusedLearningBenchmarks.Ranking]
 Private = false
 ```
 
-```@autodocs
-Modules = [DecisionFocusedLearningBenchmarks.Ranking]
-Public = false
-```
-
-## Subset selection
+### Subset selection
 
 ```@autodocs
 Modules = [DecisionFocusedLearningBenchmarks.SubsetSelection]
 Private = false
 ```
 
-```@autodocs
-Modules = [DecisionFocusedLearningBenchmarks.SubsetSelection]
-Public = false
-```
-
-## Stochastic Vehicle Scheduling
+### Stochastic Vehicle Scheduling
 
 ```@autodocs
 Modules = [DecisionFocusedLearningBenchmarks.StochasticVehicleScheduling]
 Private = false
 ```
 
-```@autodocs
-Modules = [DecisionFocusedLearningBenchmarks.StochasticVehicleScheduling]
-Public = false
-```
-
-## Warcraft
+### Warcraft
 
 ```@autodocs
 Modules = [DecisionFocusedLearningBenchmarks.Warcraft]
 Private = false
 ```
 
-```@autodocs
-Modules = [DecisionFocusedLearningBenchmarks.Warcraft]
+## Internals
+
+```@autodocs
+Modules = [
+    DecisionFocusedLearningBenchmarks.Utils,
+    DecisionFocusedLearningBenchmarks.Argmax,
+    DecisionFocusedLearningBenchmarks.Argmax2D,
+    DecisionFocusedLearningBenchmarks.ContextualStochasticArgmax,
+    DecisionFocusedLearningBenchmarks.DynamicVehicleScheduling,
+    DecisionFocusedLearningBenchmarks.DynamicAssortment,
+    DecisionFocusedLearningBenchmarks.FixedSizeShortestPath,
+    DecisionFocusedLearningBenchmarks.Maintenance,
+    DecisionFocusedLearningBenchmarks.PortfolioOptimization,
+    DecisionFocusedLearningBenchmarks.Ranking,
+    DecisionFocusedLearningBenchmarks.SubsetSelection,
+    DecisionFocusedLearningBenchmarks.StochasticVehicleScheduling,
+    DecisionFocusedLearningBenchmarks.Warcraft,
+]
 Public = false
 ```
diff --git a/docs/src/benchmarks/argmax.md b/docs/src/benchmarks/argmax.md
deleted file mode 100644
index 00a5e67..0000000
--- a/docs/src/benchmarks/argmax.md
+++ /dev/null
@@ -1,4 +0,0 @@
-# Argmax
-
-!!! warning
-    Documentation for this benchmark is still under development. Please refer to the source code and API for more details.
diff --git a/docs/src/benchmarks/contextual_stochastic_argmax.md b/docs/src/benchmarks/contextual_stochastic_argmax.md
deleted file mode 100644
index 59f588f..0000000
--- a/docs/src/benchmarks/contextual_stochastic_argmax.md
+++ /dev/null
@@ -1,37 +0,0 @@
-# Contextual Stochastic Argmax
-
-[`ContextualStochasticArgmaxBenchmark`](@ref) is a minimalist contextual stochastic optimization benchmark problem.
-
-The decision maker selects one item out of ``n``. Item values are uncertain at decision time: they depend on a base utility plus a context-correlated perturbation revealed only after the decision is made. An observable context vector, correlated with the perturbation via a fixed linear map ``W``, allows the learner to anticipate the perturbation and pick the right item.
-
-## Problem Formulation
-
-**Instance**: ``c_{\text{base}} \sim \mathcal{U}[0,1]^n``, base values for ``n`` items.
-
-**Context**: ``x_{\text{raw}} \sim \mathcal{N}(0, I_d)``, a ``d``-dimensional signal correlated with item values. The feature vector passed to the model is ``x = [c_{\text{base}};\, x_{\text{raw}}] \in \mathbb{R}^{n+d}``.
-
-**Scenario**: the realized item values are
-```math
-\xi = c_{\text{base}} + W x_{\text{raw}} + \varepsilon, \quad \varepsilon \sim \mathcal{N}(0, \sigma^2 I_n)
-```
-where ``W \in \mathbb{R}^{n \times d}`` is a fixed matrix unknown to the learner.
-
-**Decision**: ``y \in \{e_1, \ldots, e_n\}`` (one-hot vector selecting one item).
-
-## Policies
-
-### DFL Policy
-
-```math
-\xrightarrow[\text{Features}]{x}
-\fbox{Neural network $\varphi_w$}
-\xrightarrow[\text{Predicted values}]{\hat{\theta}}
-\fbox{\texttt{one\_hot\_argmax}}
-\xrightarrow[\text{Decision}]{y}
-```
-
-The neural network predicts item values ``\hat{\theta} \in \mathbb{R}^n`` from the feature vector ``x \in \mathbb{R}^{n+d}``. The default architecture is `Dense(n+d => n; bias=false)`, which can exactly recover the optimal linear predictor ``[I_n \mid W]``, so a well-trained model should reach near-zero gap.
-
-### SAA Policy
-
-``y_{\text{SAA}} = \operatorname{argmax}\bigl(\frac{1}{S}\sum_s \xi^{(s)}\bigr)`` — the exact SAA-optimal decision for linear argmax, accessible via `generate_baseline_policies(bench).saa`.
diff --git a/docs/src/benchmarks/dvsp.md b/docs/src/benchmarks/dvsp.md
deleted file mode 100644
index 2282597..0000000
--- a/docs/src/benchmarks/dvsp.md
+++ /dev/null
@@ -1,145 +0,0 @@
-# Dynamic Vehicle Scheduling
-
-The Dynamic Vehicle Scheduling Problem (DVSP) is a sequential decision-making problem where an agent must dynamically dispatch vehicles to serve customers that arrive over time.
-
-## Problem Description
-
-### Overview
-
-In the dynamic vehicle scheduling problem, a fleet operator must decide at each time step which customer to serve immediately and which to postpone to future time steps.
-The goal is to serve all customers by the end of the planning horizon while minimizing total travel time.
-
-This is a simplified version of the more complex Dynamic Vehicle Routing Problem with Time Windows (DVRPTW), focusing on the core sequential decision-making aspects without capacity or time window constraints.
-
-The problem is characterized by:
-- **Exogenous noise**: customer arrivals are stochastic and follow a fixed known distribution, independent of the agent's actions
-- **Combinatorial action space**: at each time step, the agent must build vehicle routes to serve selected customers, which leads to a huge combinatorial action space
-
-### Mathematical Formulation
-
-The dynamic vehicle scheduling problem can be formulated as a finite-horizon Markov Decision Process (MDP):
-
-**State Space** ``\mathcal{S}``: At time step ``t``, the state ``s_t`` consists of:
-```math
-s_t = (R_t, D_t, t)
-```
-where:
-- ``R_t`` are the pending customer (not yet served), where each customer ``r_i \in R_t`` contains:
-  - ``x_i, y_i``: 2d spatial coordinates of the customer location
-  - ``\tau_i``: start time when the customer needs to be served
-  - ``s_i``: service time required to serve the customer
-- ``D_t`` indicates which customers must be dispatched this time step (i.e. that cannot be postponed further, otherwise they will be infeasible at the next time step because of their start time)
-- ``t \in \{1, 2, \ldots, T\}`` is the current time step
-
-The state also implicitly includes (constant over time):
-- Travel duration matrix ``d_{ij}``: time to travel from location ``i`` to location ``j``
-- Depot location
-
-**Action Space** ``\mathcal{A}(s_t)``: The action at time step ``t`` is a set of vehicle routes:
-```math
-a_t = \{r_1, r_2, \ldots, r_k\}
-```
-where each route ``r_i`` is a sequence of customer that starts and ends at the depot.
-
-A route is feasible if:
-- It starts and ends at the depot
-- It follows time constraints, i.e. customers are served on time
-
-**Transition Dynamics** ``\mathcal{P}(s_{t+1} | s_t, a_t)``: After executing routes ``a_t``:
-
-1. **Remove served customers** from the pending customer set
-2. **Generate new customer arrivals** according to the underlying exogenous distribution
-3. **Update must-dispatch set** based on postponement rules
-
-**Reward Function** ``r(s_t, a_t)``: The immediate reward is the negative total travel time of the routes:
-
-```math
-r(s_t, a_t) = - \sum_{r \in a_t} \sum_{(i,j) \in r} d_{ij}
-```
-
-where ``d_{ij}`` is the travel duration from location ``i`` to location ``j``, and the sum is over all consecutive location pairs in each route ``r``.
-
-**Objective**: Find a policy ``\pi: \mathcal{S} \to \mathcal{A}`` that maximizes expected cumulative reward:
-```math
-\max_\pi \mathbb{E}\left[\sum_{t=1}^T r(s_t, \pi(s_t)) \right]
-```
-
-## Key Components
-
-### [`DynamicVehicleSchedulingBenchmark`](@ref)
-
-The main benchmark configuration with the following parameters:
-
-- `max_requests_per_epoch`: Maximum number of new customers per time step (default: 10)
-- `Δ_dispatch`: Time delay between decision and vehicle dispatch (default: 1.0)
-- `epoch_duration`: Duration of each decision time step (default: 1.0)
-- `two_dimensional_features`: Whether to use simplified 2D features instead of full feature set (default: false)
-
-### Instance Generation
-
-Problem instances are generated from static vehicle routing datasets and include:
-
-- **Customer locations**: Spatial coordinates for pickup/delivery points
-- **Depot location**: Central starting and ending point for all routes
-- **Travel times**: Distance/duration matrix between all location pairs
-- **Service times**: Service time each customer
-
-The dynamic version samples new customer arrivals from the static instance, drawing new customers by independently sampling:
-- their locations from the set of static customer locations
-- service times, uniformly from the range of service times in the static instance
-
-### Features
-
-The benchmark provides two feature matrix representations, containing one column per postponable customer in the state:
-
-**Full Features** (27-dimensional):
-- Start times for postponable customers (1)
-- End times (start + service time) (2)
-- Travel time from depot to customer (3)
-- Travel time from customer to depot (4)
-- Slack time until next time step (5)
-- % of must-dispatch customers that can reach this customer on time (6)
-- % of customers reachable from this customer on time (7)
-- % of customers that can reach this customer on time (8)
-- % of customers reachable or that can reach this customer on time (9)
-- Quantile-based travel times to other customers (9 quantiles) (10-18)
-- Quantiles of % of reachable new customers (9 quantiles) (19-27)
-
-**2D Features** (simplified):
-- Travel time from depot to customer (1)
-- Mean travel time to other customers (2)
-
-## Benchmark Policies
-
-### Lazy Policy
-
-The lazy policy postpones all possible customers, serving only those that must be dispatched.
-
-### Greedy Policy  
-
-The greedy policy serves all pending customers as soon as they arrive, without considering future consequences. 
-
-## Decision-Focused Learning Policy
-
-```math
-\xrightarrow[\text{State}]{s_t}
-\fbox{Neural network $\varphi_w$}
-\xrightarrow[\text{Prizes}]{\theta}
-\fbox{Prize-collecting VSP}
-\xrightarrow[\text{Routes}]{a_t}
-```
-
-**Components**:
-
-1. **Neural Network** ``\varphi_w``: Takes current state features as input and predicts customer prizes ``\theta = (\theta_1, \ldots, \theta_n)``, one value per postponable customer.
-2. **Optimization Layer**: Solves the prize-collecting vehicle scheduling problem to determine optimal routes given the predicted prizes, by maximizing total collected prizes minus travel costs:
-    ```math
-    \max_{a_t\in \mathcal{A}(s_t)} \sum_{r \in a_t} \left( \sum_{i \in r} \theta_i - \sum_{(i,j) \in r} d_{ij} \right)
-    ```
-    This can be modeled as a flow linear program on a directed acyclic graph (DAG) and is solved using standard LP solvers.
-
-The neural network architecture adapts to the feature dimensionality:
-- **2D features**: `Dense(2 => 1)`, applied in parallel to each postponable customer
-- **Full features**: `Dense(27 => 1)` applied in parallel to each postponable customer
-
-**Note:** one can also use more complex architectures such as a deeper MLP or a graph neural network for better performance.
diff --git a/docs/src/benchmarks/dynamic/01_dynamic_assortment.jl b/docs/src/benchmarks/dynamic/01_dynamic_assortment.jl
new file mode 100644
index 0000000..fa38776
--- /dev/null
+++ b/docs/src/benchmarks/dynamic/01_dynamic_assortment.jl
@@ -0,0 +1,125 @@
+# # Dynamic Assortment
+# Select which K items to offer at each step to maximize revenue: customer preferences
+# evolve dynamically based on purchase history (hype and saturation effects).
+
+using DecisionFocusedLearningBenchmarks
+using Plots
+
+b = DynamicAssortmentBenchmark()
+
+# ## Observable input
+#
+# Generate one environment and roll it out with the greedy policy to collect a sample
+# trajectory. At each step the agent observes item prices, hype levels, saturation, and
+# purchase history:
+policies = generate_baseline_policies(b)
+env = generate_environments(b, 1)[1]
+_, trajectory = evaluate_policy!(policies.expert, env)
+
+# The observable state at step 1: item prices (fixed across steps):
+plot_context(b, trajectory[1])
+
+# ## A training sample
+#
+# Each step in a trajectory is a labeled tuple `(x, θ, y)` plus state and reward:
+# - `x`: `(d+8) × N` feature matrix per step (prices, hype, saturation, history, time)
+# - `θ`: predicted utility score per item
+# - `y`: offered assortment at this step (BitVector of length N, true = offered)
+# - `instance`: full state tuple (features matrix, purchase history)
+# - `reward`: price of the purchased item (0 if no purchase)
+#
+# One step with the offered assortment highlighted (green = offered):
+plot_sample(b, trajectory[1])
+
+# A few steps side by side (prices are fixed; assortment composition changes over time):
+plot_trajectory(b, trajectory[1:min(4, length(trajectory))])
+
+# ## DFL pipeline components
+
+# The DFL agent chains two components: a neural network predicting utility scores per item:
+model = generate_statistical_model(b)     # MLP: state features → predicted utility per item
+# and a maximizer offering the K items with the highest predicted utilities:
+maximizer = generate_maximizer(b)         # top-K selection by predicted utility
+
+# At each step, the model maps the current state (prices, hype, saturation, history) to a
+# utility score per item. The maximizer selects the K items with the highest scores.
+
+# ---
+# ## Problem Description
+#
+# ### Overview
+#
+# In the **Dynamic Assortment problem**, a retailer has ``N`` items and must select
+# ``K`` to offer at each time step. Customer preferences evolve based on purchase history
+# through **hype** (recent purchases increase demand) and **saturation** (repeated
+# purchases slightly decrease demand).
+#
+# ### Mathematical Formulation
+#
+# **State** ``s_t = (p, f, h_t, \sigma_t, t, \mathcal{H}_t)`` where:
+# - ``p``: fixed item prices
+# - ``f``: static item features
+# - ``h_t, \sigma_t``: current hype and saturation levels
+# - ``t``: current time step
+# - ``\mathcal{H}_t``: purchase history (last 5 purchases)
+#
+# **Action:** ``a_t \subseteq \{1,\ldots,N\}`` with ``|a_t| = K``
+#
+# **Customer choice** (multinomial logit):
+# ```math
+# \mathbb{P}(i \mid a_t, s_t) = \frac{\exp(\theta_i(s_t))}{\sum_{j \in a_t} \exp(\theta_j(s_t)) + 1}
+# ```
+#
+# **Transition dynamics:**
+# - Hype: ``h_{t+1}^{(i)} = h_t^{(i)} \times m^{(i)}`` where the multiplier reflects recent purchases
+# - Saturation: increases by ×1.01 for the purchased item
+#
+# **Reward:** ``r(s_t, a_t) = p_{i^\star}`` (price of the purchased item, 0 if no purchase)
+#
+# **Objective:**
+# ```math
+# \max_\pi \; \mathbb{E}\!\left[\sum_{t=1}^T r(s_t, \pi(s_t))\right]
+# ```
+#
+# ## Key Components
+#
+# ### [`DynamicAssortmentBenchmark`](@ref)
+#
+# | Parameter | Description | Default |
+# |-----------|-------------|---------|
+# | `N` | Number of items in catalog | 20 |
+# | `d` | Static feature dimension per item | 2 |
+# | `K` | Assortment size | 4 |
+# | `max_steps` | Steps per episode | 80 |
+# | `exogenous` | Whether dynamics are exogenous | `false` |
+#
+# ### State Observation
+#
+# Agents observe a ``(d+8) \times N`` normalized feature matrix per step containing:
+# current prices, hype, saturation, static features, change in hype/saturation from
+# previous step and from initial state, and normalized time step.
+#
+# ## Baseline Policies
+#
+# | Policy | Description |
+# |--------|-------------|
+# | Expert | Brute-force enumeration of all ``\binom{N}{K}`` subsets; optimal but slow |
+# | Greedy | Selects the ``K`` items with highest prices |
+#
+# ## DFL Policy
+#
+# ```math
+# \xrightarrow[\text{State}]{s_t}
+# \fbox{Neural network $\varphi_w$}
+# \xrightarrow[\text{Utilities}]{\theta \in \mathbb{R}^N}
+# \fbox{Top-K}
+# \xrightarrow[\text{Assortment}]{a_t}
+# ```
+#
+# **Model:** `Chain(Dense(d+8 → 5), Dense(5 → 1), vec)`: predicts one utility score
+# per item from the current state features.
+#
+# **Maximizer:** `TopKMaximizer(K)`: selects the top ``K`` items by predicted utility.
+#
+# !!! note "Reference"
+#     [Structured Reinforcement Learning for Combinatorial Decision-Making](https://arxiv.org/abs/2505.19053)
diff --git a/docs/src/benchmarks/dynamic/02_maintenance.jl b/docs/src/benchmarks/dynamic/02_maintenance.jl
new file mode 100644
index 0000000..be710aa
--- /dev/null
+++ b/docs/src/benchmarks/dynamic/02_maintenance.jl
@@ -0,0 +1,114 @@
+# # Maintenance
+# Decide which components to maintain at each step to minimize failure and maintenance costs:
+# components degrade stochastically and the agent has limited maintenance capacity.
+
+using DecisionFocusedLearningBenchmarks
+using Plots
+
+b = MaintenanceBenchmark(; N=5, K=2)  # 5 components, maintain up to 2 per step
+
+# ## Observable input
+#
+# Generate one environment and roll it out with the greedy policy to collect a sample
+# trajectory. At each step the agent observes the degradation level of each component:
+policies = generate_baseline_policies(b)
+env = generate_environments(b, 1)[1]
+_, trajectory = evaluate_policy!(policies.greedy, env)
+
+# The observable state at step 1: degradation levels per component (1 = new, n = failed):
+plot_context(b, trajectory[1])
+
+# ## A training sample
+#
+# Each step in a trajectory is a labeled tuple `(x, θ, y)` plus state and reward:
+# - `x`: degradation state vector (values in `1..n` per component)
+# - `θ`: urgency score per component (predicted by model)
+# - `y`: which components are maintained at this step (BitVector of length N)
+# - `instance`: degradation state vector
+# - `reward`: negative cost (maintenance and failure costs) at this step
+#
+# One step with maintenance decisions (green = maintained, red = failed):
+plot_sample(b, trajectory[1])
+
+# A few steps side by side showing degradation evolving over time:
+plot_trajectory(b, trajectory[1:min(4, length(trajectory))])
+
+# ## DFL pipeline components
+
+# The DFL agent chains two components: a neural network predicting urgency scores per component:
+model = generate_statistical_model(b)     # two-layer MLP: degradation state → urgency scores
+# and a maximizer selecting the most urgent components for maintenance:
+maximizer = generate_maximizer(b)         # top-K selection among components with positive scores
+
+# At each step, the model maps the current degradation state to an urgency score per component.
+# The maximizer selects up to K components with the highest positive scores for maintenance.
+
+# ---
+# ## Problem Description
+#
+# ### Overview
+#
+# In the **Maintenance benchmark**, a system has ``N`` identical components, each with
+# ``n`` discrete degradation states (1 = new, ``n`` = failed). At each step, the agent
+# can maintain up to ``K`` components. Maintained components are reset to state 1.
+# Unmaintained components degrade stochastically.
+#
+# ### Mathematical Formulation
+#
+# **State** ``s_t \in \{1,\ldots,n\}^N``: degradation level of each component.
+#
+# **Action** ``a_t \subseteq \{1,\ldots,N\}`` with ``|a_t| \leq K``
+#
+# **Transition dynamics:** For each component ``i``:
+# - If maintained: ``s_{t+1}^i = 1``
+# - If not maintained: ``s_{t+1}^i = \min(s_t^i + 1, n)`` with probability ``p``, else ``s_t^i``
+#
+# **Cost:**
+# ```math
+# c(s_t, a_t) = c_m \cdot |a_t| + c_f \cdot \#\{i : s_t^i = n\}
+# ```
+#
+# **Objective:**
+# ```math
+# \min_\pi \; \mathbb{E}\!\left[\sum_{t=1}^T c(s_t, \pi(s_t))\right]
+# ```
+#
+# ## Key Components
+#
+# ### [`MaintenanceBenchmark`](@ref)
+#
+# | Parameter | Description | Default |
+# |-----------|-------------|---------|
+# | `N` | Number of components | 2 |
+# | `K` | Max simultaneous maintenance operations | 1 |
+# | `n` | Degradation levels per component | 3 |
+# | `p` | Degradation probability per step | 0.2 |
+# | `c_f` | Failure cost per failed component | 10.0 |
+# | `c_m` | Maintenance cost per maintained component | 3.0 |
+# | `max_steps` | Steps per episode | 80 |
+#
+# ### Instance Generation
+#
+# Each instance has random starting degradation states uniformly drawn from ``\{1,\ldots,n\}``.
+#
+# ## Baseline Policies
+#
+# | Policy | Description |
+# |--------|-------------|
+# | Greedy | Maintains components in the last degradation state before failure, up to capacity |
+#
+# ## DFL Policy
+#
+# ```math
+# \xrightarrow[\text{State}]{s_t \in \{1,\ldots,n\}^N}
+# \fbox{Neural network $\varphi_w$}
+# \xrightarrow[\text{Scores}]{\theta \in \mathbb{R}^N}
+# \fbox{Top-K (positive)}
+# \xrightarrow[\text{Maintenance}]{a_t}
+# ```
+#
+# **Model:** `Chain(Dense(N → N), Dense(N → N), vec)`: two-layer MLP predicting one
+# urgency score per component.
+#
+# **Maximizer:** `TopKPositiveMaximizer(K)`: selects the ``K`` components with the
+# highest positive scores for maintenance.
diff --git a/docs/src/benchmarks/dynamic/03_dvsp.jl b/docs/src/benchmarks/dynamic/03_dvsp.jl
new file mode 100644
index 0000000..aa0f2c6
--- /dev/null
+++ b/docs/src/benchmarks/dynamic/03_dvsp.jl
@@ -0,0 +1,131 @@
+# # Dynamic Vehicle Scheduling
+# Dispatch vehicles to customers arriving over time: at each step the agent decides which
+# customers to serve now and which to postpone, minimizing total travel cost.
+
+using DecisionFocusedLearningBenchmarks
+using Plots
+
+b = DynamicVehicleSchedulingBenchmark()
+
+# ## Observable input
+#
+# Generate one environment and roll it out with the greedy policy to collect a sample
+# trajectory. At each step the agent observes customer positions, start times, and which
+# customers have reached their dispatch deadline:
+policies = generate_baseline_policies(b)
+env = generate_environments(b, 1)[1]
+_, trajectory = evaluate_policy!(policies.greedy, env)
+
+# The observable state at step 1: depot (green square), must-dispatch customers
+# (red stars; deadline reached), postponable customers (blue triangles):
+plot_context(b, trajectory[1])
+
+# ## A training sample
+#
+# Each step in a trajectory is a labeled tuple `(x, θ, y)` plus state and reward:
+# - `x`: 27-dimensional feature vector per customer (schedule slack, travel times, reachability)
+# - `θ`: prize per customer (predicted by the model; used as optimization input)
+# - `y`: routes dispatched at this step
+# - `instance`: full DVSP state (customer positions, deadlines, current epoch)
+# - `reward`: negative travel cost incurred at this step
+#
+# One step with dispatched routes:
+plot_sample(b, trajectory[1])
+
+# Multiple steps side by side: customers accumulate and routes change over time:
+plot_trajectory(b, trajectory)
+
+# ## DFL pipeline components
+
+# The DFL agent chains two components: a neural network predicting a prize per customer:
+model = generate_statistical_model(b)     # Dense(27 → 1) per customer: state features → prize
+# and a maximizer selecting routes that balance collected prizes against travel costs:
+maximizer = generate_maximizer(b)         # prize-collecting VSP solver
+
+# At each step, the model assigns a prize to each postponable customer. The solver then
+# selects routes maximizing collected prizes minus travel costs, deciding which customers
+# to serve now and which to defer.
+
+# ---
+# ## Problem Description
+#
+# ### Overview
+#
+# In the **Dynamic Vehicle Scheduling Problem (DVSP)**, a fleet operator must decide at
+# each time step which customers to serve immediately and which to postpone. The goal is
+# to serve all customers by end of the planning horizon while minimizing total travel time.
+#
+# The problem is characterized by:
+# - **Exogenous noise**: customer arrivals are stochastic and follow a fixed distribution
+# - **Combinatorial action space**: routes are built over a large set of customers
+#
+# ### Mathematical Formulation
+#
+# **State** ``s_t = (R_t, D_t, t)`` where:
+# - ``R_t``: pending customers, each with coordinates, start time, service time
+# - ``D_t``: must-dispatch customers (cannot be postponed further)
+# - ``t``: current time step
+#
+# **Action** ``a_t``: a set of vehicle routes ``\{r_1, r_2, \ldots, r_k\}``, each starting
+# and ending at the depot, satisfying time constraints.
+#
+# **Reward:**
+# ```math
+# r(s_t, a_t) = -\sum_{r \in a_t} \sum_{(i,j) \in r} d_{ij}
+# ```
+#
+# **Objective:**
+# ```math
+# \max_\pi \; \mathbb{E}\!\left[\sum_{t=1}^T r(s_t, \pi(s_t))\right]
+# ```
+#
+# ## Key Components
+#
+# ### [`DynamicVehicleSchedulingBenchmark`](@ref)
+#
+# | Parameter | Description | Default |
+# |-----------|-------------|---------|
+# | `max_requests_per_epoch` | Maximum new customers per time step | 10 |
+# | `Δ_dispatch` | Time delay between decision and dispatch | 1.0 |
+# | `epoch_duration` | Duration of each time step | 1.0 |
+# | `two_dimensional_features` | Use 2D instead of full 27D features | `false` |
+#
+# ### Features
+#
+# **Full features (27D per customer):** start/end times, depot travel times, slack,
+# reachability ratios, quantile-based travel times to other customers.
+#
+# **2D features:** travel time from depot + mean travel time to others.
+#
+# ## Baseline Policies
+#
+# | Policy | Description |
+# |--------|-------------|
+# | Lazy | Postpones all possible customers; serves only must-dispatch |
+# | Greedy | Serves all pending customers immediately |
+#
+# ## DFL Policy
+#
+# ```math
+# \xrightarrow[\text{State}]{s_t}
+# \fbox{Neural network $\varphi_w$}
+# \xrightarrow[\text{Prizes}]{\theta}
+# \fbox{Prize-collecting VSP}
+# \xrightarrow[\text{Routes}]{a_t}
+# ```
+#
+# The neural network predicts a prize ``\theta_i`` for each postponable customer.
+# The prize-collecting VSP solver then maximizes collected prizes minus travel costs:
+# ```math
+# \max_{a_t \in \mathcal{A}(s_t)} \sum_{r \in a_t} \left(\sum_{i \in r} \theta_i - \sum_{(i,j) \in r} d_{ij}\right)
+# ```
+#
+# **Model:**
+# - 2D features: `Dense(2 → 1)` applied independently per customer
+# - Full features: `Dense(27 → 1)` applied independently per customer
+#
+# !!! note "Reference"
+#     This problem is a simplified version of the
+#     [EURO-NeurIPS challenge 2022](https://euro-neurips-vrp-2022.challenges.ortec.com/),
+#     and solved using DFL in [Combinatorial Optimization enriched Machine Learning to solve the
+#     Dynamic Vehicle Routing Problem with Time Windows](https://arxiv.org/abs/2304.00789).
diff --git a/docs/src/benchmarks/dynamic_assortment.md b/docs/src/benchmarks/dynamic_assortment.md
deleted file mode 100644
index 6f5264c..0000000
--- a/docs/src/benchmarks/dynamic_assortment.md
+++ /dev/null
@@ -1,158 +0,0 @@
-# Dynamic Assortment
-
-The Dynamic Assortment problem is a sequential decision-making benchmark where an agent must repeatedly select which subset of items to offer to customers over time. The goal is to maximize total revenue while accounting for dynamic customer preferences that evolve based on purchase history.
-
-## Problem Description
-
-### Overview
-
-In the dynamic assortment problem, a retailer has access to a catalog of ``N`` items and must decide which subset of exactly ``K`` items to offer to customers at each time step. Customers make purchasing decisions according to a choice model that depends on public features ``x``:
-
-- **Item prices**: Fixed monetary cost of each item
-- **Item features**: Static characteristics of each item (size ``d``)
-- **Hype**: Dynamic popularity that increases when items are purchased recently, and decays over time if not purchased
-- **Saturation**: Dynamic measure that slightly increases when specific items are purchased
-
-Both hype and saturation evolve over time based on the agent's assortment decisions and customer purchases, this providing an endogenous multistage stochastic optimization problem.
-
-### Mathematical Formulation
-
-The dynamic assortment problem can be formulated as a finite-horizon Markov Decision Process (MDP) with the following components:
-
-**State Space** ``\mathcal{S}``: At time step ``t``, the state ``s_t`` consists of:
-```math
-s_t = (p, f, h_t, \sigma_t, t, \mathcal{H}_t)
-```
-where:
-- ``p \in \mathbb{R}^N`` are the fixed item prices
-- ``f \in \mathbb{R}^{d \times N}`` are the static item features
-- ``h_t \in \mathbb{R}^N`` are the current hype levels for each item
-- ``\sigma_t \in \mathbb{R}^N`` are the current saturation levels for each item
-- ``t \in \{1, 2, \ldots, T\}`` is the current time step
-- ``\mathcal{H}_t`` is the purchase history (last 5 purchases)
-
-**Action Space** ``\mathcal{A}``: The action at time ``t`` is an assortment selection:
-```math
-a_t \subseteq \{1, 2, \ldots, N\} \text{ such that } |a_t| = K
-```
-
-**Customer Choice Model**: Given assortment ``a_t``, customers choose according to a multinomial logit model:
-```math
-\forall i\in a_t,\, \mathbb{P}(i | a_t, s_t) = \frac{\exp(\theta_i(s_t))}{\sum_{j\in a_t} \exp(\theta_j(s_t)) + 1}
-```
-```math
-\mathbb{P}(\text{no purchase} | a_t, s_t) = \frac{1}{\sum_{j\in a_t} \exp(\theta_j(s_t)) + 1}
-```
-
-where ``\theta_i(s_t)`` is the utility of item ``i`` at state ``s_t``, computed by a hidden utility function:
-```math
-\theta_i(s_t) = \Phi(p_i, h_t^{(i)}, \sigma_t^{(i)}, f_{\cdot,i})
-```
-
-**Transition Dynamics** ``\mathcal{P}(s_{t+1} | s_t, a_t)``: After selecting assortment ``a_t`` and observing customer choice ``i^\star \sim \mathbb{P}(\cdot | a_t, s_t)``, the state evolves as:
-
-1. **Hype Update**: For each item ``i``, compute a hype multiplier based on recent purchase history:
-   ```math
-   m^{(i)} = 1 + \sum_{k=1}^{\min(5, |\mathcal{H}_t|)} \mathbf{1}_{i = \mathcal{H}_t[-k]} \cdot \alpha_k
-   ```
-   where ``\mathcal{H}_t[-k]`` is the ``k``-th most recent purchase, and the factors are:
-   ```math
-   \alpha_1 = 0.02, \quad \alpha_2 = \alpha_3 = \alpha_4 = \alpha_5 = -0.005
-   ```
-   Then update: ``h_{t+1}^{(i)} = h_t^{(i)} \times m^{(i)}``
-
-2. **Saturation Update**:
-   ```math
-   \sigma_{t+1}^{(i)} = \begin{cases}
-   \sigma_t^{(i)} \times 1.01 & \text{if } i = i^\star \\
-   \sigma_t^{(i)} & \text{otherwise}
-   \end{cases}
-   ```
-
-3. **History Update**: ``\mathcal{H}_{t+1} = \text{append}(\mathcal{H}_t, i^\star)`` (keeping last 5 purchases)
-
-**Reward Function** ``r(s_t, a_t, s_{t+1})``: The immediate reward is the revenue from the customer's purchase:
-```math
-r(s_t, a_t, s_{t+1}) = \begin{cases}
-p_{i^\star} & \text{if customer purchases item } i^\star \\
-0 & \text{if no purchase}
-\end{cases}
-```
-
-**Objective**: Find a policy ``\pi: \mathcal{S} \to \mathcal{A}`` that maximizes the expected cumulative reward:
-```math
-\max_\pi \mathbb{E}\left[\sum_{t=1}^T r(s_t, \pi(s_t), s_{t+1}) \right]
-```
-
-**Terminal Condition**: The episode terminates after ``T`` time steps, with no terminal reward.
-
-## Key Components
-
-### [`DynamicAssortmentBenchmark`](@ref)
-
-The main benchmark configuration with the following parameters:
-
-- `N`: Number of items in the catalog (default: 20)
-- `d`: Dimension of static feature vectors (default: 2) 
-- `K`: Assortment size constraint (default: 4)
-- `max_steps`: Number of time steps per episode (default: 80)
-- `customer_choice_model`: linear mapping from features to utilities
-- `exogenous`: Whether dynamics are exogenous (default: false)
-
-### Instance Generation
-
-Each problem instance includes:
-
-- **Prices**: Random values in [1, 10] for each item, plus 0 for no-purchase
-- **Features**: Random static features in [1, 10] for each item
-- **Initial State**: Random starting hype and saturation values in [1, 10]
-
-### Environment Dynamics
-
-The environment tracks:
-- Current time step
-- Purchase history (last 5 purchases)
-- Current hype and saturation for each item  
-- Customer utilities computed from current state
-
-**State Observation**: Agents observe a normalized feature vector containing:
-- Current full features (prices, hype, saturation, static features)
-- Change in hype/saturation from previous step
-- Change in hype/saturation from initial state  
-- Normalized current time step
-
-All features are divided by 10 for normalization.
-
-## Benchmark Policies
-
-### Expert Policy
-
-The expert policy computes the optimal assortment by brute-force enumeration:
-1. Enumerate all possible K-subsets of the N items
-2. For each subset, compute expected revenue using the choice model
-3. Return the subset with highest expected revenue
-
-This provides an optimal baseline but is computationally expensive.
-
-### Greedy Policy  
-
-The greedy policy selects the K items with the highest prices, ignoring dynamic effects and customer preferences. This provides a simple baseline.
-
-## Decision-Focused Learning Policy
-
-```math
-\xrightarrow[\text{State}]{s_t}
-\fbox{Neural network $\varphi_w$}
-\xrightarrow[\text{Cost vector}]{\theta}
-\fbox{Top K}
-\xrightarrow[\text{Assortment}]{a_t}
-```
-
-**Components**:
-
-1. **Neural Network** ``\varphi_w``: Takes the current state ``s_t`` as input and predicts item utilities ``\theta = (\theta_1, \ldots, \theta_N)``
-2. **Optimization Layer**: Selects the top ``K`` items with highest predicted utilities to form the assortment ``a_t``
-
-## Reference
-
-Based on the paper: [Structured Reinforcement Learning for Combinatorial Decision-Making](https://arxiv.org/abs/2505.19053)
diff --git a/docs/src/benchmarks/fixed_size_shortest_path.md b/docs/src/benchmarks/fixed_size_shortest_path.md
deleted file mode 100644
index 049724d..0000000
--- a/docs/src/benchmarks/fixed_size_shortest_path.md
+++ /dev/null
@@ -1,7 +0,0 @@
-# Shortest paths
-
-[`FixedSizeShortestPathBenchmark`](@ref) is a benchmark problem that consists of finding the shortest path in a grid graph between the top left and bottom right corners.
-In this benchmark, the grid size is the same for all instances.
-
-!!! warning
-    Documentation for this benchmark is still under development. Please refer to the source code and API for more details.
\ No newline at end of file
diff --git a/docs/src/benchmarks/maintenance.md b/docs/src/benchmarks/maintenance.md
deleted file mode 100644
index 060099d..0000000
--- a/docs/src/benchmarks/maintenance.md
+++ /dev/null
@@ -1,107 +0,0 @@
-# Maintenance problem with resource constraint
-
-The Maintenance problem with resource constraint is a sequential decision-making benchmark where an agent must repeatedly decide which components to maintain over time. The goal is to minimize total expected cost while accounting for independent degradation of components and limited maintenance capacity.
-
-
-## Problem Description
-
-### Overview
-
-In this benchmark, a system consists of ``N`` identical components, each of which can degrade over ``n`` discrete states. State ``1`` means that the component is new, state $n$ means that the component is failed. At each time step, the agent can maintain up to $K$ components.  
-
-This forms an endogenous multistage stochastic optimization problem, where the agent must plan maintenance actions over the horizon.
-
-### Mathematical Formulation
-
-The maintenance problem can be formulated as a finite-horizon Markov Decision Process (MDP) with the following components:
-
-**State Space** ``\mathcal{S}``: At time step ``t``, the state ``s_t \in [1:n]^N`` is the degradation state for each component.
-
-**Action Space** ``\mathcal{A}``: The action at time ``t`` is the set of components that are maintained at time ``t``:
-```math
-a_t \subseteq \{1, 2, \ldots, N\} \text{ such that } |a_t| \leq K
-```
-### Transition Dynamics
-
-The state transitions depend on whether a component is maintained or not:
-
-For each component \(i\) at time \(t\):
-
-- **Maintained component** (\(i \in a_t\)):
-
-\[
-s_{t+1}^i = 1 \quad \text{(perfect maintenance)}
-\]
-
-- **Unmaintained component** (\(i \notin a_t\)):
-
-\[
-s_{t+1}^i =
-\begin{cases}
-\min(s_t^i + 1, n) & \text{with probability } p,\\
-s_t^i & \text{with probability } 1-p.
-\end{cases}
-\]
-
-Here, \(p\) is the degradation probability, \(s_t^i\) is the current state of component \(i\), and \(n\) is the maximum (failed) state.
-
----
-
-### Cost Function
-
-The immediate cost at time \(t\) is:
-
-```math
-c(s_t, a_t) = \Big( c_m \cdot |a_t| + c_f \cdot \#\{ i : s_t^i = n \} \Big)
-```
-
-Where:
-
-- $c_m$ is the maintenance cost per component.  
-- $|a_t|$ is the number of components maintained.  
-- $c_f$ is the failure cost per failed component.  
-- $\#\{ i : s_t^i = n \}$ counts the number of components in the failed state.
-
-This formulation captures the total cost for maintaining components and penalizing failures.
-
-**Objective**: Find a policy $\pi: \mathcal{S} \to \mathcal{A}$ that minimizes the expected cumulative cost:
-```math
-\min_\pi \mathbb{E}\left[\sum_{t=1}^T c(s_t, \pi(s_t)) \right]
-```
-
-**Terminal Condition**: The episode terminates after $T$ time steps, with no terminal reward.
-
-## Key Components
-
-### [`MaintenanceBenchmark`](@ref)
-
-The main benchmark configuration with the following parameters:
-
-- `N`: number of components (default: 2)
-- `K`: maximum number of components that can be maintained simultaneously (default: 1) 
-- `n`: number of degradation states per component (default: 3)
-- `p`: degradation probability (default: 0.2)
-- `c_f`: failure cost (default: 10.0)
-- `c_m`: maintenance cost (default: 3.0)
-- `max_steps`: Number of time steps per episode (default: 80)
-
-### Instance Generation
-
-Each problem instance includes:
-
-- **Starting State**: Random starting degradation state in $[1,n]$ for each components.
-
-### Environment Dynamics
-
-The environment tracks:
-- Current time step
-- Current degradation state.
-
-**State Observation**: Agents observe a normalized feature vector containing the degradation state of each component.
-
-## Benchmark Policies
-
-### Greedy Policy  
-
-Greedy policy that maintains components in the last two degradation states, up to the maintenance capacity. This provides a simple baseline.
-
diff --git a/docs/src/benchmarks/portfolio_optimization.md b/docs/src/benchmarks/portfolio_optimization.md
deleted file mode 100644
index da14f5a..0000000
--- a/docs/src/benchmarks/portfolio_optimization.md
+++ /dev/null
@@ -1,15 +0,0 @@
-# Portfolio Optimization
-
-[`PortfolioOptimizationBenchmark`](@ref) is a Markovitz portfolio optimization problem, where asset prices are unknown, and only contextual data is available to predict these prices.
-The goal is to predict asset prices $c$ and maximize the expected return of a portfolio, subject to a risk constraint using this maximization program:
-```math
-\begin{aligned}
-\max\quad & c^\top x\\
-\text{s.t.}\quad & x^\top \Sigma x \leq \gamma\\
-& 1^\top x \leq 1\\
-& x \geq 0
-\end{aligned}
-```
-
-!!! warning
-    Documentation for this benchmark is still under development. Please refer to the source code and API for more details.
\ No newline at end of file
diff --git a/docs/src/benchmarks/ranking.md b/docs/src/benchmarks/ranking.md
deleted file mode 100644
index b0069e4..0000000
--- a/docs/src/benchmarks/ranking.md
+++ /dev/null
@@ -1,4 +0,0 @@
-# Ranking
-
-!!! warning
-    Documentation for this benchmark is still under development. Please refer to the source code and API for more details.
\ No newline at end of file
diff --git a/docs/src/benchmarks/static/01_argmax.jl b/docs/src/benchmarks/static/01_argmax.jl
new file mode 100644
index 0000000..817ac70
--- /dev/null
+++ b/docs/src/benchmarks/static/01_argmax.jl
@@ -0,0 +1,83 @@
+# # Argmax
+# Select the single best item from a set of `n` items, given features correlated with hidden
+# item scores. This is a minimalist DFL setting: equivalent to multiclass
+# classification, but with an argmax layer instead of softmax. Useful as a minimal sandbox for
+# understanding DFL concepts.
+
+using DecisionFocusedLearningBenchmarks
+using Plots
+using Statistics
+
+b = ArgmaxBenchmark(; seed=0)
+
+# ## Observable input
+#
+# At inference time the decision-maker observes only a feature matrix `x`
+# (rows = features, columns = items):
+dataset = generate_dataset(b, 100; seed=0)
+sample = first(dataset)
+plot_context(b, sample)
+
+# ## A training sample
+#
+# Each sample is a labeled triple `(x, θ, y)`:
+# - `x`: feature matrix (observable at train and test time)
+# - `θ`: true item scores (training supervision only, hidden at test time)
+# - `y`: optimal one-hot decision derived from `θ`
+#
+# The full training triple (features, true scores, and optimal decision):
+plot_sample(b, sample)
+
+# ## Untrained policy
+
+# A DFL policy chains two components: a statistical model predicting scores from features:
+model = generate_statistical_model(b)     # linear map: features → predicted scores
+# and a maximizer turning those scores into a decision:
+maximizer = generate_maximizer(b)         # one-hot argmax
+
+# A randomly initialized policy makes essentially random decisions:
+θ_pred = model(sample.x)
+y_pred = maximizer(θ_pred)
+#
+plot_sample(b, DataSample(sample; θ=θ_pred, y=y_pred))
+
+# The goal of training is to find parameters that maximize accuracy.
+# Current accuracy on the dataset:
+mean(maximizer(model(s.x)) == s.y for s in dataset)
+
+# ---
+# ## Problem Description
+#
+# In the **Argmax benchmark**, a feature matrix ``x \in \mathbb{R}^{p \times n}`` is
+# observed. A hidden linear encoder maps ``x`` to a score vector
+# ``\theta = \text{encoder}(x) \in \mathbb{R}^n``. The task is to select the item with
+# the highest score:
+# ```math
+# y = \mathrm{argmax}(\theta) = \mathop{\mathrm{argmax}}\limits_{y\in\Delta^n} \theta^\top y
+# ```
+# The solution ``y`` is encoded as a one-hot vector.
+# The score vector ``\theta`` is never observed (only features ``x`` are available).
+# The DFL pipeline trains a model ``f_w`` so that ``\mathrm{argmax}(f_w(x))`` matches
+# ``\mathrm{argmax}(\theta)`` at decision time.
+#
+# ## Key Parameters
+#
+# | Parameter | Description | Default |
+# |-----------|-------------|---------|
+# | `instance_dim` | Number of items | 10 |
+# | `nb_features` | Feature dimension `p` | 5 |
+#
+# ## DFL Policy
+#
+# ```math
+# \xrightarrow[\text{Features}]{x \in \mathbb{R}^{p \times n}}
+# \fbox{Linear model $f_w$}
+# \xrightarrow[\text{Predicted scores}]{\theta \in \mathbb{R}^n}
+# \fbox{argmax}
+# \xrightarrow[\text{Selection}]{y \in \{0,1\}^n}
+# ```
+#
+# **Model:** `Chain(Dense(nb_features → 1; bias=false), vec)`: a single linear layer
+# predicting one score per item.
+#
+# **Maximizer:** `one_hot_argmax`: returns a one-hot vector at the argmax index.
diff --git a/docs/src/benchmarks/static/02_argmax2d.jl b/docs/src/benchmarks/static/02_argmax2d.jl
new file mode 100644
index 0000000..566c3ec
--- /dev/null
+++ b/docs/src/benchmarks/static/02_argmax2d.jl
@@ -0,0 +1,77 @@
+# # Argmax on a 2D polytope
+# Select the best vertex of a random convex polytope in 2D: predict a cost direction θ from
+# features, then return the vertex `v` maximizing `θᵀv`. The 2D setting makes this benchmark
+# visual: the cost direction and selected vertex can be plotted directly, and the loss
+# landscape can be shown as a contour plot over the 2D θ space.
+
+using DecisionFocusedLearningBenchmarks
+using Plots
+
+b = Argmax2DBenchmark(; seed=0)
+
+# ## Observable input
+#
+# At inference time the decision-maker observes the feature vector `x` and the polytope shape,
+# but not the cost direction hidden `θ`:
+dataset = generate_dataset(b, 50; seed=0)
+sample = first(dataset)
+plot_context(b, sample)
+
+# ## A training sample
+#
+# Each sample is a labeled triple `(x, θ, y)`:
+# - `x`: feature vector (observable at train and test time)
+# - `θ`: 2D cost direction (training supervision only, hidden at test time)
+# - `y`: polytope vertex maximizing `θᵀv` (optimal decision)
+# - `instance` (in `context`): polytope vertices (observable problem structure)
+#
+# The full training triple (polytope, cost direction θ, optimal vertex y):
+plot_sample(b, sample)
+
+# ## Untrained policy
+
+# A DFL policy chains two components: a statistical model predicting a 2D cost direction:
+model = generate_statistical_model(b)     # linear map: features → 2D cost vector
+# and a maximizer selecting the best polytope vertex for that direction:
+maximizer = generate_maximizer(b)         # vertex maximizing θᵀv over polytope vertices
+
+# A randomly initialized policy predicts an arbitrary cost direction:
+θ_pred = model(sample.x)
+y_pred = maximizer(θ_pred; sample.context...)
+plot_sample(b, DataSample(sample; θ=θ_pred, y=y_pred))
+
+# ---
+# ## Problem Description
+#
+# In the **Argmax2D benchmark**, each instance defines a random convex polytope
+# ``\mathcal{Y}(x) = \mathrm{conv}(v_1, \ldots, v_m)`` in ``\mathbb{R}^2``.
+# A hidden encoder maps features ``x \in \mathbb{R}^p`` to a 2D cost vector
+# ``\theta \in \mathbb{R}^2``. The task is to find the polytope vertex maximizing
+# the dot product:
+# ```math
+# y^* = \mathop{\mathrm{argmax}}\limits_{v \in \mathcal{Y}(x)} \; \theta^\top v
+# ```
+#
+# This is a toy 2D combinatorial optimization problem useful for visualizing
+# how well a model learns the cost direction.
+#
+# ## Key Parameters
+#
+# | Parameter | Description | Default |
+# |-----------|-------------|---------|
+# | `nb_features` | Feature dimension `p` | 5 |
+# | `polytope_vertex_range` | Number of polytope vertices (list; one value drawn at random per instance) | `[6]` |
+#
+# ## DFL Policy
+#
+# ```math
+# \xrightarrow[\text{Features}]{x}
+# \fbox{Linear model}
+# \xrightarrow{\theta \in \mathbb{R}^2}
+# \fbox{Polytope argmax}
+# \xrightarrow{y}
+# ```
+#
+# **Model:** `Dense(nb_features → 2; bias=false)`: predicts a 2D cost direction.
+#
+# **Maximizer:** finds the vertex of the instance polytope with maximum dot product with θ.
diff --git a/docs/src/benchmarks/static/03_ranking.jl b/docs/src/benchmarks/static/03_ranking.jl
new file mode 100644
index 0000000..880344b
--- /dev/null
+++ b/docs/src/benchmarks/static/03_ranking.jl
@@ -0,0 +1,75 @@
+# # Ranking
+# Rank a set of items. Each item has a hidden score, correlated with observable input
+# features. The goal is to learn to sort items by their hidden scores, using observable
+# features alone.
+
+using DecisionFocusedLearningBenchmarks
+using Plots
+
+b = RankingBenchmark()
+
+# ## Observable input
+#
+# At inference time the decision-maker observes only the feature matrix `x`
+# (rows = features, columns = items):
+dataset = generate_dataset(b, 50; seed=0)
+sample = first(dataset)
+plot_context(b, sample)
+
+# ## A training sample
+#
+# Each sample is a labeled triple `(x, θ, y)`:
+# - `x`: feature matrix (rows = features, columns = items; observable at train and test time)
+# - `θ`: true item costs (training supervision only, hidden at test time)
+# - `y`: ordinal ranks derived from `θ` (`y[i] = 1` means item `i` has the lowest cost)
+#
+# The full training triple (features, true costs, and derived ranking):
+plot_sample(b, sample)
+
+# ## Untrained policy
+
+# A DFL policy chains two components: a statistical model predicting item scores:
+model = generate_statistical_model(b)     # linear map: features → predicted costs
+# and a maximizer ranking items by those scores:
+maximizer = generate_maximizer(b)         # ordinal ranking via sortperm
+
+# A randomly initialized policy produces an arbitrary ranking:
+θ_pred = model(sample.x)
+y_pred = maximizer(θ_pred)
+plot_sample(b, DataSample(sample; θ=θ_pred, y=y_pred))
+
+# Optimality gap on the dataset (lower is better):
+compute_gap(b, dataset, model, maximizer)
+
+# ---
+# ## Problem Description
+#
+# In the **Ranking benchmark**, a feature matrix ``x \in \mathbb{R}^{p \times n}`` is
+# observed. A hidden linear encoder maps ``x`` to a cost vector
+# ``\theta \in \mathbb{R}^n``. The task is to compute the ordinal ranking of the items
+# by cost:
+# ```math
+# y_i = \mathrm{rank}(\theta_i \mid \theta_1, \ldots, \theta_n) = \mathop{\mathrm{argmax}}\limits_{y\in\sigma(n)} \theta^\top y
+# ```
+# where ``y_i = 1`` means item ``i`` has the lowest cost.
+#
+# ## Key Parameters
+#
+# | Parameter | Description | Default |
+# |-----------|-------------|---------|
+# | `instance_dim` | Number of items to rank | 10 |
+# | `nb_features` | Feature dimension `p` | 5 |
+#
+# ## DFL Policy
+#
+# ```math
+# \xrightarrow[\text{Features}]{x}
+# \fbox{Linear model}
+# \xrightarrow{\theta}
+# \fbox{ranking}
+# \xrightarrow{y}
+# ```
+#
+# **Model:** `Chain(Dense(nb_features → 1; bias=false), vec)`: predicts one score per item.
+#
+# **Maximizer:** `ranking(θ)`: returns a vector of ordinal ranks via `invperm(sortperm(θ))`.
diff --git a/docs/src/benchmarks/static/04_portfolio_optimization.jl b/docs/src/benchmarks/static/04_portfolio_optimization.jl
new file mode 100644
index 0000000..a6a734b
--- /dev/null
+++ b/docs/src/benchmarks/static/04_portfolio_optimization.jl
@@ -0,0 +1,89 @@
+# # Portfolio Optimization
+# Allocate wealth across assets to maximize expected return subject to a risk constraint:
+# asset returns are unknown and must be predicted from contextual features.
+
+using DecisionFocusedLearningBenchmarks
+using Plots
+
+b = PortfolioOptimizationBenchmark()
+
+# ## Observable input
+#
+# At inference time the decision-maker observes only the contextual feature vector `x`:
+dataset = generate_dataset(b, 20; seed=0)
+sample = first(dataset)
+plot_context(b, sample)
+
+# ## A training sample
+#
+# Each sample is a labeled triple `(x, θ, y)`:
+# - `x`: contextual feature vector (observable at train and test time)
+# - `θ`: true expected asset returns (training supervision only, hidden at test time)
+# - `y`: optimal portfolio weights solving the Markowitz QP given `θ`
+#
+# Top: feature vector x. Bottom left: true returns θ. Bottom right: optimal weights y:
+plot_sample(b, sample)
+
+# ## Untrained policy
+
+# A DFL policy chains two components: a statistical model predicting expected asset returns:
+model = generate_statistical_model(b)     # linear map: features → predicted returns
+# and a maximizer allocating the optimal portfolio given those returns:
+maximizer = generate_maximizer(b)         # Markowitz QP solver (Ipopt via JuMP)
+
+# A randomly initialized policy predicts arbitrary returns, leading to a suboptimal allocation:
+θ_pred = model(sample.x)
+y_pred = maximizer(θ_pred)
+plot_sample(b, DataSample(sample; θ=θ_pred, y=y_pred))
+
+# Optimality gap on the dataset (lower is better):
+compute_gap(b, dataset, model, maximizer)
+
+# ---
+# ## Problem Description
+#
+# A **Markowitz portfolio optimization** problem where asset expected returns are unknown.
+# Given contextual features ``x \in \mathbb{R}^p``, the learner predicts returns
+# ``\theta \in \mathbb{R}^d`` and solves:
+#
+# ```math
+# \begin{aligned}
+# \max_{y} \quad & \theta^\top y \\
+# \text{s.t.} \quad & y^\top \Sigma y \leq \gamma \\
+# & \mathbf{1}^\top y \leq 1 \\
+# & y \geq 0
+# \end{aligned}
+# ```
+#
+# where ``\Sigma`` is the asset covariance matrix and ``\gamma`` is the risk budget.
+# The solver uses [Ipopt.jl](https://github.com/jump-dev/Ipopt.jl) via JuMP.
+#
+# ## Key Parameters
+#
+# | Parameter | Description | Default |
+# |-----------|-------------|---------|
+# | `d` | Number of assets | 50 |
+# | `p` | Feature dimension | 5 |
+# | `deg` | Polynomial degree for data generation | 1 |
+# | `ν` | Noise hyperparameter | 1.0 |
+#
+# Data is generated following the process in
+# [Mandi et al., 2023](https://arxiv.org/abs/2307.13565).
+#
+# ## DFL Policy
+#
+# ```math
+# \xrightarrow[\text{Features}]{x \in \mathbb{R}^p}
+# \fbox{Linear model}
+# \xrightarrow[\text{Predicted returns}]{\theta \in \mathbb{R}^d}
+# \fbox{QP solver (Ipopt)}
+# \xrightarrow[\text{Portfolio}]{y \in \mathbb{R}^d}
+# ```
+#
+# **Model:** `Dense(p → d)`, predicts one expected return per asset.
+#
+# **Maximizer:** Ipopt QP solver enforcing the variance and budget constraints.
+#
+# !!! note "Reference"
+#     Mandi et al. (2023), Decision-Focused Learning: Foundations, State of the Art, Benchmark and Future Opportunities.
+#     [arXiv:2307.13565](https://arxiv.org/abs/2307.13565)
diff --git a/docs/src/benchmarks/static/05_subset_selection.jl b/docs/src/benchmarks/static/05_subset_selection.jl
new file mode 100644
index 0000000..2d1199e
--- /dev/null
+++ b/docs/src/benchmarks/static/05_subset_selection.jl
@@ -0,0 +1,83 @@
+# # Subset Selection
+# Select the `k` most valuable items from a set of `n`: items with unknown values
+# must be identified from observable features alone.
+
+using DecisionFocusedLearningBenchmarks
+using Plots
+
+b = SubsetSelectionBenchmark(; identity_mapping=false)
+
+# ## Observable input
+#
+# At inference time the decision-maker observes only the feature vector `x`:
+dataset = generate_dataset(b, 50; seed=0)
+sample = first(dataset)
+plot_context(b, sample)
+
+# ## A training sample
+#
+# Each sample is a labeled triple `(x, θ, y)`:
+# - `x`: item feature vector (observable at train and test time)
+# - `θ`: true item values, derived from `x` via a hidden encoder (training supervision only)
+# - `y`: selection indicator (`y[i] = 1` for the `k` highest-value items, 0 otherwise)
+#
+# The full training triple (features, hidden values, and selection):
+plot_sample(b, sample)
+
+# ## Untrained policy
+
+# A DFL policy chains two components: a statistical model predicting item scores:
+model = generate_statistical_model(b)     # linear map: features → predicted item scores
+# and a maximizer selecting the top-k items by those scores:
+maximizer = generate_maximizer(b)         # top-k selection
+
+# A randomly initialized policy selects items with no relation to their true values:
+θ_pred = model(sample.x)
+y_pred = maximizer(θ_pred)
+plot_sample(b, DataSample(sample; θ=θ_pred, y=y_pred))
+
+# Optimality gap on the dataset (lower is better):
+compute_gap(b, dataset, model, maximizer)
+
+# ---
+# ## Problem Description
+#
+# In the **Subset Selection benchmark**, ``n`` items have unknown values ``\theta_i``.
+# A feature vector ``x \in \mathbb{R}^n`` is observed (identity mapping by default).
+# The task is to select the ``k`` items with the highest values:
+# ```math
+# \begin{aligned}
+# y = \mathrm{top}_k(\theta) = & \mathop{\mathrm{argmax}}\limits_{y \in \{0,1\}^n} \; \theta^\top y \\
+# & \quad\text{s.t.} \quad \sum_{i=1}^n y_i = k
+# \end{aligned}
+# ```
+# where ``y \in \{0,1\}^n`` with exactly ``k`` ones.
+#
+# ## Key Parameters
+#
+# | Parameter | Description | Default |
+# |-----------|-------------|---------|
+# | `n` | Total number of items | 25 |
+# | `k` | Number of items to select | 5 |
+# | `identity_mapping` | Use identity as the hidden mapping | `true` |
+#
+# When `identity_mapping=true`, features equal item values directly (`x = θ`).
+# When `false`, a random linear layer is used as the hidden mapping.
+#
+# ## DFL Policy
+#
+# ```math
+# \xrightarrow[\text{Features}]{x}
+# \fbox{Linear model}
+# \xrightarrow{\theta}
+# \fbox{top-k}
+# \xrightarrow{y}
+# ```
+#
+# **Model:** `Dense(n → n; bias=false)`: predicts a score per item.
+#
+# **Maximizer:** `top_k(θ, k)`: returns a boolean vector with `true` at the `k`
+# highest-scoring positions.
+#
+# !!! note "Reference"
+#     Setting from [Decision-Focused Learning: Foundations, State of the Art, Benchmark and Future Opportunities](https://arxiv.org/abs/2307.13565)
diff --git a/docs/src/benchmarks/static/06_fixed_size_shortest_path.jl b/docs/src/benchmarks/static/06_fixed_size_shortest_path.jl
new file mode 100644
index 0000000..593d356
--- /dev/null
+++ b/docs/src/benchmarks/static/06_fixed_size_shortest_path.jl
@@ -0,0 +1,86 @@
+# # Shortest Path
+# Find the cheapest path from the top-left to the bottom-right of a grid graph:
+# edge costs are unknown and must be predicted from instance features.
+
+using DecisionFocusedLearningBenchmarks
+using Plots
+
+b = FixedSizeShortestPathBenchmark()
+
+# ## Observable input
+#
+# At inference time the decision-maker observes the feature vector `x` and the fixed grid
+# structure (source top-left, sink bottom-right):
+dataset = generate_dataset(b, 50; seed=0)
+sample = first(dataset)
+plot_context(b, sample)
+
+# ## A training sample
+#
+# Each sample is a labeled triple `(x, θ, y)`:
+# - `x`: instance feature vector (observable at train and test time)
+# - `θ`: true edge costs (training supervision only, hidden at test time)
+# - `y`: path indicator vector (`y[e] = 1` if edge `e` is on the optimal path)
+#
+# Top: feature vector x. Bottom left: edge costs θ. Bottom right: optimal path y (white dots):
+plot_sample(b, sample)
+
+# ## Untrained policy
+
+# A DFL policy chains two components: a statistical model predicting edge costs:
+model = generate_statistical_model(b)     # linear map: features → predicted edge costs
+# and a maximizer finding the shortest path given those costs:
+maximizer = generate_maximizer(b)         # Dijkstra shortest path on the grid graph
+
+# A randomly initialized policy predicts arbitrary costs, yielding a near-straight path:
+θ_pred = model(sample.x)
+y_pred = maximizer(θ_pred)
+plot_sample(b, DataSample(sample; θ=θ_pred, y=y_pred))
+
+# Optimality gap on the dataset (lower is better):
+compute_gap(b, dataset, model, maximizer)
+
+# ---
+# ## Problem Description
+#
+# A **fixed-size grid shortest path** problem. The graph is a directed acyclic grid of
+# size ``(\text{rows} \times \text{cols})``, with edges pointing right and downward.
+# Edge costs ``\theta \in \mathbb{R}^E`` are unknown; only a feature vector
+# ``x \in \mathbb{R}^p`` is observed. The task is to find the minimum-cost path from
+# vertex 1 (top-left) to vertex ``V`` (bottom-right):
+# ```math
+# y^* = \mathop{\mathrm{argmax}}\limits_{y \in \mathcal{P}} \; -\theta^\top y
+# ```
+# where ``y \in \{0,1\}^E`` indicates selected edges and ``\mathcal{P}`` is the set of
+# valid source-to-sink paths.
+#
+# Data is generated following the process in
+# [Mandi et al., 2023](https://arxiv.org/abs/2307.13565).
+#
+# ## Key Parameters
+#
+# | Parameter | Description | Default |
+# |-----------|-------------|---------|
+# | `grid_size` | Grid dimensions `(rows, cols)` | `(5, 5)` |
+# | `p` | Feature dimension | 5 |
+# | `deg` | Polynomial degree for cost generation | 1 |
+# | `ν` | Multiplicative noise level (0 = no noise) | 0.0 |
+#
+# ## DFL Policy
+#
+# ```math
+# \xrightarrow[\text{Features}]{x \in \mathbb{R}^p}
+# \fbox{Linear model}
+# \xrightarrow[\text{Predicted costs}]{\theta \in \mathbb{R}^E}
+# \fbox{Dijkstra / Bellman-Ford}
+# \xrightarrow[\text{Path}]{y \in \{0,1\}^E}
+# ```
+#
+# **Model:** `Chain(Dense(p → E))`: predicts one cost per edge.
+#
+# **Maximizer:** Dijkstra (default) or Bellman-Ford on negated weights to find the
+# longest (maximum-weight) path.
+#
+# !!! note "Reference"
+#     Mandi et al. (2023), Decision-Focused Learning: Foundations, State of the Art, Benchmark and Future Opportunities.
+#     [arXiv:2307.13565](https://arxiv.org/abs/2307.13565)
diff --git a/docs/src/benchmarks/static/07_warcraft.jl b/docs/src/benchmarks/static/07_warcraft.jl
new file mode 100644
index 0000000..c4f5b04
--- /dev/null
+++ b/docs/src/benchmarks/static/07_warcraft.jl
@@ -0,0 +1,90 @@
+# # Warcraft
+# Find the cheapest path on a 12×12 terrain map: cell travel costs are unknown and must
+# be inferred from the RGB terrain image using a neural network.
+
+using DecisionFocusedLearningBenchmarks
+using Plots
+
+b = WarcraftBenchmark()
+
+# ## Observable input
+#
+# At inference time the decision-maker observes only the terrain image `x` (not the costs `θ`):
+sample = generate_dataset(b, 1)[1]
+plot_context(b, sample)
+
+# ## A training sample
+#
+# Each sample is a labeled triple `(x, θ, y)`:
+# - `x`: terrain image (12×12×3 RGB array; observable at train and test time)
+# - `θ`: true cell travel costs (training supervision only, hidden at test time)
+# - `y`: optimal path indicator (`y[i,j] = 1` if cell `(i,j)` is on the path)
+#
+# Left: terrain image. Middle: true costs θ. Right: optimal path y:
+plot_sample(b, sample)
+
+# ## Untrained policy
+
+# A DFL policy chains two components: a CNN predicting cell travel costs from the terrain image:
+model = generate_statistical_model(b)     # ResNet18 CNN: terrain image → 12×12 cost map
+# and a maximizer finding the shortest path given those costs:
+maximizer = generate_maximizer(b)         # Dijkstra shortest path on the 12×12 grid
+
+# An untrained CNN produces a near-uniform cost map, yielding a near-straight path:
+θ_pred = model(sample.x)
+y_pred = maximizer(θ_pred)
+plot_sample(b, DataSample(sample; θ=θ_pred, y=y_pred))
+
+# Optimality gap on this sample (lower is better):
+compute_gap(b, [sample], model, maximizer)
+
+# ---
+# ## Problem Description
+#
+# In the **Warcraft benchmark**, each instance is a 12×12 grid representing a Warcraft
+# terrain map. Each cell has an unknown travel cost depending on its terrain type (forest,
+# mountain, water, etc.). The task is to find the path from the top-left to the
+# bottom-right corner that minimizes total travel cost.
+#
+# Formally, let ``\theta_{ij}`` be the (unknown) cost of cell ``(i,j)`` and
+# ``y_{ij} \in \{0,1\}`` indicate whether cell ``(i,j)`` is on the path. The objective is:
+# ```math
+# y^* = \mathop{\mathrm{argmin}}\limits_{y \in \mathcal{P}} \sum_{(i,j)} \theta_{ij} \, y_{ij}
+# ```
+# where ``\mathcal{P}`` is the set of valid grid paths (4-connected, source to sink).
+#
+# The dataset contains 10 000 labeled terrain images from the Warcraft II tileset.
+# It is downloaded automatically on first use via
+# [DataDeps.jl](https://github.com/oxinabox/DataDeps.jl).
+#
+# ## Key Components
+#
+# **[`WarcraftBenchmark`](@ref)** has no parameters.
+#
+# | Method | Description |
+# |--------|-------------|
+# | `generate_dataset(b, n)` | Downloads and loads `n` terrain images with true costs and paths |
+# | `generate_statistical_model(b)` | ResNet18 CNN (first 5 layers + adaptive maxpool + neg) |
+# | `generate_maximizer(b; dijkstra=true)` | Dijkstra or Bellman-Ford shortest path |
+#
+# ## DFL Policy
+#
+# ```math
+# \xrightarrow[\text{Terrain image}]{x \in \mathbb{R}^{12 \times 12 \times 3}}
+# \fbox{ResNet18 CNN}
+# \xrightarrow[\text{Cell costs}]{\theta \in \mathbb{R}^{12 \times 12}}
+# \fbox{Dijkstra}
+# \xrightarrow[\text{Path}]{y \in \{0,1\}^{12 \times 12}}
+# ```
+#
+# The CNN maps terrain pixel values to predicted cell costs, which are then passed to a
+# shortest-path solver. Training end-to-end with
+# [InferOpt.jl](https://github.com/JuliaDecisionFocusedLearning/InferOpt.jl) teaches
+# the network to produce costs that lead to good paths, not just accurate cost estimates.
+#
+# !!! tip
+#     See the [Warcraft tutorial](../../warcraft_tutorial.md) for a complete end-to-end training
+#     example using `PerturbedMultiplicative` and `FenchelYoungLoss`.
+#
+# !!! note "Reference"
+#     Vlastelica et al. (2020), Differentiation of Blackbox Combinatorial Solvers, ICLR 2020.
diff --git a/docs/src/benchmarks/stochastic/01_contextual_stochastic_argmax.jl b/docs/src/benchmarks/stochastic/01_contextual_stochastic_argmax.jl
new file mode 100644
index 0000000..ba90e41
--- /dev/null
+++ b/docs/src/benchmarks/stochastic/01_contextual_stochastic_argmax.jl
@@ -0,0 +1,107 @@
+# # Contextual Stochastic Argmax
+# Select the best item from a set of `n` items with stochastic utilities: each scenario draws
+# a different utility vector, but utilities depend on observable context features. This is a
+# toy benchmark designed so that a linear model can exactly recover the optimal
+# context-to-utility mapping.
+
+using DecisionFocusedLearningBenchmarks
+using Plots
+
+b = ContextualStochasticArgmaxBenchmark()
+
+# By default, `generate_dataset` returns unlabeled samples (`y = nothing`) for this benchmark.
+# A `target_policy` must be provided to attach labels. Here we use the anticipative
+# oracle: it returns the item with the highest realized utility for each scenario,
+# giving one labeled sample per scenario per instance.
+anticipative = generate_anticipative_solver(b)
+policy =
+    (ctx, scenarios) ->
+        [DataSample(ctx; y=anticipative(ξ), extra=(; scenario=ξ)) for ξ in scenarios]
+dataset = generate_dataset(b, 20; target_policy=policy, seed=0)
+sample = first(dataset)
+
+# ## Observable input
+#
+# At inference time the model observes `x = [c_base; x_raw]`. `plot_context` shows both
+# components: base utilities `c_base` (left) and context features `x_raw` (right):
+plot_context(b, sample)
+
+# ## A training sample
+#
+# Stochastic benchmarks have no single ground-truth label: the optimal item depends on
+# which utility scenario is realized. We label each sample with the anticipative oracle,
+# which returns the best item given the realized scenario ξ.
+#
+# Each labeled sample contains:
+# - `x`: feature vector `[c_base; x_raw]` (observable at train and test time)
+# - `y`: optimal item for the realized scenario ξ (one-hot; anticipative oracle label)
+# - `extra.scenario`: realized utility vector ξ (available only during training)
+#
+# Top: feature vector x. Bottom: realized scenario ξ acting as the cost vector,
+# with the anticipative-optimal item in red:
+plot_sample(b, DataSample(sample; θ=sample.scenario))
+
+# ## Untrained policy
+
+# A DFL policy chains two components: a statistical model predicting expected item utilities:
+model = generate_statistical_model(b)     # linear map: features → predicted expected utilities
+# and a maximizer selecting the item with the highest predicted utility:
+maximizer = generate_maximizer(b)         # one-hot argmax
+
+# A randomly initialized policy selects items with no relation to their expected utilities.
+# Top: feature vector x. Bottom: predicted utilities θ̂ with the selected item in red:
+θ_pred = model(sample.x)
+y_pred = maximizer(θ_pred)
+plot_sample(b, DataSample(sample; θ=θ_pred, y=y_pred))
+
+# ---
+# ## Problem Description
+#
+# ### Overview
+#
+# In the **Contextual Stochastic Argmax benchmark**, ``n`` items have random utilities
+# that depend on observable context. Per instance:
+# - ``c_\text{base} \sim U[0,1]^n``: base utilities (stored in `context`)
+# - ``x_\text{raw} \sim \mathcal{N}(0, I_d)``: observable context features
+# - Full features: ``x = [c_\text{base}; x_\text{raw}] \in \mathbb{R}^{n+d}``
+#
+# The realized utility (scenario) is drawn as:
+# ```math
+# \xi = c_\text{base} + W \, x_\text{raw} + \varepsilon, \quad \varepsilon \sim \mathcal{N}(0, \sigma^2 I)
+# ```
+# where ``W \in \mathbb{R}^{n \times d}`` is a fixed unknown perturbation matrix.
+#
+# The task is to select the item with the highest realized utility:
+# ```math
+# y^* = \mathrm{argmax}(\xi)
+# ```
+#
+# A linear model ``\theta = [I \mid W] \cdot x`` can exactly recover the optimal
+# solution in expectation.
+#
+# ## Key Parameters
+#
+# | Parameter | Description | Default |
+# |-----------|-------------|---------|
+# | `n` | Number of items | 10 |
+# | `d` | Context feature dimension | 5 |
+# | `noise_std` | Noise standard deviation σ | 0.1 |
+#
+# ## Baseline Policies
+#
+# - **SAA**: selects the item with highest mean utility over available scenarios.
+#
+# ## DFL Policy
+#
+# ```math
+# \xrightarrow[\text{Features}]{x = [c_\text{base}; x_\text{raw}]}
+# \fbox{Linear model}
+# \xrightarrow{\theta \in \mathbb{R}^n}
+# \fbox{argmax}
+# \xrightarrow{y}
+# ```
+#
+# **Model:** `Dense(n+d → n; bias=false)`: can in principle recover the exact mapping
+# ``[I \mid W]`` from training data.
+#
+# **Maximizer:** `one_hot_argmax`.
diff --git a/docs/src/benchmarks/stochastic/02_vsp.jl b/docs/src/benchmarks/stochastic/02_vsp.jl
new file mode 100644
index 0000000..c01a373
--- /dev/null
+++ b/docs/src/benchmarks/stochastic/02_vsp.jl
@@ -0,0 +1,125 @@
+# # Stochastic Vehicle Scheduling
+# Assign vehicles to cover a set of tasks while minimizing costs under stochastic delays:
+# the DFL agent learns to predict adjusted costs that implicitly hedge against uncertainty.
+
+using DecisionFocusedLearningBenchmarks
+using Plots
+
+b = StochasticVehicleSchedulingBenchmark(; nb_tasks=10)
+
+# ## Observable input
+#
+# Each instance is a city with task locations and scheduled times. Task spatial positions
+# and scheduled times are observable at inference time.
+# `store_city=true` is required to visualize the map (not needed for training):
+sample = generate_dataset(b, 1; store_city=true)[1]
+plot_context(b, sample)
+
+# ## A training sample
+#
+# Each sample is a labeled triple `(x, θ, y)`:
+# - `x`: 20-dimensional feature vector per edge, encoding schedule slack and travel times
+# - `θ`: adjusted edge costs (training supervision only, hidden at test time)
+# - `y`: binary assignment (`y[(u,v)] = 1` if a vehicle travels edge `(u, v)` in the schedule)
+#
+# Unlike static benchmarks, `y` labels are not available by default and must be attached
+# via a `target_policy` (e.g., the deterministic VSP solver). Routes are visualized
+# in the untrained policy section below.
+
+# ## Untrained policy
+
+# A DFL policy chains two components: a statistical model predicting adjusted edge costs:
+model = generate_statistical_model(b)     # linear map: task features -> adjusted edge costs
+# and a maximizer solving the deterministic VSP given those costs:
+maximizer = generate_maximizer(b)         # deterministic VSP solver (HiGHS MIP)
+
+# The untrained model predicts random edge costs; the resulting schedule is arbitrary:
+θ_pred = model(sample.x)
+y_pred = maximizer(θ_pred; sample.context...)
+plot_sample(b, DataSample(sample; θ=θ_pred, y=y_pred))
+
+# ---
+# ## Problem Description
+#
+# ### Overview
+#
+# In the **Vehicle Scheduling Problem (VSP)**, we consider a set of tasks ``V``. Each
+# task ``v \in V`` has a scheduled beginning time ``t_v^b`` and end time ``t_v^e``, with
+# ``t_v^e > t_v^b``. We denote ``t^{tr}_{(u,v)}`` the travel time from task ``u`` to task
+# ``v``. A task ``v`` can follow ``u`` only if:
+# ```math
+# t_v^b \geq t_u^e + t^{tr}_{(u,v)}
+# ```
+#
+# An instance of VSP can be modeled as an acyclic directed graph where nodes are tasks
+# and edges represent feasible successions. A solution is a set of disjoint paths such
+# that all tasks are fulfilled exactly once to minimize total costs.
+#
+# In the **Stochastic VSP (StoVSP)**, after the scheduling decision is set, random delays
+# propagate along vehicle tours. The objective becomes minimizing base costs plus expected
+# total delay costs over scenarios.
+#
+# ### Mathematical Formulation
+#
+# **Variables:** Let ``y_{u,v} \in \{0,1\}`` indicate if a vehicle performs task ``v``
+# immediately after task ``u``.
+#
+# **Delay Propagation:** For each task ``v`` in scenario ``s``:
+# - ``\gamma_v^s``: intrinsic delay of task ``v``
+# - ``d_v^s``: total accumulated delay
+# - ``\delta_{u,v}^s = t_v^b - (t_u^e + t^{tr}_{(u,v)})``: slack time
+#
+# ```math
+# d_v^s = \gamma_v^s + \max(d_u^s - \delta_{u,v}^s,\; 0)
+# ```
+#
+# **Objective:**
+# ```math
+# \min_{y} \; \sum_{(u,v)} c_{u,v} \, y_{u,v} + \mathbb{E}_{s \in S}\!\left[\sum_v C_d \, d_v^s\right]
+# ```
+#
+# ## Key Components
+#
+# ### [`StochasticVehicleSchedulingBenchmark`](@ref)
+#
+# | Parameter | Description | Default |
+# |-----------|-------------|---------|
+# | `nb_tasks` | Number of tasks per instance | 25 |
+# | `nb_scenarios` | Number of scenarios for objective evaluation | 10 |
+#
+# ### Instance Generation
+#
+# Each instance simulates a geographic city with depots and task locations. Tasks have
+# realistic scheduled start/end times. Scenarios are random intrinsic delays ``\gamma``
+# drawn from a Log-Normal distribution. Feature vectors are 20-dimensional.
+#
+# ## Baseline Policies
+#
+# | Policy | Description |
+# |--------|-------------|
+# | `svs_deterministic_policy` | Solves the deterministic VSP, ignoring delays |
+# | `svs_saa_policy` | SAA via column generation over ``K`` scenarios |
+# | `svs_saa_mip_policy` | Exact SAA via compact MIP formulation |
+# | `svs_local_search_policy` | Heuristic local search over sampled scenarios |
+#
+# ## DFL Policy
+#
+# ```math
+# \xrightarrow[\text{Features}]{x \in \mathbb{R}^{20}}
+# \fbox{Linear model $\varphi_w$}
+# \xrightarrow[\text{Predicted cost}]{c}
+# \fbox{Deterministic VSP solver}
+# \xrightarrow[\text{Routes}]{y}
+# ```
+#
+# By training end-to-end with the deterministic solver, the linear model learns adjusted
+# costs ``c`` that implicitly account for expected stochastic delays, while keeping
+# the fast deterministic solver at inference time.
+#
+# **Model:** `Chain(Dense(20 -> 1; bias=false), vec)`: predicts one adjusted cost per edge.
+#
+# **Maximizer:** `StochasticVehicleSchedulingMaximizer`: HiGHS MIP solver on the
+# deterministic VSP instance.
+#
+# !!! note "Reference"
+#     Full details on this problem can be found in [Learning to Approximate Industrial Problems by Operations Research Classic Problems](https://hal.science/hal-02396091/document)
diff --git a/docs/src/benchmarks/subset_selection.md b/docs/src/benchmarks/subset_selection.md
deleted file mode 100644
index 918e424..0000000
--- a/docs/src/benchmarks/subset_selection.md
+++ /dev/null
@@ -1,13 +0,0 @@
-# Subset Selection
-
-[`SubsetSelectionBenchmark`](@ref) is the most trivial benchmark problem in this package.
-It is minimalistic and serves as a simple example for debugging and testing purposes.
-
-## Description
-We have a set of ``n`` items, each item having an unknown value.
-We want to select a subset of ``k`` items that maximizes the sum of the values of the selected items.
-
-As input, instead of the items costs, we are given a feature vector, such that an unknown linear mapping between the feature vector and the value of the items exists.
-
-By default, this linear mapping is the identity mapping, i.e., the value of each item is equal to the value of the corresponding feature vector element.
-However, this mapping can be changed by setting the `identity_mapping` parameter to `false`.
diff --git a/docs/src/benchmarks/vsp.md b/docs/src/benchmarks/vsp.md
deleted file mode 100644
index adcb772..0000000
--- a/docs/src/benchmarks/vsp.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# Stochastic Vehicle Scheduling
-
-[`StochasticVehicleSchedulingBenchmark`](@ref).
-
-!!! warning
-    Documentation for this benchmark is still under development. Please refer to the source code and API for more details.
diff --git a/docs/src/benchmarks/warcraft.md b/docs/src/benchmarks/warcraft.md
deleted file mode 100644
index c78850e..0000000
--- a/docs/src/benchmarks/warcraft.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# Warcraft
-
-See the tutorial for a full demo of [`WarcraftBenchmark`](@ref).
diff --git a/docs/src/custom_benchmarks.md b/docs/src/custom_benchmarks.md
index 968d9e6..90a12c0 100644
--- a/docs/src/custom_benchmarks.md
+++ b/docs/src/custom_benchmarks.md
@@ -75,8 +75,8 @@ is_minimization_problem(bench::MyBenchmark) -> Bool   # default: true (minimizat
 objective_value(bench::MyBenchmark, sample::DataSample, y) -> Real
 compute_gap(bench::MyBenchmark, dataset, model, maximizer) -> Float64
 has_visualization(bench::MyBenchmark) -> Bool                            # default: false; return true when plot methods are implemented/available
-plot_instance(bench::MyBenchmark, sample::DataSample; kwargs...)
-plot_solution(bench::MyBenchmark, sample::DataSample; kwargs...)
+plot_context(bench::MyBenchmark, sample::DataSample; kwargs...)
+plot_sample(bench::MyBenchmark, sample::DataSample; kwargs...)
 ```
 
 ---
diff --git a/docs/src/index.md b/docs/src/index.md
index 4294eb6..4ffb150 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -32,12 +32,12 @@ Where:
 
 ## Package Overview
 
-**DecisionFocusedLearningBenchmarks.jl** provides a collection of benchmark problems for evaluating decision-focused learning algorithms. The package offers:
+**DecisionFocusedLearningBenchmarks.jl** provides a collection of benchmark problems for evaluating decision-focused learning algorithms. The package contains:
 
-- **Collection of benchmark problems** spanning diverse applications
+- A **collection of benchmark problems** spanning diverse applications
 - **Common tools** for creating datasets, statistical models, and optimization algorithms
-- **Generic interface** for building custom benchmarks
-- Compatibility with [InferOpt.jl](https://github.com/JuliaDecisionFocusedLearning/InferOpt.jl) and the whole [JuliaDecisionFocusedLearning](https://github.com/JuliaDecisionFocusedLearning) ecosystem
+- A **generic interface** for building custom benchmarks
+- **Compatibility** with [InferOpt.jl](https://github.com/JuliaDecisionFocusedLearning/InferOpt.jl) and the whole [JuliaDecisionFocusedLearning](https://github.com/JuliaDecisionFocusedLearning) ecosystem
 
 ## Benchmark Categories
 
@@ -53,8 +53,9 @@ Single-stage optimization problems with no randomness involved:
 - [`FixedSizeShortestPathBenchmark`](@ref): find shortest path on grid graphs with fixed size
 - [`WarcraftBenchmark`](@ref): shortest path on image maps
 
-### Stochastic Benchmarks (`AbstractStochasticBenchmark`)  
+### Stochastic Benchmarks (`AbstractStochasticBenchmark`)
 Single-stage optimization problems under uncertainty:
+- [`ContextualStochasticArgmaxBenchmark`](@ref): contextual argmax with stochastic utilities
 - [`StochasticVehicleSchedulingBenchmark`](@ref): stochastic vehicle scheduling under delay uncertainty
 
 ### Dynamic Benchmarks (`AbstractDynamicBenchmark`)
@@ -65,6 +66,13 @@ Multi-stage sequential decision-making problems:
 
 ## Getting Started
 
+First, make sure to install the package from the Julia registry:
+
+```julia
+using Pkg
+Pkg.add("DecisionFocusedLearningBenchmarks")
+```
+
 In a few lines of code, you can create benchmark instances, generate datasets, initialize learning components, and evaluate performance, using the same syntax across all benchmarks:
 
 ```julia
diff --git a/docs/src/tutorials/warcraft_tutorial.jl b/docs/src/tutorials/warcraft_tutorial.jl
index b801d7a..b3a2368 100644
--- a/docs/src/tutorials/warcraft_tutorial.jl
+++ b/docs/src/tutorials/warcraft_tutorial.jl
@@ -22,7 +22,7 @@ dataset = generate_dataset(b, 50);
 # Subdatasets can be created through regular slicing:
 train_dataset, test_dataset = dataset[1:45], dataset[46:50]
 
-# And getting an individual sample will return a [`DataSample`](@ref) with four fields: `x`, `info`, `θ`, and `y`:
+# And getting an individual sample will return a [`DataSample`](@ref) with five fields: `x`, `θ`, `y`, `context`, and `extra`:
 sample = test_dataset[1]
 # `x` correspond to the input features, i.e. the input image (3D array) in the Warcraft benchmark case:
 x = sample.x
@@ -32,9 +32,11 @@ x = sample.x
 y_true = sample.y
 # `context` is not used in this benchmark (no solver kwargs needed), so it is empty:
 isempty(sample.context)
+# `extra` is also not used in this benchmark, so it is empty as well:
+isempty(sample.extra)
 
-# For some benchmarks, we provide the following plotting method [`plot_solution`](@ref) to visualize the data:
-plot_solution(b, sample)
+# For some benchmarks, we provide the following plotting method [`plot_sample`](@ref) to visualize the data:
+plot_sample(b, sample)
 # We can see here the terrain image, the true terrain weights, and the true shortest path avoiding the high cost cells.
 
 # ## Building a pipeline
@@ -48,10 +50,10 @@ model = generate_statistical_model(b)
 
 # Finally, the [`generate_maximizer`](@ref) method can be used to generate a combinatorial optimization algorithm that takes the predicted cell weights as input and returns the corresponding shortest path:
 maximizer = generate_maximizer(b; dijkstra=true)
-# In the case o fthe Warcraft benchmark, the method has an additional keyword argument to chose the algorithm to use: Dijkstra's algorithm or Bellman-Ford algorithm.
+# In the case of the Warcraft benchmark, the method has an additional keyword argument to chose the algorithm to use: Dijkstra's algorithm or Bellman-Ford algorithm.
 y = maximizer(θ)
 # As we can see, currently the pipeline predicts random noise as cell weights, and therefore the maximizer returns a straight line path.
-plot_solution(b, DataSample(; x, θ, y))
+plot_sample(b, DataSample(; x, θ, y))
 # We can evaluate the current pipeline performance using the optimality gap metric:
 starting_gap = compute_gap(b, test_dataset, model, maximizer)
 
@@ -85,7 +87,7 @@ final_gap = compute_gap(b, test_dataset, model, maximizer)
 #
 θ = model(x)
 y = maximizer(θ)
-plot_solution(b, DataSample(; x, θ, y))
+plot_sample(b, DataSample(; x, θ, y))
 
 using Test #src
 @test final_gap < starting_gap #src
diff --git a/docs/src/using_benchmarks.md b/docs/src/using_benchmarks.md
index d9ab6fc..e64f6bb 100644
--- a/docs/src/using_benchmarks.md
+++ b/docs/src/using_benchmarks.md
@@ -190,9 +190,9 @@ dataset = generate_dataset(bench, 10)
 sample = dataset[1]
 
 has_visualization(bench)           # true
-plot_instance(bench, sample)       # problem geometry only
-plot_solution(bench, sample)       # sample.y overlaid on the instance
-plot_solution(bench, sample, y)    # convenience 3-arg form: override y before plotting
+plot_context(bench, sample)       # problem geometry only
+plot_sample(bench, sample)       # sample.y overlaid on the instance
+plot_sample(bench, sample, y)    # convenience 3-arg form: override y before plotting
 
 # Dynamic benchmarks only
 traj = generate_anticipative_solver(bench)(env)
@@ -202,8 +202,8 @@ gif(anim, "episode.gif")
 ```
 
 - `has_visualization(bench)`: returns `true` for benchmarks that implement plot support (if Plots is loaded).
-- `plot_instance(bench, sample; kwargs...)`: renders the problem geometry without any solution.
-- `plot_solution(bench, sample; kwargs...)`: renders `sample.y` overlaid on the instance.
-- `plot_solution(bench, sample, y; kwargs...)`: 3-arg convenience form that overrides `y` before plotting.
+- `plot_context(bench, sample; kwargs...)`: renders the problem geometry without any solution.
+- `plot_sample(bench, sample; kwargs...)`: renders `sample.y` overlaid on the instance.
+- `plot_sample(bench, sample, y; kwargs...)`: 3-arg convenience form that overrides `y` before plotting.
 - `plot_trajectory(bench, traj; kwargs...)`: dynamic benchmarks only; produces a grid of per-epoch subplots.
 - `animate_trajectory(bench, traj; kwargs...)`: dynamic benchmarks only, returns a `Plots.Animation` that can be saved with `gif(anim, "file.gif")`.
diff --git a/ext/DFLBenchmarksPlotsExt.jl b/ext/DFLBenchmarksPlotsExt.jl
index 0a5caae..117d174 100644
--- a/ext/DFLBenchmarksPlotsExt.jl
+++ b/ext/DFLBenchmarksPlotsExt.jl
@@ -5,25 +5,23 @@ using DocStringExtensions: TYPEDSIGNATURES
 using LaTeXStrings: @L_str
 using Plots
 import DecisionFocusedLearningBenchmarks:
-    has_visualization, plot_instance, plot_solution, plot_trajectory, animate_trajectory
+    has_visualization, plot_context, plot_sample, plot_trajectory, animate_trajectory
 
+function _step_str(sample::DataSample)
+    return hasproperty(sample, :step) ? " (step $(sample.step))" : ""
+end
+
+include("plots/argmax_plots.jl")
 include("plots/argmax2d_plots.jl")
+include("plots/ranking_plots.jl")
+include("plots/subset_selection_plots.jl")
+include("plots/portfolio_plots.jl")
+include("plots/shortest_path_plots.jl")
+include("plots/contextual_stochastic_argmax_plots.jl")
 include("plots/warcraft_plots.jl")
 include("plots/svs_plots.jl")
 include("plots/dvs_plots.jl")
-
-"""
-    plot_solution(bench::AbstractBenchmark, sample::DataSample, y; kwargs...)
-
-Reconstruct a new sample with `y` overridden and delegate to the 2-arg
-[`plot_solution`](@ref). Only available when `Plots` is loaded.
-"""
-function plot_solution(bench::AbstractBenchmark, sample::DataSample, y; kwargs...)
-    return plot_solution(
-        bench,
-        DataSample(; sample.context..., x=sample.x, θ=sample.θ, y=y, extra=sample.extra);
-        kwargs...,
-    )
-end
+include("plots/dynamic_assortment_plots.jl")
+include("plots/maintenance_plots.jl")
 
 end
diff --git a/ext/plots/argmax2d_plots.jl b/ext/plots/argmax2d_plots.jl
index cdb9800..d609213 100644
--- a/ext/plots/argmax2d_plots.jl
+++ b/ext/plots/argmax2d_plots.jl
@@ -39,28 +39,21 @@ function _plot_y!(pl, y)
         color="#CB3C33",
         markersize=9,
         markershape=:square,
-        label=L"f(\theta)",
+        label=L"y = \mathrm{argmax}_v\; \theta^\top v",
     )
 end
 
 has_visualization(::Argmax2DBenchmark) = true
 
-function plot_instance(::Argmax2DBenchmark, sample::DataSample; kwargs...)
+function plot_context(::Argmax2DBenchmark, sample::DataSample; kwargs...)
     pl = _init_plot(; kwargs...)
     _plot_polytope!(pl, sample.instance)
     return pl
 end
 
-function plot_solution(::Argmax2DBenchmark, sample::DataSample; kwargs...)
+function plot_sample(::Argmax2DBenchmark, sample::DataSample; kwargs...)
     pl = _init_plot(; kwargs...)
     _plot_polytope!(pl, sample.instance)
     _plot_objective!(pl, sample.θ)
     return _plot_y!(pl, sample.y)
 end
-
-function plot_solution(::Argmax2DBenchmark, sample::DataSample, y; θ=sample.θ, kwargs...)
-    pl = _init_plot(; kwargs...)
-    _plot_polytope!(pl, sample.instance)
-    _plot_objective!(pl, θ)
-    return _plot_y!(pl, y)
-end
diff --git a/ext/plots/argmax_plots.jl b/ext/plots/argmax_plots.jl
new file mode 100644
index 0000000..5b4ba13
--- /dev/null
+++ b/ext/plots/argmax_plots.jl
@@ -0,0 +1,57 @@
+has_visualization(::ArgmaxBenchmark) = true
+
+"""
+$TYPEDSIGNATURES
+
+Plot the input features as a heatmap. Columns correspond to items, rows correspond to features.
+"""
+function plot_context(::ArgmaxBenchmark, sample::DataSample; kwargs...)
+    x = sample.x  # nb_features × n
+    n = size(x, 2)
+    return Plots.heatmap(
+        x;
+        xlabel="Item",
+        ylabel="Feature",
+        title="Features x (observable input)",
+        xticks=1:n,
+        kwargs...,
+    )
+end
+
+"""
+$TYPEDSIGNATURES
+
+Plot the features `x` as a heatmap, the scores `θ` as a bar chart, and the
+decision `y` as a one-hot heatmap. All three share the same item axis.
+"""
+function plot_sample(::ArgmaxBenchmark, sample::DataSample; kwargs...)
+    x = sample.x  # nb_features × n
+    θ = sample.θ  # length n
+    y = sample.y  # one-hot, length n
+    n = length(θ)
+
+    p1 = Plots.heatmap(
+        x; ylabel="Feature", title="x (features, observable)", xticks=(1:n, fill("", n))
+    )
+    p2 = Plots.bar(
+        1:n,
+        Float64.(θ);
+        legend=false,
+        ylabel="Score",
+        title="θ (scores)",
+        xticks=(1:n, fill("", n)),
+    )
+    p3 = Plots.heatmap(
+        reshape(Float64.(y), 1, n);
+        xlabel="Item",
+        ylabel="y",
+        title="y (decision, one-hot)",
+        yticks=false,
+        xticks=1:n,
+        color=:Greens,
+        colorbar=false,
+    )
+
+    l = Plots.@layout [a{0.55h}; b{0.3h}; c{0.15h}]
+    return Plots.plot(p1, p2, p3; layout=l, size=(600, 480), kwargs...)
+end
diff --git a/ext/plots/contextual_stochastic_argmax_plots.jl b/ext/plots/contextual_stochastic_argmax_plots.jl
new file mode 100644
index 0000000..c62234e
--- /dev/null
+++ b/ext/plots/contextual_stochastic_argmax_plots.jl
@@ -0,0 +1,62 @@
+has_visualization(::ContextualStochasticArgmaxBenchmark) = true
+
+function plot_context(::ContextualStochasticArgmaxBenchmark, sample::DataSample; kwargs...)
+    c_base = sample.c_base  # base utilities (first n components of x)
+    x_raw = sample.x_raw    # context features (last d components of x)
+    n = length(c_base)
+    d = length(x_raw)
+
+    p1 = Plots.bar(
+        1:n,
+        c_base;
+        legend=false,
+        xlabel="Item",
+        ylabel="Base utility",
+        title="c_base (base utilities)",
+        color=:steelblue,
+    )
+    p2 = Plots.bar(
+        1:d,
+        x_raw;
+        legend=false,
+        xlabel="Feature",
+        ylabel="Value",
+        title="x_raw (context features)",
+        color=:darkorange,
+    )
+    return Plots.plot(p1, p2; layout=(1, 2), size=(800, 300), kwargs...)
+end
+
+function plot_sample(::ContextualStochasticArgmaxBenchmark, sample::DataSample; kwargs...)
+    x = sample.x
+    θ = sample.θ
+    y = sample.y
+    n_x = length(x)
+    n = length(θ)
+    n_c = length(sample.c_base)
+
+    x_colors = vcat(fill(:steelblue, n_c), fill(:darkorange, n_x - n_c))
+    p_x = Plots.bar(
+        1:n_x,
+        x;
+        color=x_colors,
+        legend=false,
+        xlabel="Feature index",
+        ylabel="Value",
+        title="x (blue = c_base, orange = x_raw)",
+    )
+
+    colors = [y[i] > 0 ? :firebrick : :steelblue for i in 1:n]
+    p_θ = Plots.bar(
+        1:n,
+        θ;
+        color=colors,
+        legend=false,
+        xlabel="Item",
+        ylabel="Utility",
+        title="θ (selected item in red)",
+    )
+
+    l = Plots.@layout [a{0.4h}; b]
+    return Plots.plot(p_x, p_θ; layout=l, size=(700, 500), kwargs...)
+end
diff --git a/ext/plots/dvs_plots.jl b/ext/plots/dvs_plots.jl
index 0b61a5e..7352593 100644
--- a/ext/plots/dvs_plots.jl
+++ b/ext/plots/dvs_plots.jl
@@ -3,43 +3,24 @@ using Printf: @sprintf
 
 has_visualization(::DynamicVehicleSchedulingBenchmark) = true
 
-# ── helpers (moved from static_vsp/plot.jl) ─────────────────────────────────
-
-function _plot_static_instance(
-    x_depot,
-    y_depot,
-    x_customers,
-    y_customers;
-    customer_markersize=4,
-    depot_markersize=7,
-    alpha_depot=0.8,
-    customer_color=:lightblue,
-    depot_color=:lightgreen,
-    kwargs...,
-)
-    fig = Plots.plot(;
-        legend=:topleft, xlabel="x coordinate", ylabel="y coordinate", kwargs...
-    )
-    Plots.scatter!(
-        fig,
-        x_customers,
-        y_customers;
-        label="Customers",
-        markercolor=customer_color,
-        marker=:circle,
-        markersize=customer_markersize,
-    )
-    Plots.scatter!(
-        fig,
-        [x_depot],
-        [y_depot];
-        label="Depot",
-        markercolor=depot_color,
-        marker=:rect,
-        markersize=depot_markersize,
-        alpha=alpha_depot,
-    )
-    return fig
+# ── helpers ────────────────────────────────────────────────────────────────────
+
+function _compute_bounds(pd; margin=0.05, legend_margin_factor=0.15)
+    x_min = minimum(min(data.x_depot, minimum(data.x_customers)) for data in pd)
+    x_max = maximum(max(data.x_depot, maximum(data.x_customers)) for data in pd)
+    y_min = minimum(min(data.y_depot, minimum(data.y_customers)) for data in pd)
+    y_max = maximum(max(data.y_depot, maximum(data.y_customers)) for data in pd)
+
+    xlims = (x_min - margin, x_max + margin)
+    y_range = y_max - y_min + 2 * margin
+    legend_margin = y_range * legend_margin_factor
+    ylims = (y_min - margin, y_max + margin + legend_margin)
+
+    min_start_time = minimum(minimum(data.start_times) for data in pd)
+    max_start_time = maximum(maximum(data.start_times) for data in pd)
+    clims = (min_start_time, max_start_time)
+
+    return (; xlims, ylims, clims)
 end
 
 # ── plot_state ───────────────────────────────────────────────────────────────
@@ -69,21 +50,16 @@ function plot_state(
         state
     )
 
-    plot_args = Dict(
-        :legend => :topleft, :title => "DVSP State - Epoch $(state.current_epoch)"
+    xlabel = show_axis_labels ? "x coordinate" : ""
+    ylabel = show_axis_labels ? "y coordinate" : ""
+    fig = Plots.plot(;
+        legend=:topleft,
+        title="DVSP State - Epoch $(state.current_epoch)",
+        xlabel=xlabel,
+        ylabel=ylabel,
+        kwargs...,
     )
 
-    if show_axis_labels
-        plot_args[:xlabel] = "x coordinate"
-        plot_args[:ylabel] = "y coordinate"
-    end
-
-    for (k, v) in kwargs
-        plot_args[k] = v
-    end
-
-    fig = Plots.plot(; plot_args...)
-
     Plots.scatter!(
         fig,
         [x_depot],
@@ -96,45 +72,39 @@ function plot_state(
         markerstrokewidth=markerstrokewidth,
     )
 
-    scatter_must_dispatch_args = Dict(
-        :label => "Must-dispatch customers",
-        :markercolor => must_dispatch_color,
-        :marker => must_dispatch_marker,
-        :markersize => customer_markersize,
-        :markerstrokewidth => markerstrokewidth,
-    )
-
-    scatter_postponable_args = Dict(
-        :label => "Postponable customers",
-        :markercolor => postponable_color,
-        :marker => postponable_marker,
-        :markersize => customer_markersize,
-        :markerstrokewidth => markerstrokewidth,
-    )
-    if show_colorbar
-        scatter_must_dispatch_args[:marker_z] = start_times[is_must_dispatch]
-        scatter_postponable_args[:marker_z] = start_times[.!is_must_dispatch]
-        scatter_postponable_args[:colormap] = :plasma
-        scatter_must_dispatch_args[:colormap] = :plasma
-        scatter_postponable_args[:colorbar] = :right
-        scatter_must_dispatch_args[:colorbar] = :right
+    colorbar_args = if show_colorbar
+        (; colormap=:plasma, colorbar=:right)
+    else
+        (;)
     end
 
-    if length(x_customers[is_must_dispatch]) > 0
+    if any(is_must_dispatch)
         Plots.scatter!(
             fig,
             x_customers[is_must_dispatch],
             y_customers[is_must_dispatch];
-            scatter_must_dispatch_args...,
+            label="Must-dispatch",
+            markercolor=must_dispatch_color,
+            marker=must_dispatch_marker,
+            markersize=customer_markersize,
+            markerstrokewidth=markerstrokewidth,
+            marker_z=show_colorbar ? start_times[is_must_dispatch] : nothing,
+            colorbar_args...,
         )
     end
 
-    if length(x_customers[.!is_must_dispatch]) > 0
+    if any(.!is_must_dispatch)
         Plots.scatter!(
             fig,
             x_customers[.!is_must_dispatch],
             y_customers[.!is_must_dispatch];
-            scatter_postponable_args...,
+            label="Postponable",
+            markercolor=postponable_color,
+            marker=postponable_marker,
+            markersize=customer_markersize,
+            markerstrokewidth=markerstrokewidth,
+            marker_z=show_colorbar ? start_times[.!is_must_dispatch] : nothing,
+            colorbar_args...,
         )
     end
 
@@ -194,13 +164,13 @@ end
 
 # ── interface methods ────────────────────────────────────────────────────────
 
-function plot_instance(
+function plot_context(
     bench::DynamicVehicleSchedulingBenchmark, sample::DataSample; kwargs...
 )
     return plot_state(sample.instance; kwargs...)
 end
 
-function plot_solution(
+function plot_sample(
     bench::DynamicVehicleSchedulingBenchmark, sample::DataSample; kwargs...
 )
     return plot_routes(sample.instance, sample.y; reward=sample.reward, kwargs...)
@@ -234,19 +204,7 @@ function plot_trajectory(
     end
     rows = ceil(Int, n_epochs / cols)
 
-    x_min = minimum(min(data.x_depot, minimum(data.x_customers)) for data in pd)
-    x_max = maximum(max(data.x_depot, maximum(data.x_customers)) for data in pd)
-    y_min = minimum(min(data.y_depot, minimum(data.y_customers)) for data in pd)
-    y_max = maximum(max(data.y_depot, maximum(data.y_customers)) for data in pd)
-
-    xlims = (x_min - margin, x_max + margin)
-    y_range = y_max - y_min + 2 * margin
-    legend_margin = y_range * legend_margin_factor
-    ylims = (y_min - margin, y_max + margin + legend_margin)
-
-    min_start_time = minimum(minimum(data.start_times) for data in pd)
-    max_start_time = maximum(maximum(data.start_times) for data in pd)
-    clims = (min_start_time, max_start_time)
+    (; xlims, ylims, clims) = _compute_bounds(pd; margin, legend_margin_factor)
 
     plots = map(1:n_epochs) do i
         sample = traj[i]
@@ -316,19 +274,7 @@ function animate_trajectory(
     pd = DVS.build_plot_data(traj)
     epoch_costs = [-sample.reward for sample in traj]
 
-    x_min = minimum(min(data.x_depot, minimum(data.x_customers)) for data in pd)
-    x_max = maximum(max(data.x_depot, maximum(data.x_customers)) for data in pd)
-    y_min = minimum(min(data.y_depot, minimum(data.y_customers)) for data in pd)
-    y_max = maximum(max(data.y_depot, maximum(data.y_customers)) for data in pd)
-
-    xlims = (x_min - margin, x_max + margin)
-    y_range = y_max - y_min + 2 * margin
-    legend_margin = y_range * legend_margin_factor
-    ylims = (y_min - margin, y_max + margin + legend_margin)
-
-    min_start_time = minimum(minimum(data.start_times) for data in pd)
-    max_start_time = maximum(maximum(data.start_times) for data in pd)
-    clims = (min_start_time, max_start_time)
+    (; xlims, ylims, clims) = _compute_bounds(pd; margin, legend_margin_factor)
 
     if show_cost_bar
         x_min, x_max = xlims
diff --git a/ext/plots/dynamic_assortment_plots.jl b/ext/plots/dynamic_assortment_plots.jl
new file mode 100644
index 0000000..3aa310d
--- /dev/null
+++ b/ext/plots/dynamic_assortment_plots.jl
@@ -0,0 +1,49 @@
+has_visualization(::DynamicAssortmentBenchmark) = true
+
+function plot_context(::DynamicAssortmentBenchmark, sample::DataSample; kwargs...)
+    prices = sample.instance[1][1, :] .* 10
+    N = length(prices)
+    return Plots.bar(
+        1:N,
+        prices;
+        legend=false,
+        xlabel="Item",
+        ylabel="Price",
+        title="Item prices$(_step_str(sample))",
+        color=:steelblue,
+        kwargs...,
+    )
+end
+
+function plot_sample(::DynamicAssortmentBenchmark, sample::DataSample; kwargs...)
+    prices = sample.instance[1][1, :] .* 10
+    y = sample.y
+    N = length(prices)
+    colors = [y[i] ? :seagreen : :lightgray for i in 1:N]
+    return Plots.bar(
+        1:N,
+        prices;
+        legend=false,
+        xlabel="Item",
+        ylabel="Price",
+        title="Assortment$(_step_str(sample))",
+        color=colors,
+        kwargs...,
+    )
+end
+
+function plot_trajectory(
+    bench::DynamicAssortmentBenchmark,
+    trajectory::Vector{<:DataSample};
+    max_steps=6,
+    cols=3,
+    kwargs...,
+)
+    n = min(length(trajectory), max_steps)
+    rows = ceil(Int, n / cols)
+    steps = round.(Int, range(1, length(trajectory); length=n))
+    plots = [plot_sample(bench, trajectory[t]) for t in steps]
+    return Plots.plot(
+        plots...; layout=(rows, cols), size=(cols * 300, rows * 250), kwargs...
+    )
+end
diff --git a/ext/plots/maintenance_plots.jl b/ext/plots/maintenance_plots.jl
new file mode 100644
index 0000000..490a166
--- /dev/null
+++ b/ext/plots/maintenance_plots.jl
@@ -0,0 +1,57 @@
+has_visualization(::MaintenanceBenchmark) = true
+
+function _degradation_colors(state, n)
+    return [s == n ? :firebrick : :steelblue for s in state]
+end
+
+function plot_context(bench::MaintenanceBenchmark, sample::DataSample; kwargs...)
+    state = sample.instance
+    N = length(state)
+    n = bench.n
+    return Plots.bar(
+        1:N,
+        state;
+        legend=false,
+        xlabel="Component",
+        ylabel="Degradation level",
+        title="Degradation state$(_step_str(sample))",
+        ylim=(0, n + 0.5),
+        color=_degradation_colors(state, n),
+        kwargs...,
+    )
+end
+
+function plot_sample(bench::MaintenanceBenchmark, sample::DataSample; kwargs...)
+    state = sample.instance
+    y = sample.y
+    N = length(state)
+    n = bench.n
+    colors = [y[i] ? :seagreen : c for (i, c) in enumerate(_degradation_colors(state, n))]
+    return Plots.bar(
+        1:N,
+        state;
+        legend=false,
+        xlabel="Component",
+        ylabel="Degradation level",
+        title="Maintenance$(_step_str(sample))",
+        ylim=(0, n + 0.5),
+        color=colors,
+        kwargs...,
+    )
+end
+
+function plot_trajectory(
+    bench::MaintenanceBenchmark,
+    trajectory::Vector{<:DataSample};
+    max_steps=6,
+    cols=3,
+    kwargs...,
+)
+    n = min(length(trajectory), max_steps)
+    rows = ceil(Int, n / cols)
+    steps = round.(Int, range(1, length(trajectory); length=n))
+    plots = [plot_sample(bench, trajectory[t]) for t in steps]
+    return Plots.plot(
+        plots...; layout=(rows, cols), size=(cols * 300, rows * 250), kwargs...
+    )
+end
diff --git a/ext/plots/portfolio_plots.jl b/ext/plots/portfolio_plots.jl
new file mode 100644
index 0000000..84df40a
--- /dev/null
+++ b/ext/plots/portfolio_plots.jl
@@ -0,0 +1,57 @@
+has_visualization(::PortfolioOptimizationBenchmark) = true
+
+function plot_context(::PortfolioOptimizationBenchmark, sample::DataSample; kwargs...)
+    x = sample.x
+    p = length(x)
+    return Plots.bar(
+        1:p,
+        Float64.(x);
+        legend=false,
+        xlabel="Feature",
+        ylabel="Value",
+        title="Features x (observable input)",
+        color=:steelblue,
+        xticks=1:p,
+        kwargs...,
+    )
+end
+
+function plot_sample(::PortfolioOptimizationBenchmark, sample::DataSample; kwargs...)
+    x = sample.x
+    θ = sample.θ
+    y = sample.y
+    p = length(x)
+    d = length(θ)
+
+    p_x = Plots.bar(
+        1:p,
+        Float64.(x);
+        legend=false,
+        xlabel="Feature",
+        ylabel="Value",
+        title="x (features, observable)",
+        color=:steelblue,
+        xticks=1:p,
+    )
+    p1 = Plots.bar(
+        1:d,
+        θ;
+        legend=false,
+        xlabel="Asset",
+        ylabel="Expected return",
+        title="θ (expected returns)",
+        color=:steelblue,
+    )
+    p2 = Plots.bar(
+        1:d,
+        y;
+        legend=false,
+        xlabel="Asset",
+        ylabel="Portfolio weight",
+        title="y (portfolio weights)",
+        color=:seagreen,
+    )
+
+    l = Plots.@layout [a{0.3h}; [b c]]
+    return Plots.plot(p_x, p1, p2; layout=l, size=(800, 500), kwargs...)
+end
diff --git a/ext/plots/ranking_plots.jl b/ext/plots/ranking_plots.jl
new file mode 100644
index 0000000..2eda042
--- /dev/null
+++ b/ext/plots/ranking_plots.jl
@@ -0,0 +1,46 @@
+has_visualization(::RankingBenchmark) = true
+
+function plot_context(::RankingBenchmark, sample::DataSample; kwargs...)
+    x = sample.x  # nb_features × n
+    n = size(x, 2)
+    return Plots.heatmap(
+        x;
+        xlabel="Item",
+        ylabel="Feature",
+        title="Features x (observable input)",
+        xticks=1:n,
+        kwargs...,
+    )
+end
+
+function plot_sample(::RankingBenchmark, sample::DataSample; kwargs...)
+    x = sample.x  # nb_features × n
+    θ = sample.θ  # length n
+    y = sample.y  # y[i] = rank of item i (1 = best)
+    n = length(θ)
+
+    p1 = Plots.heatmap(
+        x; ylabel="Feature", title="x (features, observable)", xticks=(1:n, fill("", n))
+    )
+    p2 = Plots.bar(
+        1:n,
+        Float64.(θ);
+        legend=false,
+        ylabel="Cost",
+        title="θ (costs)",
+        xticks=(1:n, fill("", n)),
+    )
+    p3 = Plots.bar(
+        1:n,
+        Float64.(y);
+        legend=false,
+        xlabel="Item",
+        ylabel="Rank",
+        title="y (rank, lower = better)",
+        color=:steelblue,
+        xticks=1:n,
+    )
+
+    l = Plots.@layout [a{0.55h}; b{0.225h}; c{0.225h}]
+    return Plots.plot(p1, p2, p3; layout=l, size=(600, 500), kwargs...)
+end
diff --git a/ext/plots/shortest_path_plots.jl b/ext/plots/shortest_path_plots.jl
new file mode 100644
index 0000000..2649ce5
--- /dev/null
+++ b/ext/plots/shortest_path_plots.jl
@@ -0,0 +1,136 @@
+import Graphs: edges, src, dst
+
+has_visualization(::FixedSizeShortestPathBenchmark) = true
+
+"""
+Map edge weights to a (rows × cols) vertex weight matrix by averaging incident edge weights,
+and return a boolean (rows × cols) matrix marking vertices on the path.
+"""
+function _grid_matrices(bench::FixedSizeShortestPathBenchmark, θ, y)
+    rows, cols = bench.grid_size
+    n_v = rows * cols
+    g = bench.graph
+
+    # Vertex weights: mean of absolute weights of incident edges
+    v_weights = zeros(Float64, n_v)
+    v_counts = zeros(Int, n_v)
+    for (i, e) in enumerate(edges(g))
+        v_weights[src(e)] += abs(θ[i])
+        v_counts[src(e)] += 1
+        v_weights[dst(e)] += abs(θ[i])
+        v_counts[dst(e)] += 1
+    end
+    v_weights ./= max.(v_counts, 1)
+
+    # Path vertices
+    on_path = falses(n_v)
+    for (i, e) in enumerate(edges(g))
+        if y[i]
+            on_path[src(e)] = true
+            on_path[dst(e)] = true
+        end
+    end
+
+    # Reshape to (rows, cols): vertex v → row ceil(v/cols), col ((v-1)%cols)+1
+    weight_grid = reshape(v_weights, cols, rows)'
+    path_grid = reshape(on_path, cols, rows)'
+    return weight_grid, path_grid
+end
+
+function _plot_grid(
+    bench::FixedSizeShortestPathBenchmark;
+    grid=nothing,
+    title="",
+    colorbar=false,
+    color=:viridis,
+    path_grid=nothing,
+    kwargs...,
+)
+    rows, cols = bench.grid_size
+    if isnothing(grid)
+        grid = ones(rows, cols)
+    end
+    pl = Plots.heatmap(
+        grid;
+        yflip=true,
+        aspect_ratio=:equal,
+        title=title,
+        colorbar=colorbar,
+        framestyle=:none,
+        color=color,
+        kwargs...,
+    )
+    Plots.vline!(pl, (0.5):1:(cols + 0.5); color=:gray, lw=0.5, label=false)
+    Plots.hline!(pl, (0.5):1:(rows + 0.5); color=:gray, lw=0.5, label=false)
+    if !isnothing(path_grid)
+        path_xs = Int[]
+        path_ys = Int[]
+        for r in 1:rows, c in 1:cols
+            if path_grid[r, c]
+                push!(path_xs, c)
+                push!(path_ys, r)
+            end
+        end
+        Plots.scatter!(
+            pl,
+            path_xs,
+            path_ys;
+            color=:white,
+            markersize=6,
+            markerstrokewidth=0,
+            label=false,
+        )
+    end
+    Plots.scatter!(
+        pl, [1], [1]; color=:seagreen, markersize=8, markershape=:square, label=false
+    )
+    Plots.scatter!(
+        pl, [cols], [rows]; color=:crimson, markersize=8, markershape=:square, label=false
+    )
+    return pl
+end
+
+function plot_context(bench::FixedSizeShortestPathBenchmark, sample::DataSample; kwargs...)
+    x = sample.x
+    p_feat = length(x)
+    rows, cols = bench.grid_size
+
+    p_x = Plots.bar(
+        1:p_feat,
+        Float64.(x);
+        legend=false,
+        xlabel="Feature",
+        ylabel="Value",
+        title="x (features)",
+        color=:steelblue,
+        xticks=1:p_feat,
+    )
+    p_grid = _plot_grid(bench; title="Grid graph ($(rows)×$(cols))", color=:grays)
+
+    l = Plots.@layout [a{0.35w} b]
+    return Plots.plot(p_x, p_grid; layout=l, size=(700, 300), kwargs...)
+end
+
+function plot_sample(bench::FixedSizeShortestPathBenchmark, sample::DataSample; kwargs...)
+    x = sample.x
+    p_feat = length(x)
+    weight_grid, path_grid = _grid_matrices(bench, sample.θ, sample.y)
+
+    p_x = Plots.bar(
+        1:p_feat,
+        Float64.(x);
+        legend=false,
+        xlabel="Feature",
+        ylabel="Value",
+        title="x (features)",
+        color=:steelblue,
+        xticks=1:p_feat,
+    )
+    p1 = _plot_grid(bench; grid=weight_grid, title="Edge weights θ", colorbar=true)
+    p2 = _plot_grid(
+        bench; grid=weight_grid, title="Shortest path y", color=:Blues, path_grid=path_grid
+    )
+
+    l = Plots.@layout [a{0.25h}; [b c]]
+    return Plots.plot(p_x, p1, p2; layout=l, size=(700, 500), kwargs...)
+end
diff --git a/ext/plots/subset_selection_plots.jl b/ext/plots/subset_selection_plots.jl
new file mode 100644
index 0000000..0b4eaba
--- /dev/null
+++ b/ext/plots/subset_selection_plots.jl
@@ -0,0 +1,56 @@
+has_visualization(::SubsetSelectionBenchmark) = true
+
+function plot_context(::SubsetSelectionBenchmark, sample::DataSample; kwargs...)
+    x = sample.x  # length n feature vector
+    n = length(x)
+    return Plots.bar(
+        1:n,
+        Float64.(x);
+        legend=false,
+        xlabel="Item",
+        ylabel="Feature value",
+        title="Features x (observable input)",
+        color=:steelblue,
+        xticks=1:n,
+        kwargs...,
+    )
+end
+
+function plot_sample(::SubsetSelectionBenchmark, sample::DataSample; kwargs...)
+    x = sample.x  # length n feature vector
+    θ = sample.θ  # length n hidden values
+    y = sample.y  # y[i] = true if item i is selected
+    n = length(θ)
+
+    p1 = Plots.bar(
+        1:n,
+        Float64.(x);
+        legend=false,
+        ylabel="Feature value",
+        title="x (features, observable)",
+        color=:steelblue,
+        xticks=(1:n, fill("", n)),
+    )
+    p2 = Plots.bar(
+        1:n,
+        Float64.(θ);
+        legend=false,
+        ylabel="Value",
+        title="θ (true values)",
+        color=:steelblue,
+        xticks=(1:n, fill("", n)),
+    )
+    p3 = Plots.heatmap(
+        reshape(Float64.(y), 1, n);
+        xlabel="Item",
+        ylabel="y",
+        title="y (selected items)",
+        yticks=false,
+        xticks=1:n,
+        color=:Greens,
+        colorbar=false,
+    )
+
+    l = Plots.@layout [a{0.35h}; b{0.35h}; c{0.3h}]
+    return Plots.plot(p1, p2, p3; layout=l, size=(600, 480), kwargs...)
+end
diff --git a/ext/plots/svs_plots.jl b/ext/plots/svs_plots.jl
index 9a6161e..f660ff2 100644
--- a/ext/plots/svs_plots.jl
+++ b/ext/plots/svs_plots.jl
@@ -3,17 +3,17 @@ import DecisionFocusedLearningBenchmarks.StochasticVehicleScheduling:
 
 has_visualization(::StochasticVehicleSchedulingBenchmark) = true
 
-function plot_instance(
-    ::StochasticVehicleSchedulingBenchmark, sample::DataSample; kwargs...
-)
-    @assert hasproperty(sample.instance, :city) "Sample does not contain city information."
-    (; tasks, district_width, width) = sample.instance.city
+# ── helpers ────────────────────────────────────────────────────────────────────
+
+function _plot_city(city; colormap=:turbo, task_markersize=7, depot_markersize=9, kwargs...)
+    (; tasks, district_width, width) = city
     ticks = 0:district_width:width
-    max_time = maximum(t.end_time for t in sample.instance.city.tasks[1:(end - 1)])
+    max_time = maximum(t.end_time for t in tasks[2:(end - 1)])
     fig = Plots.plot(;
         xlabel="x",
         ylabel="y",
-        gridlinewidth=3,
+        gridlinewidth=1,
+        gridlinealpha=0.3,
         aspect_ratio=:equal,
         size=(500, 500),
         xticks=ticks,
@@ -25,80 +25,90 @@ function plot_instance(
         colorbar_title="Time",
         kwargs...,
     )
-    Plots.scatter!(
-        fig,
-        [tasks[1].start_point.x],
-        [tasks[1].start_point.y];
-        label=nothing,
-        marker=:rect,
-        markersize=10,
-    )
-    Plots.annotate!(
-        fig, (tasks[1].start_point.x, tasks[1].start_point.y, Plots.text("0", 10))
-    )
-    for (i_task, task) in enumerate(tasks[2:(end - 1)])
+    for task in tasks[2:(end - 1)]
         (; start_point, end_point) = task
-        points = [(start_point.x, start_point.y), (end_point.x, end_point.y)]
-        Plots.plot!(fig, points; color=:black, label=nothing)
+        Plots.plot!(
+            fig,
+            [start_point.x, end_point.x],
+            [start_point.y, end_point.y];
+            color=:gray70,
+            linewidth=1,
+            label=nothing,
+        )
         Plots.scatter!(
             fig,
-            points[1];
-            markersize=10,
+            [start_point.x],
+            [start_point.y];
+            markersize=task_markersize,
             marker=:rect,
             marker_z=task.start_time,
-            colormap=:turbo,
+            colormap=colormap,
             label=nothing,
         )
         Plots.scatter!(
             fig,
-            points[2];
-            markersize=10,
+            [end_point.x],
+            [end_point.y];
+            markersize=task_markersize,
             marker=:rect,
             marker_z=task.end_time,
-            colormap=:turbo,
+            colormap=colormap,
             label=nothing,
         )
-        Plots.annotate!(fig, (points[1]..., Plots.text("$(i_task)", 10)))
     end
+    Plots.scatter!(
+        fig,
+        [tasks[1].start_point.x],
+        [tasks[1].start_point.y];
+        label=nothing,
+        marker=:rect,
+        markersize=depot_markersize,
+        markercolor=:black,
+    )
     return fig
 end
 
-function plot_solution(
-    ::StochasticVehicleSchedulingBenchmark, sample::DataSample; kwargs...
-)
-    @assert hasproperty(sample.instance, :city) "Sample does not contain city information."
-    (; tasks, district_width, width) = sample.instance.city
-    ticks = 0:district_width:width
-    solution = Solution(sample.y, sample.instance)
-    path_list = compute_path_list(solution)
-    fig = Plots.plot(;
-        xlabel="x",
-        ylabel="y",
-        legend=false,
-        gridlinewidth=3,
-        aspect_ratio=:equal,
-        size=(500, 500),
-        xticks=ticks,
-        yticks=ticks,
-        xlims=(-1, width + 1),
-        ylims=(-1, width + 1),
-        kwargs...,
-    )
+function _plot_routes(fig, city, path_list; route_linewidth=2, route_alpha=0.7)
+    (; tasks) = city
     for path in path_list
         X = Float64[]
         Y = Float64[]
-        (; start_point, end_point) = tasks[path[1]]
-        (; x, y) = end_point
-        push!(X, x)
-        push!(Y, y)
-        for task in path[2:end]
-            (; start_point, end_point) = tasks[task]
+        (; end_point) = tasks[path[1]]
+        push!(X, end_point.x)
+        push!(Y, end_point.y)
+        for task_idx in path[2:end]
+            (; start_point, end_point) = tasks[task_idx]
             push!(X, start_point.x)
             push!(Y, start_point.y)
             push!(X, end_point.x)
             push!(Y, end_point.y)
         end
-        Plots.plot!(fig, X, Y; marker=:circle)
+        Plots.plot!(
+            fig,
+            X,
+            Y;
+            linewidth=route_linewidth,
+            alpha=route_alpha,
+            label=false,
+            z_order=:back,
+        )
     end
     return fig
 end
+
+# ── interface methods ──────────────────────────────────────────────────────────
+
+function plot_context(::StochasticVehicleSchedulingBenchmark, sample::DataSample; kwargs...)
+    @assert hasproperty(sample.instance, :city) "Sample does not contain city information."
+    return _plot_city(sample.instance.city; kwargs...)
+end
+
+function plot_sample(::StochasticVehicleSchedulingBenchmark, sample::DataSample; kwargs...)
+    @assert hasproperty(sample.instance, :city) "Sample does not contain city information."
+    city = sample.instance.city
+    fig = _plot_city(city; kwargs...)
+    solution = Solution(sample.y, sample.instance)
+    path_list = compute_path_list(solution)
+    _plot_routes(fig, city, path_list)
+    return fig
+end
diff --git a/ext/plots/warcraft_plots.jl b/ext/plots/warcraft_plots.jl
index 2029225..a88725e 100644
--- a/ext/plots/warcraft_plots.jl
+++ b/ext/plots/warcraft_plots.jl
@@ -3,7 +3,7 @@ using Images: Gray
 
 has_visualization(::WarcraftBenchmark) = true
 
-function plot_instance(::WarcraftBenchmark, sample::DataSample; kwargs...)
+function plot_context(::WarcraftBenchmark, sample::DataSample; kwargs...)
     im = dropdims(sample.x; dims=4)
     img = W.convert_image_for_plot(im)
     return Plots.plot(
@@ -11,12 +11,12 @@ function plot_instance(::WarcraftBenchmark, sample::DataSample; kwargs...)
     )
 end
 
-function plot_solution(
+function plot_sample(
     ::WarcraftBenchmark,
     sample::DataSample;
     θ_true=sample.θ,
-    θ_title="Weights",
-    y_title="Path",
+    θ_title="Cell costs θ",
+    y_title="Path y",
     kwargs...,
 )
     x = sample.x
diff --git a/src/ContextualStochasticArgmax/ContextualStochasticArgmax.jl b/src/ContextualStochasticArgmax/ContextualStochasticArgmax.jl
index 49d22e2..5a84825 100644
--- a/src/ContextualStochasticArgmax/ContextualStochasticArgmax.jl
+++ b/src/ContextualStochasticArgmax/ContextualStochasticArgmax.jl
@@ -121,6 +121,16 @@ include("policies.jl")
 """
 $TYPEDSIGNATURES
 
+Return the named baseline policies for [`ContextualStochasticArgmaxBenchmark`](@ref).
+Each policy has signature `(ctx_sample, scenarios) -> Vector{DataSample}`.
+"""
+function Utils.generate_baseline_policies(::ContextualStochasticArgmaxBenchmark)
+    return (; saa=Policy("SAA", "argmax of mean scenarios", csa_saa_policy))
+end
+
+"""
+$TYPEDSIGNATURES
+
 Generates the anticipative solver for the benchmark.
 """
 function Utils.generate_anticipative_solver(::ContextualStochasticArgmaxBenchmark)
diff --git a/src/ContextualStochasticArgmax/policies.jl b/src/ContextualStochasticArgmax/policies.jl
index 1dc2d28..2cf7ae5 100644
--- a/src/ContextualStochasticArgmax/policies.jl
+++ b/src/ContextualStochasticArgmax/policies.jl
@@ -17,16 +17,6 @@ function csa_saa_policy(ctx_sample, scenarios)
     ]
 end
 
-"""
-$TYPEDSIGNATURES
-
-Return the named baseline policies for [`ContextualStochasticArgmaxBenchmark`](@ref).
-Each policy has signature `(ctx_sample, scenarios) -> Vector{DataSample}`.
-"""
-function Utils.generate_baseline_policies(::ContextualStochasticArgmaxBenchmark)
-    return (; saa=Policy("SAA", "argmax of mean scenarios", csa_saa_policy))
-end
-
 """
 $TYPEDEF
 
diff --git a/src/DecisionFocusedLearningBenchmarks.jl b/src/DecisionFocusedLearningBenchmarks.jl
index 3ef3448..7380544 100644
--- a/src/DecisionFocusedLearningBenchmarks.jl
+++ b/src/DecisionFocusedLearningBenchmarks.jl
@@ -82,7 +82,7 @@ export is_exogenous, is_endogenous
 export is_minimization_problem
 
 export objective_value
-export has_visualization, plot_instance, plot_solution, plot_trajectory, animate_trajectory
+export has_visualization, plot_context, plot_sample, plot_trajectory, animate_trajectory
 export compute_gap
 
 # Export all benchmarks
diff --git a/src/DynamicVehicleScheduling/plot.jl b/src/DynamicVehicleScheduling/plot.jl
index f6e39ad..06e62d8 100644
--- a/src/DynamicVehicleScheduling/plot.jl
+++ b/src/DynamicVehicleScheduling/plot.jl
@@ -1,4 +1,4 @@
-function plot_instance(env::DVSPEnv; kwargs...)
+function plot_context(env::DVSPEnv; kwargs...)
     return plot_instance(env.instance.static_instance; kwargs...)
 end
 
diff --git a/src/FixedSizeShortestPath/FixedSizeShortestPath.jl b/src/FixedSizeShortestPath/FixedSizeShortestPath.jl
index 86d5bdb..f72ecc9 100644
--- a/src/FixedSizeShortestPath/FixedSizeShortestPath.jl
+++ b/src/FixedSizeShortestPath/FixedSizeShortestPath.jl
@@ -138,7 +138,7 @@ function Utils.generate_statistical_model(
 )
     Random.seed!(seed)
     (; p, graph) = bench
-    return Chain(Dense(p, ne(graph)))
+    return Dense(p, ne(graph))
 end
 
 export FixedSizeShortestPathBenchmark
diff --git a/src/Utils/Utils.jl b/src/Utils/Utils.jl
index e9eb16e..50f5129 100644
--- a/src/Utils/Utils.jl
+++ b/src/Utils/Utils.jl
@@ -42,7 +42,7 @@ export generate_baseline_policies
 export generate_anticipative_solver, generate_parametric_anticipative_solver
 export is_minimization_problem
 
-export has_visualization, plot_instance, plot_solution, plot_trajectory, animate_trajectory
+export has_visualization, plot_context, plot_sample, plot_trajectory, animate_trajectory
 export compute_gap
 export grid_graph, get_path, path_to_matrix
 export neg_tensor, squeeze_last_dims, average_tensor
diff --git a/src/Utils/data_sample.jl b/src/Utils/data_sample.jl
index 1147761..1a7002d 100644
--- a/src/Utils/data_sample.jl
+++ b/src/Utils/data_sample.jl
@@ -85,6 +85,23 @@ end
 """
 $TYPEDSIGNATURES
 
+Copy constructor for `DataSample` with optional overrides.
+"""
+function DataSample(
+    sample::DataSample;
+    x=sample.x,
+    θ=sample.θ,
+    y=sample.y,
+    extra=sample.extra,
+    context=sample.context,
+    kwargs...,
+)
+    return DataSample(; x=x, θ=θ, y=y, extra=extra, context..., kwargs...)
+end
+
+"""
+$TYPEDSIGNATURES
+
 Extended property access for `DataSample`.
 
 Allows accessing `context` and `extra` fields directly as properties.
diff --git a/src/Utils/interface/abstract_benchmark.jl b/src/Utils/interface/abstract_benchmark.jl
index bedf527..b89ef91 100644
--- a/src/Utils/interface/abstract_benchmark.jl
+++ b/src/Utils/interface/abstract_benchmark.jl
@@ -59,24 +59,24 @@ function generate_baseline_policies end
 """
     has_visualization(::AbstractBenchmark) -> Bool
 
-Return `true` if `plot_instance` and `plot_solution` are implemented for this benchmark
+Return `true` if `plot_context` and `plot_sample` are implemented for this benchmark
 (requires `Plots` to be loaded). Default is `false`.
 """
 has_visualization(::AbstractBenchmark) = false
 
 """
-    plot_instance(bench::AbstractBenchmark, sample::DataSample; kwargs...)
+    plot_context(bench::AbstractBenchmark, sample::DataSample; kwargs...)
 
-Plot the problem instance (no solution). Only available when `Plots` is loaded.
+Plot the observable context before making a decision (no solution). Only available when `Plots` is loaded.
 """
-function plot_instance end
+function plot_context end
 
 """
-    plot_solution(bench::AbstractBenchmark, sample::DataSample; kwargs...)
+    plot_sample(bench::AbstractBenchmark, sample::DataSample; kwargs...)
 
 Plot the instance with `sample.y` overlaid. Only available when `Plots` is loaded.
 """
-function plot_solution end
+function plot_sample end
 
 """
     objective_value(bench::AbstractBenchmark, sample::DataSample, y) -> Real
diff --git a/src/Utils/interface/static_benchmark.jl b/src/Utils/interface/static_benchmark.jl
index d809e05..b7baf79 100644
--- a/src/Utils/interface/static_benchmark.jl
+++ b/src/Utils/interface/static_benchmark.jl
@@ -28,7 +28,7 @@ Also implement:
 - [`objective_value`](@ref)`(bench, sample, y)`: must be implemented by every static benchmark
 
 # Optional methods (no default, require `Plots` to be loaded)
-- [`plot_instance`](@ref), [`plot_solution`](@ref)
+- [`plot_context`](@ref), [`plot_sample`](@ref)
 - [`generate_baseline_policies`](@ref)
 """
 abstract type AbstractStaticBenchmark <: AbstractBenchmark end
@@ -56,7 +56,7 @@ end
     generate_dataset(::AbstractStaticBenchmark, dataset_size::Int; target_policy=nothing, kwargs...) -> Vector{<:DataSample}
 
 Generate a `Vector` of [`DataSample`](@ref) of length `dataset_size` for given benchmark.
-Content of the dataset can be visualized using [`plot_solution`](@ref), when it applies.
+Content of the dataset can be visualized using [`plot_sample`](@ref), when it applies.
 
 By default, it uses [`generate_sample`](@ref) to create each sample in the dataset, and passes any
 keyword arguments to it. `target_policy` is applied if provided, it is called on each sample
diff --git a/src/Utils/policy.jl b/src/Utils/policy.jl
index 5eb0c6d..f20a6fb 100644
--- a/src/Utils/policy.jl
+++ b/src/Utils/policy.jl
@@ -39,12 +39,14 @@ function evaluate_policy!(
     end
     total_reward = 0.0
     labeled_dataset = DataSample[]
+    step = 0
     while !is_terminated(env)
+        step += 1
         y = policy(env; kwargs...)
         features, state = observe(env)
         state_copy = deepcopy(state)  # To avoid mutation issues
         reward = step!(env, y)
-        sample = DataSample(; x=features, y=y, instance=state_copy, extra=(; reward))
+        sample = DataSample(; x=features, y=y, instance=state_copy, extra=(; reward, step))
         if isempty(labeled_dataset)
             labeled_dataset = typeof(sample)[sample]
         else
diff --git a/test/argmax.jl b/test/argmax.jl
index d772c4d..aca8f98 100644
--- a/test/argmax.jl
+++ b/test/argmax.jl
@@ -33,4 +33,15 @@
         y = maximizer(θ)
         @test length(y) == instance_dim
     end
+
+    @testset "Plots" begin
+        using Plots
+        @test has_visualization(b)
+        fig1 = plot_context(b, dataset[1])
+        @test fig1 isa Plots.Plot
+        fig2 = plot_sample(b, dataset[1])
+        @test fig2 isa Plots.Plot
+        fig3 = plot_sample(b, DataSample(dataset[1]; y=dataset[2].y))
+        @test fig3 isa Plots.Plot
+    end
 end
diff --git a/test/argmax_2d.jl b/test/argmax_2d.jl
index e3bd6ff..5447e7b 100644
--- a/test/argmax_2d.jl
+++ b/test/argmax_2d.jl
@@ -17,11 +17,11 @@
     @test gap >= 0
 
     @test has_visualization(b)
-    figure = plot_solution(b, dataset[1])
+    figure = plot_sample(b, dataset[1])
     @test figure isa Plots.Plot
-    figure2 = plot_instance(b, dataset[1])
+    figure2 = plot_context(b, dataset[1])
     @test figure2 isa Plots.Plot
-    figure3 = plot_solution(b, dataset[1], dataset[2].y)
+    figure3 = plot_sample(b, DataSample(dataset[1]; y=dataset[2].y))
     @test figure3 isa Plots.Plot
 
     for (i, sample) in enumerate(dataset)
diff --git a/test/contextual_stochastic_argmax.jl b/test/contextual_stochastic_argmax.jl
index 1e5e59a..79533cc 100644
--- a/test/contextual_stochastic_argmax.jl
+++ b/test/contextual_stochastic_argmax.jl
@@ -113,3 +113,22 @@ end
     s = DataSample(; x=randn(Float32, 8), y=maximizer(randn(Float32, 5)))
     @test_throws Exception objective_value(b, s, s.y)
 end
+
+@testset "ContextualStochasticArgmax - Plots" begin
+    using DecisionFocusedLearningBenchmarks
+    using Plots
+
+    b = ContextualStochasticArgmaxBenchmark(; n=5, d=3, seed=0)
+    policies = generate_baseline_policies(b)
+    dataset = generate_dataset(b, 2; nb_scenarios=2, target_policy=policies.saa)
+    model = generate_statistical_model(b; seed=0)
+    sample = DataSample(dataset[1]; θ=model(dataset[1].x))
+
+    @test has_visualization(b)
+    fig1 = plot_context(b, dataset[1])
+    @test fig1 isa Plots.Plot
+    fig2 = plot_sample(b, sample)
+    @test fig2 isa Plots.Plot
+    fig3 = plot_sample(b, DataSample(sample; y=dataset[2].y))
+    @test fig3 isa Plots.Plot
+end
diff --git a/test/dynamic_assortment.jl b/test/dynamic_assortment.jl
index 93c4f1e..40a27ca 100644
--- a/test/dynamic_assortment.jl
+++ b/test/dynamic_assortment.jl
@@ -356,3 +356,23 @@ end
     @test length(y) == DAP.item_count(b)
     @test sum(y) == DAP.assortment_size(b)
 end
+
+@testset "DynamicAssortment - Plots" begin
+    using DecisionFocusedLearningBenchmarks
+    using Plots
+
+    b = DynamicAssortmentBenchmark(; N=4, d=2, K=2, max_steps=5, exogenous=true)
+    envs = generate_environments(b, 2; seed=0)
+    policies = generate_baseline_policies(b)
+    _, traj = evaluate_policy!(policies[1], envs)
+
+    @test has_visualization(b)
+    fig1 = plot_context(b, traj[1])
+    @test fig1 isa Plots.Plot
+    fig2 = plot_sample(b, traj[1])
+    @test fig2 isa Plots.Plot
+    fig3 = plot_sample(b, DataSample(traj[1]; y=traj[2].y))
+    @test fig3 isa Plots.Plot
+    fig4 = plot_trajectory(b, traj)
+    @test fig4 isa Plots.Plot
+end
diff --git a/test/dynamic_vsp_plots.jl b/test/dynamic_vsp_plots.jl
index 32cbc4a..46b8584 100644
--- a/test/dynamic_vsp_plots.jl
+++ b/test/dynamic_vsp_plots.jl
@@ -9,19 +9,19 @@
     # Get a trajectory via the anticipative solver
     y = generate_anticipative_solver(b)(env; nb_epochs=3)
 
-    # Test plot_instance (shows first epoch state)
-    fig1 = plot_instance(b, y[1])
+    # Test plot_context (shows first epoch state)
+    fig1 = plot_context(b, y[1])
     @test fig1 isa Plots.Plot
 
     # Test plot_trajectory (grid of epoch subplots)
     fig2 = plot_trajectory(b, y)
     @test fig2 isa Plots.Plot
 
-    # Test plot_solution via baseline policy
+    # Test plot_sample via baseline policy
     policies = generate_baseline_policies(b)
     lazy = policies[1]
     _, d = evaluate_policy!(lazy, env)
-    fig3 = plot_solution(b, d[1])
+    fig3 = plot_sample(b, d[1])
     @test fig3 isa Plots.Plot
 
     # Test animate_trajectory — returns Animation, save separately with gif()
diff --git a/test/fixed_size_shortest_path.jl b/test/fixed_size_shortest_path.jl
index eacdd64..33fe679 100644
--- a/test/fixed_size_shortest_path.jl
+++ b/test/fixed_size_shortest_path.jl
@@ -32,4 +32,15 @@
         y = maximizer(θ)
         @test length(y) == length(y_true)
     end
+
+    @testset "Plots" begin
+        using Plots
+        @test has_visualization(b)
+        fig1 = plot_context(b, dataset[1])
+        @test fig1 isa Plots.Plot
+        fig2 = plot_sample(b, dataset[1])
+        @test fig2 isa Plots.Plot
+        fig3 = plot_sample(b, DataSample(dataset[1]; y=dataset[2].y))
+        @test fig3 isa Plots.Plot
+    end
 end
diff --git a/test/maintenance.jl b/test/maintenance.jl
index a2a9983..8aa4b15 100644
--- a/test/maintenance.jl
+++ b/test/maintenance.jl
@@ -225,3 +225,23 @@ end
     θ = fill(-1.0, 10)
     @test maximizer(θ) == falses(10)
 end
+
+@testset "Maintenance - Plots" begin
+    using DecisionFocusedLearningBenchmarks
+    using Plots
+
+    b = MaintenanceBenchmark()
+    envs = generate_environments(b, 2; seed=0)
+    policies = generate_baseline_policies(b)
+    _, traj = evaluate_policy!(policies[1], envs)
+
+    @test has_visualization(b)
+    fig1 = plot_context(b, traj[1])
+    @test fig1 isa Plots.Plot
+    fig2 = plot_sample(b, traj[1])
+    @test fig2 isa Plots.Plot
+    fig3 = plot_sample(b, DataSample(traj[1]; y=traj[2].y))
+    @test fig3 isa Plots.Plot
+    fig4 = plot_trajectory(b, traj)
+    @test fig4 isa Plots.Plot
+end
diff --git a/test/portfolio_optimization.jl b/test/portfolio_optimization.jl
index b436c81..922e14f 100644
--- a/test/portfolio_optimization.jl
+++ b/test/portfolio_optimization.jl
@@ -29,4 +29,15 @@
 
     gap = compute_gap(b, dataset[1:5], model, maximizer)
     @test isfinite(gap)
+
+    @testset "Plots" begin
+        using Plots
+        @test has_visualization(b)
+        fig1 = plot_context(b, dataset[1])
+        @test fig1 isa Plots.Plot
+        fig2 = plot_sample(b, dataset[1])
+        @test fig2 isa Plots.Plot
+        fig3 = plot_sample(b, DataSample(dataset[1]; y=dataset[2].y))
+        @test fig3 isa Plots.Plot
+    end
 end
diff --git a/test/ranking.jl b/test/ranking.jl
index b3c2a3b..68aa6cf 100644
--- a/test/ranking.jl
+++ b/test/ranking.jl
@@ -34,4 +34,15 @@
     gap = compute_gap(b, dataset[1:5], model, maximizer)
     @test isfinite(gap)
     @test gap >= 0
+
+    @testset "Plots" begin
+        using Plots
+        @test has_visualization(b)
+        fig1 = plot_context(b, dataset[1])
+        @test fig1 isa Plots.Plot
+        fig2 = plot_sample(b, dataset[1])
+        @test fig2 isa Plots.Plot
+        fig3 = plot_sample(b, DataSample(dataset[1]; y=dataset[2].y))
+        @test fig3 isa Plots.Plot
+    end
 end
diff --git a/test/subset_selection.jl b/test/subset_selection.jl
index 90d3150..fa85e21 100644
--- a/test/subset_selection.jl
+++ b/test/subset_selection.jl
@@ -40,4 +40,15 @@
     gap = compute_gap(b, dataset[1:5], model, maximizer)
     @test isfinite(gap)
     @test gap >= 0
+
+    @testset "Plots" begin
+        using Plots
+        @test has_visualization(b_identity)
+        fig1 = plot_context(b_identity, dataset[1])
+        @test fig1 isa Plots.Plot
+        fig2 = plot_sample(b_identity, dataset[1])
+        @test fig2 isa Plots.Plot
+        fig3 = plot_sample(b_identity, DataSample(dataset[1]; y=dataset[2].y))
+        @test fig3 isa Plots.Plot
+    end
 end
diff --git a/test/vsp.jl b/test/vsp.jl
index 5a18429..0d778d2 100644
--- a/test/vsp.jl
+++ b/test/vsp.jl
@@ -49,9 +49,9 @@
     @test length(ls_dataset[1].extra.scenarios) == K
 
     # Plots work unchanged
-    figure_1 = plot_instance(b, saa_dataset[1])
+    figure_1 = plot_context(b, saa_dataset[1])
     @test figure_1 isa Plots.Plot
-    figure_2 = plot_solution(b, saa_dataset[1])
+    figure_2 = plot_sample(b, saa_dataset[1])
     @test figure_2 isa Plots.Plot
 
     maximizer = generate_maximizer(b)
diff --git a/test/warcraft.jl b/test/warcraft.jl
index 94a4678..49f5ba0 100644
--- a/test/warcraft.jl
+++ b/test/warcraft.jl
@@ -14,11 +14,11 @@
     dijkstra_maximizer = generate_maximizer(b; dijkstra=true)
 
     @test has_visualization(b)
-    figure = plot_solution(b, dataset[1])
+    figure = plot_sample(b, dataset[1])
     @test figure isa Plots.Plot
-    figure2 = plot_instance(b, dataset[1])
+    figure2 = plot_context(b, dataset[1])
     @test figure2 isa Plots.Plot
-    figure3 = plot_solution(b, dataset[1], dataset[2].y)
+    figure3 = plot_sample(b, DataSample(dataset[1]; y=dataset[2].y))
     @test figure3 isa Plots.Plot
     gap = compute_gap(b, dataset, model, dijkstra_maximizer)
     @test gap >= 0