Skip to content

Commit 17875ea

Browse files
sradcoAI Assistant
andcommitted
collector: add nvmesubsystem collector for NVMe-oF path health
Add a new disabled-by-default collector that reads /sys/class/nvme-subsystem/ to expose NVMe over Fabrics subsystem connectivity metrics. This complements the existing nvme collector (which reports per-controller hardware stats) by monitoring the subsystem-level path redundancy — how many controller paths are live, connecting, or dead for each NVMe subsystem. Exposed metrics: - node_nvmesubsystem_info - node_nvmesubsystem_paths - node_nvmesubsystem_paths_live - node_nvmesubsystem_path_state Signed-off-by: Shirly Radco <sradco@redhat.com> Co-authored-by: AI Assistant <noreply@cursor.com>
1 parent a1cbf81 commit 17875ea

File tree

6 files changed

+395
-2
lines changed

6 files changed

+395
-2
lines changed

README.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,7 @@ logind | Exposes session counts from [logind](http://www.freedesktop.org/wiki/So
202202
meminfo\_numa | Exposes memory statistics from `/sys/devices/system/node/node[0-9]*/meminfo`, `/sys/devices/system/node/node[0-9]*/numastat`. | Linux
203203
mountstats | Exposes filesystem statistics from `/proc/self/mountstats`. Exposes detailed NFS client statistics. | Linux
204204
network_route | Exposes the routing table as metrics | Linux
205+
nvmesubsystem | Exposes NVMe-oF subsystem path health from `/sys/class/nvme-subsystem/`. | Linux
205206
pcidevice | Exposes pci devices' information including their link status and parent devices. | Linux
206207
perf | Exposes perf based metrics (Warning: Metrics are dependent on kernel configuration and settings). | Linux
207208
processes | Exposes aggregate process statistics from `/proc`. | Linux
@@ -339,6 +340,25 @@ echo 'role{role="application_server"} 1' > /path/to/directory/role.prom.$$
339340
mv /path/to/directory/role.prom.$$ /path/to/directory/role.prom
340341
```
341342

343+
### NVMe Subsystem Collector
344+
345+
The `nvmesubsystem` collector exposes NVMe-oF (NVMe over Fabrics) subsystem
346+
path health by reading `/sys/class/nvme-subsystem/`. It complements the
347+
existing `nvme` collector (which reports per-controller hardware stats) by
348+
monitoring the **connectivity layer** — how many controller paths are live,
349+
connecting, or dead for each NVMe subsystem.
350+
351+
Enable it with `--collector.nvmesubsystem`.
352+
353+
#### Exposed metrics
354+
355+
| Metric | Description |
356+
|--------|-------------|
357+
| `node_nvmesubsystem_info` | Info metric with subsystem NQN, model, serial and I/O policy as labels. |
358+
| `node_nvmesubsystem_paths` | Number of controller paths for the subsystem. |
359+
| `node_nvmesubsystem_paths_live` | Number of controller paths currently in `live` state. |
360+
| `node_nvmesubsystem_path_state` | Per-controller path state (1 for the current state, 0 for others). |
361+
342362
### Filtering enabled collectors
343363

344364
The `node_exporter` will expose all metrics from enabled collectors by default. This is the recommended way to collect metrics to avoid errors when comparing metrics of different families.

collector/fixtures/sys.ttar

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2255,6 +2255,104 @@ Lines: 1
22552255
4096
22562256
Mode: 644
22572257
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2258+
Directory: sys/class/nvme-subsystem
2259+
Mode: 755
2260+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2261+
Directory: sys/class/nvme-subsystem/nvme-subsys0
2262+
Mode: 755
2263+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2264+
Path: sys/class/nvme-subsystem/nvme-subsys0/iopolicy
2265+
Lines: 1
2266+
round-robinEOF
2267+
Mode: 644
2268+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2269+
Path: sys/class/nvme-subsystem/nvme-subsys0/model
2270+
Lines: 1
2271+
Dell PowerStoreEOF
2272+
Mode: 644
2273+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2274+
Directory: sys/class/nvme-subsystem/nvme-subsys0/nvme0
2275+
Mode: 755
2276+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2277+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme0/address
2278+
Lines: 1
2279+
nn-0x200000109b123456:pn-0x100000109b123456EOF
2280+
Mode: 644
2281+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2282+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme0/state
2283+
Lines: 1
2284+
liveEOF
2285+
Mode: 644
2286+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2287+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme0/transport
2288+
Lines: 1
2289+
fcEOF
2290+
Mode: 644
2291+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2292+
Directory: sys/class/nvme-subsystem/nvme-subsys0/nvme1
2293+
Mode: 755
2294+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2295+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme1/address
2296+
Lines: 1
2297+
nn-0x200000109b123457:pn-0x100000109b123457EOF
2298+
Mode: 644
2299+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2300+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme1/state
2301+
Lines: 1
2302+
liveEOF
2303+
Mode: 644
2304+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2305+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme1/transport
2306+
Lines: 1
2307+
fcEOF
2308+
Mode: 644
2309+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2310+
Directory: sys/class/nvme-subsystem/nvme-subsys0/nvme2
2311+
Mode: 755
2312+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2313+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme2/address
2314+
Lines: 1
2315+
nn-0x200000109b123458:pn-0x100000109b123458EOF
2316+
Mode: 644
2317+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2318+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme2/state
2319+
Lines: 1
2320+
liveEOF
2321+
Mode: 644
2322+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2323+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme2/transport
2324+
Lines: 1
2325+
fcEOF
2326+
Mode: 644
2327+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2328+
Directory: sys/class/nvme-subsystem/nvme-subsys0/nvme3
2329+
Mode: 755
2330+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2331+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme3/address
2332+
Lines: 1
2333+
nn-0x200000109b123459:pn-0x100000109b123459EOF
2334+
Mode: 644
2335+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2336+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme3/state
2337+
Lines: 1
2338+
deadEOF
2339+
Mode: 644
2340+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2341+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme3/transport
2342+
Lines: 1
2343+
fcEOF
2344+
Mode: 644
2345+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2346+
Path: sys/class/nvme-subsystem/nvme-subsys0/serial
2347+
Lines: 1
2348+
SN12345678EOF
2349+
Mode: 644
2350+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2351+
Path: sys/class/nvme-subsystem/nvme-subsys0/subsysnqn
2352+
Lines: 1
2353+
nqn.2014-08.org.nvmexpress:uuid:a34c4f3a-0d6f-5cec-dead-beefcafebabeEOF
2354+
Mode: 644
2355+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
22582356
Directory: sys/class/power_supply
22592357
Mode: 755
22602358
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

collector/nvmesubsystem_linux.go

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
// Copyright The Prometheus Authors
2+
// Licensed under the Apache License, Version 2.0 (the "License");
3+
// you may not use this file except in compliance with the License.
4+
// You may obtain a copy of the License at
5+
//
6+
// http://www.apache.org/licenses/LICENSE-2.0
7+
//
8+
// Unless required by applicable law or agreed to in writing, software
9+
// distributed under the License is distributed on an "AS IS" BASIS,
10+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
// See the License for the specific language governing permissions and
12+
// limitations under the License.
13+
14+
//go:build !nonvmesubsystem
15+
16+
package collector
17+
18+
import (
19+
"errors"
20+
"fmt"
21+
"log/slog"
22+
"os"
23+
24+
"github.com/prometheus/client_golang/prometheus"
25+
"github.com/prometheus/procfs/sysfs"
26+
)
27+
28+
var nvmeControllerStates = []string{
29+
"live", "connecting", "resetting", "dead", "unknown",
30+
}
31+
32+
func normalizeControllerState(raw string) string {
33+
switch raw {
34+
case "live", "connecting", "resetting", "dead":
35+
return raw
36+
case "deleting", "deleting (no IO)", "new":
37+
return raw
38+
default:
39+
return "unknown"
40+
}
41+
}
42+
43+
type nvmeSubsystemCollector struct {
44+
fs sysfs.FS
45+
logger *slog.Logger
46+
47+
subsystemInfo *prometheus.Desc
48+
subsystemPaths *prometheus.Desc
49+
subsystemPathsLive *prometheus.Desc
50+
pathState *prometheus.Desc
51+
}
52+
53+
func init() {
54+
registerCollector("nvmesubsystem", defaultDisabled, NewNVMeSubsystemCollector)
55+
}
56+
57+
// NewNVMeSubsystemCollector returns a new Collector exposing NVMe-oF subsystem
58+
// path health from /sys/class/nvme-subsystem/.
59+
func NewNVMeSubsystemCollector(logger *slog.Logger) (Collector, error) {
60+
const subsystem = "nvmesubsystem"
61+
62+
fs, err := sysfs.NewFS(*sysPath)
63+
if err != nil {
64+
return nil, fmt.Errorf("failed to open sysfs: %w", err)
65+
}
66+
67+
return &nvmeSubsystemCollector{
68+
fs: fs,
69+
logger: logger,
70+
subsystemInfo: prometheus.NewDesc(
71+
prometheus.BuildFQName(namespace, subsystem, "info"),
72+
"Non-numeric information about an NVMe subsystem.",
73+
[]string{"subsystem", "nqn", "model", "serial", "iopolicy"}, nil,
74+
),
75+
subsystemPaths: prometheus.NewDesc(
76+
prometheus.BuildFQName(namespace, subsystem, "paths"),
77+
"Number of controller paths for an NVMe subsystem.",
78+
[]string{"subsystem"}, nil,
79+
),
80+
subsystemPathsLive: prometheus.NewDesc(
81+
prometheus.BuildFQName(namespace, subsystem, "paths_live"),
82+
"Number of controller paths in live state for an NVMe subsystem.",
83+
[]string{"subsystem"}, nil,
84+
),
85+
pathState: prometheus.NewDesc(
86+
prometheus.BuildFQName(namespace, subsystem, "path_state"),
87+
"Current NVMe controller path state (1 for the current state, 0 for all others).",
88+
[]string{"subsystem", "controller", "transport", "state"}, nil,
89+
),
90+
}, nil
91+
}
92+
93+
func (c *nvmeSubsystemCollector) Update(ch chan<- prometheus.Metric) error {
94+
subsystems, err := c.fs.NVMeSubsystemClass()
95+
if err != nil {
96+
if errors.Is(err, os.ErrNotExist) || errors.Is(err, os.ErrPermission) {
97+
c.logger.Debug("Could not read NVMe subsystem info", "err", err)
98+
return ErrNoData
99+
}
100+
return fmt.Errorf("failed to scan NVMe subsystems: %w", err)
101+
}
102+
103+
for _, subsys := range subsystems {
104+
ch <- prometheus.MustNewConstMetric(c.subsystemInfo, prometheus.GaugeValue, 1,
105+
subsys.Name, subsys.NQN, subsys.Model, subsys.Serial, subsys.IOPolicy)
106+
107+
total := float64(len(subsys.Controllers))
108+
var live float64
109+
for _, ctrl := range subsys.Controllers {
110+
state := normalizeControllerState(ctrl.State)
111+
if state == "live" {
112+
live++
113+
}
114+
115+
for _, s := range nvmeControllerStates {
116+
val := 0.0
117+
if s == state {
118+
val = 1.0
119+
}
120+
ch <- prometheus.MustNewConstMetric(c.pathState, prometheus.GaugeValue, val,
121+
subsys.Name, ctrl.Name, ctrl.Transport, s)
122+
}
123+
}
124+
125+
ch <- prometheus.MustNewConstMetric(c.subsystemPaths, prometheus.GaugeValue, total, subsys.Name)
126+
ch <- prometheus.MustNewConstMetric(c.subsystemPathsLive, prometheus.GaugeValue, live, subsys.Name)
127+
}
128+
129+
return nil
130+
}

0 commit comments

Comments
 (0)