Skip to content

Commit 635b613

Browse files
sradcoAI Assistant
andcommitted
collector: add nvmesubsystem collector for NVMe-oF path health
Add a new disabled-by-default collector that reads /sys/class/nvme-subsystem/ to expose NVMe over Fabrics subsystem connectivity metrics. This complements the existing nvme collector (which reports per-controller hardware stats) by monitoring the subsystem-level path redundancy — how many controller paths are live, connecting, or dead for each NVMe subsystem. Exposed metrics: - node_nvmesubsystem_info - node_nvmesubsystem_paths_total - node_nvmesubsystem_paths_live - node_nvmesubsystem_path_state Signed-off-by: Shirly Radco <sradco@redhat.com> Co-authored-by: AI Assistant <noreply@cursor.com>
1 parent a1cbf81 commit 635b613

File tree

4 files changed

+515
-0
lines changed

4 files changed

+515
-0
lines changed

README.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,7 @@ logind | Exposes session counts from [logind](http://www.freedesktop.org/wiki/So
202202
meminfo\_numa | Exposes memory statistics from `/sys/devices/system/node/node[0-9]*/meminfo`, `/sys/devices/system/node/node[0-9]*/numastat`. | Linux
203203
mountstats | Exposes filesystem statistics from `/proc/self/mountstats`. Exposes detailed NFS client statistics. | Linux
204204
network_route | Exposes the routing table as metrics | Linux
205+
nvmesubsystem | Exposes NVMe-oF subsystem path health from `/sys/class/nvme-subsystem/`. | Linux
205206
pcidevice | Exposes pci devices' information including their link status and parent devices. | Linux
206207
perf | Exposes perf based metrics (Warning: Metrics are dependent on kernel configuration and settings). | Linux
207208
processes | Exposes aggregate process statistics from `/proc`. | Linux
@@ -339,6 +340,25 @@ echo 'role{role="application_server"} 1' > /path/to/directory/role.prom.$$
339340
mv /path/to/directory/role.prom.$$ /path/to/directory/role.prom
340341
```
341342

343+
### NVMe Subsystem Collector
344+
345+
The `nvmesubsystem` collector exposes NVMe-oF (NVMe over Fabrics) subsystem
346+
path health by reading `/sys/class/nvme-subsystem/`. It complements the
347+
existing `nvme` collector (which reports per-controller hardware stats) by
348+
monitoring the **connectivity layer** — how many controller paths are live,
349+
connecting, or dead for each NVMe subsystem.
350+
351+
Enable it with `--collector.nvmesubsystem`.
352+
353+
#### Exposed metrics
354+
355+
| Metric | Description |
356+
|--------|-------------|
357+
| `node_nvmesubsystem_info` | Info metric with subsystem NQN, model, serial and I/O policy as labels. |
358+
| `node_nvmesubsystem_paths_total` | Total number of controller paths for the subsystem. |
359+
| `node_nvmesubsystem_paths_live` | Number of controller paths currently in `live` state. |
360+
| `node_nvmesubsystem_path_state` | Per-controller path state (1 for the current state, 0 for others). |
361+
342362
### Filtering enabled collectors
343363

344364
The `node_exporter` will expose all metrics from enabled collectors by default. This is the recommended way to collect metrics to avoid errors when comparing metrics of different families.

collector/fixtures/sys.ttar

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2255,6 +2255,104 @@ Lines: 1
22552255
4096
22562256
Mode: 644
22572257
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2258+
Directory: sys/class/nvme-subsystem
2259+
Mode: 755
2260+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2261+
Directory: sys/class/nvme-subsystem/nvme-subsys0
2262+
Mode: 755
2263+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2264+
Path: sys/class/nvme-subsystem/nvme-subsys0/iopolicy
2265+
Lines: 1
2266+
round-robinEOF
2267+
Mode: 644
2268+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2269+
Path: sys/class/nvme-subsystem/nvme-subsys0/model
2270+
Lines: 1
2271+
Dell PowerStoreEOF
2272+
Mode: 644
2273+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2274+
Directory: sys/class/nvme-subsystem/nvme-subsys0/nvme0
2275+
Mode: 755
2276+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2277+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme0/address
2278+
Lines: 1
2279+
nn-0x200000109b123456:pn-0x100000109b123456EOF
2280+
Mode: 644
2281+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2282+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme0/state
2283+
Lines: 1
2284+
liveEOF
2285+
Mode: 644
2286+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2287+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme0/transport
2288+
Lines: 1
2289+
fcEOF
2290+
Mode: 644
2291+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2292+
Directory: sys/class/nvme-subsystem/nvme-subsys0/nvme1
2293+
Mode: 755
2294+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2295+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme1/address
2296+
Lines: 1
2297+
nn-0x200000109b123457:pn-0x100000109b123457EOF
2298+
Mode: 644
2299+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2300+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme1/state
2301+
Lines: 1
2302+
liveEOF
2303+
Mode: 644
2304+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2305+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme1/transport
2306+
Lines: 1
2307+
fcEOF
2308+
Mode: 644
2309+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2310+
Directory: sys/class/nvme-subsystem/nvme-subsys0/nvme2
2311+
Mode: 755
2312+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2313+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme2/address
2314+
Lines: 1
2315+
nn-0x200000109b123458:pn-0x100000109b123458EOF
2316+
Mode: 644
2317+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2318+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme2/state
2319+
Lines: 1
2320+
liveEOF
2321+
Mode: 644
2322+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2323+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme2/transport
2324+
Lines: 1
2325+
fcEOF
2326+
Mode: 644
2327+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2328+
Directory: sys/class/nvme-subsystem/nvme-subsys0/nvme3
2329+
Mode: 755
2330+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2331+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme3/address
2332+
Lines: 1
2333+
nn-0x200000109b123459:pn-0x100000109b123459EOF
2334+
Mode: 644
2335+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2336+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme3/state
2337+
Lines: 1
2338+
deadEOF
2339+
Mode: 644
2340+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2341+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme3/transport
2342+
Lines: 1
2343+
fcEOF
2344+
Mode: 644
2345+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2346+
Path: sys/class/nvme-subsystem/nvme-subsys0/serial
2347+
Lines: 1
2348+
SN12345678EOF
2349+
Mode: 644
2350+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2351+
Path: sys/class/nvme-subsystem/nvme-subsys0/subsysnqn
2352+
Lines: 1
2353+
nqn.2014-08.org.nvmexpress:uuid:a34c4f3a-0d6f-5cec-dead-beefcafebabeEOF
2354+
Mode: 644
2355+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
22582356
Directory: sys/class/power_supply
22592357
Mode: 755
22602358
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

collector/nvmesubsystem_linux.go

Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
// Copyright The Prometheus Authors
2+
// Licensed under the Apache License, Version 2.0 (the "License");
3+
// you may not use this file except in compliance with the License.
4+
// You may obtain a copy of the License at
5+
//
6+
// http://www.apache.org/licenses/LICENSE-2.0
7+
//
8+
// Unless required by applicable law or agreed to in writing, software
9+
// distributed under the License is distributed on an "AS IS" BASIS,
10+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
// See the License for the specific language governing permissions and
12+
// limitations under the License.
13+
14+
//go:build !nonvmesubsystem
15+
16+
package collector
17+
18+
import (
19+
"fmt"
20+
"log/slog"
21+
"os"
22+
"path/filepath"
23+
"regexp"
24+
"strings"
25+
26+
"github.com/prometheus/client_golang/prometheus"
27+
)
28+
29+
type nvmeSubsystemCollector struct {
30+
logger *slog.Logger
31+
scanSubsystems func() ([]nvmeSubsystem, error)
32+
33+
subsystemInfo *prometheus.Desc
34+
subsystemPathsTotal *prometheus.Desc
35+
subsystemPathsLive *prometheus.Desc
36+
pathState *prometheus.Desc
37+
}
38+
39+
type nvmeSubsystem struct {
40+
Name string
41+
NQN string
42+
Model string
43+
Serial string
44+
IOPolicy string
45+
Controllers []nvmeController
46+
}
47+
48+
type nvmeController struct {
49+
Name string
50+
State string
51+
Transport string
52+
Address string
53+
}
54+
55+
var (
56+
nvmeControllerRE = regexp.MustCompile(`^nvme\d+$`)
57+
58+
nvmeControllerStates = []string{
59+
"live", "connecting", "resetting", "dead", "unknown",
60+
}
61+
)
62+
63+
func normalizeControllerState(raw string) string {
64+
switch raw {
65+
case "live", "connecting", "resetting", "dead":
66+
return raw
67+
case "deleting", "deleting (no IO)", "new":
68+
return raw
69+
default:
70+
return "unknown"
71+
}
72+
}
73+
74+
func init() {
75+
registerCollector("nvmesubsystem", defaultDisabled, NewNVMeSubsystemCollector)
76+
}
77+
78+
// NewNVMeSubsystemCollector returns a new Collector exposing NVMe-oF subsystem
79+
// path health from /sys/class/nvme-subsystem/.
80+
func NewNVMeSubsystemCollector(logger *slog.Logger) (Collector, error) {
81+
const subsystem = "nvmesubsystem"
82+
83+
c := &nvmeSubsystemCollector{
84+
logger: logger,
85+
subsystemInfo: prometheus.NewDesc(
86+
prometheus.BuildFQName(namespace, subsystem, "info"),
87+
"Non-numeric information about an NVMe subsystem.",
88+
[]string{"subsystem", "nqn", "model", "serial", "iopolicy"}, nil,
89+
),
90+
subsystemPathsTotal: prometheus.NewDesc(
91+
prometheus.BuildFQName(namespace, subsystem, "paths_total"),
92+
"Total number of controller paths for an NVMe subsystem.",
93+
[]string{"subsystem"}, nil,
94+
),
95+
subsystemPathsLive: prometheus.NewDesc(
96+
prometheus.BuildFQName(namespace, subsystem, "paths_live"),
97+
"Number of controller paths in live state for an NVMe subsystem.",
98+
[]string{"subsystem"}, nil,
99+
),
100+
pathState: prometheus.NewDesc(
101+
prometheus.BuildFQName(namespace, subsystem, "path_state"),
102+
"Current NVMe controller path state (1 for the current state, 0 for all others).",
103+
[]string{"subsystem", "controller", "transport", "state"}, nil,
104+
),
105+
}
106+
107+
c.scanSubsystems = func() ([]nvmeSubsystem, error) {
108+
return scanNVMeSubsystems(*sysPath)
109+
}
110+
111+
return c, nil
112+
}
113+
114+
func (c *nvmeSubsystemCollector) Update(ch chan<- prometheus.Metric) error {
115+
subsystems, err := c.scanSubsystems()
116+
if err != nil {
117+
return fmt.Errorf("failed to scan NVMe subsystems: %w", err)
118+
}
119+
120+
for _, subsys := range subsystems {
121+
ch <- prometheus.MustNewConstMetric(c.subsystemInfo, prometheus.GaugeValue, 1,
122+
subsys.Name, subsys.NQN, subsys.Model, subsys.Serial, subsys.IOPolicy)
123+
124+
total := float64(len(subsys.Controllers))
125+
var live float64
126+
for _, ctrl := range subsys.Controllers {
127+
state := normalizeControllerState(ctrl.State)
128+
if state == "live" {
129+
live++
130+
}
131+
132+
for _, s := range nvmeControllerStates {
133+
val := 0.0
134+
if s == state {
135+
val = 1.0
136+
}
137+
ch <- prometheus.MustNewConstMetric(c.pathState, prometheus.GaugeValue, val,
138+
subsys.Name, ctrl.Name, ctrl.Transport, s)
139+
}
140+
}
141+
142+
ch <- prometheus.MustNewConstMetric(c.subsystemPathsTotal, prometheus.GaugeValue, total, subsys.Name)
143+
ch <- prometheus.MustNewConstMetric(c.subsystemPathsLive, prometheus.GaugeValue, live, subsys.Name)
144+
}
145+
146+
return nil
147+
}
148+
149+
func scanNVMeSubsystems(sysfsBase string) ([]nvmeSubsystem, error) {
150+
subsysBase := filepath.Join(sysfsBase, "class", "nvme-subsystem")
151+
152+
entries, err := os.ReadDir(subsysBase)
153+
if err != nil {
154+
return nil, err
155+
}
156+
157+
var subsystems []nvmeSubsystem
158+
for _, entry := range entries {
159+
if !strings.HasPrefix(entry.Name(), "nvme-subsys") {
160+
continue
161+
}
162+
subsysPath := filepath.Join(subsysBase, entry.Name())
163+
subsys, err := parseNVMeSubsystem(entry.Name(), subsysPath)
164+
if err != nil {
165+
continue
166+
}
167+
subsystems = append(subsystems, *subsys)
168+
}
169+
170+
return subsystems, nil
171+
}
172+
173+
func parseNVMeSubsystem(name, path string) (*nvmeSubsystem, error) {
174+
subsys := &nvmeSubsystem{Name: name}
175+
176+
subsys.NQN = readSysfsString(filepath.Join(path, "subsysnqn"))
177+
subsys.Model = readSysfsString(filepath.Join(path, "model"))
178+
subsys.Serial = readSysfsString(filepath.Join(path, "serial"))
179+
subsys.IOPolicy = readSysfsString(filepath.Join(path, "iopolicy"))
180+
181+
entries, err := os.ReadDir(path)
182+
if err != nil {
183+
return subsys, nil
184+
}
185+
186+
for _, entry := range entries {
187+
if !nvmeControllerRE.MatchString(entry.Name()) {
188+
continue
189+
}
190+
ctrlPath := filepath.Join(path, entry.Name())
191+
ctrl := nvmeController{
192+
Name: entry.Name(),
193+
State: readSysfsString(filepath.Join(ctrlPath, "state")),
194+
Transport: readSysfsString(filepath.Join(ctrlPath, "transport")),
195+
Address: readSysfsString(filepath.Join(ctrlPath, "address")),
196+
}
197+
subsys.Controllers = append(subsys.Controllers, ctrl)
198+
}
199+
200+
return subsys, nil
201+
}
202+
203+
func readSysfsString(path string) string {
204+
data, err := os.ReadFile(path)
205+
if err != nil {
206+
return ""
207+
}
208+
return strings.TrimSpace(string(data))
209+
}

0 commit comments

Comments
 (0)