Skip to content

Commit 8acec17

Browse files
sradcoAI Assistant
andcommitted
collector: add nvmesubsystem collector for NVMe-oF path health
Add a new disabled-by-default collector that reads /sys/class/nvme-subsystem/ to expose NVMe over Fabrics subsystem connectivity metrics. This complements the existing nvme collector (which reports per-controller hardware stats) by monitoring the subsystem-level path redundancy — how many controller paths are live, connecting, or dead for each NVMe subsystem. Exposed metrics: - node_nvmesubsystem_info - node_nvmesubsystem_paths - node_nvmesubsystem_paths_live - node_nvmesubsystem_path_state Signed-off-by: Shirly Radco <sradco@redhat.com> Co-authored-by: AI Assistant <noreply@cursor.com>
1 parent a1cbf81 commit 8acec17

File tree

4 files changed

+520
-0
lines changed

4 files changed

+520
-0
lines changed

README.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,7 @@ logind | Exposes session counts from [logind](http://www.freedesktop.org/wiki/So
202202
meminfo\_numa | Exposes memory statistics from `/sys/devices/system/node/node[0-9]*/meminfo`, `/sys/devices/system/node/node[0-9]*/numastat`. | Linux
203203
mountstats | Exposes filesystem statistics from `/proc/self/mountstats`. Exposes detailed NFS client statistics. | Linux
204204
network_route | Exposes the routing table as metrics | Linux
205+
nvmesubsystem | Exposes NVMe-oF subsystem path health from `/sys/class/nvme-subsystem/`. | Linux
205206
pcidevice | Exposes pci devices' information including their link status and parent devices. | Linux
206207
perf | Exposes perf based metrics (Warning: Metrics are dependent on kernel configuration and settings). | Linux
207208
processes | Exposes aggregate process statistics from `/proc`. | Linux
@@ -339,6 +340,25 @@ echo 'role{role="application_server"} 1' > /path/to/directory/role.prom.$$
339340
mv /path/to/directory/role.prom.$$ /path/to/directory/role.prom
340341
```
341342

343+
### NVMe Subsystem Collector
344+
345+
The `nvmesubsystem` collector exposes NVMe-oF (NVMe over Fabrics) subsystem
346+
path health by reading `/sys/class/nvme-subsystem/`. It complements the
347+
existing `nvme` collector (which reports per-controller hardware stats) by
348+
monitoring the **connectivity layer** — how many controller paths are live,
349+
connecting, or dead for each NVMe subsystem.
350+
351+
Enable it with `--collector.nvmesubsystem`.
352+
353+
#### Exposed metrics
354+
355+
| Metric | Description |
356+
|--------|-------------|
357+
| `node_nvmesubsystem_info` | Info metric with subsystem NQN, model, serial and I/O policy as labels. |
358+
| `node_nvmesubsystem_paths` | Number of controller paths for the subsystem. |
359+
| `node_nvmesubsystem_paths_live` | Number of controller paths currently in `live` state. |
360+
| `node_nvmesubsystem_path_state` | Per-controller path state (1 for the current state, 0 for others). |
361+
342362
### Filtering enabled collectors
343363

344364
The `node_exporter` will expose all metrics from enabled collectors by default. This is the recommended way to collect metrics to avoid errors when comparing metrics of different families.

collector/fixtures/sys.ttar

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2255,6 +2255,104 @@ Lines: 1
22552255
4096
22562256
Mode: 644
22572257
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2258+
Directory: sys/class/nvme-subsystem
2259+
Mode: 755
2260+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2261+
Directory: sys/class/nvme-subsystem/nvme-subsys0
2262+
Mode: 755
2263+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2264+
Path: sys/class/nvme-subsystem/nvme-subsys0/iopolicy
2265+
Lines: 1
2266+
round-robinEOF
2267+
Mode: 644
2268+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2269+
Path: sys/class/nvme-subsystem/nvme-subsys0/model
2270+
Lines: 1
2271+
Dell PowerStoreEOF
2272+
Mode: 644
2273+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2274+
Directory: sys/class/nvme-subsystem/nvme-subsys0/nvme0
2275+
Mode: 755
2276+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2277+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme0/address
2278+
Lines: 1
2279+
nn-0x200000109b123456:pn-0x100000109b123456EOF
2280+
Mode: 644
2281+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2282+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme0/state
2283+
Lines: 1
2284+
liveEOF
2285+
Mode: 644
2286+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2287+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme0/transport
2288+
Lines: 1
2289+
fcEOF
2290+
Mode: 644
2291+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2292+
Directory: sys/class/nvme-subsystem/nvme-subsys0/nvme1
2293+
Mode: 755
2294+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2295+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme1/address
2296+
Lines: 1
2297+
nn-0x200000109b123457:pn-0x100000109b123457EOF
2298+
Mode: 644
2299+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2300+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme1/state
2301+
Lines: 1
2302+
liveEOF
2303+
Mode: 644
2304+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2305+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme1/transport
2306+
Lines: 1
2307+
fcEOF
2308+
Mode: 644
2309+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2310+
Directory: sys/class/nvme-subsystem/nvme-subsys0/nvme2
2311+
Mode: 755
2312+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2313+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme2/address
2314+
Lines: 1
2315+
nn-0x200000109b123458:pn-0x100000109b123458EOF
2316+
Mode: 644
2317+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2318+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme2/state
2319+
Lines: 1
2320+
liveEOF
2321+
Mode: 644
2322+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2323+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme2/transport
2324+
Lines: 1
2325+
fcEOF
2326+
Mode: 644
2327+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2328+
Directory: sys/class/nvme-subsystem/nvme-subsys0/nvme3
2329+
Mode: 755
2330+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2331+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme3/address
2332+
Lines: 1
2333+
nn-0x200000109b123459:pn-0x100000109b123459EOF
2334+
Mode: 644
2335+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2336+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme3/state
2337+
Lines: 1
2338+
deadEOF
2339+
Mode: 644
2340+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2341+
Path: sys/class/nvme-subsystem/nvme-subsys0/nvme3/transport
2342+
Lines: 1
2343+
fcEOF
2344+
Mode: 644
2345+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2346+
Path: sys/class/nvme-subsystem/nvme-subsys0/serial
2347+
Lines: 1
2348+
SN12345678EOF
2349+
Mode: 644
2350+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
2351+
Path: sys/class/nvme-subsystem/nvme-subsys0/subsysnqn
2352+
Lines: 1
2353+
nqn.2014-08.org.nvmexpress:uuid:a34c4f3a-0d6f-5cec-dead-beefcafebabeEOF
2354+
Mode: 644
2355+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
22582356
Directory: sys/class/power_supply
22592357
Mode: 755
22602358
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

collector/nvmesubsystem_linux.go

Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,214 @@
1+
// Copyright The Prometheus Authors
2+
// Licensed under the Apache License, Version 2.0 (the "License");
3+
// you may not use this file except in compliance with the License.
4+
// You may obtain a copy of the License at
5+
//
6+
// http://www.apache.org/licenses/LICENSE-2.0
7+
//
8+
// Unless required by applicable law or agreed to in writing, software
9+
// distributed under the License is distributed on an "AS IS" BASIS,
10+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
// See the License for the specific language governing permissions and
12+
// limitations under the License.
13+
14+
//go:build !nonvmesubsystem
15+
16+
package collector
17+
18+
import (
19+
"errors"
20+
"fmt"
21+
"log/slog"
22+
"os"
23+
"path/filepath"
24+
"regexp"
25+
"strings"
26+
27+
"github.com/prometheus/client_golang/prometheus"
28+
)
29+
30+
type nvmeSubsystemCollector struct {
31+
logger *slog.Logger
32+
scanSubsystems func() ([]nvmeSubsystem, error)
33+
34+
subsystemInfo *prometheus.Desc
35+
subsystemPaths *prometheus.Desc
36+
subsystemPathsLive *prometheus.Desc
37+
pathState *prometheus.Desc
38+
}
39+
40+
type nvmeSubsystem struct {
41+
Name string
42+
NQN string
43+
Model string
44+
Serial string
45+
IOPolicy string
46+
Controllers []nvmeController
47+
}
48+
49+
type nvmeController struct {
50+
Name string
51+
State string
52+
Transport string
53+
Address string
54+
}
55+
56+
var (
57+
nvmeControllerRE = regexp.MustCompile(`^nvme\d+$`)
58+
59+
nvmeControllerStates = []string{
60+
"live", "connecting", "resetting", "dead", "unknown",
61+
}
62+
)
63+
64+
func normalizeControllerState(raw string) string {
65+
switch raw {
66+
case "live", "connecting", "resetting", "dead":
67+
return raw
68+
case "deleting", "deleting (no IO)", "new":
69+
return raw
70+
default:
71+
return "unknown"
72+
}
73+
}
74+
75+
func init() {
76+
registerCollector("nvmesubsystem", defaultDisabled, NewNVMeSubsystemCollector)
77+
}
78+
79+
// NewNVMeSubsystemCollector returns a new Collector exposing NVMe-oF subsystem
80+
// path health from /sys/class/nvme-subsystem/.
81+
func NewNVMeSubsystemCollector(logger *slog.Logger) (Collector, error) {
82+
const subsystem = "nvmesubsystem"
83+
84+
c := &nvmeSubsystemCollector{
85+
logger: logger,
86+
subsystemInfo: prometheus.NewDesc(
87+
prometheus.BuildFQName(namespace, subsystem, "info"),
88+
"Non-numeric information about an NVMe subsystem.",
89+
[]string{"subsystem", "nqn", "model", "serial", "iopolicy"}, nil,
90+
),
91+
subsystemPaths: prometheus.NewDesc(
92+
prometheus.BuildFQName(namespace, subsystem, "paths"),
93+
"Number of controller paths for an NVMe subsystem.",
94+
[]string{"subsystem"}, nil,
95+
),
96+
subsystemPathsLive: prometheus.NewDesc(
97+
prometheus.BuildFQName(namespace, subsystem, "paths_live"),
98+
"Number of controller paths in live state for an NVMe subsystem.",
99+
[]string{"subsystem"}, nil,
100+
),
101+
pathState: prometheus.NewDesc(
102+
prometheus.BuildFQName(namespace, subsystem, "path_state"),
103+
"Current NVMe controller path state (1 for the current state, 0 for all others).",
104+
[]string{"subsystem", "controller", "transport", "state"}, nil,
105+
),
106+
}
107+
108+
c.scanSubsystems = func() ([]nvmeSubsystem, error) {
109+
return scanNVMeSubsystems(*sysPath)
110+
}
111+
112+
return c, nil
113+
}
114+
115+
func (c *nvmeSubsystemCollector) Update(ch chan<- prometheus.Metric) error {
116+
subsystems, err := c.scanSubsystems()
117+
if err != nil {
118+
if errors.Is(err, os.ErrNotExist) || errors.Is(err, os.ErrPermission) {
119+
c.logger.Debug("Could not read NVMe subsystem info", "err", err)
120+
return ErrNoData
121+
}
122+
return fmt.Errorf("failed to scan NVMe subsystems: %w", err)
123+
}
124+
125+
for _, subsys := range subsystems {
126+
ch <- prometheus.MustNewConstMetric(c.subsystemInfo, prometheus.GaugeValue, 1,
127+
subsys.Name, subsys.NQN, subsys.Model, subsys.Serial, subsys.IOPolicy)
128+
129+
total := float64(len(subsys.Controllers))
130+
var live float64
131+
for _, ctrl := range subsys.Controllers {
132+
state := normalizeControllerState(ctrl.State)
133+
if state == "live" {
134+
live++
135+
}
136+
137+
for _, s := range nvmeControllerStates {
138+
val := 0.0
139+
if s == state {
140+
val = 1.0
141+
}
142+
ch <- prometheus.MustNewConstMetric(c.pathState, prometheus.GaugeValue, val,
143+
subsys.Name, ctrl.Name, ctrl.Transport, s)
144+
}
145+
}
146+
147+
ch <- prometheus.MustNewConstMetric(c.subsystemPaths, prometheus.GaugeValue, total, subsys.Name)
148+
ch <- prometheus.MustNewConstMetric(c.subsystemPathsLive, prometheus.GaugeValue, live, subsys.Name)
149+
}
150+
151+
return nil
152+
}
153+
154+
func scanNVMeSubsystems(sysfsBase string) ([]nvmeSubsystem, error) {
155+
subsysBase := filepath.Join(sysfsBase, "class", "nvme-subsystem")
156+
157+
entries, err := os.ReadDir(subsysBase)
158+
if err != nil {
159+
return nil, err
160+
}
161+
162+
var subsystems []nvmeSubsystem
163+
for _, entry := range entries {
164+
if !strings.HasPrefix(entry.Name(), "nvme-subsys") {
165+
continue
166+
}
167+
subsysPath := filepath.Join(subsysBase, entry.Name())
168+
subsys, err := parseNVMeSubsystem(entry.Name(), subsysPath)
169+
if err != nil {
170+
continue
171+
}
172+
subsystems = append(subsystems, *subsys)
173+
}
174+
175+
return subsystems, nil
176+
}
177+
178+
func parseNVMeSubsystem(name, path string) (*nvmeSubsystem, error) {
179+
subsys := &nvmeSubsystem{Name: name}
180+
181+
subsys.NQN = readSysfsString(filepath.Join(path, "subsysnqn"))
182+
subsys.Model = readSysfsString(filepath.Join(path, "model"))
183+
subsys.Serial = readSysfsString(filepath.Join(path, "serial"))
184+
subsys.IOPolicy = readSysfsString(filepath.Join(path, "iopolicy"))
185+
186+
entries, err := os.ReadDir(path)
187+
if err != nil {
188+
return subsys, nil
189+
}
190+
191+
for _, entry := range entries {
192+
if !nvmeControllerRE.MatchString(entry.Name()) {
193+
continue
194+
}
195+
ctrlPath := filepath.Join(path, entry.Name())
196+
ctrl := nvmeController{
197+
Name: entry.Name(),
198+
State: readSysfsString(filepath.Join(ctrlPath, "state")),
199+
Transport: readSysfsString(filepath.Join(ctrlPath, "transport")),
200+
Address: readSysfsString(filepath.Join(ctrlPath, "address")),
201+
}
202+
subsys.Controllers = append(subsys.Controllers, ctrl)
203+
}
204+
205+
return subsys, nil
206+
}
207+
208+
func readSysfsString(path string) string {
209+
data, err := os.ReadFile(path)
210+
if err != nil {
211+
return ""
212+
}
213+
return strings.TrimSpace(string(data))
214+
}

0 commit comments

Comments
 (0)