Skip to content

Commit ff59b83

Browse files
nvidia-gpu: fix crash caused by device instance ID collision
When multiple devices of the same device type return identical instance IDs in response of the QueryDeviceIdentification MCTP VDM command, their device names become similar due to the instance ID being appended. This leads to multiple devices sharing the same dbus object path for their sensors, resulting in a dbus object path collision and an exception. This patch ensures that no dbus objects are created in the device class constructor, and that the init method is only invoked when a new device is instantiated. Additionally, it introduces checks for exceptions that may occur during device creation. Tested: Build an image for nvl32-obmc machine with the following patch cherry picked. https://gerrit.openbmc.org/c/openbmc/openbmc/+/85490 The patch cherry-picks the following patches that are currently under review. ``` 1. device tree https://lore.kernel.org/all/aRbLqH8pLWCQryhu@molberding.nvidia.com/ 2. mctpd patches CodeConstruct/mctp#85 3. u-boot changes https://lore.kernel.org/openbmc/20251121-msx4-v1-0-fc0118b666c1@nvidia.com/T/#t 4. kernel changes as specified in the openbmc patch (for espi) 5. entity-manager changes https://gerrit.openbmc.org/c/openbmc/entity-manager/+/85455 6. platform-init changes https://gerrit.openbmc.org/c/openbmc/platform-init/+/85456 7. spi changes https://lore.kernel.org/all/20251121-w25q01jv_fixup-v1-1-3d175050db73@nvidia.com/ ``` nvidiagpusensor application does not crash. Change-Id: I44afe0233ad5dfe634b42f906c736ada82640b79 Signed-off-by: Harshit Aghera <haghera@nvidia.com>
1 parent fb24f1f commit ff59b83

3 files changed

Lines changed: 66 additions & 34 deletions

File tree

src/nvidia-gpu/NvidiaDeviceDiscovery.cpp

Lines changed: 57 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -86,16 +86,26 @@ void processQueryDeviceIdResponse(
8686
"EID", eid, "DEVTYPE", responseDeviceType, "IID",
8787
responseInstanceId);
8888

89-
auto gpuName = configs.name + '_' +
90-
std::to_string(responseInstanceId);
91-
92-
auto gpu = gpuDevices
93-
.insert(std::make_pair(
94-
gpuName, std::make_shared<GpuDevice>(
95-
configs, gpuName, path, conn, eid,
96-
io, mctpRequester, objectServer)))
97-
.first;
98-
gpu->second->init();
89+
const std::string gpuName =
90+
std::format("{}_{}", configs.name, responseInstanceId);
91+
92+
std::shared_ptr<GpuDevice>& gpu = gpuDevices[gpuName];
93+
94+
if (gpu == nullptr)
95+
{
96+
gpu = std::make_shared<GpuDevice>(configs, gpuName, path, conn,
97+
eid, io, mctpRequester,
98+
objectServer);
99+
100+
gpu->init();
101+
}
102+
else
103+
{
104+
lg2::info(
105+
"GPU Device with name {NAME} already exists. Skipping creating a new device.",
106+
"NAME", gpuName);
107+
}
108+
99109
break;
100110
}
101111

@@ -106,16 +116,26 @@ void processQueryDeviceIdResponse(
106116
"EID", eid, "DEVTYPE", responseDeviceType, "IID",
107117
responseInstanceId);
108118

109-
auto smaName = configs.name + "_SMA_" +
110-
std::to_string(responseInstanceId);
119+
const std::string smaName =
120+
std::format("{}_SMA_{}", configs.name, responseInstanceId);
121+
122+
std::shared_ptr<SmaDevice>& sma = smaDevices[smaName];
123+
124+
if (sma == nullptr)
125+
{
126+
sma = std::make_shared<SmaDevice>(configs, smaName, path, conn,
127+
eid, io, mctpRequester,
128+
objectServer);
129+
130+
sma->init();
131+
}
132+
else
133+
{
134+
lg2::info(
135+
"SMA Device with name {NAME} already exists. Skipping creating a new device.",
136+
"NAME", smaName);
137+
}
111138

112-
auto sma = smaDevices
113-
.insert(std::make_pair(
114-
smaName, std::make_shared<SmaDevice>(
115-
configs, smaName, path, conn, eid,
116-
io, mctpRequester, objectServer)))
117-
.first;
118-
sma->second->init();
119139
break;
120140
}
121141

@@ -126,17 +146,26 @@ void processQueryDeviceIdResponse(
126146
"EID", eid, "DEVTYPE", responseDeviceType, "IID",
127147
responseInstanceId);
128148

129-
std::string pcieName =
149+
const std::string pcieName =
130150
std::format("Nvidia_ConnectX_{}", responseInstanceId);
131151

132-
auto pcieDevice =
133-
pcieDevices
134-
.insert(std::make_pair(
135-
pcieName, std::make_shared<PcieDevice>(
136-
configs, pcieName, path, conn, eid, io,
137-
mctpRequester, objectServer)))
138-
.first;
139-
pcieDevice->second->init();
152+
std::shared_ptr<PcieDevice>& pcie = pcieDevices[pcieName];
153+
154+
if (pcie == nullptr)
155+
{
156+
pcie = std::make_shared<PcieDevice>(
157+
configs, pcieName, path, conn, eid, io, mctpRequester,
158+
objectServer);
159+
160+
pcie->init();
161+
}
162+
else
163+
{
164+
lg2::info(
165+
"PCIe Device with name {NAME} already exists. Skipping creating a new device.",
166+
"NAME", pcieName);
167+
}
168+
140169
break;
141170
}
142171
}

src/nvidia-gpu/NvidiaGpuDevice.cpp

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -52,18 +52,19 @@ GpuDevice::GpuDevice(const SensorConfigs& configs, const std::string& name,
5252
sdbusplus::asio::object_server& objectServer) :
5353
eid(eid), sensorPollMs(std::chrono::milliseconds{configs.pollRate}),
5454
waitTimer(io, std::chrono::steady_clock::duration(0)),
55-
mctpRequester(mctpRequester), conn(conn), objectServer(objectServer),
56-
configs(configs), name(escapeName(name)), path(path)
55+
mctpRequester(mctpRequester), io(io), conn(conn),
56+
objectServer(objectServer), configs(configs), name(escapeName(name)),
57+
path(path)
58+
{}
59+
60+
void GpuDevice::init()
5761
{
5862
inventory = std::make_shared<Inventory>(
5963
conn, objectServer, name, mctpRequester,
6064
gpu::DeviceIdentification::DEVICE_GPU, eid, io);
61-
}
65+
inventory->init();
6266

63-
void GpuDevice::init()
64-
{
6567
makeSensors();
66-
inventory->init();
6768
}
6869

6970
void GpuDevice::makeSensors()

src/nvidia-gpu/NvidiaGpuDevice.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,8 @@ class GpuDevice : public std::enable_shared_from_this<GpuDevice>
6464

6565
mctp::MctpRequester& mctpRequester;
6666

67+
boost::asio::io_context& io;
68+
6769
std::shared_ptr<sdbusplus::asio::connection> conn;
6870

6971
sdbusplus::asio::object_server& objectServer;

0 commit comments

Comments
 (0)