Harden OneHot operator input validation and output size computation

Gopalakrishnan Nallasamy · Gopalakrishnan Nallasamy · commit 0c704ceb7ed5 · 2026-04-16T16:13:09.000-07:00
- Add overflow check in PrepareOutputShape using SafeInt for output size and prefix_dim_size multiplication to prevent unbounded allocation when depth or indices shape would overflow int64

- Guard against division by zero when prefix_dim_size is zero

- Add CUDA int32 range validation before fast_divmod to avoid silent truncation in gsl::narrow_cast for suffix_dim_size and depth_val * suffix_dim_size

- Check for nullptr from Output() in both CPU and CUDA Compute paths

- Add unit tests: depth overflow (two variants), negative depth, depth=1 edge case, scalar-indices rejection (ONNX spec requires rank&gt;=1), and opset 9 coverage
diff --git a/onnxruntime/core/providers/cpu/tensor/onehot.cc b/onnxruntime/core/providers/cpu/tensor/onehot.cc
@@ -16,9 +16,12 @@ limitations under the License.
 
 #include "core/providers/cpu/tensor/onehot.h"
 #include "core/common/eigen_common_wrapper.h"
+#include "core/common/safeint.h"
 #include "core/platform/env.h"
 #include "core/providers/common.h"
 
+#include <limits>
+
 #ifndef EIGEN_USE_THREADS
 #define EIGEN_USE_THREADS
 #endif
@@ -100,11 +103,28 @@ Status PrepareOutputShape(const Tensor* indices, const int64_t depth_val, const
 
   output_shape.insert(output_shape.begin() + true_axis, depth_val);
 
-  prefix_dim_size = 1;
+  // Validate that the total output tensor element count does not overflow int64.
+  {
+    int64_t total_elements = 1;
+    for (auto dim : output_shape) {
+      if (dim > 0 && total_elements > std::numeric_limits<int64_t>::max() / dim) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                               "OneHot: output tensor size would overflow for the given indices shape "
+                               "and depth value (", depth_val, ").");
+      }
+      total_elements *= dim;
+    }
+  }
+
+  // Use SafeInt for prefix_dim_size computation to guard against overflow.
+  SafeInt<int64_t> safe_prefix = 1;
   for (int64_t i = 0; i < true_axis; ++i) {
-    prefix_dim_size *= indices_dims[onnxruntime::narrow<size_t>(i)];
+    safe_prefix *= indices_dims[onnxruntime::narrow<size_t>(i)];
   }
-  suffix_dim_size = indices_shape.Size() / prefix_dim_size;
+  prefix_dim_size = safe_prefix;
+
+  // Guard against division by zero when indices have a zero-sized dimension before the axis.
+  suffix_dim_size = (prefix_dim_size > 0) ? (indices_shape.Size() / prefix_dim_size) : 0;
 
   return Status::OK();
 }
@@ -166,6 +186,7 @@ Status OneHotOp<in_type, out_type, depth_type>::Compute(OpKernelContext* p_op_ke
   // allocate output
   const auto* values_data = values->Data<out_type>();
   Tensor* output = p_op_kernel_context->Output(0, TensorShape(output_shape));
+  ORT_RETURN_IF_NOT(output, "OneHot: failed to allocate output tensor. Output shape may be too large.");
 
   // edge case where we have a dim with a value of 0
   if (output->Shape().Size() == 0)
diff --git a/onnxruntime/core/providers/cuda/tensor/onehot.cc b/onnxruntime/core/providers/cuda/tensor/onehot.cc
@@ -3,6 +3,8 @@
 
 #include "core/providers/cuda/tensor/onehot.h"
 
+#include <limits>
+
 using namespace onnxruntime::common;
 
 namespace onnxruntime {
@@ -55,11 +57,22 @@ Status OneHotOp<in_type, out_type, depth_type>::ComputeInternal(OpKernelContext*
   // allocate output
   const auto* values_data = reinterpret_cast<const CudaT_Out*>(values->Data<out_type>());
   Tensor* output = ctx->Output(0, TensorShape(output_shape));
+  ORT_RETURN_IF_NOT(output, "OneHot: failed to allocate output tensor. Output shape may be too large.");
 
   // edge case where we have a dim with a value of 0
   if (output->Shape().Size() == 0)
     return Status::OK();
 
+  // Validate that dimensions used by CUDA kernels fit in int32 range.
+  // fast_divmod requires int32 operands.
+  constexpr int64_t kInt32Max = std::numeric_limits<int>::max();
+  ORT_RETURN_IF_NOT(suffix_dim_size <= kInt32Max,
+                    "OneHot: suffix dimension size (", suffix_dim_size,
+                    ") exceeds int32 range supported by the CUDA kernel.");
+  ORT_RETURN_IF_NOT(depth_val <= kInt32Max / std::max(suffix_dim_size, int64_t{1}),
+                    "OneHot: depth (", depth_val, ") * suffix dimension size (", suffix_dim_size,
+                    ") exceeds int32 range supported by the CUDA kernel.");
+
   const fast_divmod fdm_suffix(gsl::narrow_cast<int>(suffix_dim_size));
   const auto* indices_data = indices->Data<in_type>();
   auto* output_data = reinterpret_cast<CudaT_Out*>(output->MutableData<out_type>());
diff --git a/onnxruntime/test/providers/cpu/tensor/onehot_op_test.cc b/onnxruntime/test/providers/cpu/tensor/onehot_op_test.cc
@@ -1,6 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include <limits>
+
 #include "gtest/gtest.h"
 #include "test/providers/provider_test_utils.h"
 #include "test/common/trt_op_test_utils.h"
@@ -499,6 +501,78 @@ TEST(OneHotOpTest, DimWithZero) {
   test.Run();
 }
 
+// Test that extremely large depth values that would cause output tensor size overflow are rejected.
+TEST(OneHotOpTest, DepthTooLarge_OutputSizeOverflow) {
+  OpTester test("OneHot", 11);
+  // indices shape [2, 3] with depth = INT64_MAX causes output shape [2, 3, INT64_MAX]
+  // which would overflow when computing total element count.
+  test.AddInput<int64_t>("indices", {2, 3}, {1, 2, 3, 4, 5, 6});
+  test.AddInput<int64_t>("depth", {1}, {std::numeric_limits<int64_t>::max()});
+  test.AddInput<int64_t>("values", {2}, {0, 1});
+  test.AddOutput<int64_t>("output", {2, 3, 1}, {0, 0, 0, 0, 0, 0});
+  test.Run(OpTester::ExpectResult::kExpectFailure, "output tensor size would overflow");
+}
+
+// Test that a very large depth value that overflows with multi-dimensional indices is rejected.
+TEST(OneHotOpTest, DepthTooLarge_OutputSizeOverflow_LargeIndices) {
+  OpTester test("OneHot", 11);
+  // indices shape [1000] with depth = INT64_MAX / 500 causes overflow in element count.
+  const int64_t large_depth = std::numeric_limits<int64_t>::max() / 500;
+  std::vector<int64_t> indices(1000, 0);
+  std::vector<int64_t> dummy_output(1000, 0);
+  test.AddInput<int64_t>("indices", {1000}, indices);
+  test.AddInput<int64_t>("depth", {1}, {large_depth});
+  test.AddInput<int64_t>("values", {2}, {0, 1});
+  test.AddOutput<int64_t>("output", {1000, 1}, dummy_output);
+  test.Run(OpTester::ExpectResult::kExpectFailure, "output tensor size would overflow");
+}
+
+// Test that a negative depth value is rejected.
+TEST(OneHotOpTest, NegativeDepth) {
+  OpTester test("OneHot", 11);
+  test.AddInput<int64_t>("indices", {2, 3}, {1, 2, 3, 4, 5, 6});
+  test.AddInput<int64_t>("depth", {1}, {-5});
+  test.AddInput<int64_t>("values", {2}, {0, 1});
+  test.AddOutput<int64_t>("output", {2, 3, 1}, {0, 0, 0, 0, 0, 0});
+  test.Run(OpTester::ExpectResult::kExpectFailure, "Depth is negative");
+}
+
+// Test minimum valid depth value of 1.
+TEST(OneHotOpTest, DepthOne) {
+  OpTester test("OneHot", 11);
+  test.AddInput<int64_t>("indices", {3}, {0, 0, 0});
+  test.AddInput<int64_t>("depth", {1}, {1});
+  test.AddInput<int64_t>("values", {2}, {0, 1});
+  test.AddOutput<int64_t>("output", {3, 1}, {1, 1, 1});
+  test.Run();
+}
+
+// Test scalar (rank-0) indices are rejected per ONNX spec (indices must have rank >= 1).
+TEST(OneHotOpTest, ScalarIndicesRejected) {
+  OpTester test("OneHot", 11);
+  test.AddInput<int64_t>("indices", {}, {2});
+  test.AddInput<int64_t>("depth", {1}, {5});
+  test.AddInput<int64_t>("values", {2}, {0, 1});
+  test.AddOutput<int64_t>("output", {5}, {0, 0, 1, 0, 0});
+  test.Run(OpTester::ExpectResult::kExpectFailure, "Indices tensor must have rank >= 1");
+}
+
+// Test with opset 9.
+TEST(OneHotOpTest, DefaultAxis_Opset9) {
+  OpTester test("OneHot", 9);
+  test.AddInput<int64_t>("indices", {2, 3}, {1, 9, 8, 2, 4, 6});
+  test.AddInput<int64_t>("depth", {1}, {10});
+  test.AddInput<int64_t>("values", {2}, {0, 1});
+  test.AddOutput<int64_t>("output", {2, 3, 10},
+                          {0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
+                           0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
+                           0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
+                           0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
+                           0, 0, 0, 0, 0, 0, 1, 0, 0, 0});
+  test.Run();
+}
+
 #ifdef USE_CUDA
 
 TEST(OneHotOpTest, DefaultAxis_int64_MLFloat16_int64 /*indices, output, depth*/) {