[SPARK-55502][PYTHON] Unify UDF and UDTF Arrow conversion error handling

Yicong-Huang · Yicong-Huang · commit bfbdcf1c19f4 · 2026-03-24T20:52:49.000Z
### What changes were proposed in this pull request? Backport SPARK-55502 to branch-4.0: unify error messages for UDF and UDTF Arrow conversion errors to match master. **Key changes**: - UDF path: updated error messages from "Exception thrown when converting pandas.Series..." to user-friendly "Failed to convert..." / "Cannot convert..." format - UDTF path: replaced `UDTF_ARROW_TYPE_CAST_ERROR` error class with "Exception thrown when converting pandas.Series..." format (matching master's legacy path) - Removed unused `UDTF_ARROW_TYPE_CAST_ERROR` error condition - Updated test expectations to match new error messages ### Why are the changes needed? The cross-version CI test (master-server + branch-4.0-client) fails because master updated the error messages in SPARK-55502, but branch-4.0 tests still expect the old format. ### Does this PR introduce _any_ user-facing change? Yes, error messages change for UDF Arrow conversion errors (same changes as master). ### How was this patch tested? Updated existing unit tests. ### Was this patch authored or co-authored using generative AI tooling? Yes
diff --git a/python/pyspark/errors/error-conditions.json b/python/pyspark/errors/error-conditions.json
@@ -990,11 +990,6 @@
       "Return type of the user-defined function should be <expected>, but is <actual>."
     ]
   },
-  "UDTF_ARROW_TYPE_CAST_ERROR": {
-    "message": [
-      "Cannot convert the output value of the column '<col_name>' with type '<col_type>' to the specified return type of the column: '<arrow_type>'. Please check if the data types match and try again."
-    ]
-  },
   "UDTF_CONSTRUCTOR_INVALID_IMPLEMENTS_ANALYZE_METHOD": {
     "message": [
       "Failed to evaluate the user-defined table function '<name>' because its constructor is invalid: the function implements the 'analyze' method, but its constructor has more than two arguments (including the 'self' reference). Please update the table function so that its constructor accepts exactly one 'self' argument, or one 'self' argument plus another argument for the result of the 'analyze' method, and try the query again."
diff --git a/python/pyspark/sql/pandas/serializers.py b/python/pyspark/sql/pandas/serializers.py
@@ -321,24 +321,26 @@ def _create_array(self, series, arrow_type, spark_type=None, arrow_cast=False):
                 else:
                     raise
         except TypeError as e:
-            error_msg = (
-                "Exception thrown when converting pandas.Series (%s) "
-                "with name '%s' to Arrow Array (%s)."
-            )
-            raise PySparkTypeError(error_msg % (series.dtype, series.name, arrow_type)) from e
+            raise PySparkTypeError(
+                f"Cannot convert the output value of the column "
+                f"'{series.name}' with type '{series.dtype}' to the "
+                f"specified return type of the column: '{arrow_type}'."
+                f" Please check if the data types match and try again."
+            ) from e
         except ValueError as e:
             error_msg = (
-                "Exception thrown when converting pandas.Series (%s) "
-                "with name '%s' to Arrow Array (%s)."
+                f"Failed to convert the value of the column "
+                f"'{series.name}' with type '{series.dtype}' to Arrow "
+                f"type '{arrow_type}'."
             )
             if self._safecheck:
-                error_msg = error_msg + (
-                    " It can be caused by overflows or other "
-                    "unsafe conversions warned by Arrow. Arrow safe type check "
-                    "can be disabled by using SQL config "
+                error_msg += (
+                    " It can be caused by overflows or other unsafe "
+                    "conversions warned by Arrow. Arrow safe type "
+                    "check can be disabled by using SQL config "
                     "`spark.sql.execution.pandas.convertToArrowArraySafely`."
                 )
-            raise PySparkValueError(error_msg % (series.dtype, series.name, arrow_type)) from e
+            raise PySparkValueError(error_msg) from e
 
     def _create_batch(self, series):
         """
@@ -695,18 +697,22 @@ def _create_array(self, series, arrow_type, spark_type=None, arrow_cast=False):
                     )
                 else:
                     raise
-        except pa.lib.ArrowException:
-            # Display the most user-friendly error messages instead of showing
-            # arrow's error message. This also works better with Spark Connect
-            # where the exception messages are by default truncated.
-            raise PySparkRuntimeError(
-                errorClass="UDTF_ARROW_TYPE_CAST_ERROR",
-                messageParameters={
-                    "col_name": series.name,
-                    "col_type": str(series.dtype),
-                    "arrow_type": arrow_type,
-                },
-            ) from None
+        except pa.lib.ArrowException as e:
+            error_msg = (
+                "Exception thrown when converting pandas.Series (%s) "
+                "with name '%s' to Arrow Array (%s)."
+                % (series.dtype, series.name, arrow_type)
+            )
+            if isinstance(e, TypeError):
+                raise PySparkTypeError(error_msg) from e
+            if self._safecheck:
+                error_msg += (
+                    " It can be caused by overflows or other unsafe "
+                    "conversions warned by Arrow. Arrow safe type "
+                    "check can be disabled by using SQL config "
+                    "`spark.sql.execution.pandas.convertToArrowArraySafely`."
+                )
+            raise PySparkValueError(error_msg) from e
 
     def __repr__(self):
         return "ArrowStreamPandasUDTFSerializer"
diff --git a/python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py b/python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py
@@ -251,8 +251,8 @@ def check_apply_in_pandas_returning_incompatible_type(self):
                 # sometimes we see ValueErrors
                 with self.subTest(convert="string to double"):
                     expected = (
-                        r"ValueError: Exception thrown when converting pandas.Series \(object\) "
-                        r"with name 'k' to Arrow Array \(double\)."
+                        r"ValueError: Failed to convert the value of the column 'k' "
+                        r"with type 'object' to Arrow type 'double'."
                     )
                     if safely:
                         expected = expected + (
@@ -271,8 +271,9 @@ def check_apply_in_pandas_returning_incompatible_type(self):
                 # sometimes we see TypeErrors
                 with self.subTest(convert="double to string"):
                     expected = (
-                        r"TypeError: Exception thrown when converting pandas.Series \(float64\) "
-                        r"with name 'k' to Arrow Array \(string\).\n"
+                        r"TypeError: Cannot convert the output value of the column 'k' "
+                        r"with type 'float64' to the specified return type of the column: "
+                        r"'string'. Please check if the data types match and try again.\n"
                     )
                     self._test_merge_error(
                         fn=lambda lft, rgt: pd.DataFrame({"id": [1], "k": [2.0]}),
diff --git a/python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py b/python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py
@@ -357,8 +357,8 @@ def check_apply_in_pandas_returning_incompatible_type(self):
                 # sometimes we see ValueErrors
                 with self.subTest(convert="string to double"):
                     expected = (
-                        r"ValueError: Exception thrown when converting pandas.Series \(object\) "
-                        r"with name 'mean' to Arrow Array \(double\)."
+                        r"ValueError: Failed to convert the value of the column 'mean' "
+                        r"with type 'object' to Arrow type 'double'."
                     )
                     if safely:
                         expected = expected + (
@@ -377,8 +377,9 @@ def check_apply_in_pandas_returning_incompatible_type(self):
                 with self.subTest(convert="double to string"):
                     with self.assertRaisesRegex(
                         PythonException,
-                        r"TypeError: Exception thrown when converting pandas.Series \(float64\) "
-                        r"with name 'mean' to Arrow Array \(string\).\n",
+                        r"TypeError: Cannot convert the output value of the column 'mean' "
+                        r"with type 'float64' to the specified return type of the column: "
+                        r"'string'. Please check if the data types match and try again.\n",
                     ):
                         self._test_apply_in_pandas(
                             lambda key, pdf: pd.DataFrame([key + (pdf.v.mean(),)]),
diff --git a/python/pyspark/sql/tests/pandas/test_pandas_map.py b/python/pyspark/sql/tests/pandas/test_pandas_map.py
@@ -263,8 +263,8 @@ def func(iterator):
                             yield pdf.assign(id="test_string")
 
                     expected = (
-                        r"ValueError: Exception thrown when converting pandas.Series "
-                        r"\(object\) with name 'id' to Arrow Array \(double\)."
+                        r"ValueError: Failed to convert the value of the column 'id' "
+                        r"with type 'object' to Arrow type 'double'."
                     )
                     if safely:
                         expected = expected + (
@@ -293,8 +293,8 @@ def func(iterator):
                     )
                     if safely:
                         expected = (
-                            r"ValueError: Exception thrown when converting pandas.Series "
-                            r"\(float64\) with name 'id' to Arrow Array \(int32\)."
+                            r"ValueError: Failed to convert the value of the column 'id' "
+                            r"with type 'float64' to Arrow type 'int32'."
                             " It can be caused by overflows or other "
                             "unsafe conversions warned by Arrow. Arrow safe type check "
                             "can be disabled by using SQL config "
diff --git a/python/pyspark/sql/tests/pandas/test_pandas_udf.py b/python/pyspark/sql/tests/pandas/test_pandas_udf.py
@@ -315,7 +315,7 @@ def udf(column):
         # Since 0.11.0, PyArrow supports the feature to raise an error for unsafe cast.
         with self.sql_conf({"spark.sql.execution.pandas.convertToArrowArraySafely": True}):
             with self.assertRaisesRegex(
-                Exception, "Exception thrown when converting pandas.Series"
+                Exception, "Failed to convert the value"
             ):
                 df.select(["A"]).withColumn("udf", udf("A")).collect()
 
@@ -335,7 +335,7 @@ def udf(column):
         # When enabling safe type check, Arrow 0.11.0+ disallows overflow cast.
         with self.sql_conf({"spark.sql.execution.pandas.convertToArrowArraySafely": True}):
             with self.assertRaisesRegex(
-                Exception, "Exception thrown when converting pandas.Series"
+                Exception, "Failed to convert the value"
             ):
                 df.withColumn("udf", udf("id")).collect()
 
diff --git a/python/pyspark/sql/tests/test_udtf.py b/python/pyspark/sql/tests/test_udtf.py
@@ -2878,7 +2878,7 @@ class TestUDTF:
             def eval(self):
                 yield 1,
 
-        err = "UDTF_ARROW_TYPE_CAST_ERROR"
+        err = "Exception thrown when converting pandas.Series"
 
         for ret_type, expected in [
             ("x: boolean", [Row(x=True)]),
@@ -2905,7 +2905,7 @@ class TestUDTF:
             def eval(self):
                 yield "1",
 
-        err = "UDTF_ARROW_TYPE_CAST_ERROR"
+        err = "Exception thrown when converting pandas.Series"
 
         for ret_type, expected in [
             ("x: boolean", [Row(x=True)]),
@@ -2934,7 +2934,7 @@ class TestUDTF:
             def eval(self):
                 yield "hello",
 
-        err = "UDTF_ARROW_TYPE_CAST_ERROR"
+        err = "Exception thrown when converting pandas.Series"
 
         for ret_type, expected in [
             ("x: boolean", err),
@@ -2963,7 +2963,7 @@ class TestUDTF:
             def eval(self):
                 yield [0, 1.1, 2],
 
-        err = "UDTF_ARROW_TYPE_CAST_ERROR"
+        err = "Exception thrown when converting pandas.Series"
 
         for ret_type, expected in [
             ("x: boolean", err),
@@ -2996,7 +2996,7 @@ class TestUDTF:
             def eval(self):
                 yield {"a": 0, "b": 1.1, "c": 2},
 
-        err = "UDTF_ARROW_TYPE_CAST_ERROR"
+        err = "Exception thrown when converting pandas.Series"
 
         for ret_type, expected in [
             ("x: boolean", err),
@@ -3028,7 +3028,7 @@ class TestUDTF:
             def eval(self):
                 yield {"a": 0, "b": 1.1, "c": 2},
 
-        err = "UDTF_ARROW_TYPE_CAST_ERROR"
+        err = "Exception thrown when converting pandas.Series"
 
         for ret_type, expected in [
             ("x: boolean", err),
@@ -3059,7 +3059,7 @@ class TestUDTF:
             def eval(self):
                 yield Row(a=0, b=1.1, c=2),
 
-        err = "UDTF_ARROW_TYPE_CAST_ERROR"
+        err = "Exception thrown when converting pandas.Series"
 
         for ret_type, expected in [
             ("x: boolean", err),
@@ -3096,7 +3096,7 @@ def eval(self):
             "x: array<int>",
         ]:
             with self.subTest(ret_type=ret_type):
-                with self.assertRaisesRegex(PythonException, "UDTF_ARROW_TYPE_CAST_ERROR"):
+                with self.assertRaisesRegex(PythonException, "Exception thrown when converting pandas.Series"):
                     udtf(TestUDTF, returnType=ret_type)().collect()