Skip to content

Commit bfbdcf1

Browse files
committed
[SPARK-55502][PYTHON] Unify UDF and UDTF Arrow conversion error handling
### What changes were proposed in this pull request? Backport SPARK-55502 to branch-4.0: unify error messages for UDF and UDTF Arrow conversion errors to match master. **Key changes**: - UDF path: updated error messages from "Exception thrown when converting pandas.Series..." to user-friendly "Failed to convert..." / "Cannot convert..." format - UDTF path: replaced `UDTF_ARROW_TYPE_CAST_ERROR` error class with "Exception thrown when converting pandas.Series..." format (matching master's legacy path) - Removed unused `UDTF_ARROW_TYPE_CAST_ERROR` error condition - Updated test expectations to match new error messages ### Why are the changes needed? The cross-version CI test (master-server + branch-4.0-client) fails because master updated the error messages in SPARK-55502, but branch-4.0 tests still expect the old format. ### Does this PR introduce _any_ user-facing change? Yes, error messages change for UDF Arrow conversion errors (same changes as master). ### How was this patch tested? Updated existing unit tests. ### Was this patch authored or co-authored using generative AI tooling? Yes
1 parent a8973f3 commit bfbdcf1

7 files changed

Lines changed: 54 additions & 51 deletions

File tree

python/pyspark/errors/error-conditions.json

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -990,11 +990,6 @@
990990
"Return type of the user-defined function should be <expected>, but is <actual>."
991991
]
992992
},
993-
"UDTF_ARROW_TYPE_CAST_ERROR": {
994-
"message": [
995-
"Cannot convert the output value of the column '<col_name>' with type '<col_type>' to the specified return type of the column: '<arrow_type>'. Please check if the data types match and try again."
996-
]
997-
},
998993
"UDTF_CONSTRUCTOR_INVALID_IMPLEMENTS_ANALYZE_METHOD": {
999994
"message": [
1000995
"Failed to evaluate the user-defined table function '<name>' because its constructor is invalid: the function implements the 'analyze' method, but its constructor has more than two arguments (including the 'self' reference). Please update the table function so that its constructor accepts exactly one 'self' argument, or one 'self' argument plus another argument for the result of the 'analyze' method, and try the query again."

python/pyspark/sql/pandas/serializers.py

Lines changed: 30 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -321,24 +321,26 @@ def _create_array(self, series, arrow_type, spark_type=None, arrow_cast=False):
321321
else:
322322
raise
323323
except TypeError as e:
324-
error_msg = (
325-
"Exception thrown when converting pandas.Series (%s) "
326-
"with name '%s' to Arrow Array (%s)."
327-
)
328-
raise PySparkTypeError(error_msg % (series.dtype, series.name, arrow_type)) from e
324+
raise PySparkTypeError(
325+
f"Cannot convert the output value of the column "
326+
f"'{series.name}' with type '{series.dtype}' to the "
327+
f"specified return type of the column: '{arrow_type}'."
328+
f" Please check if the data types match and try again."
329+
) from e
329330
except ValueError as e:
330331
error_msg = (
331-
"Exception thrown when converting pandas.Series (%s) "
332-
"with name '%s' to Arrow Array (%s)."
332+
f"Failed to convert the value of the column "
333+
f"'{series.name}' with type '{series.dtype}' to Arrow "
334+
f"type '{arrow_type}'."
333335
)
334336
if self._safecheck:
335-
error_msg = error_msg + (
336-
" It can be caused by overflows or other "
337-
"unsafe conversions warned by Arrow. Arrow safe type check "
338-
"can be disabled by using SQL config "
337+
error_msg += (
338+
" It can be caused by overflows or other unsafe "
339+
"conversions warned by Arrow. Arrow safe type "
340+
"check can be disabled by using SQL config "
339341
"`spark.sql.execution.pandas.convertToArrowArraySafely`."
340342
)
341-
raise PySparkValueError(error_msg % (series.dtype, series.name, arrow_type)) from e
343+
raise PySparkValueError(error_msg) from e
342344

343345
def _create_batch(self, series):
344346
"""
@@ -695,18 +697,22 @@ def _create_array(self, series, arrow_type, spark_type=None, arrow_cast=False):
695697
)
696698
else:
697699
raise
698-
except pa.lib.ArrowException:
699-
# Display the most user-friendly error messages instead of showing
700-
# arrow's error message. This also works better with Spark Connect
701-
# where the exception messages are by default truncated.
702-
raise PySparkRuntimeError(
703-
errorClass="UDTF_ARROW_TYPE_CAST_ERROR",
704-
messageParameters={
705-
"col_name": series.name,
706-
"col_type": str(series.dtype),
707-
"arrow_type": arrow_type,
708-
},
709-
) from None
700+
except pa.lib.ArrowException as e:
701+
error_msg = (
702+
"Exception thrown when converting pandas.Series (%s) "
703+
"with name '%s' to Arrow Array (%s)."
704+
% (series.dtype, series.name, arrow_type)
705+
)
706+
if isinstance(e, TypeError):
707+
raise PySparkTypeError(error_msg) from e
708+
if self._safecheck:
709+
error_msg += (
710+
" It can be caused by overflows or other unsafe "
711+
"conversions warned by Arrow. Arrow safe type "
712+
"check can be disabled by using SQL config "
713+
"`spark.sql.execution.pandas.convertToArrowArraySafely`."
714+
)
715+
raise PySparkValueError(error_msg) from e
710716

711717
def __repr__(self):
712718
return "ArrowStreamPandasUDTFSerializer"

python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -251,8 +251,8 @@ def check_apply_in_pandas_returning_incompatible_type(self):
251251
# sometimes we see ValueErrors
252252
with self.subTest(convert="string to double"):
253253
expected = (
254-
r"ValueError: Exception thrown when converting pandas.Series \(object\) "
255-
r"with name 'k' to Arrow Array \(double\)."
254+
r"ValueError: Failed to convert the value of the column 'k' "
255+
r"with type 'object' to Arrow type 'double'."
256256
)
257257
if safely:
258258
expected = expected + (
@@ -271,8 +271,9 @@ def check_apply_in_pandas_returning_incompatible_type(self):
271271
# sometimes we see TypeErrors
272272
with self.subTest(convert="double to string"):
273273
expected = (
274-
r"TypeError: Exception thrown when converting pandas.Series \(float64\) "
275-
r"with name 'k' to Arrow Array \(string\).\n"
274+
r"TypeError: Cannot convert the output value of the column 'k' "
275+
r"with type 'float64' to the specified return type of the column: "
276+
r"'string'. Please check if the data types match and try again.\n"
276277
)
277278
self._test_merge_error(
278279
fn=lambda lft, rgt: pd.DataFrame({"id": [1], "k": [2.0]}),

python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -357,8 +357,8 @@ def check_apply_in_pandas_returning_incompatible_type(self):
357357
# sometimes we see ValueErrors
358358
with self.subTest(convert="string to double"):
359359
expected = (
360-
r"ValueError: Exception thrown when converting pandas.Series \(object\) "
361-
r"with name 'mean' to Arrow Array \(double\)."
360+
r"ValueError: Failed to convert the value of the column 'mean' "
361+
r"with type 'object' to Arrow type 'double'."
362362
)
363363
if safely:
364364
expected = expected + (
@@ -377,8 +377,9 @@ def check_apply_in_pandas_returning_incompatible_type(self):
377377
with self.subTest(convert="double to string"):
378378
with self.assertRaisesRegex(
379379
PythonException,
380-
r"TypeError: Exception thrown when converting pandas.Series \(float64\) "
381-
r"with name 'mean' to Arrow Array \(string\).\n",
380+
r"TypeError: Cannot convert the output value of the column 'mean' "
381+
r"with type 'float64' to the specified return type of the column: "
382+
r"'string'. Please check if the data types match and try again.\n",
382383
):
383384
self._test_apply_in_pandas(
384385
lambda key, pdf: pd.DataFrame([key + (pdf.v.mean(),)]),

python/pyspark/sql/tests/pandas/test_pandas_map.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -263,8 +263,8 @@ def func(iterator):
263263
yield pdf.assign(id="test_string")
264264

265265
expected = (
266-
r"ValueError: Exception thrown when converting pandas.Series "
267-
r"\(object\) with name 'id' to Arrow Array \(double\)."
266+
r"ValueError: Failed to convert the value of the column 'id' "
267+
r"with type 'object' to Arrow type 'double'."
268268
)
269269
if safely:
270270
expected = expected + (
@@ -293,8 +293,8 @@ def func(iterator):
293293
)
294294
if safely:
295295
expected = (
296-
r"ValueError: Exception thrown when converting pandas.Series "
297-
r"\(float64\) with name 'id' to Arrow Array \(int32\)."
296+
r"ValueError: Failed to convert the value of the column 'id' "
297+
r"with type 'float64' to Arrow type 'int32'."
298298
" It can be caused by overflows or other "
299299
"unsafe conversions warned by Arrow. Arrow safe type check "
300300
"can be disabled by using SQL config "

python/pyspark/sql/tests/pandas/test_pandas_udf.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -315,7 +315,7 @@ def udf(column):
315315
# Since 0.11.0, PyArrow supports the feature to raise an error for unsafe cast.
316316
with self.sql_conf({"spark.sql.execution.pandas.convertToArrowArraySafely": True}):
317317
with self.assertRaisesRegex(
318-
Exception, "Exception thrown when converting pandas.Series"
318+
Exception, "Failed to convert the value"
319319
):
320320
df.select(["A"]).withColumn("udf", udf("A")).collect()
321321

@@ -335,7 +335,7 @@ def udf(column):
335335
# When enabling safe type check, Arrow 0.11.0+ disallows overflow cast.
336336
with self.sql_conf({"spark.sql.execution.pandas.convertToArrowArraySafely": True}):
337337
with self.assertRaisesRegex(
338-
Exception, "Exception thrown when converting pandas.Series"
338+
Exception, "Failed to convert the value"
339339
):
340340
df.withColumn("udf", udf("id")).collect()
341341

python/pyspark/sql/tests/test_udtf.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2878,7 +2878,7 @@ class TestUDTF:
28782878
def eval(self):
28792879
yield 1,
28802880

2881-
err = "UDTF_ARROW_TYPE_CAST_ERROR"
2881+
err = "Exception thrown when converting pandas.Series"
28822882

28832883
for ret_type, expected in [
28842884
("x: boolean", [Row(x=True)]),
@@ -2905,7 +2905,7 @@ class TestUDTF:
29052905
def eval(self):
29062906
yield "1",
29072907

2908-
err = "UDTF_ARROW_TYPE_CAST_ERROR"
2908+
err = "Exception thrown when converting pandas.Series"
29092909

29102910
for ret_type, expected in [
29112911
("x: boolean", [Row(x=True)]),
@@ -2934,7 +2934,7 @@ class TestUDTF:
29342934
def eval(self):
29352935
yield "hello",
29362936

2937-
err = "UDTF_ARROW_TYPE_CAST_ERROR"
2937+
err = "Exception thrown when converting pandas.Series"
29382938

29392939
for ret_type, expected in [
29402940
("x: boolean", err),
@@ -2963,7 +2963,7 @@ class TestUDTF:
29632963
def eval(self):
29642964
yield [0, 1.1, 2],
29652965

2966-
err = "UDTF_ARROW_TYPE_CAST_ERROR"
2966+
err = "Exception thrown when converting pandas.Series"
29672967

29682968
for ret_type, expected in [
29692969
("x: boolean", err),
@@ -2996,7 +2996,7 @@ class TestUDTF:
29962996
def eval(self):
29972997
yield {"a": 0, "b": 1.1, "c": 2},
29982998

2999-
err = "UDTF_ARROW_TYPE_CAST_ERROR"
2999+
err = "Exception thrown when converting pandas.Series"
30003000

30013001
for ret_type, expected in [
30023002
("x: boolean", err),
@@ -3028,7 +3028,7 @@ class TestUDTF:
30283028
def eval(self):
30293029
yield {"a": 0, "b": 1.1, "c": 2},
30303030

3031-
err = "UDTF_ARROW_TYPE_CAST_ERROR"
3031+
err = "Exception thrown when converting pandas.Series"
30323032

30333033
for ret_type, expected in [
30343034
("x: boolean", err),
@@ -3059,7 +3059,7 @@ class TestUDTF:
30593059
def eval(self):
30603060
yield Row(a=0, b=1.1, c=2),
30613061

3062-
err = "UDTF_ARROW_TYPE_CAST_ERROR"
3062+
err = "Exception thrown when converting pandas.Series"
30633063

30643064
for ret_type, expected in [
30653065
("x: boolean", err),
@@ -3096,7 +3096,7 @@ def eval(self):
30963096
"x: array<int>",
30973097
]:
30983098
with self.subTest(ret_type=ret_type):
3099-
with self.assertRaisesRegex(PythonException, "UDTF_ARROW_TYPE_CAST_ERROR"):
3099+
with self.assertRaisesRegex(PythonException, "Exception thrown when converting pandas.Series"):
31003100
udtf(TestUDTF, returnType=ret_type)().collect()
31013101

31023102

0 commit comments

Comments
 (0)