Skip to content

Commit 92553f1

Browse files
committed
One more format with leading whitespace
1 parent 39953ed commit 92553f1

2 files changed

Lines changed: 84 additions & 6 deletions

File tree

native/spark-expr/src/conversion_funcs/string.rs

Lines changed: 46 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1243,11 +1243,24 @@ fn timestamp_parser<T: TimeZone>(
12431243
if trimmed.is_empty() {
12441244
return Ok(None);
12451245
}
1246-
// For T-hour-only format (T\d{1,2}, no colon), Spark rejects leading whitespace
1247-
// but accepts trailing whitespace. Check the raw (pre-trim) value for a leading
1248-
// space before allowing the match.
1249-
if value.len() > value.trim_start().len() && RE_TIME_ONLY_H.is_match(trimmed) {
1250-
return Ok(None);
1246+
// Spark rejects leading whitespace for ALL T-prefixed time-only strings
1247+
// (T<h>, T<h>:<m>, T<h>:<m>:<s>, T<h>:<m>:<s>.<f>), but accepts trailing whitespace.
1248+
// Check the raw (pre-trim) value for leading whitespace before any T-time-only match.
1249+
if value.len() > value.trim_start().len()
1250+
&& (RE_TIME_ONLY_H.is_match(trimmed)
1251+
|| RE_TIME_ONLY_HM.is_match(trimmed)
1252+
|| RE_TIME_ONLY_HMS.is_match(trimmed)
1253+
|| RE_TIME_ONLY_HMSU.is_match(trimmed))
1254+
{
1255+
return if eval_mode == EvalMode::Ansi {
1256+
Err(SparkError::InvalidInputInCastToDatetime {
1257+
value: value.to_string(),
1258+
from_type: "STRING".to_string(),
1259+
to_type: "TIMESTAMP".to_string(),
1260+
})
1261+
} else {
1262+
Ok(None)
1263+
};
12511264
}
12521265
let value = trimmed;
12531266
// Spark accepts a leading '+' year sign on full date-time strings (e.g. "+2020-01-01T12:34:56")
@@ -1875,6 +1888,34 @@ mod tests {
18751888
);
18761889
}
18771890

1891+
#[test]
1892+
fn test_leading_whitespace_t_hm() {
1893+
let tz = &timezone::Tz::from_str("UTC").unwrap();
1894+
// Spark rejects leading whitespace for ALL T-prefixed time-only patterns.
1895+
for ws_input in &[" T2:30", "\tT2:30", "\nT2:30", " T2", "\tT2", "\nT2"] {
1896+
assert!(
1897+
timestamp_parser(ws_input, EvalMode::Legacy, tz)
1898+
.unwrap()
1899+
.is_none(),
1900+
"'{ws_input}' should be null in Legacy mode"
1901+
);
1902+
// In ANSI mode the same inputs must raise an error (not silently return null).
1903+
assert!(
1904+
timestamp_parser(ws_input, EvalMode::Ansi, tz).is_err(),
1905+
"'{ws_input}' should error in ANSI mode"
1906+
);
1907+
}
1908+
// Without leading whitespace, these must be valid.
1909+
for ok_input in &["T2:30", "T2"] {
1910+
assert!(
1911+
timestamp_parser(ok_input, EvalMode::Legacy, tz)
1912+
.unwrap()
1913+
.is_some(),
1914+
"'{ok_input}' should be valid"
1915+
);
1916+
}
1917+
}
1918+
18781919
#[test]
18791920
fn plus_sign_year_test() {
18801921
let tz = &timezone::Tz::from_str("UTC").unwrap();

spark/src/test/scala/org/apache/comet/CometCastSuite.scala

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1065,6 +1065,42 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper {
10651065
}
10661066
}
10671067

1068+
test("cast StringType to TimestampType - T-hour-only whitespace handling") {
1069+
// RE_TIME_ONLY_H ("T<h>", no colon) has a Spark-specific rule:
1070+
// - leading whitespace → null
1071+
// - trailing whitespace → valid (treated as trimmed)
1072+
// Adjacent patterns (T<h>:<m>, full datetime) behave differently under
1073+
// leading whitespace, so those cases are included for completeness.
1074+
withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> "UTC") {
1075+
val values = Seq(
1076+
// Bare T-hour-only: no leading whitespace (valid)
1077+
"T2", // single-digit hour
1078+
"T23", // two-digit hour
1079+
"T0", // midnight
1080+
// Bare T-hour-only: trailing whitespace only (valid)
1081+
"T2 ", // trailing space
1082+
"T2\t", // trailing tab
1083+
"T2\n", // trailing newline
1084+
// Bare T-hour-only: leading whitespace (null per Spark)
1085+
" T2", // leading space
1086+
"\tT2", // leading tab
1087+
"\nT2", // leading newline
1088+
"\r\nT2", // leading CRLF
1089+
"\t T2", // tab then space
1090+
" T2", // double space
1091+
// T-hour:minute with leading whitespace — Spark ALSO returns null (same rule applies)
1092+
" T2:30",
1093+
"\tT2:30",
1094+
"\nT2:30",
1095+
// Full datetime: leading whitespace (Spark trims, so valid)
1096+
" 2020-01-01T12:34:56",
1097+
"\t2020-01-01T12:34:56",
1098+
"\n2020-01-01T12:34:56",
1099+
"\r\n2020-01-01T12:34:56")
1100+
castTimestampTest(values.toDF("a"), DataTypes.TimestampType, assertNative = true)
1101+
}
1102+
}
1103+
10681104
// CAST from BinaryType
10691105

10701106
test("cast BinaryType to StringType") {
@@ -1557,7 +1593,8 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper {
15571593
case (None, Some(e)) =>
15581594
throw e
15591595
case (Some(e), None) =>
1560-
fail(s"Comet should have failed with ${e.getCause.getMessage}")
1596+
val msg = if (e.getCause != null) e.getCause.getMessage else e.getMessage
1597+
fail(s"Comet should have failed with $msg")
15611598
case (Some(sparkException), Some(cometException)) =>
15621599
val sparkMessage =
15631600
if (sparkException.getCause != null) sparkException.getCause.getMessage

0 commit comments

Comments
 (0)