Skip to content

Commit cfd8a1c

Browse files
authored
fix: add timezone and special formats support for cast string to timestamp (#3730)
* fix: add timezone and special formats support for cast string to timestamp * improve splitting of date_part
1 parent 3f374c3 commit cfd8a1c

2 files changed

Lines changed: 249 additions & 72 deletions

File tree

native/spark-expr/src/conversion_funcs/string.rs

Lines changed: 226 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,11 @@ use std::str::FromStr;
3434
use std::sync::{Arc, LazyLock};
3535

3636
macro_rules! cast_utf8_to_timestamp {
37-
($array:expr, $eval_mode:expr, $array_type:ty, $cast_method:ident, $tz:expr) => {{
37+
// $tz is a Timezone:Tz object and contains the session timezone.
38+
// $to_tz_str is a string containing the to_type timezone
39+
($array:expr, $eval_mode:expr, $array_type:ty, $cast_method:ident, $tz:expr, $to_tz_str:expr) => {{
3840
let len = $array.len();
39-
let mut cast_array = PrimitiveArray::<$array_type>::builder(len).with_timezone("UTC");
41+
let mut cast_array = PrimitiveArray::<$array_type>::builder(len).with_timezone($to_tz_str);
4042
let mut cast_err: Option<SparkError> = None;
4143
for i in 0..len {
4244
if $array.is_null(i) {
@@ -675,16 +677,21 @@ pub(crate) fn cast_string_to_timestamp(
675677
.downcast_ref::<GenericStringArray<i32>>()
676678
.expect("Expected a string array");
677679

678-
let tz = &timezone::Tz::from_str(timezone_str).unwrap();
680+
let tz = &timezone::Tz::from_str(timezone_str)
681+
.map_err(|_| SparkError::Internal(format!("Invalid timezone string: {timezone_str}")))?;
679682

680683
let cast_array: ArrayRef = match to_type {
681-
DataType::Timestamp(_, _) => cast_utf8_to_timestamp!(
682-
string_array,
683-
eval_mode,
684-
TimestampMicrosecondType,
685-
timestamp_parser,
686-
tz
687-
)?,
684+
DataType::Timestamp(_, tz_opt) => {
685+
let to_tz = tz_opt.as_deref().unwrap_or("UTC");
686+
cast_utf8_to_timestamp!(
687+
string_array,
688+
eval_mode,
689+
TimestampMicrosecondType,
690+
timestamp_parser,
691+
tz,
692+
to_tz
693+
)?
694+
}
688695
_ => unreachable!("Invalid data type {:?} in cast from string", to_type),
689696
};
690697
Ok(cast_array)
@@ -967,20 +974,31 @@ fn get_timestamp_values<T: TimeZone>(
967974
timestamp_type: &str,
968975
tz: &T,
969976
) -> SparkResult<Option<i64>> {
970-
let values: Vec<_> = value.split(['T', '-', ':', '.']).collect();
971-
let year = values[0].parse::<i32>().unwrap_or_default();
977+
// Handle negative year: strip leading '-' and remember the sign.
978+
let (sign, date_part) = if let Some(stripped) = value.strip_prefix('-') {
979+
(-1i32, stripped)
980+
} else {
981+
(1i32, value)
982+
};
983+
let mut parts = date_part.split(['T', ' ', '-', ':', '.']);
984+
let year = sign
985+
* parts
986+
.next()
987+
.unwrap_or("")
988+
.parse::<i32>()
989+
.unwrap_or_default();
972990

973991
// NaiveDate (used internally by chrono's with_ymd_and_hms) is bounded to ±262142.
974992
if !(-262143..=262142).contains(&year) {
975993
return Ok(None);
976994
}
977995

978-
let month = values.get(1).map_or(1, |m| m.parse::<u32>().unwrap_or(1));
979-
let day = values.get(2).map_or(1, |d| d.parse::<u32>().unwrap_or(1));
980-
let hour = values.get(3).map_or(0, |h| h.parse::<u32>().unwrap_or(0));
981-
let minute = values.get(4).map_or(0, |m| m.parse::<u32>().unwrap_or(0));
982-
let second = values.get(5).map_or(0, |s| s.parse::<u32>().unwrap_or(0));
983-
let microsecond = values.get(6).map_or(0, |ms| ms.parse::<u32>().unwrap_or(0));
996+
let month = parts.next().map_or(1, |m| m.parse::<u32>().unwrap_or(1));
997+
let day = parts.next().map_or(1, |d| d.parse::<u32>().unwrap_or(1));
998+
let hour = parts.next().map_or(0, |h| h.parse::<u32>().unwrap_or(0));
999+
let minute = parts.next().map_or(0, |m| m.parse::<u32>().unwrap_or(0));
1000+
let second = parts.next().map_or(0, |s| s.parse::<u32>().unwrap_or(0));
1001+
let microsecond = parts.next().map_or(0, |ms| ms.parse::<u32>().unwrap_or(0));
9841002

9851003
let mut timestamp_info = TimeStampInfo::default();
9861004

@@ -1041,28 +1059,19 @@ fn parse_timestamp_to_micros<T: TimeZone>(
10411059
timestamp_info.second,
10421060
);
10431061

1044-
// Check if datetime is not None
1045-
let tz_datetime = match datetime.single() {
1062+
// Spark uses the offset before daylight savings change so we need to use earliest()
1063+
// Return None for LocalResult::None which is the invalid time in a DST spring forward gap).
1064+
let tz_datetime = match datetime.earliest() {
10461065
Some(dt) => dt
10471066
.with_timezone(tz)
10481067
.with_nanosecond(timestamp_info.microsecond * 1000),
1049-
None => {
1050-
return Err(SparkError::Internal(
1051-
"Failed to parse timestamp".to_string(),
1052-
));
1053-
}
1054-
};
1055-
1056-
let result = match tz_datetime {
1057-
Some(dt) => dt.timestamp_micros(),
1058-
None => {
1059-
return Err(SparkError::Internal(
1060-
"Failed to parse timestamp".to_string(),
1061-
));
1062-
}
1068+
None => return Ok(None),
10631069
};
10641070

1065-
Ok(Some(result))
1071+
match tz_datetime {
1072+
Some(dt) => Ok(Some(dt.timestamp_micros())),
1073+
None => Ok(None),
1074+
}
10661075
}
10671076

10681077
fn parse_str_to_year_timestamp<T: TimeZone>(value: &str, tz: &T) -> SparkResult<Option<i64>> {
@@ -1096,21 +1105,6 @@ fn parse_str_to_microsecond_timestamp<T: TimeZone>(
10961105
get_timestamp_values(value, "microsecond", tz)
10971106
}
10981107

1099-
type TimestampPattern<T> = (&'static Regex, fn(&str, &T) -> SparkResult<Option<i64>>);
1100-
1101-
static RE_YEAR: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\d{4,7}$").unwrap());
1102-
static RE_MONTH: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\d{4,7}-\d{2}$").unwrap());
1103-
static RE_DAY: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\d{4,7}-\d{2}-\d{2}$").unwrap());
1104-
static RE_HOUR: LazyLock<Regex> =
1105-
LazyLock::new(|| Regex::new(r"^\d{4,7}-\d{2}-\d{2}T\d{1,2}$").unwrap());
1106-
static RE_MINUTE: LazyLock<Regex> =
1107-
LazyLock::new(|| Regex::new(r"^\d{4,7}-\d{2}-\d{2}T\d{2}:\d{2}$").unwrap());
1108-
static RE_SECOND: LazyLock<Regex> =
1109-
LazyLock::new(|| Regex::new(r"^\d{4,7}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}$").unwrap());
1110-
static RE_MICROSECOND: LazyLock<Regex> =
1111-
LazyLock::new(|| Regex::new(r"^\d{4,7}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{1,6}$").unwrap());
1112-
static RE_TIME_ONLY: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^T\d{1,2}$").unwrap());
1113-
11141108
fn timestamp_parser<T: TimeZone>(
11151109
value: &str,
11161110
eval_mode: EvalMode,
@@ -1120,15 +1114,103 @@ fn timestamp_parser<T: TimeZone>(
11201114
if value.is_empty() {
11211115
return Ok(None);
11221116
}
1123-
let patterns: &[TimestampPattern<T>] = &[
1124-
(&RE_YEAR, parse_str_to_year_timestamp),
1117+
1118+
// Handle Z or ±HH:MM offset suffix: strip it and parse with the explicit fixed offset.
1119+
if let Some((stripped, offset_secs)) = extract_offset_suffix(value) {
1120+
let fixed_tz = chrono::FixedOffset::east_opt(offset_secs)
1121+
.ok_or_else(|| SparkError::Internal("Invalid timezone offset".to_string()))?;
1122+
return timestamp_parser_with_tz(stripped, eval_mode, &fixed_tz);
1123+
}
1124+
1125+
timestamp_parser_with_tz(value, eval_mode, tz)
1126+
}
1127+
1128+
/// If `value` ends with a UTC offset suffix (`Z`, `+HH:MM`, or `-HH:MM`), returns the
1129+
/// stripped string and the offset in seconds. Returns `None` if no offset suffix is present.
1130+
fn extract_offset_suffix(value: &str) -> Option<(&str, i32)> {
1131+
if let Some(stripped) = value.strip_suffix('Z') {
1132+
return Some((stripped, 0));
1133+
}
1134+
// Check for ±HH:MM at the end (exactly 6 chars: sign + 2 digits + ':' + 2 digits)
1135+
if value.len() >= 6 {
1136+
let suffix_start = value.len() - 6;
1137+
let suffix = &value[suffix_start..];
1138+
let sign_byte = suffix.as_bytes()[0];
1139+
if (sign_byte == b'+' || sign_byte == b'-') && suffix.as_bytes()[3] == b':' {
1140+
if let (Ok(h), Ok(m)) = (suffix[1..3].parse::<i32>(), suffix[4..6].parse::<i32>()) {
1141+
let sign = if sign_byte == b'+' { 1i32 } else { -1i32 };
1142+
return Some((&value[..suffix_start], sign * (h * 3600 + m * 60)));
1143+
}
1144+
}
1145+
}
1146+
None
1147+
}
1148+
1149+
type TimestampParsePattern<T> = (&'static Regex, fn(&str, &T) -> SparkResult<Option<i64>>);
1150+
1151+
static RE_YEAR: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^-?\d{4,7}$").unwrap());
1152+
static RE_MONTH: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^-?\d{4,7}-\d{2}$").unwrap());
1153+
static RE_DAY: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^-?\d{4,7}-\d{2}-\d{2}$").unwrap());
1154+
static RE_HOUR: LazyLock<Regex> =
1155+
LazyLock::new(|| Regex::new(r"^-?\d{4,7}-\d{2}-\d{2}[T ]\d{1,2}$").unwrap());
1156+
static RE_MINUTE: LazyLock<Regex> =
1157+
LazyLock::new(|| Regex::new(r"^-?\d{4,7}-\d{2}-\d{2}[T ]\d{2}:\d{2}$").unwrap());
1158+
static RE_SECOND: LazyLock<Regex> =
1159+
LazyLock::new(|| Regex::new(r"^-?\d{4,7}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}$").unwrap());
1160+
static RE_MICROSECOND: LazyLock<Regex> =
1161+
LazyLock::new(|| Regex::new(r"^-?\d{4,7}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}\.\d{1,6}$").unwrap());
1162+
static RE_TIME_ONLY_H: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^T\d{1,2}$").unwrap());
1163+
static RE_TIME_ONLY_HM: LazyLock<Regex> =
1164+
LazyLock::new(|| Regex::new(r"^T\d{1,2}:\d{2}$").unwrap());
1165+
static RE_TIME_ONLY_HMS: LazyLock<Regex> =
1166+
LazyLock::new(|| Regex::new(r"^T\d{1,2}:\d{2}:\d{2}$").unwrap());
1167+
static RE_TIME_ONLY_HMSU: LazyLock<Regex> =
1168+
LazyLock::new(|| Regex::new(r"^T\d{1,2}:\d{2}:\d{2}\.\d{1,6}$").unwrap());
1169+
static RE_BARE_HM: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\d{1,2}:\d{2}$").unwrap());
1170+
static RE_BARE_HMS: LazyLock<Regex> =
1171+
LazyLock::new(|| Regex::new(r"^\d{1,2}:\d{2}:\d{2}$").unwrap());
1172+
static RE_BARE_HMSU: LazyLock<Regex> =
1173+
LazyLock::new(|| Regex::new(r"^\d{1,2}:\d{2}:\d{2}\.\d{1,6}$").unwrap());
1174+
1175+
fn timestamp_parser_with_tz<T: TimeZone>(
1176+
value: &str,
1177+
eval_mode: EvalMode,
1178+
tz: &T,
1179+
) -> SparkResult<Option<i64>> {
1180+
// Both T-separator and space-separator date-time forms are supported.
1181+
// Negative years are handled by get_timestamp_values detecting a leading '-'.
1182+
let patterns: &[TimestampParsePattern<T>] = &[
1183+
// Year only: 4-7 digits, optionally negative
1184+
(
1185+
&RE_YEAR,
1186+
parse_str_to_year_timestamp as fn(&str, &T) -> SparkResult<Option<i64>>,
1187+
),
1188+
// Year-month
11251189
(&RE_MONTH, parse_str_to_month_timestamp),
1190+
// Year-month-day
11261191
(&RE_DAY, parse_str_to_day_timestamp),
1192+
// Date T-or-space hour (1 or 2 digits)
11271193
(&RE_HOUR, parse_str_to_hour_timestamp),
1194+
// Date T-or-space hour:minute
11281195
(&RE_MINUTE, parse_str_to_minute_timestamp),
1196+
// Date T-or-space hour:minute:second
11291197
(&RE_SECOND, parse_str_to_second_timestamp),
1198+
// Date T-or-space hour:minute:second.fraction
11301199
(&RE_MICROSECOND, parse_str_to_microsecond_timestamp),
1131-
(&RE_TIME_ONLY, parse_str_to_time_only_timestamp),
1200+
// Time-only: T hour (1 or 2 digits, no colon)
1201+
(&RE_TIME_ONLY_H, parse_str_to_time_only_timestamp),
1202+
// Time-only: T hour:minute
1203+
(&RE_TIME_ONLY_HM, parse_str_to_time_only_timestamp),
1204+
// Time-only: T hour:minute:second
1205+
(&RE_TIME_ONLY_HMS, parse_str_to_time_only_timestamp),
1206+
// Time-only: T hour:minute:second.fraction
1207+
(&RE_TIME_ONLY_HMSU, parse_str_to_time_only_timestamp),
1208+
// Bare time-only: hour:minute (without T prefix)
1209+
(&RE_BARE_HM, parse_str_to_time_only_timestamp),
1210+
// Bare time-only: hour:minute:second
1211+
(&RE_BARE_HMS, parse_str_to_time_only_timestamp),
1212+
// Bare time-only: hour:minute:second.fraction
1213+
(&RE_BARE_HMSU, parse_str_to_time_only_timestamp),
11321214
];
11331215

11341216
let mut timestamp = None;
@@ -1157,23 +1239,43 @@ fn timestamp_parser<T: TimeZone>(
11571239
}
11581240

11591241
fn parse_str_to_time_only_timestamp<T: TimeZone>(value: &str, tz: &T) -> SparkResult<Option<i64>> {
1160-
let values: Vec<&str> = value.split('T').collect();
1161-
let time_values: Vec<u32> = values[1]
1162-
.split(':')
1163-
.map(|v| v.parse::<u32>().unwrap_or(0))
1164-
.collect();
1242+
// The 'T' is optional in the time format; strip it if specified.
1243+
let time_part = value.strip_prefix('T').unwrap_or(value);
1244+
1245+
// Parse time components: hour[:minute[:second[.fraction]]]
1246+
// Use splitn(3) so "12:34:56.789" splits into ["12", "34", "56.789"].
1247+
let colon_parts: Vec<&str> = time_part.splitn(3, ':').collect();
1248+
let hour: u32 = colon_parts[0].parse().unwrap_or(0);
1249+
let minute: u32 = colon_parts.get(1).and_then(|s| s.parse().ok()).unwrap_or(0);
1250+
let (second, nanosecond) = if let Some(sec_frac) = colon_parts.get(2) {
1251+
let dot_idx = sec_frac.find('.');
1252+
let sec: u32 = sec_frac[..dot_idx.unwrap_or(sec_frac.len())]
1253+
.parse()
1254+
.unwrap_or(0);
1255+
let ns: u32 = if let Some(dot) = dot_idx {
1256+
let frac = &sec_frac[dot + 1..];
1257+
// Interpret up to 6 digits as microseconds, padding with trailing zeros.
1258+
let trimmed = &frac[..frac.len().min(6)];
1259+
let padded = format!("{:0<6}", trimmed);
1260+
padded.parse::<u32>().unwrap_or(0) * 1000
1261+
} else {
1262+
0
1263+
};
1264+
(sec, ns)
1265+
} else {
1266+
(0, 0)
1267+
};
11651268

11661269
let datetime = tz.from_utc_datetime(&chrono::Utc::now().naive_utc());
1167-
let timestamp = datetime
1270+
let result = datetime
11681271
.with_timezone(tz)
1169-
.with_hour(time_values.first().copied().unwrap_or_default())
1170-
.and_then(|dt| dt.with_minute(*time_values.get(1).unwrap_or(&0)))
1171-
.and_then(|dt| dt.with_second(*time_values.get(2).unwrap_or(&0)))
1172-
.and_then(|dt| dt.with_nanosecond(*time_values.get(3).unwrap_or(&0) * 1_000))
1173-
.map(|dt| dt.timestamp_micros())
1174-
.unwrap_or_default();
1175-
1176-
Ok(Some(timestamp))
1272+
.with_hour(hour)
1273+
.and_then(|dt| dt.with_minute(minute))
1274+
.and_then(|dt| dt.with_second(second))
1275+
.and_then(|dt| dt.with_nanosecond(nanosecond))
1276+
.map(|dt| dt.timestamp_micros());
1277+
1278+
Ok(result)
11771279
}
11781280

11791281
//a string to date parser - port of spark's SparkDateTimeUtils#stringToDate.
@@ -1343,7 +1445,8 @@ mod tests {
13431445
eval_mode,
13441446
TimestampMicrosecondType,
13451447
timestamp_parser,
1346-
tz
1448+
tz,
1449+
"UTC"
13471450
)
13481451
.unwrap();
13491452

@@ -1373,7 +1476,8 @@ mod tests {
13731476
eval_mode,
13741477
TimestampMicrosecondType,
13751478
timestamp_parser,
1376-
tz
1479+
tz,
1480+
"UTC"
13771481
);
13781482
assert!(
13791483
result.is_err(),
@@ -1497,6 +1601,59 @@ mod tests {
14971601
timestamp_parser("10000-01-01T12:34:56.123456", EvalMode::Legacy, tz).unwrap(),
14981602
Some(253402346096123456)
14991603
);
1604+
// Space separator (same values as T separator)
1605+
assert_eq!(
1606+
timestamp_parser("2020-01-01 12", EvalMode::Legacy, tz).unwrap(),
1607+
Some(1577880000000000)
1608+
);
1609+
assert_eq!(
1610+
timestamp_parser("2020-01-01 12:34", EvalMode::Legacy, tz).unwrap(),
1611+
Some(1577882040000000)
1612+
);
1613+
assert_eq!(
1614+
timestamp_parser("2020-01-01 12:34:56", EvalMode::Legacy, tz).unwrap(),
1615+
Some(1577882096000000)
1616+
);
1617+
assert_eq!(
1618+
timestamp_parser("2020-01-01 12:34:56.123456", EvalMode::Legacy, tz).unwrap(),
1619+
Some(1577882096123456)
1620+
);
1621+
// Z suffix (UTC)
1622+
assert_eq!(
1623+
timestamp_parser("2020-01-01T12:34:56Z", EvalMode::Legacy, tz).unwrap(),
1624+
Some(1577882096000000)
1625+
);
1626+
// Positive offset suffix
1627+
assert_eq!(
1628+
timestamp_parser("2020-01-01T12:34:56+05:30", EvalMode::Legacy, tz).unwrap(),
1629+
Some(1577862296000000) // 12:34:56 UTC+5:30 = 07:04:56 UTC
1630+
);
1631+
// T-prefixed time-only with colon
1632+
assert!(timestamp_parser("T12:34", EvalMode::Legacy, tz)
1633+
.unwrap()
1634+
.is_some());
1635+
assert!(timestamp_parser("T12:34:56", EvalMode::Legacy, tz)
1636+
.unwrap()
1637+
.is_some());
1638+
assert!(timestamp_parser("T12:34:56.123456", EvalMode::Legacy, tz)
1639+
.unwrap()
1640+
.is_some());
1641+
// Bare time-only (hour:minute without T prefix)
1642+
assert!(timestamp_parser("12:34", EvalMode::Legacy, tz)
1643+
.unwrap()
1644+
.is_some());
1645+
assert!(timestamp_parser("12:34:56", EvalMode::Legacy, tz)
1646+
.unwrap()
1647+
.is_some());
1648+
// Negative year
1649+
assert!(timestamp_parser("-0001", EvalMode::Legacy, tz)
1650+
.unwrap()
1651+
.is_some());
1652+
assert!(
1653+
timestamp_parser("-0001-01-01T12:34:56", EvalMode::Legacy, tz)
1654+
.unwrap()
1655+
.is_some()
1656+
);
15001657
}
15011658

15021659
#[test]

0 commit comments

Comments
 (0)