Skip to content

Commit d065ce5

Browse files
parthchandraandygrove
authored andcommitted
fix: Make cast string to timestamp compatible with Spark (apache#3884)
* fix: Make cast string to timestamp compatible with Spark Add addtional formats and handle edge cases. Update compatibility guide Spark version specific behaviour for cast string to timestamp
1 parent 911f6e1 commit d065ce5

9 files changed

Lines changed: 925 additions & 124 deletions

File tree

docs/source/user-guide/latest/compatibility.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,15 @@ Cast operations in Comet fall into three levels of support:
145145
Spark.
146146
- **N/A**: Spark does not support this cast.
147147

148+
### String to Timestamp
149+
150+
Comet's native `CAST(string AS TIMESTAMP)` implementation supports all timestamp formats accepted
151+
by Apache Spark, including ISO 8601 date-time strings, date-only strings, time-only strings
152+
(`HH:MM:SS`), embedded timezone offsets (e.g. `+07:30`, `GMT-01:00`, `UTC`), named timezone
153+
suffixes (e.g. `Europe/Moscow`), and the full Spark timestamp year range
154+
(-290308 to 294247). Note that `CAST(string AS DATE)` is only compatible for years between
155+
262143 BC and 262142 AD due to an underlying library limitation.
156+
148157
### Legacy Mode
149158

150159
<!--BEGIN:CAST_LEGACY_TABLE-->

native/core/src/execution/planner.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -406,7 +406,12 @@ impl PhysicalPlanner {
406406
Ok(Arc::new(Cast::new(
407407
child,
408408
datatype,
409-
SparkCastOptions::new(eval_mode, &expr.timezone, expr.allow_incompat),
409+
SparkCastOptions::new_with_version(
410+
eval_mode,
411+
&expr.timezone,
412+
expr.allow_incompat,
413+
expr.is_spark4_plus,
414+
),
410415
spark_expr.expr_id,
411416
query_context,
412417
)))

native/proto/src/proto/expr.proto

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,9 @@ message Cast {
266266
string timezone = 3;
267267
EvalMode eval_mode = 4;
268268
bool allow_incompat = 5;
269+
// True when running against Spark 4.0+. Controls version-specific cast behaviour
270+
// such as the handling of leading whitespace before T-prefixed time-only strings.
271+
bool is_spark4_plus = 6;
269272
}
270273

271274
message BinaryExpr {

native/spark-expr/src/conversion_funcs/cast.rs

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,9 @@ pub struct SparkCastOptions {
131131
pub timezone: String,
132132
/// Allow casts that are supported but not guaranteed to be 100% compatible
133133
pub allow_incompat: bool,
134+
/// True when running against Spark 4.0+. Enables version-specific cast behaviour
135+
/// such as the handling of leading whitespace before T-prefixed time-only strings.
136+
pub is_spark4_plus: bool,
134137
/// Support casting unsigned ints to signed ints (used by Parquet SchemaAdapter)
135138
pub allow_cast_unsigned_ints: bool,
136139
/// We also use the cast logic for adapting Parquet schemas, so this flag is used
@@ -148,6 +151,7 @@ impl SparkCastOptions {
148151
eval_mode,
149152
timezone: timezone.to_string(),
150153
allow_incompat,
154+
is_spark4_plus: false,
151155
allow_cast_unsigned_ints: false,
152156
is_adapting_schema: false,
153157
null_string: "null".to_string(),
@@ -160,12 +164,25 @@ impl SparkCastOptions {
160164
eval_mode,
161165
timezone: "".to_string(),
162166
allow_incompat,
167+
is_spark4_plus: false,
163168
allow_cast_unsigned_ints: false,
164169
is_adapting_schema: false,
165170
null_string: "null".to_string(),
166171
binary_output_style: None,
167172
}
168173
}
174+
175+
pub fn new_with_version(
176+
eval_mode: EvalMode,
177+
timezone: &str,
178+
allow_incompat: bool,
179+
is_spark4_plus: bool,
180+
) -> Self {
181+
Self {
182+
is_spark4_plus,
183+
..Self::new(eval_mode, timezone, allow_incompat)
184+
}
185+
}
169186
}
170187

171188
/// Spark-compatible cast implementation. Defers to DataFusion's cast where that is known
@@ -296,9 +313,13 @@ pub(crate) fn cast_array(
296313
let cast_result = match (&from_type, to_type) {
297314
(Utf8, Boolean) => spark_cast_utf8_to_boolean::<i32>(&array, eval_mode),
298315
(LargeUtf8, Boolean) => spark_cast_utf8_to_boolean::<i64>(&array, eval_mode),
299-
(Utf8, Timestamp(_, _)) => {
300-
cast_string_to_timestamp(&array, to_type, eval_mode, &cast_options.timezone)
301-
}
316+
(Utf8, Timestamp(_, _)) => cast_string_to_timestamp(
317+
&array,
318+
to_type,
319+
eval_mode,
320+
&cast_options.timezone,
321+
cast_options.is_spark4_plus,
322+
),
302323
(Utf8, Date32) => cast_string_to_date(&array, to_type, eval_mode),
303324
(Date32, Int32) => {
304325
// Date32 is stored as days since epoch (i32), so this is a simple reinterpret cast

0 commit comments

Comments
 (0)