@@ -436,13 +436,15 @@ fn throw_exception(env: &mut JNIEnv, error: &CometError, backtrace: Option<Strin
436436 // Handle direct SparkError - serialize to JSON
437437 CometError :: Spark ( spark_error) => throw_spark_error_as_json ( env, spark_error) ,
438438 _ => {
439- // Check for file-not-found errors that may arrive through other wrapping paths
440439 let error_msg = error. to_string ( ) ;
440+ // Check for file-not-found errors that may arrive through other wrapping paths
441441 if error_msg. contains ( "not found" )
442442 && error_msg. contains ( "No such file or directory" )
443443 {
444444 let spark_error = SparkError :: FileNotFound { message : error_msg } ;
445445 throw_spark_error_as_json ( env, & spark_error)
446+ } else if let Some ( spark_error) = try_convert_duplicate_field_error ( & error_msg) {
447+ throw_spark_error_as_json ( env, & spark_error)
446448 } else {
447449 let exception = error. to_exception ( ) ;
448450 match backtrace {
@@ -474,6 +476,54 @@ fn throw_spark_error_as_json(
474476 )
475477}
476478
479+ /// Try to convert a DataFusion "Unable to get field named" error into a SparkError.
480+ /// DataFusion produces this error when reading Parquet files with duplicate field names
481+ /// in case-insensitive mode. For example, if a Parquet file has columns "B" and "b",
482+ /// DataFusion may deduplicate them and report: Unable to get field named "b". Valid
483+ /// fields: ["A", "B"]. When the requested field has a case-insensitive match among the
484+ /// valid fields, we convert this to Spark's _LEGACY_ERROR_TEMP_2093 error.
485+ fn try_convert_duplicate_field_error ( error_msg : & str ) -> Option < SparkError > {
486+ // Match: Schema error: Unable to get field named "X". Valid fields: [...]
487+ lazy_static ! {
488+ static ref FIELD_RE : Regex =
489+ Regex :: new( r#"Unable to get field named "([^"]+)"\. Valid fields: \[(.+)\]"# ) . unwrap( ) ;
490+ }
491+ if let Some ( caps) = FIELD_RE . captures ( error_msg) {
492+ let requested_field = caps. get ( 1 ) ?. as_str ( ) ;
493+ let requested_lower = requested_field. to_lowercase ( ) ;
494+ // Parse field names from the Valid fields list: ["A", "B"] or [A, B, b]
495+ let valid_fields_raw = caps. get ( 2 ) ?. as_str ( ) ;
496+ let all_fields: Vec < String > = valid_fields_raw
497+ . split ( ',' )
498+ . map ( |s| s. trim ( ) . trim_matches ( '"' ) . to_string ( ) )
499+ . collect ( ) ;
500+ // Find fields that match case-insensitively
501+ let mut matched: Vec < String > = all_fields
502+ . into_iter ( )
503+ . filter ( |f| f. to_lowercase ( ) == requested_lower)
504+ . collect ( ) ;
505+ // Need at least one case-insensitive match to treat this as a duplicate field error.
506+ // DataFusion may deduplicate columns case-insensitively, so the valid fields list
507+ // might contain only one variant (e.g. "B" when file has both "B" and "b").
508+ // If requested field differs from the match, both existed in the original file.
509+ if matched. is_empty ( ) {
510+ return None ;
511+ }
512+ // Add the requested field name if it's not already in the list (different case)
513+ if !matched. iter ( ) . any ( |f| f == requested_field) {
514+ matched. push ( requested_field. to_string ( ) ) ;
515+ }
516+ let required_field_name = requested_field. to_string ( ) ;
517+ let matched_fields = format ! ( "[{}]" , matched. join( ", " ) ) ;
518+ Some ( SparkError :: DuplicateFieldCaseInsensitive {
519+ required_field_name,
520+ matched_fields,
521+ } )
522+ } else {
523+ None
524+ }
525+ }
526+
477527#[ derive( Debug , Error ) ]
478528enum StacktraceError {
479529 #[ error( "Unable to initialize message: {0}" ) ]
0 commit comments