Describe the bug
When a SQL query contains multiple correlated scalar subqueries in the SELECT clause, the scalar_subquery_to_join optimizer rule fails with:
Schema error: Ambiguous reference to unqualified field __always_true
To Reproduce
// Reproduction case for DataFusion bug: Ambiguous reference to __always_true
// with multiple correlated scalar subqueries
use datafusion::arrow::array::{Float64Array, StringArray};
use datafusion::arrow::datatypes::{DataType, Field, Schema};
use datafusion::arrow::record_batch::RecordBatch;
use datafusion::datasource::MemTable;
use datafusion::prelude::*;
use std::sync::Arc;
#[tokio::main]
async fn main() -> datafusion::error::Result<()> {
println!("DataFusion Multiple Correlated Scalar Subqueries Bug Reproduction\n");
// Create a simple table with region and duration columns
let schema = Arc::new(Schema::new(vec![
Field::new("region", DataType::Utf8, false),
Field::new("duration", DataType::Float64, false),
]));
let region = StringArray::from(vec!["US", "EU", "US", "EU", "APAC"]);
let duration = Float64Array::from(vec![10.0, 20.0, 15.0, 25.0, 30.0]);
let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(region), Arc::new(duration)])?;
// Create session context
let config = SessionConfig::new().with_target_partitions(1);
let ctx = SessionContext::new_with_config(config);
// Register the table
let table = MemTable::try_new(schema, vec![vec![batch]])?;
ctx.register_table("activity", Arc::new(table))?;
println!("=== Test 1: Single correlated scalar subquery (WORKS) ===");
let sql_single = "
SELECT
region,
SUM(duration) as total_duration,
(SELECT COUNT(*) FROM activity WHERE region = a.region) as count
FROM activity a
GROUP BY region
";
match ctx.sql(sql_single).await {
Ok(df) => match df.collect().await {
Ok(results) => {
println!("Single subquery works!");
println!(
"Results: {} rows\n",
results.iter().map(|b| b.num_rows()).sum::<usize>()
);
}
Err(e) => println!("Execution error: {}\n", e),
},
Err(e) => println!("Planning error: {}\n", e),
}
println!("=== Test 2: Multiple correlated scalar subqueries (FAILS) ===");
let sql_multiple = "
SELECT
region,
SUM(duration) as total_duration,
(SELECT COUNT(*) FROM activity WHERE region = a.region) as count,
(SELECT MAX(duration) FROM activity WHERE region = a.region) as max_duration
FROM activity a
GROUP BY region
";
match ctx.sql(sql_multiple).await {
Ok(df) => match df.collect().await {
Ok(results) => {
println!("Multiple subqueries work!");
println!(
"Results: {} rows\n",
results.iter().map(|b| b.num_rows()).sum::<usize>()
);
}
Err(e) => {
println!("Execution error:");
println!("{}\n", e);
}
},
Err(e) => {
println!("❌ Planning error: {}\n", e);
}
}
println!("=== Test 3: Multiple subqueries with different tables (for comparison) ===");
// Create a second table
let schema2 = Arc::new(Schema::new(vec![
Field::new("region", DataType::Utf8, false),
Field::new("revenue", DataType::Float64, false),
]));
let region2 = StringArray::from(vec!["US", "EU", "APAC"]);
let revenue = Float64Array::from(vec![100.0, 200.0, 150.0]);
let batch2 = RecordBatch::try_new(schema2.clone(), vec![Arc::new(region2), Arc::new(revenue)])?;
let table2 = MemTable::try_new(schema2, vec![vec![batch2]])?;
ctx.register_table("sales", Arc::new(table2))?;
let sql_different_tables = "
SELECT
region,
SUM(duration) as total_duration,
(SELECT COUNT(*) FROM activity WHERE region = a.region) as activity_count,
(SELECT MAX(revenue) FROM sales WHERE region = a.region) as max_revenue
FROM activity a
GROUP BY region
";
match ctx.sql(sql_different_tables).await {
Ok(df) => match df.collect().await {
Ok(results) => {
println!("Multiple subqueries on different tables work!");
println!(
"Results: {} rows\n",
results.iter().map(|b| b.num_rows()).sum::<usize>()
);
}
Err(e) => println!("Execution error: {}\n", e),
},
Err(e) => println!("Planning error: {}\n", e),
}
Ok(())
}
This outputs errors for the 2 lasts queries like :
Optimizer rule 'scalar_subquery_to_join' failed
caused by
Schema error: Ambiguous reference to unqualified field __always_true
Expected behavior
The query should execute successfully, returning aggregated results with the scalar subquery values.
It's working with single correlated subquery.
Additional context
No response
Describe the bug
When a SQL query contains multiple correlated scalar subqueries in the SELECT clause, the
scalar_subquery_to_joinoptimizer rule fails with:To Reproduce
This outputs errors for the 2 lasts queries like :
Expected behavior
The query should execute successfully, returning aggregated results with the scalar subquery values.
It's working with single correlated subquery.
Additional context
No response