From 85d59bb2ed971bd532a0c063178fb2da2631ed45 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Thu, 9 Apr 2026 13:07:50 +0100 Subject: [PATCH 1/9] Add support for null aware mark-joins --- datafusion/common/src/join_type.rs | 7 +- datafusion/core/src/physical_planner.rs | 107 ++++- datafusion/expr/src/logical_plan/builder.rs | 35 +- datafusion/expr/src/logical_plan/mod.rs | 4 +- datafusion/expr/src/logical_plan/plan.rs | 40 +- .../src/decorrelate_predicate_subquery.rs | 129 ++++- .../src/extract_equijoin_predicate.rs | 2 +- .../physical-plan/src/joins/hash_join/exec.rs | 444 ++++++++++++++++-- .../src/joins/hash_join/stream.rs | 199 +++++++- .../src/joins/symmetric_hash_join.rs | 2 + datafusion/physical-plan/src/joins/utils.rs | 99 +++- .../test_files/null_aware_anti_join.slt | 11 - .../test_files/null_aware_mark_join.slt | 351 ++++++++++++++ 13 files changed, 1308 insertions(+), 122 deletions(-) create mode 100644 datafusion/sqllogictest/test_files/null_aware_mark_join.slt diff --git a/datafusion/common/src/join_type.rs b/datafusion/common/src/join_type.rs index c77a1475ed227..de4e4dc84bb7e 100644 --- a/datafusion/common/src/join_type.rs +++ b/datafusion/common/src/join_type.rs @@ -61,9 +61,10 @@ pub enum JoinType { /// [1]. This join type is used to decorrelate EXISTS subqueries used inside disjunctive /// predicates. /// - /// Note: This we currently do not implement the full null semantics for the mark join described - /// in [1] which will be needed if we and ANY subqueries. In our version the mark column will - /// only be true for had a match and false when no match was found, never null. + /// For scalar `NOT IN`, DataFusion can plan a null-aware hash mark join where the + /// mark column is nullable: TRUE for a match, NULL for SQL UNKNOWN, and FALSE + /// otherwise. Row-valued multi-column `NOT IN` and non-hash residual predicate + /// null-aware mark semantics are not implemented. /// /// [1]: http://btw2017.informatik.uni-stuttgart.de/slidesandpapers/F1-10-37/paper_web.pdf LeftMark, diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 190a08da12222..174748026ec56 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -1626,6 +1626,9 @@ impl DefaultPhysicalPlanner { let prefer_hash_join = session_state.config_options().optimizer.prefer_hash_join; + let can_repartition_join = session_state.config().target_partitions() > 1 + && session_state.config().repartition_joins() + && !*null_aware; // TODO: Allow PWMJ to deal with residual equijoin conditions let join: Arc = if join_on.is_empty() { @@ -1754,10 +1757,7 @@ impl DefaultPhysicalPlanner { None, )?) } - } else if session_state.config().target_partitions() > 1 - && session_state.config().repartition_joins() - && !prefer_hash_join - { + } else if can_repartition_join && !prefer_hash_join { // Use SortMergeJoin if hash join is not preferred let join_on_len = join_on.len(); Arc::new(SortMergeJoinExec::try_new( @@ -1769,24 +1769,15 @@ impl DefaultPhysicalPlanner { vec![SortOptions::default(); join_on_len], *null_equality, )?) - } else if session_state.config().target_partitions() > 1 - && session_state.config().repartition_joins() - && prefer_hash_join - && !*null_aware - // Null-aware joins must use CollectLeft - { - Arc::new(HashJoinExec::try_new( - physical_left, - physical_right, - join_on, - join_filter, - join_type, - None, - PartitionMode::Auto, - *null_equality, - *null_aware, - )?) } else { + // Null-aware joins need global probe-side state, so keep + // them in CollectLeft mode. + let partition_mode = if can_repartition_join { + PartitionMode::Auto + } else { + PartitionMode::CollectLeft + }; + Arc::new(HashJoinExec::try_new( physical_left, physical_right, @@ -1794,7 +1785,7 @@ impl DefaultPhysicalPlanner { join_filter, join_type, None, - PartitionMode::CollectLeft, + partition_mode, *null_equality, *null_aware, )?) @@ -3466,11 +3457,27 @@ mod tests { ctx.sql(query).await?.create_physical_plan().await } + async fn plan_sql_with_config( + query: &str, + config: SessionConfig, + ) -> Result> { + let ctx = SessionContext::new_with_config(config); + ctx.sql(query).await?.create_physical_plan().await + } + async fn collect_sql(query: &str) -> Result> { let ctx = SessionContext::new(); ctx.sql(query).await?.collect().await } + async fn collect_sql_with_config( + query: &str, + config: SessionConfig, + ) -> Result> { + let ctx = SessionContext::new_with_config(config); + ctx.sql(query).await?.collect().await + } + #[tokio::test] async fn test_all_operators() -> Result<()> { let logical_plan = test_csv_scan() @@ -3605,6 +3612,62 @@ mod tests { Ok(()) } + #[tokio::test] + async fn correlated_not_in_is_null_uses_null_aware_hash_mark_join() -> Result<()> { + let query = " + SELECT value + FROM ( + VALUES + (1, 1, 'a'), + (3, 1, 'b'), + (1, 2, 'c'), + (NULL, 1, 'd'), + (5, 3, 'e'), + (2, 1, 'f'), + (NULL, 2, 'g') + ) AS outer_corr_table(id, grp, value) + WHERE (id NOT IN ( + SELECT id + FROM ( + VALUES + (2, 1), + (NULL, 1), + (1, 2) + ) AS inner_corr_table(id, grp) + WHERE inner_corr_table.grp = outer_corr_table.grp + )) IS NULL + ORDER BY value"; + + let config = SessionConfig::new() + .with_target_partitions(4) + .set_bool("datafusion.optimizer.prefer_hash_join", false); + + let plan = plan_sql_with_config(query, config.clone()).await?; + let formatted = displayable(plan.as_ref()).indent(true).to_string(); + assert_contains!( + &formatted, + "HashJoinExec: mode=CollectLeft, join_type=LeftMark" + ); + assert!(!formatted.contains("SortMergeJoinExec"), "{formatted}"); + + let batches = collect_sql_with_config(query, config).await?; + assert_batches_eq!( + &[ + "+-------+", + "| value |", + "+-------+", + "| a |", + "| b |", + "| d |", + "| g |", + "+-------+", + ], + &batches + ); + + Ok(()) + } + #[tokio::test] async fn scalar_subquery_in_projection_and_filter_plans() -> Result<()> { let plan = plan_sql( diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 2ecb12c30afad..5707efa5b0a13 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -1154,8 +1154,12 @@ impl LogicalPlanBuilder { .zip(right_keys) .map(|(l, r)| (Expr::Column(l), Expr::Column(r))) .collect(); - let join_schema = - build_join_schema(self.plan.schema(), right.schema(), &join_type)?; + let join_schema = build_join_schema_with_null_aware( + self.plan.schema(), + right.schema(), + &join_type, + null_aware, + )?; // Inner type without join condition is cross join if join_type != JoinType::Inner && on.is_empty() && filter.is_none() { @@ -1652,7 +1656,7 @@ pub fn unique_field_aliases(fields: &Fields) -> Vec> { .collect() } -fn mark_field(schema: &DFSchema) -> (Option, Arc) { +fn mark_field(schema: &DFSchema, nullable: bool) -> (Option, Arc) { let mut table_references = schema .iter() .filter_map(|(qualifier, _)| qualifier) @@ -1666,16 +1670,33 @@ fn mark_field(schema: &DFSchema) -> (Option, Arc) { ( table_reference, - Arc::new(Field::new("mark", DataType::Boolean, false)), + Arc::new(Field::new("mark", DataType::Boolean, nullable)), ) } /// Creates a schema for a join operation. -/// The fields from the left side are first +/// The fields from the left side are first. +/// +/// The `mark` column of a mark join is non-nullable. Use +/// [`build_join_schema_with_null_aware`] for null-aware mark joins, whose mark +/// column can be `NULL` (SQL UNKNOWN). pub fn build_join_schema( left: &DFSchema, right: &DFSchema, join_type: &JoinType, +) -> Result { + build_join_schema_with_null_aware(left, right, join_type, false) +} + +/// Like [`build_join_schema`], but makes the `LeftMark`/`RightMark` `mark` +/// column nullable when `null_aware` is set, so it can represent SQL UNKNOWN +/// for null-aware `NOT IN` semantics. `null_aware` has no effect on non-mark +/// join types. +pub fn build_join_schema_with_null_aware( + left: &DFSchema, + right: &DFSchema, + join_type: &JoinType, + null_aware: bool, ) -> Result { fn nullify_fields<'a>( fields: impl Iterator, &'a Arc)>, @@ -1738,7 +1759,7 @@ pub fn build_join_schema( } JoinType::LeftMark => left_fields .map(|(q, f)| (q.cloned(), Arc::clone(f))) - .chain(once(mark_field(right))) + .chain(once(mark_field(right, null_aware))) .collect(), JoinType::RightSemi | JoinType::RightAnti => { // Only use the right side for the schema @@ -1748,7 +1769,7 @@ pub fn build_join_schema( } JoinType::RightMark => right_fields .map(|(q, f)| (q.cloned(), Arc::clone(f))) - .chain(once(mark_field(left))) + .chain(once(mark_field(left, null_aware))) .collect(), }; let func_dependencies = left.functional_dependencies().join( diff --git a/datafusion/expr/src/logical_plan/mod.rs b/datafusion/expr/src/logical_plan/mod.rs index 4766c3f33379f..b84e9b23d3c5c 100644 --- a/datafusion/expr/src/logical_plan/mod.rs +++ b/datafusion/expr/src/logical_plan/mod.rs @@ -28,8 +28,8 @@ pub mod tree_node; pub use builder::{ LogicalPlanBuilder, LogicalPlanBuilderOptions, LogicalTableSource, UNNAMED_TABLE, - build_join_schema, requalify_sides_if_needed, table_scan, union, - wrap_projection_for_join_if_necessary, + build_join_schema, build_join_schema_with_null_aware, requalify_sides_if_needed, + table_scan, union, wrap_projection_for_join_if_necessary, }; pub use ddl::{ CreateCatalog, CreateCatalogSchema, CreateExternalTable, CreateFunction, diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 9ca6941a61ce6..c0324f6f096ba 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -47,7 +47,8 @@ use crate::utils::{ use crate::{ BinaryExpr, CreateMemoryTable, CreateView, Execute, Expr, ExprSchemable, GroupingSet, LogicalPlanBuilder, Operator, Prepare, TableProviderFilterPushDown, TableSource, - WindowFunctionDefinition, build_join_schema, expr_vec_fmt, requalify_sides_if_needed, + WindowFunctionDefinition, build_join_schema_with_null_aware, expr_vec_fmt, + requalify_sides_if_needed, }; use crate::statistics::StatisticsRequest; @@ -667,8 +668,12 @@ impl LogicalPlan { null_equality, null_aware, }) => { - let schema = - build_join_schema(left.schema(), right.schema(), &join_type)?; + let schema = build_join_schema_with_null_aware( + left.schema(), + right.schema(), + &join_type, + null_aware, + )?; let new_on: Vec<_> = on .into_iter() @@ -944,7 +949,12 @@ impl LogicalPlan { .. }) => { let (left, right) = self.only_two_inputs(inputs)?; - let schema = build_join_schema(left.schema(), right.schema(), join_type)?; + let schema = build_join_schema_with_null_aware( + left.schema(), + right.schema(), + join_type, + *null_aware, + )?; let equi_expr_count = on.len() * 2; assert!(expr.len() >= equi_expr_count); @@ -4228,13 +4238,13 @@ pub struct Join { pub schema: DFSchemaRef, /// Defines the null equality for the join. pub null_equality: NullEquality, - /// Whether this is a null-aware anti join (for NOT IN semantics). + /// Whether this join needs null-aware NOT IN semantics. /// - /// Only applies to LeftAnti joins. When true, implements SQL NOT IN semantics where: - /// - If the right side (subquery) contains any NULL in join keys, no rows are output - /// - Left side rows with NULL in join keys are not output + /// For `LeftAnti`, if the right side contains any NULL in join keys, no rows are output and + /// left rows with NULL join keys are also excluded. /// - /// This is required for correct NOT IN subquery behavior with three-valued logic. + /// For `LeftMark`, the generated `mark` column becomes nullable so unmatched rows can produce + /// `NULL` rather than `false` when SQL three-valued logic requires it. pub null_aware: bool, } @@ -4253,7 +4263,7 @@ impl Join { /// * `join_type` - Type of join (Inner, Left, Right, etc.) /// * `join_constraint` - Join constraint (On, Using) /// * `null_equality` - How to handle nulls in join comparisons - /// * `null_aware` - Whether this is a null-aware anti join (for NOT IN semantics) + /// * `null_aware` - Whether this join needs null-aware NOT IN semantics /// /// # Returns /// @@ -4269,7 +4279,12 @@ impl Join { null_equality: NullEquality, null_aware: bool, ) -> Result { - let join_schema = build_join_schema(left.schema(), right.schema(), &join_type)?; + let join_schema = build_join_schema_with_null_aware( + left.schema(), + right.schema(), + &join_type, + null_aware, + )?; Ok(Join { left, @@ -4320,10 +4335,11 @@ impl Join { .map(|(l, r)| (Expr::Column(l), Expr::Column(r))) .collect(); - let join_schema = build_join_schema( + let join_schema = build_join_schema_with_null_aware( left_sch.schema(), right_sch.schema(), &original_join.join_type, + original_join.null_aware, )?; Ok(( diff --git a/datafusion/optimizer/src/decorrelate_predicate_subquery.rs b/datafusion/optimizer/src/decorrelate_predicate_subquery.rs index 0609109ec6e58..85b808dbcd3ac 100644 --- a/datafusion/optimizer/src/decorrelate_predicate_subquery.rs +++ b/datafusion/optimizer/src/decorrelate_predicate_subquery.rs @@ -21,6 +21,7 @@ use std::ops::Deref; use std::sync::Arc; use crate::decorrelate::PullUpCorrelatedExpr; +use crate::extract_equijoin_predicate::split_eq_and_noneq_join_predicate; use crate::optimizer::ApplyOrder; use crate::utils::replace_qualified_name; use crate::{OptimizerConfig, OptimizerRule}; @@ -448,9 +449,40 @@ fn build_join( sub_query_alias.clone() }; - // Mark joins don't use null-aware semantics (they use three-valued logic with mark column) + let mark_filter_is_hashable_only = + if join_type == JoinType::LeftMark && in_predicate_opt.is_some() { + let (_, residual_filter) = split_eq_and_noneq_join_predicate( + join_filter.clone(), + left.schema(), + right_projected.schema(), + )?; + residual_filter.is_none() + } else { + false + }; + + // For scalar NOT IN mark joins, propagate null-aware semantics into the + // nullable mark column when the predicate can be implemented by hash keys. + // Non-equality correlated filters stay on the legacy path because hash join + // execution cannot mark UNKNOWN candidates for residual predicates. + let null_aware = join_type == JoinType::LeftMark + && in_predicate_opt.is_some() + && mark_filter_is_hashable_only + && join_keys_may_be_null( + &join_filter, + left.schema(), + right_projected.schema(), + )?; + let new_plan = LogicalPlanBuilder::from(left.clone()) - .join_on(right_projected, join_type, Some(join_filter))? + .join_detailed_with_options( + right_projected, + join_type, + (Vec::::new(), Vec::::new()), + Some(join_filter), + NullEquality::NullEqualsNothing, + null_aware, + )? .build()?; debug!( @@ -571,6 +603,43 @@ mod tests { )) } + fn nullable_scalar_mark_scan(name: &str) -> Result { + let schema = Schema::new(vec![ + Field::new("id", DataType::Int32, true), + Field::new("grp", DataType::Int32, true), + ]); + table_scan(Some(name), &schema, None)?.build() + } + + fn has_null_aware_left_mark_join(plan: &LogicalPlan) -> bool { + if let LogicalPlan::Join(join) = plan + && join.join_type == JoinType::LeftMark + { + return join.null_aware; + } + + plan.inputs().into_iter().any(has_null_aware_left_mark_join) + } + + fn has_non_null_aware_left_mark_join(plan: &LogicalPlan) -> bool { + if let LogicalPlan::Join(join) = plan + && join.join_type == JoinType::LeftMark + { + return !join.null_aware; + } + + plan.inputs() + .into_iter() + .any(has_non_null_aware_left_mark_join) + } + + fn optimize_with_decorrelate(plan: LogicalPlan) -> Result { + let optimizer = crate::Optimizer::with_rules(vec![Arc::new( + DecorrelatePredicateSubquery::new(), + )]); + optimizer.optimize(plan, &crate::OptimizerContext::new(), |_, _| {}) + } + /// Test for several IN subquery expressions #[test] fn in_subquery_multiple() -> Result<()> { @@ -1201,6 +1270,62 @@ mod tests { ) } + #[test] + fn correlated_not_in_mark_join_is_null_aware_for_hashable_filter() -> Result<()> { + let outer_scan = nullable_scalar_mark_scan("outer_t")?; + let inner_scan = nullable_scalar_mark_scan("inner_t")?; + + let subquery = Arc::new( + LogicalPlanBuilder::from(inner_scan) + .filter( + out_ref_col(DataType::Int32, "outer_t.grp").eq(col("inner_t.grp")), + )? + .project(vec![col("inner_t.id")])? + .build()?, + ); + + let plan = LogicalPlanBuilder::from(outer_scan) + .filter(not_in_subquery(col("outer_t.id"), subquery).is_null())? + .build()?; + + let optimized = optimize_with_decorrelate(plan)?; + assert!( + has_null_aware_left_mark_join(&optimized), + "{}", + optimized.display_indent_schema() + ); + + Ok(()) + } + + #[test] + fn correlated_not_in_mark_join_is_not_null_aware_for_residual_filter() -> Result<()> { + let outer_scan = nullable_scalar_mark_scan("outer_t")?; + let inner_scan = nullable_scalar_mark_scan("inner_t")?; + + let subquery = Arc::new( + LogicalPlanBuilder::from(inner_scan) + .filter( + out_ref_col(DataType::Int32, "outer_t.grp").lt(col("inner_t.grp")), + )? + .project(vec![col("inner_t.id")])? + .build()?, + ); + + let plan = LogicalPlanBuilder::from(outer_scan) + .filter(not_in_subquery(col("outer_t.id"), subquery).is_null())? + .build()?; + + let optimized = optimize_with_decorrelate(plan)?; + assert!( + has_non_null_aware_left_mark_join(&optimized), + "{}", + optimized.display_indent_schema() + ); + + Ok(()) + } + #[test] fn in_subquery_both_side_expr() -> Result<()> { let table_scan = test_table_scan()?; diff --git a/datafusion/optimizer/src/extract_equijoin_predicate.rs b/datafusion/optimizer/src/extract_equijoin_predicate.rs index 0a50761e8a9f7..03a5aaa20402c 100644 --- a/datafusion/optimizer/src/extract_equijoin_predicate.rs +++ b/datafusion/optimizer/src/extract_equijoin_predicate.rs @@ -174,7 +174,7 @@ impl OptimizerRule for ExtractEquijoinPredicate { /// /// According to the above rule, `expr1` is the equijoin predicate, while `expr2` and `expr3` are not. /// The function returns Ok(\[expr1\], Some(expr2 AND expr3)) -fn split_eq_and_noneq_join_predicate( +pub(crate) fn split_eq_and_noneq_join_predicate( filter: Expr, left_schema: &DFSchema, right_schema: &DFSchema, diff --git a/datafusion/physical-plan/src/joins/hash_join/exec.rs b/datafusion/physical-plan/src/joins/hash_join/exec.rs index b1d387ea74557..ed276cd2dcfe6 100644 --- a/datafusion/physical-plan/src/joins/hash_join/exec.rs +++ b/datafusion/physical-plan/src/joins/hash_join/exec.rs @@ -59,7 +59,7 @@ use crate::{ common::can_project, joins::utils::{ BuildProbeJoinMetrics, ColumnIndex, JoinFilter, JoinHashMapType, - build_join_schema, check_join_is_valid, estimate_join_statistics, + build_join_schema_with_null_aware, check_join_is_valid, estimate_join_statistics, need_produce_result_in_final, symmetric_join_output_partitioning, }, metrics::{ExecutionPlanMetricsSet, MetricsSet}, @@ -190,12 +190,19 @@ pub(super) struct JoinLeftData { /// The hash table with indices into `batch` /// Arc is used to allow sharing with SharedBuildAccumulator for hash map pushdown pub(super) map: Arc, + /// Hash table over correlated scope keys for scalar null-aware mark joins. + /// + /// For null-aware `LeftMark`, key 0 is the scalar `NOT IN` value key and + /// keys 1..N are correlated equality scope keys. + null_aware_mark_scope_map: Option>, /// The input rows for the build side batch: RecordBatch, /// The build side on expressions values values: Vec, /// Shared bitmap builder for visited left indices visited_indices_bitmap: SharedBitmapBuilder, + /// Shared bitmap builder for null marks + null_indices_bitmap: SharedBitmapBuilder, /// Counter of running probe-threads, potentially /// able to update `visited_indices_bitmap` probe_threads_counter: AtomicUsize, @@ -211,7 +218,7 @@ pub(super) struct JoinLeftData { /// Membership testing strategy for filter pushdown /// Contains either InList values for small build sides or hash table reference for large build sides pub(super) membership: PushdownStrategy, - /// Shared atomic flag indicating if any probe partition saw data (for null-aware anti joins) + /// Shared atomic flag indicating if any probe partition saw data (for null-aware anti/mark joins) /// This is shared across all probe partitions to provide global knowledge pub(super) probe_side_non_empty: AtomicBool, /// Shared atomic flag indicating if any probe partition saw NULL in join keys (for null-aware anti joins) @@ -224,6 +231,10 @@ impl JoinLeftData { &self.map } + pub(super) fn null_aware_mark_scope_map(&self) -> Option<&Map> { + self.null_aware_mark_scope_map.as_deref() + } + /// returns a reference to the build side batch pub(super) fn batch(&self) -> &RecordBatch { &self.batch @@ -256,6 +267,10 @@ impl JoinLeftData { &self.visited_indices_bitmap } + pub(super) fn null_indices_bitmap(&self) -> &SharedBitmapBuilder { + &self.null_indices_bitmap + } + /// returns a reference to the InList values for filter pushdown pub(super) fn membership(&self) -> &PushdownStrategy { &self.membership @@ -421,18 +436,26 @@ impl HashJoinExecBuilder { // Validate null_aware flag if exec.null_aware { let join_type = exec.join_type(); - if !matches!(join_type, JoinType::LeftAnti) { + if !matches!(join_type, JoinType::LeftAnti | JoinType::LeftMark) { return plan_err!( - "null_aware can only be true for LeftAnti joins, got {join_type}" + "null_aware can only be true for LeftAnti or LeftMark joins, got {join_type}" ); } let on = exec.on(); - if on.len() != 1 { + if *join_type == JoinType::LeftAnti && on.len() != 1 { return plan_err!( - "null_aware anti join only supports single column join key, got {} columns", + "null_aware LeftAnti joins only support single column join key, got {} columns", on.len() ); } + // Null-aware joins need global probe-side state (and a single build-side scope + // map for correlated LeftMark), so Partitioned would miss cross-partition rows + // and is forbidden. + if matches!(exec.partition_mode(), PartitionMode::Partitioned) { + return plan_err!( + "null_aware joins require PartitionMode::CollectLeft, got PartitionMode::Partitioned" + ); + } } if preserve_properties { @@ -467,8 +490,12 @@ impl HashJoinExecBuilder { } check_join_is_valid(&left_schema, &right_schema, &on)?; - let (join_schema, column_indices) = - build_join_schema(&left_schema, &right_schema, &join_type); + let (join_schema, column_indices) = build_join_schema_with_null_aware( + &left_schema, + &right_schema, + &join_type, + null_aware, + ); let join_schema = Arc::new(join_schema); @@ -764,7 +791,19 @@ pub struct HashJoinExec { column_indices: Vec, /// The equality null-handling behavior of the join algorithm. pub null_equality: NullEquality, - /// Flag to indicate if this is a null-aware anti join + /// Flag to indicate if this join uses null-aware equality semantics. + /// + /// Set for the physical lowering of scalar `NOT IN` subqueries (producing + /// `JoinType::LeftAnti` when uncorrelated or `JoinType::LeftMark` when + /// correlated). When `true`, NULLs in the join keys follow SQL `NOT IN` + /// three-valued logic rather than ordinary equi-join semantics. + /// + /// Key-ordering convention (relied on positionally, not enforced): for a + /// null-aware `LeftMark` join with more than one key, `on[0]` is the scalar + /// `NOT IN` value key and `on[1..N]` are the correlated equality scope keys. + /// Reordering these keys would silently produce wrong results, which is why + /// such joins are pinned to `PartitionMode::CollectLeft` (the only key + /// reorderer acts solely on `PartitionMode::Partitioned`). pub null_aware: bool, /// Cache holding plan properties like equivalences, output partitioning etc. cache: Arc, @@ -1389,6 +1428,9 @@ impl ExecutionPlan for HashJoinExec { Arc::clone(context.session_config().options()), self.null_equality, array_map_created_count, + self.null_aware + && self.join_type == JoinType::LeftMark + && on_left.len() > 1, )) })?, PartitionMode::Partitioned => { @@ -1409,6 +1451,9 @@ impl ExecutionPlan for HashJoinExec { Arc::clone(context.session_config().options()), self.null_equality, array_map_created_count, + self.null_aware + && self.join_type == JoinType::LeftMark + && on_left.len() > 1, )) } PartitionMode::Auto => { @@ -1911,6 +1956,29 @@ fn should_collect_min_max_for_perfect_hash( Ok(ArrayMap::is_supported_type(&data_type)) } +fn new_join_hashmap( + num_rows: usize, + reservation: &mut MemoryReservation, + metrics: &BuildProbeJoinMetrics, +) -> Result> { + let fixed_size_u32 = size_of::(); + let fixed_size_u64 = size_of::(); + + if num_rows > u32::MAX as usize { + let estimated_hashtable_size = + estimate_memory_size::<(u64, u64)>(num_rows, fixed_size_u64)?; + reservation.try_grow(estimated_hashtable_size)?; + metrics.build_mem_used.add(estimated_hashtable_size); + Ok(Box::new(JoinHashMapU64::with_capacity(num_rows))) + } else { + let estimated_hashtable_size = + estimate_memory_size::<(u32, u64)>(num_rows, fixed_size_u32)?; + reservation.try_grow(estimated_hashtable_size)?; + metrics.build_mem_used.add(estimated_hashtable_size); + Ok(Box::new(JoinHashMapU32::with_capacity(num_rows))) + } +} + /// Collects all batches from the left (build) side stream and creates a hash map for joining. /// /// This function is responsible for: @@ -1928,6 +1996,8 @@ fn should_collect_min_max_for_perfect_hash( /// * `with_visited_indices_bitmap` - Whether to track visited indices (for outer joins) /// * `probe_threads_count` - Number of threads that will probe this hash table /// * `should_compute_dynamic_filters` - Whether to compute min/max bounds for dynamic filtering +/// * `with_null_aware_mark_state` - Whether to build the per-build-row null-indices bitmap +/// and correlation-scope map used by correlated null-aware `LeftMark` joins /// /// # Dynamic Filter Coordination /// When `should_compute_dynamic_filters` is true, this function computes the min/max bounds @@ -1952,6 +2022,7 @@ async fn collect_left_input( config: Arc, null_equality: NullEquality, array_map_created_count: Count, + with_null_aware_mark_state: bool, ) -> Result { let schema = left_stream.schema(); @@ -2031,25 +2102,10 @@ async fn collect_left_input( } else { // Estimation of memory size, required for hashtable, prior to allocation. // Final result can be verified using `RawTable.allocation_info()` - let fixed_size_u32 = size_of::(); - let fixed_size_u64 = size_of::(); - // Use `u32` indices for the JoinHashMap when num_rows ≤ u32::MAX, otherwise use the // `u64` indice variant // Arc is used instead of Box to allow sharing with SharedBuildAccumulator for hash map pushdown - let mut hashmap: Box = if num_rows > u32::MAX as usize { - let estimated_hashtable_size = - estimate_memory_size::<(u64, u64)>(num_rows, fixed_size_u64)?; - reservation.try_grow(estimated_hashtable_size)?; - metrics.build_mem_used.add(estimated_hashtable_size); - Box::new(JoinHashMapU64::with_capacity(num_rows)) - } else { - let estimated_hashtable_size = - estimate_memory_size::<(u32, u64)>(num_rows, fixed_size_u32)?; - reservation.try_grow(estimated_hashtable_size)?; - metrics.build_mem_used.add(estimated_hashtable_size); - Box::new(JoinHashMapU32::with_capacity(num_rows)) - }; + let mut hashmap = new_join_hashmap(num_rows, &mut reservation, &metrics)?; let mut hashes_buffer = Vec::new(); let mut offset = 0; @@ -2082,19 +2138,59 @@ async fn collect_left_input( (Map::HashMap(hashmap), batch, left_values) }; - // Reserve additional memory for visited indices bitmap and create shared builder - let visited_indices_bitmap = if with_visited_indices_bitmap { + let allocate_bitmap = || -> Result { let bitmap_size = bit_util::ceil(batch.num_rows(), 8); reservation.try_grow(bitmap_size)?; metrics.build_mem_used.add(bitmap_size); - let mut bitmap_buffer = BooleanBufferBuilder::new(batch.num_rows()); - bitmap_buffer.append_n(num_rows, false); - bitmap_buffer + let mut bitmap = BooleanBufferBuilder::new(batch.num_rows()); + bitmap.append_n(num_rows, false); + Ok(bitmap) + }; + + // Reserve additional memory for visited indices bitmap and create shared builder + let visited_indices_bitmap = if with_visited_indices_bitmap { + allocate_bitmap()? } else { BooleanBufferBuilder::new(0) }; + let null_indices_bitmap = if with_null_aware_mark_state { + allocate_bitmap()? + } else { + BooleanBufferBuilder::new(0) + }; + + let null_aware_mark_scope_map = if with_null_aware_mark_state { + // Null-aware `LeftMark` convention: `on_left[0]` is the value key and + // `on_left[1..]` the scope keys, so the scope map needs more than one key. + debug_assert!( + on_left.len() > 1, + "null-aware LeftMark needs on_left[0]=value, on_left[1..]=scope, got {} key(s)", + on_left.len() + ); + // This secondary map is keyed only by correlation scope keys. The + // primary join map may still use ArrayMap for full-key TRUE matches, + // but scope-only NULL marking uses a HashMap for arbitrary key shapes. + let mut hashmap = new_join_hashmap(num_rows, &mut reservation, &metrics)?; + + let mut hashes_buffer = vec![0; batch.num_rows()]; + update_hash( + &on_left[1..], + &batch, + &mut *hashmap, + 0, + &random_state, + &mut hashes_buffer, + 0, + true, + )?; + + Some(Arc::new(Map::HashMap(hashmap))) + } else { + None + }; + let map = Arc::new(join_hash_map); let membership = if num_rows == 0 { @@ -2129,9 +2225,11 @@ async fn collect_left_input( let data = JoinLeftData { map, + null_aware_mark_scope_map, batch, values: left_values, visited_indices_bitmap: Mutex::new(visited_indices_bitmap), + null_indices_bitmap: Mutex::new(null_indices_bitmap), probe_threads_counter: AtomicUsize::new(probe_threads_count), _reservation: reservation, bounds, @@ -6500,7 +6598,7 @@ mod tests { Ok(()) } - /// Test that null_aware validation rejects non-LeftAnti join types + /// Test that null_aware validation rejects unsupported join types #[tokio::test] async fn test_null_aware_validation_wrong_join_type() { let left = @@ -6531,7 +6629,7 @@ mod tests { result .unwrap_err() .to_string() - .contains("null_aware can only be true for LeftAnti joins") + .contains("null_aware can only be true for LeftAnti or LeftMark joins") ); } @@ -6566,13 +6664,293 @@ mod tests { true, // null_aware = true (invalid for multi-column) ); + assert!(result.is_err()); + assert!( + result.unwrap_err().to_string().contains( + "null_aware LeftAnti joins only support single column join key" + ) + ); + } + + /// Test that null_aware validation rejects PartitionMode::Partitioned (must be CollectLeft). + #[tokio::test] + async fn test_null_aware_validation_partitioned() { + let left = build_table(("a", &vec![1]), ("b", &vec![2]), ("c", &vec![3])); + let right = build_table(("x", &vec![1]), ("y", &vec![2]), ("z", &vec![3])); + + let on = vec![ + ( + Arc::new(Column::new_with_schema("a", &left.schema()).unwrap()) as _, + Arc::new(Column::new_with_schema("x", &right.schema()).unwrap()) as _, + ), + ( + Arc::new(Column::new_with_schema("b", &left.schema()).unwrap()) as _, + Arc::new(Column::new_with_schema("y", &right.schema()).unwrap()) as _, + ), + ]; + + // LeftMark (not LeftAnti) so the single-key check is skipped and the partition-mode check is reached. + let result = HashJoinExec::try_new( + left, + right, + on, + None, + &JoinType::LeftMark, + None, + PartitionMode::Partitioned, + NullEquality::NullEqualsNothing, + true, // null_aware = true (invalid with Partitioned) + ); + assert!(result.is_err()); assert!( result .unwrap_err() .to_string() - .contains("null_aware anti join only supports single column join key") + .contains("null_aware joins require PartitionMode::CollectLeft") + ); + } + + /// Test null-aware left mark join when probe side contains NULL. + /// Expected: + /// - matched rows => true + /// - unmatched non-NULL rows => NULL + /// - NULL build keys with non-empty probe side => NULL + #[apply(hash_join_exec_configs)] + #[tokio::test] + async fn test_null_aware_left_mark_probe_null(batch_size: usize) -> Result<()> { + let task_ctx = prepare_task_ctx(batch_size, false); + + let left = build_table_two_cols( + ("c1", &vec![Some(1), Some(4), None]), + ("dummy", &vec![Some(10), Some(40), Some(0)]), + ); + + let right = build_table_two_cols( + ("c2", &vec![Some(1), Some(2), None]), + ("dummy", &vec![Some(100), Some(200), Some(300)]), + ); + + let on = vec![( + Arc::new(Column::new_with_schema("c1", &left.schema())?) as _, + Arc::new(Column::new_with_schema("c2", &right.schema())?) as _, + )]; + + let join = HashJoinExec::try_new( + left, + right, + on, + None, + &JoinType::LeftMark, + None, + PartitionMode::CollectLeft, + NullEquality::NullEqualsNothing, + true, // null_aware = true + )?; + + let stream = join.execute(0, task_ctx)?; + let batches = common::collect(stream).await?; + + allow_duplicates! { + assert_snapshot!(batches_to_sort_string(&batches), @r" + +----+-------+------+ + | c1 | dummy | mark | + +----+-------+------+ + | | 0 | | + | 1 | 10 | true | + | 4 | 40 | | + +----+-------+------+ + "); + } + + Ok(()) + } + + /// Test null-aware left mark join when probe side is empty. + /// Expected: all rows are marked false, including NULL build keys. + #[apply(hash_join_exec_configs)] + #[tokio::test] + async fn test_null_aware_left_mark_empty_probe(batch_size: usize) -> Result<()> { + let task_ctx = prepare_task_ctx(batch_size, false); + + let left = build_table_two_cols( + ("c1", &vec![Some(1), None]), + ("dummy", &vec![Some(10), Some(0)]), + ); + + let right = build_table_two_cols( + ("c2", &Vec::>::new()), + ("dummy", &Vec::>::new()), + ); + + let on = vec![( + Arc::new(Column::new_with_schema("c1", &left.schema())?) as _, + Arc::new(Column::new_with_schema("c2", &right.schema())?) as _, + )]; + + let join = HashJoinExec::try_new( + left, + right, + on, + None, + &JoinType::LeftMark, + None, + PartitionMode::CollectLeft, + NullEquality::NullEqualsNothing, + true, // null_aware = true + )?; + + let stream = join.execute(0, task_ctx)?; + let batches = common::collect(stream).await?; + + allow_duplicates! { + assert_snapshot!(batches_to_sort_string(&batches), @r" + +----+-------+-------+ + | c1 | dummy | mark | + +----+-------+-------+ + | | 0 | false | + | 1 | 10 | false | + +----+-------+-------+ + "); + } + + Ok(()) + } + + /// Test scalar correlated null-aware left mark join. + /// + /// The first key is the scalar NOT IN value key. The second key is the + /// correlated scope key, so the NULL on the probe side only affects group 1. + #[apply(hash_join_exec_configs)] + #[tokio::test] + async fn test_null_aware_left_mark_correlated_scope(batch_size: usize) -> Result<()> { + let task_ctx = prepare_task_ctx(batch_size, false); + + let left = build_table_two_cols( + ("id", &vec![Some(1), Some(2), Some(3), None, None, Some(5)]), + ( + "grp", + &vec![Some(1), Some(1), Some(1), Some(1), Some(2), Some(3)], + ), + ); + + let right = build_table_two_cols( + ("id", &vec![Some(2), None, Some(1)]), + ("grp", &vec![Some(1), Some(1), Some(2)]), + ); + + let on = vec![ + ( + Arc::new(Column::new_with_schema("id", &left.schema())?) as _, + Arc::new(Column::new_with_schema("id", &right.schema())?) as _, + ), + ( + Arc::new(Column::new_with_schema("grp", &left.schema())?) as _, + Arc::new(Column::new_with_schema("grp", &right.schema())?) as _, + ), + ]; + + let join = HashJoinExec::try_new( + left, + right, + on, + None, + &JoinType::LeftMark, + None, + PartitionMode::CollectLeft, + NullEquality::NullEqualsNothing, + true, + )?; + + let stream = join.execute(0, task_ctx)?; + let batches = common::collect(stream).await?; + + allow_duplicates! { + assert_snapshot!(batches_to_sort_string(&batches), @r" + +----+-----+-------+ + | id | grp | mark | + +----+-----+-------+ + | | 1 | | + | | 2 | | + | 1 | 1 | | + | 2 | 1 | true | + | 3 | 1 | | + | 5 | 3 | false | + +----+-----+-------+ + "); + } + + Ok(()) + } + + /// Scalar correlated null-aware left mark join where neither side's value + /// key (`id`) contains NULL. + /// + /// This exercises the fast path in `mark_null_candidates_for_probe_batch` + /// that skips the correlation-scope NULL lookup when no value key is NULL. + /// With no NULL value keys, SQL `NOT IN` can never be UNKNOWN, so the mark + /// must be `true`/`false` only and never NULL, regardless of correlation + /// scope or empty subqueries (`grp = 3` has no matching probe rows). + #[apply(hash_join_exec_configs)] + #[tokio::test] + async fn test_null_aware_left_mark_correlated_no_value_nulls( + batch_size: usize, + ) -> Result<()> { + let task_ctx = prepare_task_ctx(batch_size, false); + + let left = build_table_two_cols( + ("id", &vec![Some(1), Some(2), Some(3), Some(4)]), + ("grp", &vec![Some(1), Some(1), Some(2), Some(3)]), + ); + + let right = build_table_two_cols( + ("id", &vec![Some(2), Some(5)]), + ("grp", &vec![Some(1), Some(2)]), ); + + let on = vec![ + ( + Arc::new(Column::new_with_schema("id", &left.schema())?) as _, + Arc::new(Column::new_with_schema("id", &right.schema())?) as _, + ), + ( + Arc::new(Column::new_with_schema("grp", &left.schema())?) as _, + Arc::new(Column::new_with_schema("grp", &right.schema())?) as _, + ), + ]; + + let join = HashJoinExec::try_new( + left, + right, + on, + None, + &JoinType::LeftMark, + None, + PartitionMode::CollectLeft, + NullEquality::NullEqualsNothing, + true, + )?; + + let stream = join.execute(0, task_ctx)?; + let batches = common::collect(stream).await?; + + // Every mark is true/false; no UNKNOWN is produced when value keys are + // non-null. `id=2` matches within `grp=1` (true); `id=4`'s `grp=3` has + // an empty correlated subquery (false); the rest find no equal value. + allow_duplicates! { + assert_snapshot!(batches_to_sort_string(&batches), @r" + +----+-----+-------+ + | id | grp | mark | + +----+-----+-------+ + | 1 | 1 | false | + | 2 | 1 | true | + | 3 | 2 | false | + | 4 | 3 | false | + +----+-----+-------+ + "); + } + + Ok(()) } #[test] diff --git a/datafusion/physical-plan/src/joins/hash_join/stream.rs b/datafusion/physical-plan/src/joins/hash_join/stream.rs index 2aa6e69dff807..7dfafd0166121 100644 --- a/datafusion/physical-plan/src/joins/hash_join/stream.rs +++ b/datafusion/physical-plan/src/joins/hash_join/stream.rs @@ -43,7 +43,7 @@ use crate::{ BuildProbeJoinMetrics, ColumnIndex, JoinFilter, JoinHashMapType, StatefulStreamResult, adjust_indices_by_join_type, apply_join_filter_to_indices, build_batch_empty_build_side, build_batch_from_indices, - need_produce_result_in_final, + build_null_aware_left_mark_column, need_produce_result_in_final, }, }; @@ -325,6 +325,11 @@ pub(super) struct HashJoinStream { probe_indices_buffer: Vec, /// Scratch space for build indices during hash lookup build_indices_buffer: Vec, + + null_mark_hashes_buffer: Vec, + null_mark_probe_indices_buffer: Vec, + null_mark_build_indices_buffer: Vec, + /// Specifies whether the right side has an ordering to potentially preserve right_side_ordered: bool, /// Owns this partition's build-data report lifecycle. @@ -334,7 +339,7 @@ pub(super) struct HashJoinStream { /// Output buffer for coalescing small batches into larger ones with optional fetch limit. /// Uses `LimitedBatchCoalescer` to efficiently combine batches and absorb limit with 'fetch' output_buffer: LimitedBatchCoalescer, - /// Whether this is a null-aware anti join + /// Whether this is a null-aware anti or mark joins null_aware: bool, } @@ -509,6 +514,9 @@ impl HashJoinStream { hashes_buffer, probe_indices_buffer: Vec::with_capacity(batch_size), build_indices_buffer: Vec::with_capacity(batch_size), + null_mark_hashes_buffer: Vec::with_capacity(batch_size), + null_mark_probe_indices_buffer: Vec::with_capacity(batch_size), + null_mark_build_indices_buffer: Vec::with_capacity(batch_size), right_side_ordered, build_report: BuildReportHandle::new(partition, mode, build_accumulator), mode, @@ -741,10 +749,11 @@ impl HashJoinStream { let timer = self.join_metrics.join_time.timer(); - // Null-aware anti join semantics: - // For LeftAnti: output LEFT (build) rows where LEFT.key NOT IN RIGHT.key - // 1. If RIGHT (probe) contains NULL in any batch, no LEFT rows should be output - // 2. LEFT rows with NULL keys should not be output (handled in final stage) + // Null-aware join bookkeeping: + // - LeftAnti needs global knowledge of probe-side NULLs/non-emptiness to implement NOT IN. + // - Uncorrelated LeftMark uses the same global probe-side state. + // - Correlated LeftMark records per-build-row NULL candidates below using + // a separate correlation-scope lookup. if self.null_aware { // Mark that we've seen a probe batch with actual rows (probe side is non-empty) // Only set this if batch has rows - empty batches don't count @@ -756,8 +765,7 @@ impl HashJoinStream { .store(true, Ordering::Relaxed); } - // Check if probe side (RIGHT) contains NULL - // Since null_aware validation ensures single column join, we only check the first column + // Check if the scalar NOT IN value key from the probe side contains NULL. let probe_key_column = &state.values[0]; if probe_key_column.null_count() > 0 { // Found NULL in probe side - set shared flag to prevent any output @@ -767,11 +775,12 @@ impl HashJoinStream { .store(true, Ordering::Relaxed); } - // If probe side has NULL (detected in this or any other partition), return empty result - if build_side - .left_data - .probe_side_has_null - .load(Ordering::Relaxed) + // LeftAnti can short-circuit once the probe side contains NULL. + if self.join_type == JoinType::LeftAnti + && build_side + .left_data + .probe_side_has_null + .load(Ordering::Relaxed) { timer.done(); self.state = HashJoinStreamState::FetchProbeBatch; @@ -796,6 +805,22 @@ impl HashJoinStream { return Ok(StatefulStreamResult::Continue); } + if self.null_aware + && self.join_type == JoinType::LeftMark + && state.values.len() > 1 + && state.offset == (0, None) + { + mark_null_candidates_for_probe_batch( + build_side, + state, + &self.random_state, + self.batch_size, + &mut self.null_mark_hashes_buffer, + &mut self.null_mark_probe_indices_buffer, + &mut self.null_mark_build_indices_buffer, + )?; + } + // get the matched by join keys indices let (left_indices, right_indices, next_offset) = match build_side.left_data.map() { @@ -919,6 +944,7 @@ impl HashJoinStream { &self.column_indices, join_side, self.join_type, + None, )?; let push_status = self.output_buffer.push_batch(batch)?; @@ -954,6 +980,7 @@ impl HashJoinStream { let timer = self.join_metrics.join_time.timer(); if !need_produce_result_in_final(self.join_type) { + timer.done(); self.state = HashJoinStreamState::Completed; return Ok(StatefulStreamResult::Continue); } @@ -963,6 +990,7 @@ impl HashJoinStream { // For null-aware anti join, if probe side had NULL, no rows should be output // Check shared atomic state to get global knowledge across all partitions if self.null_aware + && self.join_type == JoinType::LeftAnti && build_side .left_data .probe_side_has_null @@ -972,7 +1000,9 @@ impl HashJoinStream { self.state = HashJoinStreamState::Completed; return Ok(StatefulStreamResult::Continue); } + if !build_side.left_data.report_probe_completed() { + timer.done(); self.state = HashJoinStreamState::Completed; return Ok(StatefulStreamResult::Continue); } @@ -1002,32 +1032,56 @@ impl HashJoinStream { let filtered_indices: Vec = left_side .iter() .filter_map(|idx| { - let idx_usize = idx.unwrap() as usize; + let idx = idx.expect( + "LeftAnti final indices should always contain build-side rows", + ); + let idx_usize = idx as usize; if build_key_column.is_null(idx_usize) { None // Skip rows with NULL keys } else { - Some(idx.unwrap()) + Some(idx) } }) .collect(); left_side = UInt64Array::from(filtered_indices); - - // Update right_side to match the new length - let mut builder = arrow::array::UInt32Builder::with_capacity(left_side.len()); - builder.append_nulls(left_side.len()); - right_side = builder.finish(); + right_side = UInt32Array::new_null(left_side.len()); } self.join_metrics.input_batches.add(1); self.join_metrics.input_rows.add(left_side.len()); - timer.done(); - - self.state = HashJoinStreamState::Completed; - // Push final unmatched indices to output buffer if !left_side.is_empty() { + let mark_column = if self.null_aware && self.join_type == JoinType::LeftMark { + let probe_side_has_null = build_side + .left_data + .probe_side_has_null + .load(Ordering::Relaxed); + let probe_side_non_empty = build_side + .left_data + .probe_side_non_empty + .load(Ordering::Relaxed); + let build_key_column = &build_side.left_data.values()[0]; + let null_indices_bitmap = + if build_side.left_data.null_aware_mark_scope_map().is_some() { + Some(build_side.left_data.null_indices_bitmap().lock()) + } else { + None + }; + + Some(build_null_aware_left_mark_column( + &left_side, + &right_side, + build_key_column.as_ref(), + null_indices_bitmap.as_deref(), + probe_side_has_null, + probe_side_non_empty, + )) + } else { + None + }; + let empty_right_batch = RecordBatch::new_empty(self.right.schema()); let batch = build_batch_from_indices( &self.schema, @@ -1038,6 +1092,7 @@ impl HashJoinStream { &self.column_indices, JoinSide::Left, self.join_type, + mark_column.as_ref(), )?; let push_status = self.output_buffer.push_batch(batch)?; @@ -1047,10 +1102,106 @@ impl HashJoinStream { } } + timer.done(); + self.state = HashJoinStreamState::Completed; + Ok(StatefulStreamResult::Continue) } } +fn mark_null_candidates_for_probe_batch( + build_side: &BuildSideReadyState, + state: &ProcessProbeBatchState, + random_state: &RandomState, + batch_size: usize, + hashes_buffer: &mut Vec, + probe_indices_buffer: &mut Vec, + build_indices_buffer: &mut Vec, +) -> Result<()> { + let Some(null_aware_mark_scope_map) = + build_side.left_data.null_aware_mark_scope_map() + else { + return Ok(()); + }; + + let Map::HashMap(scope_hashmap) = null_aware_mark_scope_map else { + // `collect_left_input` constructs the correlation-scope map as a + // HashMap even when the primary full-key join map uses ArrayMap. + return internal_err!("null-aware mark scope map must be a hash map"); + }; + + // Key layout (consumed positionally below): index 0 is the `NOT IN` value + // key; indices 1..N are the correlation scope keys. + debug_assert!( + build_side.left_data.values().len() > 1, + "build keys must be [value, scope..]" + ); + debug_assert!( + state.values.len() > 1, + "probe keys must be [value, scope..]" + ); + debug_assert_eq!( + build_side.left_data.values().len(), + state.values.len(), + "build/probe key counts must match" + ); + + let build_value_key = &build_side.left_data.values()[0]; + let probe_value_key = &state.values[0]; + let build_scope_values = &build_side.left_data.values()[1..]; + let probe_scope_values = &state.values[1..]; + + if build_value_key.null_count() == 0 && probe_value_key.null_count() == 0 { + return Ok(()); + } + + hashes_buffer.clear(); + hashes_buffer.resize(state.batch.num_rows(), 0); + create_hashes(probe_scope_values, random_state, hashes_buffer)?; + + let mut null_bitmap = build_side.left_data.null_indices_bitmap().lock(); + let mut offset = (0, None); + loop { + let (build_indices, probe_indices, next_offset) = lookup_join_hashmap( + scope_hashmap.as_ref(), + build_scope_values, + probe_scope_values, + NullEquality::NullEqualsNothing, + hashes_buffer, + batch_size, + offset, + probe_indices_buffer, + build_indices_buffer, + )?; + + if !build_indices.is_empty() { + build_indices.iter().zip(probe_indices.iter()).for_each( + |(build_idx, probe_idx)| { + let build_idx = build_idx + .expect("scope lookup should produce non-null build indices") + as usize; + let probe_idx = probe_idx + .expect("scope lookup should produce non-null probe indices") + as usize; + + if probe_value_key.is_null(probe_idx) + || build_value_key.is_null(build_idx) + { + null_bitmap.set_bit(build_idx, true); + } + }, + ); + } + + let Some(next_offset) = next_offset else { + break; + }; + offset = next_offset; + } + + Ok(()) +} + impl Stream for HashJoinStream { type Item = Result; diff --git a/datafusion/physical-plan/src/joins/symmetric_hash_join.rs b/datafusion/physical-plan/src/joins/symmetric_hash_join.rs index a56ad1712aa8e..321d7e686c823 100644 --- a/datafusion/physical-plan/src/joins/symmetric_hash_join.rs +++ b/datafusion/physical-plan/src/joins/symmetric_hash_join.rs @@ -937,6 +937,7 @@ pub(crate) fn build_side_determined_results( column_indices, build_hash_joiner.build_side, join_type, + None, ) .map(|batch| (batch.num_rows() > 0).then_some(batch)) } else { @@ -1040,6 +1041,7 @@ pub(crate) fn join_with_probe_batch( column_indices, build_hash_joiner.build_side, join_type, + None, ) .map(|batch| (batch.num_rows() > 0).then_some(batch)) } diff --git a/datafusion/physical-plan/src/joins/utils.rs b/datafusion/physical-plan/src/joins/utils.rs index 7ecace6b0e530..683c92c1950d2 100644 --- a/datafusion/physical-plan/src/joins/utils.rs +++ b/datafusion/physical-plan/src/joins/utils.rs @@ -257,11 +257,28 @@ fn output_join_field(old_field: &Field, join_type: &JoinType, is_left: bool) -> } /// Creates a schema for a join operation. -/// The fields from the left side are first +/// The fields from the left side are first. +/// +/// The `mark` column of a mark join is non-nullable. Use +/// [`build_join_schema_with_null_aware`] for null-aware mark joins, whose mark +/// column can be `NULL` (SQL UNKNOWN). pub fn build_join_schema( left: &Schema, right: &Schema, join_type: &JoinType, +) -> (Schema, Vec) { + build_join_schema_with_null_aware(left, right, join_type, false) +} + +/// Like [`build_join_schema`], but makes the `LeftMark`/`RightMark` `mark` +/// column nullable when `null_aware` is set, so it can represent SQL UNKNOWN +/// for null-aware `NOT IN` semantics. `null_aware` has no effect on non-mark +/// join types. +pub fn build_join_schema_with_null_aware( + left: &Schema, + right: &Schema, + join_type: &JoinType, + null_aware: bool, ) -> (Schema, Vec) { let left_fields = || { left.fields() @@ -304,7 +321,7 @@ pub fn build_join_schema( JoinType::LeftSemi | JoinType::LeftAnti => left_fields().unzip(), JoinType::LeftMark => { let right_field = once(( - Field::new("mark", DataType::Boolean, false), + Field::new("mark", DataType::Boolean, null_aware), ColumnIndex { index: 0, side: JoinSide::None, @@ -315,7 +332,7 @@ pub fn build_join_schema( JoinType::RightSemi | JoinType::RightAnti => right_fields().unzip(), JoinType::RightMark => { let left_field = once(( - Field::new("mark", DataType::Boolean, false), + Field::new("mark", DataType::Boolean, null_aware), ColumnIndex { index: 0, side: JoinSide::None, @@ -1263,6 +1280,7 @@ pub(crate) fn apply_join_filter_to_indices( filter.column_indices(), build_side, join_type, + None, )?; let filter_result = filter .expression() @@ -1285,6 +1303,7 @@ pub(crate) fn apply_join_filter_to_indices( filter.column_indices(), build_side, join_type, + None, )?; filter @@ -1326,6 +1345,7 @@ pub(crate) fn build_batch_from_indices( column_indices: &[ColumnIndex], build_side: JoinSide, join_type: JoinType, + mark_column: Option<&ArrayRef>, ) -> Result { if schema.fields().is_empty() { // For RightAnti and RightSemi joins, after `adjust_indices_by_join_type` @@ -1345,8 +1365,12 @@ pub(crate) fn build_batch_from_indices( for column_index in column_indices { let array = if column_index.side == JoinSide::None { - // For mark joins, the mark column is a true if the indices is not null, otherwise it will be false - Arc::new(compute::is_not_null(probe_indices)?) + // For mark joins, callers can provide a custom mark column. Otherwise, + // matched rows are `true` and unmatched rows are `false`. + match mark_column { + Some(mark_col) => Arc::clone(mark_col), + None => Arc::new(compute::is_not_null(probe_indices)?), + } } else if column_index.side == build_side { let array = build_input_buffer.column(column_index.index); if array.is_empty() || build_indices.null_count() == build_indices.len() { @@ -1373,6 +1397,71 @@ pub(crate) fn build_batch_from_indices( Ok(RecordBatch::try_new(Arc::new(schema.clone()), columns)?) } +/// Builds the nullable mark column for a null-aware `LeftMark` join. +/// +/// This follows the left mark hash join described in Neumann, Leis, and Kemper, +/// "The Complete Story of Joins (in HyPer)", Section 5.6: +/// +/// +/// `build_indices` and `probe_indices` are the final aligned indices derived from the +/// visited bitmap. At this point: +/// - valid `probe_indices` mean the build row matched at least one probe row, so the mark is `TRUE` +/// - null `probe_indices` mean the build row was unmatched, so the result depends on SQL +/// three-valued logic +/// +/// For the uncorrelated single-key implementation, unmatched rows are classified as follows: +/// 1. if the build key is `NULL` and the probe side is non-empty, the mark is `NULL` +/// 2. if the build key is `NULL` and the probe side is empty, the mark is `FALSE` +/// 3. if the build key is non-null and the probe side contained a `NULL`, the mark is `NULL` +/// 4. otherwise, the mark is `FALSE` +/// +/// For correlated scalar `NOT IN`, `null_indices_bitmap` carries the same UNKNOWN +/// decision per build row, scoped by the correlated equality keys. +/// +/// This is the helper equivalent of the paper's "null bucket" and `hadNull` handling. +/// It is intentionally scoped to scalar null-aware mark joins. +pub(crate) fn build_null_aware_left_mark_column( + build_indices: &UInt64Array, + probe_indices: &UInt32Array, + build_key_column: &dyn Array, + null_indices_bitmap: Option<&BooleanBufferBuilder>, + probe_side_has_null: bool, + probe_side_non_empty: bool, +) -> ArrayRef { + Arc::new( + build_indices + .iter() + .enumerate() + .map(|(output_idx, build_idx)| { + if probe_indices.is_valid(output_idx) { + Some(true) + } else { + let build_idx = build_idx.expect( + "LeftMark final indices should always contain build-side rows", + ) as usize; + if let Some(null_indices_bitmap) = null_indices_bitmap { + if null_indices_bitmap.get_bit(build_idx) { + None + } else { + Some(false) + } + } else if build_key_column.is_null(build_idx) { + if probe_side_non_empty { + None + } else { + Some(false) + } + } else if probe_side_has_null { + None + } else { + Some(false) + } + } + }) + .collect::(), + ) as ArrayRef +} + /// Returns a new [RecordBatch] for a probe batch when no probe row can find a /// match: the build-side map is empty, either because the build side has no /// rows or because none of its rows has a matchable (non-NULL) join key. diff --git a/datafusion/sqllogictest/test_files/null_aware_anti_join.slt b/datafusion/sqllogictest/test_files/null_aware_anti_join.slt index b18f3b3ae7a99..e73293f544375 100644 --- a/datafusion/sqllogictest/test_files/null_aware_anti_join.slt +++ b/datafusion/sqllogictest/test_files/null_aware_anti_join.slt @@ -147,16 +147,6 @@ WHERE id NOT IN (SELECT id FROM inner_table_no_null) ## Test 9: Multiple NOT IN conditions (OR) ############# -# KNOWN LIMITATION: Mark joins used for OR conditions don't support null-aware semantics. -# The NULL row is incorrectly returned here. According to SQL semantics: -# - NULL NOT IN (2, 4) = UNKNOWN -# - NULL NOT IN (1, 3) = UNKNOWN -# - UNKNOWN OR UNKNOWN = UNKNOWN (should be filtered out) -# But mark joins treat NULL keys as non-matching (FALSE), so: -# - NULL mark column = FALSE -# - NOT FALSE OR NOT FALSE = TRUE OR TRUE = TRUE (incorrectly included) -# TODO: Implement null-aware support for mark joins to fix this - query IT rowsort SELECT * FROM outer_table WHERE id NOT IN (SELECT id FROM inner_table_no_null) @@ -166,7 +156,6 @@ WHERE id NOT IN (SELECT id FROM inner_table_no_null) 2 b 3 c 4 d -NULL e ############# ## Test 10: NOT IN with WHERE clause in subquery diff --git a/datafusion/sqllogictest/test_files/null_aware_mark_join.slt b/datafusion/sqllogictest/test_files/null_aware_mark_join.slt new file mode 100644 index 0000000000000..72b2f0eecf14b --- /dev/null +++ b/datafusion/sqllogictest/test_files/null_aware_mark_join.slt @@ -0,0 +1,351 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +statement ok +CREATE TABLE outer_table(id INT, value TEXT) AS VALUES +(1, 'a'), +(2, 'b'), +(3, 'c'), +(4, 'd'), +(NULL, 'e'); + +statement ok +CREATE TABLE inner_table_no_null(id INT) AS VALUES +(2), +(4); + +statement ok +CREATE TABLE inner_table_with_null(id INT) AS VALUES +(2), +(NULL); + +statement ok +CREATE TABLE outer_corr_table(id INT, grp INT, value TEXT) AS VALUES +(1, 1, 'a'), +(3, 1, 'b'), +(1, 2, 'c'), +(NULL, 1, 'd'), +(5, 3, 'e'), +(2, 1, 'f'), +(NULL, 2, 'g'); + +statement ok +CREATE TABLE inner_corr_table(id INT, grp INT) AS VALUES +(2, 1), +(NULL, 1), +(1, 2); + +# Out-of-scope row-valued NOT IN repro kept for future work. +# DuckDB issue #19182 covers partially NULL tuple comparison semantics; this PR +# only targets scalar null-aware mark joins. +# statement ok +# CREATE TABLE outer_tuple_table(f1 INT, f2 INT) AS VALUES +# (0, 0), +# (1, 0); +# +# statement ok +# CREATE TABLE inner_tuple_table(c1 BIGINT, c2 BIGINT) AS VALUES +# (0, NULL); + +statement ok +CREATE TABLE empty_table(id INT) AS +SELECT * +FROM (VALUES (1)) AS seed(id) +WHERE id < 0; + +############################# +## Hash join null-aware mark +############################# + +statement ok +set datafusion.execution.target_partitions = 2; + +statement ok +set datafusion.optimizer.repartition_joins = true; + +statement ok +set datafusion.optimizer.prefer_hash_join = true; + +query TT +EXPLAIN +SELECT id, value +FROM outer_table +WHERE (id NOT IN (SELECT id FROM inner_table_with_null)) IS NULL; +---- +logical_plan +01)Projection: outer_table.id, outer_table.value +02)--Filter: NOT __correlated_sq_1.mark IS NULL +03)----LeftMark Join: outer_table.id = __correlated_sq_1.id +04)------TableScan: outer_table projection=[id, value] +05)------SubqueryAlias: __correlated_sq_1 +06)--------TableScan: inner_table_with_null projection=[id] +physical_plan +01)FilterExec: NOT mark@2 IS NULL, projection=[id@0, value@1] +02)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +03)----HashJoinExec: mode=CollectLeft, join_type=LeftMark, on=[(id@0, id@0)] +04)------DataSourceExec: partitions=1, partition_sizes=[1] +05)------DataSourceExec: partitions=1, partition_sizes=[1] + +query T rowsort +SELECT value +FROM outer_table +WHERE (id NOT IN (SELECT id FROM inner_table_with_null)) IS NULL; +---- +a +c +d +e + +query TT +EXPLAIN +SELECT value +FROM outer_table +WHERE (id NOT IN (SELECT id FROM inner_table_no_null)) IS NULL; +---- +logical_plan +01)Projection: outer_table.value +02)--Filter: NOT __correlated_sq_1.mark IS NULL +03)----Projection: outer_table.value, __correlated_sq_1.mark +04)------LeftMark Join: outer_table.id = __correlated_sq_1.id +05)--------TableScan: outer_table projection=[id, value] +06)--------SubqueryAlias: __correlated_sq_1 +07)----------TableScan: inner_table_no_null projection=[id] +physical_plan +01)FilterExec: NOT mark@1 IS NULL, projection=[value@0] +02)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +03)----HashJoinExec: mode=CollectLeft, join_type=LeftMark, on=[(id@0, id@0)], projection=[value@1, mark@2] +04)------DataSourceExec: partitions=1, partition_sizes=[1] +05)------DataSourceExec: partitions=1, partition_sizes=[1] + +query T rowsort +SELECT value +FROM outer_table +WHERE (id NOT IN (SELECT id FROM inner_table_no_null)) IS NULL; +---- +e + +query T rowsort +SELECT value +FROM outer_table +WHERE (id NOT IN (SELECT id FROM empty_table)) IS TRUE; +---- +a +b +c +d +e + +# Regression test for correlated NOT IN inside a mark join. +query T rowsort +SELECT value +FROM outer_corr_table +WHERE (id NOT IN ( + SELECT id + FROM inner_corr_table + WHERE inner_corr_table.grp = outer_corr_table.grp +)) IS NULL; +---- +a +b +d +g + +query T rowsort +SELECT value +FROM outer_corr_table +WHERE (id NOT IN ( + SELECT id + FROM inner_corr_table + WHERE inner_corr_table.grp = outer_corr_table.grp +)) IS TRUE; +---- +e + +# Regression test for row-valued NOT IN with a partially NULL tuple. +# This matches DuckDB issue #19182 and the Postgres-style behavior from +# https://github.com/duckdb/duckdb/pull/22683. It is commented out because +# tuple-valued NOT IN needs row-comparison NULL semantics that are outside this +# scalar null-aware mark join PR. +# query II rowsort +# SELECT f1, f2 +# FROM outer_tuple_table +# WHERE (f1, f2) NOT IN ( +# SELECT c1, c2 +# FROM inner_tuple_table +# ); +# ---- +# 1 0 +# +# query II rowsort +# SELECT f1, f2 +# FROM outer_tuple_table +# WHERE ((f1, f2) NOT IN ( +# SELECT c1, c2 +# FROM inner_tuple_table +# )) IS NULL; +# ---- +# 0 0 + +# These queries would fall back to a sort-merge, but we force them +# into hash joins to make sure we null awareness. + +statement ok +set datafusion.execution.target_partitions = 2; + +statement ok +set datafusion.optimizer.repartition_joins = true; + +statement ok +set datafusion.optimizer.prefer_hash_join = false; + +query TT +EXPLAIN +SELECT id, value +FROM outer_table +WHERE (id NOT IN (SELECT id FROM inner_table_with_null)) IS NULL; +---- +logical_plan +01)Projection: outer_table.id, outer_table.value +02)--Filter: NOT __correlated_sq_1.mark IS NULL +03)----LeftMark Join: outer_table.id = __correlated_sq_1.id +04)------TableScan: outer_table projection=[id, value] +05)------SubqueryAlias: __correlated_sq_1 +06)--------TableScan: inner_table_with_null projection=[id] +physical_plan +01)FilterExec: NOT mark@2 IS NULL, projection=[id@0, value@1] +02)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +03)----HashJoinExec: mode=CollectLeft, join_type=LeftMark, on=[(id@0, id@0)] +04)------DataSourceExec: partitions=1, partition_sizes=[1] +05)------DataSourceExec: partitions=1, partition_sizes=[1] + +query TT +EXPLAIN +SELECT value +FROM outer_table +WHERE (id NOT IN (SELECT id FROM inner_table_with_null)) IS NULL; +---- +logical_plan +01)Projection: outer_table.value +02)--Filter: NOT __correlated_sq_1.mark IS NULL +03)----Projection: outer_table.value, __correlated_sq_1.mark +04)------LeftMark Join: outer_table.id = __correlated_sq_1.id +05)--------TableScan: outer_table projection=[id, value] +06)--------SubqueryAlias: __correlated_sq_1 +07)----------TableScan: inner_table_with_null projection=[id] +physical_plan +01)FilterExec: NOT mark@1 IS NULL, projection=[value@0] +02)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +03)----HashJoinExec: mode=CollectLeft, join_type=LeftMark, on=[(id@0, id@0)], projection=[value@1, mark@2] +04)------DataSourceExec: partitions=1, partition_sizes=[1] +05)------DataSourceExec: partitions=1, partition_sizes=[1] + +query T rowsort +SELECT value +FROM outer_table +WHERE (id NOT IN (SELECT id FROM inner_table_with_null)) IS NULL; +---- +a +c +d +e + +query T rowsort +SELECT value +FROM outer_table +WHERE (id NOT IN (SELECT id FROM inner_table_no_null)) IS NULL; +---- +e + +#################################### +## Nested loop mark join with NULLs +#################################### + +statement ok +set datafusion.execution.target_partitions = 1; + +statement ok +set datafusion.optimizer.prefer_hash_join = true; + +query TT +EXPLAIN +SELECT value +FROM outer_table +WHERE (EXISTS ( + SELECT 1 + FROM inner_table_no_null + WHERE outer_table.id < inner_table_no_null.id +)) IS TRUE; +---- +logical_plan +01)Projection: outer_table.value +02)--Filter: __correlated_sq_1.mark IS TRUE +03)----Projection: outer_table.value, __correlated_sq_1.mark +04)------LeftMark Join: Filter: outer_table.id < __correlated_sq_1.id +05)--------TableScan: outer_table projection=[id, value] +06)--------SubqueryAlias: __correlated_sq_1 +07)----------TableScan: inner_table_no_null projection=[id] +physical_plan +01)FilterExec: mark@1 IS NOT DISTINCT FROM true, projection=[value@0] +02)--NestedLoopJoinExec: join_type=RightMark, filter=id@0 < id@1, projection=[value@1, mark@2] +03)----DataSourceExec: partitions=1, partition_sizes=[1] +04)----DataSourceExec: partitions=1, partition_sizes=[1] + +query T rowsort +SELECT value +FROM outer_table +WHERE (EXISTS ( + SELECT 1 + FROM inner_table_no_null + WHERE outer_table.id < inner_table_no_null.id +)) IS TRUE; +---- +a +b +c + +statement ok +reset datafusion.optimizer.prefer_hash_join; + +statement ok +reset datafusion.optimizer.repartition_joins; + +statement ok +set datafusion.execution.target_partitions = 4; + +statement ok +DROP TABLE empty_table; + +statement ok +DROP TABLE inner_table_with_null; + +statement ok +DROP TABLE inner_table_no_null; + +statement ok +DROP TABLE inner_corr_table; + +statement ok +DROP TABLE outer_corr_table; + +# statement ok +# DROP TABLE inner_tuple_table; +# +# statement ok +# DROP TABLE outer_tuple_table; + +statement ok +DROP TABLE outer_table; From 391bc33be7edafacfb352b3d4504ea519a6a83e3 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Mon, 8 Jun 2026 11:39:04 +0100 Subject: [PATCH 2/9] fix typo Signed-off-by: Adam Gutglick --- datafusion/physical-plan/src/joins/hash_join/stream.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/physical-plan/src/joins/hash_join/stream.rs b/datafusion/physical-plan/src/joins/hash_join/stream.rs index 7dfafd0166121..c734560cd7197 100644 --- a/datafusion/physical-plan/src/joins/hash_join/stream.rs +++ b/datafusion/physical-plan/src/joins/hash_join/stream.rs @@ -339,7 +339,7 @@ pub(super) struct HashJoinStream { /// Output buffer for coalescing small batches into larger ones with optional fetch limit. /// Uses `LimitedBatchCoalescer` to efficiently combine batches and absorb limit with 'fetch' output_buffer: LimitedBatchCoalescer, - /// Whether this is a null-aware anti or mark joins + /// Whether this is a null-aware anti or mark join null_aware: bool, } From e50c3735ffe2055745c91a7c96bf3ebc11aa8df7 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Thu, 11 Jun 2026 16:32:07 +0100 Subject: [PATCH 3/9] More changes --- .../src/decorrelate_predicate_subquery.rs | 143 ++++++++--- .../physical-plan/src/joins/hash_join/exec.rs | 223 +++++++++++++++++- .../src/joins/hash_join/stream.rs | 132 ++++++++--- 3 files changed, 413 insertions(+), 85 deletions(-) diff --git a/datafusion/optimizer/src/decorrelate_predicate_subquery.rs b/datafusion/optimizer/src/decorrelate_predicate_subquery.rs index 85b808dbcd3ac..5f091247a3e2f 100644 --- a/datafusion/optimizer/src/decorrelate_predicate_subquery.rs +++ b/datafusion/optimizer/src/decorrelate_predicate_subquery.rs @@ -37,8 +37,8 @@ use datafusion_expr::expr_rewriter::create_col_from_scalar_expr; use datafusion_expr::logical_plan::{JoinType, Subquery}; use datafusion_expr::utils::{conjunction, expr_to_columns, split_conjunction_owned}; use datafusion_expr::{ - BinaryExpr, Expr, Filter, LogicalPlan, LogicalPlanBuilder, Operator, exists, - in_subquery, lit, not, not_exists, not_in_subquery, + BinaryExpr, Expr, ExprSchemable, Filter, LogicalPlan, LogicalPlanBuilder, Operator, + exists, in_subquery, lit, not, not_exists, not_in_subquery, }; use log::debug; @@ -355,6 +355,31 @@ fn join_keys_may_be_null( Ok(false) } +/// Check whether the value keys of a scalar `IN`/`NOT IN` predicate may +/// produce NULLs. +/// +/// Unlike [`join_keys_may_be_null`], this only examines the in-predicate's two +/// sides: SQL UNKNOWN can only arise from a NULL in the compared values. A +/// NULL in a correlation key merely makes the correlated subquery empty for +/// that outer row, which `IN`/`NOT IN` resolve to FALSE/TRUE, never UNKNOWN. +fn in_predicate_values_may_be_null( + in_predicate: &Expr, + left_schema: &DFSchemaRef, + right_schema: &DFSchemaRef, +) -> Result { + let Expr::BinaryExpr(BinaryExpr { + left, + op: Operator::Eq, + right, + }) = in_predicate + else { + // Unexpected shape: conservatively assume the values may be NULL. + return Ok(true); + }; + + Ok(left.nullable(left_schema.as_ref())? || right.nullable(right_schema.as_ref())?) +} + fn build_join( left: &LogicalPlan, subquery: &LogicalPlan, @@ -386,35 +411,38 @@ fn build_join( replace_qualified_name(filter, &all_correlated_cols, &alias).map(Some) })?; - let join_filter = match (join_filter_opt, in_predicate_opt.cloned()) { - ( - Some(join_filter), - Some(Expr::BinaryExpr(BinaryExpr { - left, - op: Operator::Eq, - right, - })), - ) => { - let right_col = create_col_from_scalar_expr(right.deref(), alias)?; - let in_predicate = Expr::eq(left.deref().clone(), Expr::Column(right_col)); - in_predicate.and(join_filter) - } - (Some(join_filter), _) => join_filter, - ( - _, - Some(Expr::BinaryExpr(BinaryExpr { - left, - op: Operator::Eq, - right, - })), - ) => { - let right_col = create_col_from_scalar_expr(right.deref(), alias)?; - - Expr::eq(left.deref().clone(), Expr::Column(right_col)) - } - (None, None) => lit(true), - _ => return Ok(None), - }; + let (join_filter, aliased_in_predicate) = + match (join_filter_opt, in_predicate_opt.cloned()) { + ( + Some(join_filter), + Some(Expr::BinaryExpr(BinaryExpr { + left, + op: Operator::Eq, + right, + })), + ) => { + let right_col = create_col_from_scalar_expr(right.deref(), alias)?; + let in_predicate = + Expr::eq(left.deref().clone(), Expr::Column(right_col)); + (in_predicate.clone().and(join_filter), Some(in_predicate)) + } + (Some(join_filter), _) => (join_filter, None), + ( + _, + Some(Expr::BinaryExpr(BinaryExpr { + left, + op: Operator::Eq, + right, + })), + ) => { + let right_col = create_col_from_scalar_expr(right.deref(), alias)?; + let in_predicate = + Expr::eq(left.deref().clone(), Expr::Column(right_col)); + (in_predicate.clone(), Some(in_predicate)) + } + (None, None) => (lit(true), None), + _ => return Ok(None), + }; if matches!(join_type, JoinType::LeftMark | JoinType::RightMark) { let right_schema = sub_query_alias.schema(); @@ -465,14 +493,18 @@ fn build_join( // nullable mark column when the predicate can be implemented by hash keys. // Non-equality correlated filters stay on the legacy path because hash join // execution cannot mark UNKNOWN candidates for residual predicates. + // Only the in-predicate's value keys decide nullability: NULLs in + // correlation keys can never make `IN`/`NOT IN` evaluate to UNKNOWN. let null_aware = join_type == JoinType::LeftMark - && in_predicate_opt.is_some() && mark_filter_is_hashable_only - && join_keys_may_be_null( - &join_filter, - left.schema(), - right_projected.schema(), - )?; + && match &aliased_in_predicate { + Some(in_predicate) => in_predicate_values_may_be_null( + in_predicate, + left.schema(), + right_projected.schema(), + )?, + None => false, + }; let new_plan = LogicalPlanBuilder::from(left.clone()) .join_detailed_with_options( @@ -1326,6 +1358,43 @@ mod tests { Ok(()) } + #[test] + fn correlated_not_in_mark_join_not_null_aware_for_non_nullable_value_key() + -> Result<()> { + // The value key `id` is non-nullable on both sides; only the + // correlation key `grp` is nullable. A NULL correlation key just makes + // the correlated subquery empty, so NOT IN can never be UNKNOWN and + // the mark join does not need null-aware semantics. + let schema = Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("grp", DataType::Int32, true), + ]); + let outer_scan = table_scan(Some("outer_t"), &schema, None)?.build()?; + let inner_scan = table_scan(Some("inner_t"), &schema, None)?.build()?; + + let subquery = Arc::new( + LogicalPlanBuilder::from(inner_scan) + .filter( + out_ref_col(DataType::Int32, "outer_t.grp").eq(col("inner_t.grp")), + )? + .project(vec![col("inner_t.id")])? + .build()?, + ); + + let plan = LogicalPlanBuilder::from(outer_scan) + .filter(not_in_subquery(col("outer_t.id"), subquery).is_null())? + .build()?; + + let optimized = optimize_with_decorrelate(plan)?; + assert!( + has_non_null_aware_left_mark_join(&optimized), + "{}", + optimized.display_indent_schema() + ); + + Ok(()) + } + #[test] fn in_subquery_both_side_expr() -> Result<()> { let table_scan = test_table_scan()?; diff --git a/datafusion/physical-plan/src/joins/hash_join/exec.rs b/datafusion/physical-plan/src/joins/hash_join/exec.rs index ed276cd2dcfe6..43b33fb911ec2 100644 --- a/datafusion/physical-plan/src/joins/hash_join/exec.rs +++ b/datafusion/physical-plan/src/joins/hash_join/exec.rs @@ -65,7 +65,7 @@ use crate::{ metrics::{ExecutionPlanMetricsSet, MetricsSet}, }; -use arrow::array::{ArrayRef, BooleanBufferBuilder}; +use arrow::array::{ArrayRef, BooleanBufferBuilder, UInt64Array}; use arrow::compute::concat_batches; use arrow::datatypes::SchemaRef; use arrow::record_batch::RecordBatch; @@ -88,7 +88,7 @@ use datafusion_physical_expr::expressions::{Column, DynamicFilterPhysicalExpr, l use datafusion_physical_expr::projection::{ProjectionRef, combine_projections}; use datafusion_physical_expr::{PhysicalExpr, PhysicalExprRef}; -use datafusion_common::hash_utils::RandomState; +use datafusion_common::hash_utils::{RandomState, create_hashes}; use datafusion_physical_expr_common::physical_expr::fmt_sql; use datafusion_physical_expr_common::utils::evaluate_expressions_to_arrays; use futures::TryStreamExt; @@ -185,6 +185,26 @@ fn try_create_array_map( Ok(Some((array_map, batch, left_values))) } +/// Correlation-scope hash map over only the build rows whose scalar `NOT IN` +/// value key is NULL, used by correlated null-aware `LeftMark` joins. +/// +/// Such rows produce a NULL (UNKNOWN) mark whenever *any* probe row shares +/// their correlation scope, so every probe row must be tested against them. +/// Restricting this map to the NULL-valued build rows keeps that lookup +/// proportional to the number of NULLs instead of enumerating every scope +/// match of every probe row. +pub(super) struct NullValueScopeMap { + /// Hash table keyed by the correlation scope values of the NULL-valued + /// build rows. Stored positions index into `scope_values`/`build_indices`, + /// not the full build batch. + pub(super) map: Box, + /// Correlation scope key values of the NULL-valued build rows. + pub(super) scope_values: Vec, + /// Maps positions in `map`/`scope_values` back to row indices in the full + /// build batch. + pub(super) build_indices: UInt64Array, +} + /// HashTable and input data for the left (build side) of a join pub(super) struct JoinLeftData { /// The hash table with indices into `batch` @@ -193,8 +213,14 @@ pub(super) struct JoinLeftData { /// Hash table over correlated scope keys for scalar null-aware mark joins. /// /// For null-aware `LeftMark`, key 0 is the scalar `NOT IN` value key and - /// keys 1..N are correlated equality scope keys. - null_aware_mark_scope_map: Option>, + /// keys 1..N are correlated equality scope keys. This map covers all build + /// rows and is probed only with NULL-valued probe rows; the complementary + /// direction uses `null_value_scope_map`. + null_aware_mark_scope_map: Option>, + /// Scope map restricted to the build rows whose value key is NULL (see + /// [`NullValueScopeMap`]). `None` when the build side has no NULL value + /// keys. + null_value_scope_map: Option, /// The input rows for the build side batch: RecordBatch, /// The build side on expressions values @@ -231,10 +257,14 @@ impl JoinLeftData { &self.map } - pub(super) fn null_aware_mark_scope_map(&self) -> Option<&Map> { + pub(super) fn null_aware_mark_scope_map(&self) -> Option<&dyn JoinHashMapType> { self.null_aware_mark_scope_map.as_deref() } + pub(super) fn null_value_scope_map(&self) -> Option<&NullValueScopeMap> { + self.null_value_scope_map.as_ref() + } + /// returns a reference to the build side batch pub(super) fn batch(&self) -> &RecordBatch { &self.batch @@ -1997,7 +2027,7 @@ fn new_join_hashmap( /// * `probe_threads_count` - Number of threads that will probe this hash table /// * `should_compute_dynamic_filters` - Whether to compute min/max bounds for dynamic filtering /// * `with_null_aware_mark_state` - Whether to build the per-build-row null-indices bitmap -/// and correlation-scope map used by correlated null-aware `LeftMark` joins +/// and correlation-scope maps used by correlated null-aware `LeftMark` joins /// /// # Dynamic Filter Coordination /// When `should_compute_dynamic_filters` is true, this function computes the min/max bounds @@ -2161,7 +2191,8 @@ async fn collect_left_input( BooleanBufferBuilder::new(0) }; - let null_aware_mark_scope_map = if with_null_aware_mark_state { + let (null_aware_mark_scope_map, null_value_scope_map) = if with_null_aware_mark_state + { // Null-aware `LeftMark` convention: `on_left[0]` is the value key and // `on_left[1..]` the scope keys, so the scope map needs more than one key. debug_assert!( @@ -2172,13 +2203,15 @@ async fn collect_left_input( // This secondary map is keyed only by correlation scope keys. The // primary join map may still use ArrayMap for full-key TRUE matches, // but scope-only NULL marking uses a HashMap for arbitrary key shapes. - let mut hashmap = new_join_hashmap(num_rows, &mut reservation, &metrics)?; + // It is probed only with the NULL-valued probe rows, so the lookup work + // is proportional to the probe-side NULLs. + let mut scope_map = new_join_hashmap(num_rows, &mut reservation, &metrics)?; let mut hashes_buffer = vec![0; batch.num_rows()]; update_hash( &on_left[1..], &batch, - &mut *hashmap, + &mut *scope_map, 0, &random_state, &mut hashes_buffer, @@ -2186,9 +2219,39 @@ async fn collect_left_input( true, )?; - Some(Arc::new(Map::HashMap(hashmap))) + // Build rows whose value key is NULL get a dedicated scope map: any + // probe row sharing their scope makes their mark UNKNOWN, so they are + // probed with *every* probe row. Restricting the map to the NULL-valued + // rows keeps that lookup proportional to the build-side NULLs. + let value_key = &left_values[0]; + let null_value_scope_map = if value_key.null_count() > 0 { + let null_mask = arrow::compute::is_null(value_key.as_ref())?; + let build_indices = UInt64Array::from_iter_values( + null_mask.values().set_indices().map(|i| i as u64), + ); + let scope_values = left_values[1..] + .iter() + .map(|values| Ok(arrow::compute::filter(values.as_ref(), &null_mask)?)) + .collect::>>()?; + + let null_rows = build_indices.len(); + let mut map = new_join_hashmap(null_rows, &mut reservation, &metrics)?; + let mut hashes_buffer = vec![0; null_rows]; + create_hashes(&scope_values, &random_state, &mut hashes_buffer)?; + map.update_from_iter(Box::new(hashes_buffer.iter().enumerate().rev()), 0); + + Some(NullValueScopeMap { + map, + scope_values, + build_indices, + }) + } else { + None + }; + + (Some(scope_map), null_value_scope_map) } else { - None + (None, None) }; let map = Arc::new(join_hash_map); @@ -2226,6 +2289,7 @@ async fn collect_left_input( let data = JoinLeftData { map, null_aware_mark_scope_map, + null_value_scope_map, batch, values: left_values, visited_indices_bitmap: Mutex::new(visited_indices_bitmap), @@ -6953,6 +7017,143 @@ mod tests { Ok(()) } + /// Scalar correlated null-aware left mark join where only the build side's + /// value key contains NULL. + /// + /// This isolates the NULL-value scope map direction of + /// `mark_null_candidates_for_probe_batch`: NULL-valued build rows become + /// UNKNOWN as soon as any probe row shares their correlation scope, while + /// non-null build rows stay `true`/`false`. + #[apply(hash_join_exec_configs)] + #[tokio::test] + async fn test_null_aware_left_mark_correlated_build_null_only( + batch_size: usize, + ) -> Result<()> { + let task_ctx = prepare_task_ctx(batch_size, false); + + let left = build_table_two_cols( + ("id", &vec![Some(1), None, None, Some(3)]), + ("grp", &vec![Some(1), Some(1), Some(2), Some(3)]), + ); + + let right = build_table_two_cols( + ("id", &vec![Some(2), Some(1)]), + ("grp", &vec![Some(1), Some(2)]), + ); + + let on = vec![ + ( + Arc::new(Column::new_with_schema("id", &left.schema())?) as _, + Arc::new(Column::new_with_schema("id", &right.schema())?) as _, + ), + ( + Arc::new(Column::new_with_schema("grp", &left.schema())?) as _, + Arc::new(Column::new_with_schema("grp", &right.schema())?) as _, + ), + ]; + + let join = HashJoinExec::try_new( + left, + right, + on, + None, + &JoinType::LeftMark, + None, + PartitionMode::CollectLeft, + NullEquality::NullEqualsNothing, + true, + )?; + + let stream = join.execute(0, task_ctx)?; + let batches = common::collect(stream).await?; + + // `(NULL, 1)` and `(NULL, 2)` have non-empty correlated scopes, so + // their marks are UNKNOWN. `(1, 1)` finds no equal value and no NULL in + // its scope (false); `(3, 3)` has an empty correlated scope (false). + allow_duplicates! { + assert_snapshot!(batches_to_sort_string(&batches), @r" + +----+-----+-------+ + | id | grp | mark | + +----+-----+-------+ + | | 1 | | + | | 2 | | + | 1 | 1 | false | + | 3 | 3 | false | + +----+-----+-------+ + "); + } + + Ok(()) + } + + /// Scalar correlated null-aware left mark join where only the probe side's + /// value key contains NULL. + /// + /// This isolates the NULL-valued-probe-row direction of + /// `mark_null_candidates_for_probe_batch`: every unmatched build row whose + /// correlation scope contains a NULL-valued probe row becomes UNKNOWN. + #[apply(hash_join_exec_configs)] + #[tokio::test] + async fn test_null_aware_left_mark_correlated_probe_null_only( + batch_size: usize, + ) -> Result<()> { + let task_ctx = prepare_task_ctx(batch_size, false); + + let left = build_table_two_cols( + ("id", &vec![Some(1), Some(2), Some(5), Some(7)]), + ("grp", &vec![Some(1), Some(1), Some(2), Some(3)]), + ); + + let right = build_table_two_cols( + ("id", &vec![Some(2), None, Some(4)]), + ("grp", &vec![Some(1), Some(1), Some(2)]), + ); + + let on = vec![ + ( + Arc::new(Column::new_with_schema("id", &left.schema())?) as _, + Arc::new(Column::new_with_schema("id", &right.schema())?) as _, + ), + ( + Arc::new(Column::new_with_schema("grp", &left.schema())?) as _, + Arc::new(Column::new_with_schema("grp", &right.schema())?) as _, + ), + ]; + + let join = HashJoinExec::try_new( + left, + right, + on, + None, + &JoinType::LeftMark, + None, + PartitionMode::CollectLeft, + NullEquality::NullEqualsNothing, + true, + )?; + + let stream = join.execute(0, task_ctx)?; + let batches = common::collect(stream).await?; + + // `(1, 1)` is unmatched and its scope contains a NULL value (UNKNOWN); + // `(2, 1)` matches (true); `(5, 2)`'s scope holds only `4` (false); + // `(7, 3)` has an empty correlated scope (false). + allow_duplicates! { + assert_snapshot!(batches_to_sort_string(&batches), @r" + +----+-----+-------+ + | id | grp | mark | + +----+-----+-------+ + | 1 | 1 | | + | 2 | 1 | true | + | 5 | 2 | false | + | 7 | 3 | false | + +----+-----+-------+ + "); + } + + Ok(()) + } + #[test] fn test_lr_is_preserved() { assert_eq!(lr_is_preserved(JoinType::Inner), (true, true)); diff --git a/datafusion/physical-plan/src/joins/hash_join/stream.rs b/datafusion/physical-plan/src/joins/hash_join/stream.rs index c734560cd7197..857ebd687f3db 100644 --- a/datafusion/physical-plan/src/joins/hash_join/stream.rs +++ b/datafusion/physical-plan/src/joins/hash_join/stream.rs @@ -47,7 +47,7 @@ use crate::{ }, }; -use arrow::array::{Array, ArrayRef, UInt32Array, UInt64Array}; +use arrow::array::{Array, ArrayRef, BooleanBufferBuilder, UInt32Array, UInt64Array}; use arrow::buffer::NullBuffer; use arrow::datatypes::{Schema, SchemaRef}; use arrow::record_batch::RecordBatch; @@ -1109,6 +1109,21 @@ impl HashJoinStream { } } +/// Records which build rows of a correlated null-aware `LeftMark` join are +/// UNKNOWN candidates for this probe batch. +/// +/// Key layout (consumed positionally): index 0 is the `NOT IN` value key; +/// indices 1..N are the correlation scope keys. A build row's mark must be +/// NULL (SQL UNKNOWN) instead of FALSE when it is unmatched and either: +/// 1. its value key is NULL and any probe row shares its correlation scope, or +/// 2. some probe row in its correlation scope has a NULL value key. +/// +/// Each case only does work proportional to the rows that actually involve a +/// NULL value key: case 1 probes the build-side NULL-value scope map (which +/// contains only NULL-valued build rows) with all probe rows, and case 2 +/// probes the full scope map with only the NULL-valued probe rows. Enumerating +/// every scope match of every probe row instead would degrade to +/// O(build * probe) for low-cardinality correlation keys. fn mark_null_candidates_for_probe_batch( build_side: &BuildSideReadyState, state: &ProcessProbeBatchState, @@ -1118,20 +1133,10 @@ fn mark_null_candidates_for_probe_batch( probe_indices_buffer: &mut Vec, build_indices_buffer: &mut Vec, ) -> Result<()> { - let Some(null_aware_mark_scope_map) = - build_side.left_data.null_aware_mark_scope_map() - else { + let Some(scope_map) = build_side.left_data.null_aware_mark_scope_map() else { return Ok(()); }; - let Map::HashMap(scope_hashmap) = null_aware_mark_scope_map else { - // `collect_left_input` constructs the correlation-scope map as a - // HashMap even when the primary full-key join map uses ArrayMap. - return internal_err!("null-aware mark scope map must be a hash map"); - }; - - // Key layout (consumed positionally below): index 0 is the `NOT IN` value - // key; indices 1..N are the correlation scope keys. debug_assert!( build_side.left_data.values().len() > 1, "build keys must be [value, scope..]" @@ -1146,24 +1151,90 @@ fn mark_null_candidates_for_probe_batch( "build/probe key counts must match" ); - let build_value_key = &build_side.left_data.values()[0]; let probe_value_key = &state.values[0]; let build_scope_values = &build_side.left_data.values()[1..]; let probe_scope_values = &state.values[1..]; - if build_value_key.null_count() == 0 && probe_value_key.null_count() == 0 { + let null_value_scope_map = build_side.left_data.null_value_scope_map(); + let probe_has_null_values = probe_value_key.null_count() > 0; + if null_value_scope_map.is_none() && !probe_has_null_values { return Ok(()); } - hashes_buffer.clear(); - hashes_buffer.resize(state.batch.num_rows(), 0); - create_hashes(probe_scope_values, random_state, hashes_buffer)?; - let mut null_bitmap = build_side.left_data.null_indices_bitmap().lock(); + + // Case 1: build rows with a NULL value key are UNKNOWN as soon as any + // probe row shares their correlation scope. + if let Some(null_value_scope_map) = null_value_scope_map { + hashes_buffer.clear(); + hashes_buffer.resize(state.batch.num_rows(), 0); + create_hashes(probe_scope_values, random_state, hashes_buffer)?; + + let build_indices = &null_value_scope_map.build_indices; + scan_scope_matches_into_bitmap( + null_value_scope_map.map.as_ref(), + &null_value_scope_map.scope_values, + probe_scope_values, + hashes_buffer, + batch_size, + probe_indices_buffer, + build_indices_buffer, + // The map indexes only the NULL-valued build rows; translate its + // positions back to row indices in the full build batch. + |position| build_indices.value(position as usize), + &mut null_bitmap, + )?; + } + + // Case 2: NULL-valued probe rows make every build row in their correlation + // scope an UNKNOWN candidate. + if probe_has_null_values { + let null_mask = arrow::compute::is_null(probe_value_key.as_ref())?; + let probe_null_scope_values = probe_scope_values + .iter() + .map(|values| Ok(arrow::compute::filter(values.as_ref(), &null_mask)?)) + .collect::>>()?; + + hashes_buffer.clear(); + hashes_buffer.resize(null_mask.true_count(), 0); + create_hashes(&probe_null_scope_values, random_state, hashes_buffer)?; + + scan_scope_matches_into_bitmap( + scope_map, + build_scope_values, + &probe_null_scope_values, + hashes_buffer, + batch_size, + probe_indices_buffer, + build_indices_buffer, + |position| position, + &mut null_bitmap, + )?; + } + + Ok(()) +} + +/// Scans all correlation-scope matches between `build_scope_values` and +/// `probe_scope_values` and sets the bit of every matched build row in +/// `null_bitmap`, translating matched map positions through +/// `map_position_to_build_row`. +#[expect(clippy::too_many_arguments)] +fn scan_scope_matches_into_bitmap( + scope_map: &dyn JoinHashMapType, + build_scope_values: &[ArrayRef], + probe_scope_values: &[ArrayRef], + hashes_buffer: &[u64], + batch_size: usize, + probe_indices_buffer: &mut Vec, + build_indices_buffer: &mut Vec, + map_position_to_build_row: impl Fn(u64) -> u64, + null_bitmap: &mut BooleanBufferBuilder, +) -> Result<()> { let mut offset = (0, None); loop { - let (build_indices, probe_indices, next_offset) = lookup_join_hashmap( - scope_hashmap.as_ref(), + let (build_indices, _probe_indices, next_offset) = lookup_join_hashmap( + scope_map, build_scope_values, probe_scope_values, NullEquality::NullEqualsNothing, @@ -1174,23 +1245,10 @@ fn mark_null_candidates_for_probe_batch( build_indices_buffer, )?; - if !build_indices.is_empty() { - build_indices.iter().zip(probe_indices.iter()).for_each( - |(build_idx, probe_idx)| { - let build_idx = build_idx - .expect("scope lookup should produce non-null build indices") - as usize; - let probe_idx = probe_idx - .expect("scope lookup should produce non-null probe indices") - as usize; - - if probe_value_key.is_null(probe_idx) - || build_value_key.is_null(build_idx) - { - null_bitmap.set_bit(build_idx, true); - } - }, - ); + for build_idx in build_indices.iter() { + let build_idx = + build_idx.expect("scope lookup should produce non-null build indices"); + null_bitmap.set_bit(map_position_to_build_row(build_idx) as usize, true); } let Some(next_offset) = next_offset else { From ba1c72a49340ac44125776e9239e33ec748c468a Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Thu, 11 Jun 2026 19:58:56 +0100 Subject: [PATCH 4/9] simplify Signed-off-by: Adam Gutglick --- .../src/decorrelate_predicate_subquery.rs | 121 ++++----- .../physical-plan/src/joins/hash_join/exec.rs | 257 ++++++------------ .../src/joins/hash_join/stream.rs | 12 +- .../test_files/null_aware_mark_join.slt | 8 +- 4 files changed, 148 insertions(+), 250 deletions(-) diff --git a/datafusion/optimizer/src/decorrelate_predicate_subquery.rs b/datafusion/optimizer/src/decorrelate_predicate_subquery.rs index 5f091247a3e2f..15914bcc7a955 100644 --- a/datafusion/optimizer/src/decorrelate_predicate_subquery.rs +++ b/datafusion/optimizer/src/decorrelate_predicate_subquery.rs @@ -411,38 +411,29 @@ fn build_join( replace_qualified_name(filter, &all_correlated_cols, &alias).map(Some) })?; - let (join_filter, aliased_in_predicate) = - match (join_filter_opt, in_predicate_opt.cloned()) { - ( - Some(join_filter), - Some(Expr::BinaryExpr(BinaryExpr { - left, - op: Operator::Eq, - right, - })), - ) => { - let right_col = create_col_from_scalar_expr(right.deref(), alias)?; - let in_predicate = - Expr::eq(left.deref().clone(), Expr::Column(right_col)); - (in_predicate.clone().and(join_filter), Some(in_predicate)) - } - (Some(join_filter), _) => (join_filter, None), - ( - _, - Some(Expr::BinaryExpr(BinaryExpr { - left, - op: Operator::Eq, - right, - })), - ) => { - let right_col = create_col_from_scalar_expr(right.deref(), alias)?; - let in_predicate = - Expr::eq(left.deref().clone(), Expr::Column(right_col)); - (in_predicate.clone(), Some(in_predicate)) - } - (None, None) => (lit(true), None), - _ => return Ok(None), - }; + // Rewrite the scalar in-predicate's right side to reference the aliased + // subquery output (e.g. `outer.id = __correlated_sq_1.id`). `None` when + // there is no in-predicate or it is not a plain equality. + let aliased_in_predicate = match in_predicate_opt { + Some(Expr::BinaryExpr(BinaryExpr { + left, + op: Operator::Eq, + right, + })) => { + let right_col = create_col_from_scalar_expr(right.deref(), alias)?; + Some(Expr::eq(left.deref().clone(), Expr::Column(right_col))) + } + _ => None, + }; + + let join_filter = match (join_filter_opt, &aliased_in_predicate) { + (Some(join_filter), Some(in_predicate)) => in_predicate.clone().and(join_filter), + (Some(join_filter), None) => join_filter, + (None, Some(in_predicate)) => in_predicate.clone(), + // A non-equality in-predicate cannot become a join filter on its own. + (None, None) if in_predicate_opt.is_some() => return Ok(None), + (None, None) => lit(true), + }; if matches!(join_type, JoinType::LeftMark | JoinType::RightMark) { let right_schema = sub_query_alias.schema(); @@ -477,34 +468,31 @@ fn build_join( sub_query_alias.clone() }; - let mark_filter_is_hashable_only = - if join_type == JoinType::LeftMark && in_predicate_opt.is_some() { - let (_, residual_filter) = split_eq_and_noneq_join_predicate( - join_filter.clone(), - left.schema(), - right_projected.schema(), - )?; - residual_filter.is_none() - } else { - false - }; - // For scalar NOT IN mark joins, propagate null-aware semantics into the // nullable mark column when the predicate can be implemented by hash keys. // Non-equality correlated filters stay on the legacy path because hash join // execution cannot mark UNKNOWN candidates for residual predicates. // Only the in-predicate's value keys decide nullability: NULLs in // correlation keys can never make `IN`/`NOT IN` evaluate to UNKNOWN. - let null_aware = join_type == JoinType::LeftMark - && mark_filter_is_hashable_only - && match &aliased_in_predicate { - Some(in_predicate) => in_predicate_values_may_be_null( - in_predicate, - left.schema(), - right_projected.schema(), - )?, - None => false, - }; + let null_aware = if join_type == JoinType::LeftMark && in_predicate_opt.is_some() + { + let (_, residual_filter) = split_eq_and_noneq_join_predicate( + join_filter.clone(), + left.schema(), + right_projected.schema(), + )?; + residual_filter.is_none() + && match &aliased_in_predicate { + Some(in_predicate) => in_predicate_values_may_be_null( + in_predicate, + left.schema(), + right_projected.schema(), + )?, + None => false, + } + } else { + false + }; let new_plan = LogicalPlanBuilder::from(left.clone()) .join_detailed_with_options( @@ -643,26 +631,19 @@ mod tests { table_scan(Some(name), &schema, None)?.build() } - fn has_null_aware_left_mark_join(plan: &LogicalPlan) -> bool { - if let LogicalPlan::Join(join) = plan - && join.join_type == JoinType::LeftMark - { - return join.null_aware; - } - - plan.inputs().into_iter().any(has_null_aware_left_mark_join) - } - - fn has_non_null_aware_left_mark_join(plan: &LogicalPlan) -> bool { + fn has_left_mark_join_with_null_aware( + plan: &LogicalPlan, + expected_null_aware: bool, + ) -> bool { if let LogicalPlan::Join(join) = plan && join.join_type == JoinType::LeftMark { - return !join.null_aware; + return join.null_aware == expected_null_aware; } plan.inputs() .into_iter() - .any(has_non_null_aware_left_mark_join) + .any(|input| has_left_mark_join_with_null_aware(input, expected_null_aware)) } fn optimize_with_decorrelate(plan: LogicalPlan) -> Result { @@ -1322,7 +1303,7 @@ mod tests { let optimized = optimize_with_decorrelate(plan)?; assert!( - has_null_aware_left_mark_join(&optimized), + has_left_mark_join_with_null_aware(&optimized, true), "{}", optimized.display_indent_schema() ); @@ -1350,7 +1331,7 @@ mod tests { let optimized = optimize_with_decorrelate(plan)?; assert!( - has_non_null_aware_left_mark_join(&optimized), + has_left_mark_join_with_null_aware(&optimized, false), "{}", optimized.display_indent_schema() ); @@ -1387,7 +1368,7 @@ mod tests { let optimized = optimize_with_decorrelate(plan)?; assert!( - has_non_null_aware_left_mark_join(&optimized), + has_left_mark_join_with_null_aware(&optimized, false), "{}", optimized.display_indent_schema() ); diff --git a/datafusion/physical-plan/src/joins/hash_join/exec.rs b/datafusion/physical-plan/src/joins/hash_join/exec.rs index 43b33fb911ec2..6c4734d3d8164 100644 --- a/datafusion/physical-plan/src/joins/hash_join/exec.rs +++ b/datafusion/physical-plan/src/joins/hash_join/exec.rs @@ -205,22 +205,29 @@ pub(super) struct NullValueScopeMap { pub(super) build_indices: UInt64Array, } +/// Extra build-side state for correlated null-aware `LeftMark` joins. +/// +/// Key layout: `on[0]` is the scalar `NOT IN` value key, `on[1..]` the +/// correlation scope keys; see the `null_aware` field of `HashJoinExec`. +pub(super) struct NullAwareMarkState { + /// Hash table keyed only by the correlation scope keys, covering all build + /// rows. Probed only with NULL-valued probe rows; the complementary + /// direction uses `null_value_scope_map`. + pub(super) scope_map: Box, + /// Scope map restricted to the build rows whose value key is NULL (see + /// [`NullValueScopeMap`]). `None` when the build side has no NULL value + /// keys. + pub(super) null_value_scope_map: Option, +} + /// HashTable and input data for the left (build side) of a join pub(super) struct JoinLeftData { /// The hash table with indices into `batch` /// Arc is used to allow sharing with SharedBuildAccumulator for hash map pushdown pub(super) map: Arc, - /// Hash table over correlated scope keys for scalar null-aware mark joins. - /// - /// For null-aware `LeftMark`, key 0 is the scalar `NOT IN` value key and - /// keys 1..N are correlated equality scope keys. This map covers all build - /// rows and is probed only with NULL-valued probe rows; the complementary - /// direction uses `null_value_scope_map`. - null_aware_mark_scope_map: Option>, - /// Scope map restricted to the build rows whose value key is NULL (see - /// [`NullValueScopeMap`]). `None` when the build side has no NULL value - /// keys. - null_value_scope_map: Option, + /// Extra build-side state for correlated null-aware `LeftMark` joins (see + /// [`NullAwareMarkState`]); `None` for all other joins. + null_aware_state: Option, /// The input rows for the build side batch: RecordBatch, /// The build side on expressions values @@ -257,12 +264,8 @@ impl JoinLeftData { &self.map } - pub(super) fn null_aware_mark_scope_map(&self) -> Option<&dyn JoinHashMapType> { - self.null_aware_mark_scope_map.as_deref() - } - - pub(super) fn null_value_scope_map(&self) -> Option<&NullValueScopeMap> { - self.null_value_scope_map.as_ref() + pub(super) fn null_aware_state(&self) -> Option<&NullAwareMarkState> { + self.null_aware_state.as_ref() } /// returns a reference to the build side batch @@ -2191,10 +2194,9 @@ async fn collect_left_input( BooleanBufferBuilder::new(0) }; - let (null_aware_mark_scope_map, null_value_scope_map) = if with_null_aware_mark_state - { - // Null-aware `LeftMark` convention: `on_left[0]` is the value key and - // `on_left[1..]` the scope keys, so the scope map needs more than one key. + let null_aware_state = if with_null_aware_mark_state { + // Per the key layout on `HashJoinExec::null_aware`, the scope map + // needs more than one key. debug_assert!( on_left.len() > 1, "null-aware LeftMark needs on_left[0]=value, on_left[1..]=scope, got {} key(s)", @@ -2249,9 +2251,12 @@ async fn collect_left_input( None }; - (Some(scope_map), null_value_scope_map) + Some(NullAwareMarkState { + scope_map, + null_value_scope_map, + }) } else { - (None, None) + None }; let map = Arc::new(join_hash_map); @@ -2288,8 +2293,7 @@ async fn collect_left_input( let data = JoinLeftData { map, - null_aware_mark_scope_map, - null_value_scope_map, + null_aware_state, batch, values: left_values, visited_indices_bitmap: Mutex::new(visited_indices_bitmap), @@ -6775,6 +6779,41 @@ mod tests { ); } + /// Builds a null-aware `LeftMark` `HashJoinExec` on `on_names` key pairs + /// (`on_names[0]` is the `NOT IN` value key, the rest are correlation + /// scope keys), executes partition 0, and collects the result batches. + async fn null_aware_mark_join_collect( + batch_size: usize, + left: Arc, + right: Arc, + on_names: &[(&str, &str)], + ) -> Result> { + let task_ctx = prepare_task_ctx(batch_size, false); + + let mut on: JoinOn = vec![]; + for &(left_name, right_name) in on_names { + on.push(( + Arc::new(Column::new_with_schema(left_name, &left.schema())?) as _, + Arc::new(Column::new_with_schema(right_name, &right.schema())?) as _, + )); + } + + let join = HashJoinExec::try_new( + left, + right, + on, + None, + &JoinType::LeftMark, + None, + PartitionMode::CollectLeft, + NullEquality::NullEqualsNothing, + true, // null_aware + )?; + + let stream = join.execute(0, task_ctx)?; + common::collect(stream).await + } + /// Test null-aware left mark join when probe side contains NULL. /// Expected: /// - matched rows => true @@ -6783,8 +6822,6 @@ mod tests { #[apply(hash_join_exec_configs)] #[tokio::test] async fn test_null_aware_left_mark_probe_null(batch_size: usize) -> Result<()> { - let task_ctx = prepare_task_ctx(batch_size, false); - let left = build_table_two_cols( ("c1", &vec![Some(1), Some(4), None]), ("dummy", &vec![Some(10), Some(40), Some(0)]), @@ -6795,25 +6832,9 @@ mod tests { ("dummy", &vec![Some(100), Some(200), Some(300)]), ); - let on = vec![( - Arc::new(Column::new_with_schema("c1", &left.schema())?) as _, - Arc::new(Column::new_with_schema("c2", &right.schema())?) as _, - )]; - - let join = HashJoinExec::try_new( - left, - right, - on, - None, - &JoinType::LeftMark, - None, - PartitionMode::CollectLeft, - NullEquality::NullEqualsNothing, - true, // null_aware = true - )?; - - let stream = join.execute(0, task_ctx)?; - let batches = common::collect(stream).await?; + let batches = + null_aware_mark_join_collect(batch_size, left, right, &[("c1", "c2")]) + .await?; allow_duplicates! { assert_snapshot!(batches_to_sort_string(&batches), @r" @@ -6835,8 +6856,6 @@ mod tests { #[apply(hash_join_exec_configs)] #[tokio::test] async fn test_null_aware_left_mark_empty_probe(batch_size: usize) -> Result<()> { - let task_ctx = prepare_task_ctx(batch_size, false); - let left = build_table_two_cols( ("c1", &vec![Some(1), None]), ("dummy", &vec![Some(10), Some(0)]), @@ -6847,25 +6866,9 @@ mod tests { ("dummy", &Vec::>::new()), ); - let on = vec![( - Arc::new(Column::new_with_schema("c1", &left.schema())?) as _, - Arc::new(Column::new_with_schema("c2", &right.schema())?) as _, - )]; - - let join = HashJoinExec::try_new( - left, - right, - on, - None, - &JoinType::LeftMark, - None, - PartitionMode::CollectLeft, - NullEquality::NullEqualsNothing, - true, // null_aware = true - )?; - - let stream = join.execute(0, task_ctx)?; - let batches = common::collect(stream).await?; + let batches = + null_aware_mark_join_collect(batch_size, left, right, &[("c1", "c2")]) + .await?; allow_duplicates! { assert_snapshot!(batches_to_sort_string(&batches), @r" @@ -6888,8 +6891,6 @@ mod tests { #[apply(hash_join_exec_configs)] #[tokio::test] async fn test_null_aware_left_mark_correlated_scope(batch_size: usize) -> Result<()> { - let task_ctx = prepare_task_ctx(batch_size, false); - let left = build_table_two_cols( ("id", &vec![Some(1), Some(2), Some(3), None, None, Some(5)]), ( @@ -6903,31 +6904,13 @@ mod tests { ("grp", &vec![Some(1), Some(1), Some(2)]), ); - let on = vec![ - ( - Arc::new(Column::new_with_schema("id", &left.schema())?) as _, - Arc::new(Column::new_with_schema("id", &right.schema())?) as _, - ), - ( - Arc::new(Column::new_with_schema("grp", &left.schema())?) as _, - Arc::new(Column::new_with_schema("grp", &right.schema())?) as _, - ), - ]; - - let join = HashJoinExec::try_new( + let batches = null_aware_mark_join_collect( + batch_size, left, right, - on, - None, - &JoinType::LeftMark, - None, - PartitionMode::CollectLeft, - NullEquality::NullEqualsNothing, - true, - )?; - - let stream = join.execute(0, task_ctx)?; - let batches = common::collect(stream).await?; + &[("id", "id"), ("grp", "grp")], + ) + .await?; allow_duplicates! { assert_snapshot!(batches_to_sort_string(&batches), @r" @@ -6960,8 +6943,6 @@ mod tests { async fn test_null_aware_left_mark_correlated_no_value_nulls( batch_size: usize, ) -> Result<()> { - let task_ctx = prepare_task_ctx(batch_size, false); - let left = build_table_two_cols( ("id", &vec![Some(1), Some(2), Some(3), Some(4)]), ("grp", &vec![Some(1), Some(1), Some(2), Some(3)]), @@ -6972,31 +6953,13 @@ mod tests { ("grp", &vec![Some(1), Some(2)]), ); - let on = vec![ - ( - Arc::new(Column::new_with_schema("id", &left.schema())?) as _, - Arc::new(Column::new_with_schema("id", &right.schema())?) as _, - ), - ( - Arc::new(Column::new_with_schema("grp", &left.schema())?) as _, - Arc::new(Column::new_with_schema("grp", &right.schema())?) as _, - ), - ]; - - let join = HashJoinExec::try_new( + let batches = null_aware_mark_join_collect( + batch_size, left, right, - on, - None, - &JoinType::LeftMark, - None, - PartitionMode::CollectLeft, - NullEquality::NullEqualsNothing, - true, - )?; - - let stream = join.execute(0, task_ctx)?; - let batches = common::collect(stream).await?; + &[("id", "id"), ("grp", "grp")], + ) + .await?; // Every mark is true/false; no UNKNOWN is produced when value keys are // non-null. `id=2` matches within `grp=1` (true); `id=4`'s `grp=3` has @@ -7029,8 +6992,6 @@ mod tests { async fn test_null_aware_left_mark_correlated_build_null_only( batch_size: usize, ) -> Result<()> { - let task_ctx = prepare_task_ctx(batch_size, false); - let left = build_table_two_cols( ("id", &vec![Some(1), None, None, Some(3)]), ("grp", &vec![Some(1), Some(1), Some(2), Some(3)]), @@ -7041,31 +7002,13 @@ mod tests { ("grp", &vec![Some(1), Some(2)]), ); - let on = vec![ - ( - Arc::new(Column::new_with_schema("id", &left.schema())?) as _, - Arc::new(Column::new_with_schema("id", &right.schema())?) as _, - ), - ( - Arc::new(Column::new_with_schema("grp", &left.schema())?) as _, - Arc::new(Column::new_with_schema("grp", &right.schema())?) as _, - ), - ]; - - let join = HashJoinExec::try_new( + let batches = null_aware_mark_join_collect( + batch_size, left, right, - on, - None, - &JoinType::LeftMark, - None, - PartitionMode::CollectLeft, - NullEquality::NullEqualsNothing, - true, - )?; - - let stream = join.execute(0, task_ctx)?; - let batches = common::collect(stream).await?; + &[("id", "id"), ("grp", "grp")], + ) + .await?; // `(NULL, 1)` and `(NULL, 2)` have non-empty correlated scopes, so // their marks are UNKNOWN. `(1, 1)` finds no equal value and no NULL in @@ -7097,8 +7040,6 @@ mod tests { async fn test_null_aware_left_mark_correlated_probe_null_only( batch_size: usize, ) -> Result<()> { - let task_ctx = prepare_task_ctx(batch_size, false); - let left = build_table_two_cols( ("id", &vec![Some(1), Some(2), Some(5), Some(7)]), ("grp", &vec![Some(1), Some(1), Some(2), Some(3)]), @@ -7109,31 +7050,13 @@ mod tests { ("grp", &vec![Some(1), Some(1), Some(2)]), ); - let on = vec![ - ( - Arc::new(Column::new_with_schema("id", &left.schema())?) as _, - Arc::new(Column::new_with_schema("id", &right.schema())?) as _, - ), - ( - Arc::new(Column::new_with_schema("grp", &left.schema())?) as _, - Arc::new(Column::new_with_schema("grp", &right.schema())?) as _, - ), - ]; - - let join = HashJoinExec::try_new( + let batches = null_aware_mark_join_collect( + batch_size, left, right, - on, - None, - &JoinType::LeftMark, - None, - PartitionMode::CollectLeft, - NullEquality::NullEqualsNothing, - true, - )?; - - let stream = join.execute(0, task_ctx)?; - let batches = common::collect(stream).await?; + &[("id", "id"), ("grp", "grp")], + ) + .await?; // `(1, 1)` is unmatched and its scope contains a NULL value (UNKNOWN); // `(2, 1)` matches (true); `(5, 2)`'s scope holds only `4` (false); diff --git a/datafusion/physical-plan/src/joins/hash_join/stream.rs b/datafusion/physical-plan/src/joins/hash_join/stream.rs index 857ebd687f3db..dfe395c8384de 100644 --- a/datafusion/physical-plan/src/joins/hash_join/stream.rs +++ b/datafusion/physical-plan/src/joins/hash_join/stream.rs @@ -1064,7 +1064,7 @@ impl HashJoinStream { .load(Ordering::Relaxed); let build_key_column = &build_side.left_data.values()[0]; let null_indices_bitmap = - if build_side.left_data.null_aware_mark_scope_map().is_some() { + if build_side.left_data.null_aware_state().is_some() { Some(build_side.left_data.null_indices_bitmap().lock()) } else { None @@ -1112,8 +1112,8 @@ impl HashJoinStream { /// Records which build rows of a correlated null-aware `LeftMark` join are /// UNKNOWN candidates for this probe batch. /// -/// Key layout (consumed positionally): index 0 is the `NOT IN` value key; -/// indices 1..N are the correlation scope keys. A build row's mark must be +/// Key layout: `on[0]` is the `NOT IN` value key, `on[1..]` the correlation +/// scope keys (see `HashJoinExec::null_aware`). A build row's mark must be /// NULL (SQL UNKNOWN) instead of FALSE when it is unmatched and either: /// 1. its value key is NULL and any probe row shares its correlation scope, or /// 2. some probe row in its correlation scope has a NULL value key. @@ -1133,7 +1133,7 @@ fn mark_null_candidates_for_probe_batch( probe_indices_buffer: &mut Vec, build_indices_buffer: &mut Vec, ) -> Result<()> { - let Some(scope_map) = build_side.left_data.null_aware_mark_scope_map() else { + let Some(null_aware_state) = build_side.left_data.null_aware_state() else { return Ok(()); }; @@ -1155,7 +1155,7 @@ fn mark_null_candidates_for_probe_batch( let build_scope_values = &build_side.left_data.values()[1..]; let probe_scope_values = &state.values[1..]; - let null_value_scope_map = build_side.left_data.null_value_scope_map(); + let null_value_scope_map = null_aware_state.null_value_scope_map.as_ref(); let probe_has_null_values = probe_value_key.null_count() > 0; if null_value_scope_map.is_none() && !probe_has_null_values { return Ok(()); @@ -1200,7 +1200,7 @@ fn mark_null_candidates_for_probe_batch( create_hashes(&probe_null_scope_values, random_state, hashes_buffer)?; scan_scope_matches_into_bitmap( - scope_map, + null_aware_state.scope_map.as_ref(), build_scope_values, &probe_null_scope_values, hashes_buffer, diff --git a/datafusion/sqllogictest/test_files/null_aware_mark_join.slt b/datafusion/sqllogictest/test_files/null_aware_mark_join.slt index 72b2f0eecf14b..c23c221ccc83b 100644 --- a/datafusion/sqllogictest/test_files/null_aware_mark_join.slt +++ b/datafusion/sqllogictest/test_files/null_aware_mark_join.slt @@ -201,13 +201,7 @@ e # 0 0 # These queries would fall back to a sort-merge, but we force them -# into hash joins to make sure we null awareness. - -statement ok -set datafusion.execution.target_partitions = 2; - -statement ok -set datafusion.optimizer.repartition_joins = true; +# into hash joins to make sure we preserve null awareness. statement ok set datafusion.optimizer.prefer_hash_join = false; From 451e8987e55f4018b7d3f47d681b981b5dc5548b Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Thu, 11 Jun 2026 20:33:40 +0100 Subject: [PATCH 5/9] comments Signed-off-by: Adam Gutglick --- datafusion/core/src/physical_planner.rs | 3 +++ .../optimizer/src/decorrelate_predicate_subquery.rs | 4 ++++ datafusion/physical-plan/src/joins/hash_join/exec.rs | 6 ++++++ datafusion/physical-plan/src/joins/hash_join/stream.rs | 8 ++++++++ 4 files changed, 21 insertions(+) diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 174748026ec56..992fad3cee1dd 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -1626,6 +1626,9 @@ impl DefaultPhysicalPlanner { let prefer_hash_join = session_state.config_options().optimizer.prefer_hash_join; + // Null-aware joins are pinned to CollectLeft hash joins (see + // `HashJoinExec::null_aware`): never repartition them, and + // never route them to the sort-merge path below. let can_repartition_join = session_state.config().target_partitions() > 1 && session_state.config().repartition_joins() && !*null_aware; diff --git a/datafusion/optimizer/src/decorrelate_predicate_subquery.rs b/datafusion/optimizer/src/decorrelate_predicate_subquery.rs index 15914bcc7a955..522cbb9d95569 100644 --- a/datafusion/optimizer/src/decorrelate_predicate_subquery.rs +++ b/datafusion/optimizer/src/decorrelate_predicate_subquery.rs @@ -427,6 +427,10 @@ fn build_join( }; let join_filter = match (join_filter_opt, &aliased_in_predicate) { + // Keep the in-predicate as the first conjunct: equijoin extraction + // converts conjuncts into join keys in order, and null-aware mark + // execution requires `on[0]` to be the scalar NOT IN value key (see + // `HashJoinExec::null_aware`). (Some(join_filter), Some(in_predicate)) => in_predicate.clone().and(join_filter), (Some(join_filter), None) => join_filter, (None, Some(in_predicate)) => in_predicate.clone(), diff --git a/datafusion/physical-plan/src/joins/hash_join/exec.rs b/datafusion/physical-plan/src/joins/hash_join/exec.rs index 6c4734d3d8164..4e21f4706445f 100644 --- a/datafusion/physical-plan/src/joins/hash_join/exec.rs +++ b/datafusion/physical-plan/src/joins/hash_join/exec.rs @@ -1461,6 +1461,9 @@ impl ExecutionPlan for HashJoinExec { Arc::clone(context.session_config().options()), self.null_equality, array_map_created_count, + // with_null_aware_mark_state: the extra scope maps + null + // bitmap are only built for correlated null-aware LeftMark + // (`on[1..]` are correlation scope keys). self.null_aware && self.join_type == JoinType::LeftMark && on_left.len() > 1, @@ -1484,6 +1487,9 @@ impl ExecutionPlan for HashJoinExec { Arc::clone(context.session_config().options()), self.null_equality, array_map_created_count, + // with_null_aware_mark_state: the extra scope maps + null + // bitmap are only built for correlated null-aware LeftMark + // (`on[1..]` are correlation scope keys). self.null_aware && self.join_type == JoinType::LeftMark && on_left.len() > 1, diff --git a/datafusion/physical-plan/src/joins/hash_join/stream.rs b/datafusion/physical-plan/src/joins/hash_join/stream.rs index dfe395c8384de..9176c246b83ff 100644 --- a/datafusion/physical-plan/src/joins/hash_join/stream.rs +++ b/datafusion/physical-plan/src/joins/hash_join/stream.rs @@ -326,8 +326,13 @@ pub(super) struct HashJoinStream { /// Scratch space for build indices during hash lookup build_indices_buffer: Vec, + /// Scratch space for scope-key hashes in the null-aware mark pass, reused + /// across probe batches. Separate from `hashes_buffer`, which still holds + /// the probe batch's full-key hashes during the chunked lookup loop. null_mark_hashes_buffer: Vec, + /// Scratch space for probe indices during null-aware scope lookups null_mark_probe_indices_buffer: Vec, + /// Scratch space for build indices during null-aware scope lookups null_mark_build_indices_buffer: Vec, /// Specifies whether the right side has an ordering to potentially preserve @@ -805,6 +810,9 @@ impl HashJoinStream { return Ok(StatefulStreamResult::Continue); } + // For correlated null-aware LeftMark (`on[1..]` scope keys, hence + // values len > 1), record this batch's UNKNOWN candidates once, before + // the first chunked lookup (offset == (0, None)). if self.null_aware && self.join_type == JoinType::LeftMark && state.values.len() > 1 From 91405da0042457be274e4e669383d2813076c655 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Thu, 11 Jun 2026 20:39:30 +0100 Subject: [PATCH 6/9] More tests Signed-off-by: Adam Gutglick --- .../test_files/null_aware_mark_join.slt | 276 +++++++++++++++++- 1 file changed, 266 insertions(+), 10 deletions(-) diff --git a/datafusion/sqllogictest/test_files/null_aware_mark_join.slt b/datafusion/sqllogictest/test_files/null_aware_mark_join.slt index c23c221ccc83b..270e7a354ddeb 100644 --- a/datafusion/sqllogictest/test_files/null_aware_mark_join.slt +++ b/datafusion/sqllogictest/test_files/null_aware_mark_join.slt @@ -81,8 +81,7 @@ statement ok set datafusion.optimizer.prefer_hash_join = true; query TT -EXPLAIN -SELECT id, value +EXPLAIN SELECT id, value FROM outer_table WHERE (id NOT IN (SELECT id FROM inner_table_with_null)) IS NULL; ---- @@ -111,8 +110,7 @@ d e query TT -EXPLAIN -SELECT value +EXPLAIN SELECT value FROM outer_table WHERE (id NOT IN (SELECT id FROM inner_table_no_null)) IS NULL; ---- @@ -175,6 +173,267 @@ WHERE (id NOT IN ( ---- e +# WHERE EXISTS in a disjunction, the motivating mark join example from +# "The Complete Story of Joins (in HyPer)" (Neumann, Leis, Kemper; BTW 2017), +# Section 3.3. The OR prevents the semi join rewrite, so the EXISTS must run +# as a mark join. Unlike the null-aware NOT IN marks above, an EXISTS mark is +# two-valued: NULLs on either side of the comparison produce FALSE, not NULL. + +query TT +EXPLAIN SELECT value +FROM outer_table +WHERE EXISTS ( + SELECT * + FROM inner_table_with_null + WHERE inner_table_with_null.id = outer_table.id +) OR value = 'e'; +---- +logical_plan +01)Projection: outer_table.value +02)--Filter: __correlated_sq_1.mark OR outer_table.value = Utf8View("e") +03)----Projection: outer_table.value, __correlated_sq_1.mark +04)------LeftMark Join: outer_table.id = __correlated_sq_1.id +05)--------TableScan: outer_table projection=[id, value] +06)--------SubqueryAlias: __correlated_sq_1 +07)----------TableScan: inner_table_with_null projection=[id] +physical_plan +01)FilterExec: mark@1 OR value@0 = e, projection=[value@0] +02)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +03)----HashJoinExec: mode=CollectLeft, join_type=RightMark, on=[(id@0, id@0)], projection=[value@1, mark@2] +04)------DataSourceExec: partitions=1, partition_sizes=[1] +05)------DataSourceExec: partitions=1, partition_sizes=[1] + +query T rowsort +SELECT value +FROM outer_table +WHERE EXISTS ( + SELECT * + FROM inner_table_with_null + WHERE inner_table_with_null.id = outer_table.id +) OR value = 'e'; +---- +b +e + +# If the EXISTS mark were wrongly null-aware, every unmatched outer row would +# get a NULL mark (the build side contains a NULL), NOT mark would stay NULL, +# and all rows except 'b' would disappear. +query TT +EXPLAIN SELECT value +FROM outer_table +WHERE NOT EXISTS ( + SELECT * + FROM inner_table_with_null + WHERE inner_table_with_null.id = outer_table.id +) OR value = 'b'; +---- +logical_plan +01)Projection: outer_table.value +02)--Filter: NOT __correlated_sq_1.mark OR outer_table.value = Utf8View("b") +03)----Projection: outer_table.value, __correlated_sq_1.mark +04)------LeftMark Join: outer_table.id = __correlated_sq_1.id +05)--------TableScan: outer_table projection=[id, value] +06)--------SubqueryAlias: __correlated_sq_1 +07)----------TableScan: inner_table_with_null projection=[id] +physical_plan +01)FilterExec: NOT mark@1 OR value@0 = b, projection=[value@0] +02)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +03)----HashJoinExec: mode=CollectLeft, join_type=RightMark, on=[(id@0, id@0)], projection=[value@1, mark@2] +04)------DataSourceExec: partitions=1, partition_sizes=[1] +05)------DataSourceExec: partitions=1, partition_sizes=[1] + +query T rowsort +SELECT value +FROM outer_table +WHERE NOT EXISTS ( + SELECT * + FROM inner_table_with_null + WHERE inner_table_with_null.id = outer_table.id +) OR value = 'b'; +---- +a +b +c +d +e + +# Section 3.3 also gives the genuinely three-valued mark example: +# select Title, ECTS = any (select ECTS from Courses c2 +# where Lecturer = 123) someEqual +# from Courses c1 +# where the mark itself is projected and must surface TRUE, FALSE, and NULL. +# DataFusion cannot decorrelate subqueries in a projection yet, so the +# projected forms are documented errors for now. Expected results once that +# lands (verified against DuckDB): +# +# SELECT value, id = ANY (SELECT id FROM inner_table_no_null) AS some_equal +# FROM outer_table; +# ---- +# a false +# b true +# c false +# d true +# e NULL + +query error This feature is not implemented: Physical plan does not support logical expression Exists +SELECT value, id = ANY (SELECT id FROM inner_table_no_null) AS some_equal +FROM outer_table; + +query error This feature is not implemented: Physical plan does not support logical expression InSubquery +SELECT value, id IN (SELECT id FROM inner_table_no_null) AS some_equal +FROM outer_table; + +# In a filter the paper's = ANY form works today: rewrite_set_comparison +# expands it to a CASE over EXISTS marks, and IS NULL / IS TRUE then observe +# all three mark values. + +query TT +EXPLAIN SELECT value +FROM outer_table +WHERE (id = ANY (SELECT id FROM inner_table_with_null)) IS NULL; +---- +logical_plan +01)Projection: outer_table.value +02)--Filter: __correlated_sq_1.mark OR __correlated_sq_2.mark AND NOT __correlated_sq_3.mark AND Boolean(NULL) IS NULL +03)----Projection: outer_table.value, __correlated_sq_1.mark, __correlated_sq_2.mark, __correlated_sq_3.mark +04)------LeftMark Join: Filter: outer_table.id = __correlated_sq_3.id IS TRUE +05)--------LeftMark Join: Filter: outer_table.id = __correlated_sq_2.id IS NULL +06)----------LeftMark Join: Filter: outer_table.id = __correlated_sq_1.id IS TRUE +07)------------TableScan: outer_table projection=[id, value] +08)------------SubqueryAlias: __correlated_sq_1 +09)--------------TableScan: inner_table_with_null projection=[id] +10)----------SubqueryAlias: __correlated_sq_2 +11)------------TableScan: inner_table_with_null projection=[id] +12)--------SubqueryAlias: __correlated_sq_3 +13)----------TableScan: inner_table_with_null projection=[id] +physical_plan +01)FilterExec: mark@1 OR mark@2 AND NOT mark@3 AND NULL IS NULL, projection=[value@0] +02)--NestedLoopJoinExec: join_type=RightMark, filter=(id@0 = id@1) IS NOT DISTINCT FROM true, projection=[value@1, mark@2, mark@3, mark@4] +03)----DataSourceExec: partitions=1, partition_sizes=[1] +04)----NestedLoopJoinExec: join_type=RightMark, filter=id@0 = id@1 IS NULL +05)------DataSourceExec: partitions=1, partition_sizes=[1] +06)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +07)--------NestedLoopJoinExec: join_type=RightMark, filter=(id@0 = id@1) IS NOT DISTINCT FROM true +08)----------DataSourceExec: partitions=1, partition_sizes=[1] +09)----------DataSourceExec: partitions=1, partition_sizes=[1] + +query T rowsort +SELECT value +FROM outer_table +WHERE (id = ANY (SELECT id FROM inner_table_with_null)) IS NULL; +---- +a +c +d +e + +query TT +EXPLAIN SELECT value +FROM outer_table +WHERE (id = ANY (SELECT id FROM inner_table_no_null)) IS TRUE; +---- +logical_plan +01)Projection: outer_table.value +02)--Filter: __correlated_sq_1.mark OR __correlated_sq_2.mark AND NOT __correlated_sq_3.mark AND Boolean(NULL) IS TRUE +03)----Projection: outer_table.value, __correlated_sq_1.mark, __correlated_sq_2.mark, __correlated_sq_3.mark +04)------LeftMark Join: Filter: outer_table.id = __correlated_sq_3.id IS TRUE +05)--------LeftMark Join: Filter: outer_table.id = __correlated_sq_2.id IS NULL +06)----------LeftMark Join: Filter: outer_table.id = __correlated_sq_1.id IS TRUE +07)------------TableScan: outer_table projection=[id, value] +08)------------SubqueryAlias: __correlated_sq_1 +09)--------------TableScan: inner_table_no_null projection=[id] +10)----------SubqueryAlias: __correlated_sq_2 +11)------------TableScan: inner_table_no_null projection=[id] +12)--------SubqueryAlias: __correlated_sq_3 +13)----------TableScan: inner_table_no_null projection=[id] +physical_plan +01)FilterExec: (mark@1 OR mark@2 AND NOT mark@3 AND NULL) IS NOT DISTINCT FROM true, projection=[value@0] +02)--NestedLoopJoinExec: join_type=RightMark, filter=(id@0 = id@1) IS NOT DISTINCT FROM true, projection=[value@1, mark@2, mark@3, mark@4] +03)----DataSourceExec: partitions=1, partition_sizes=[1] +04)----NestedLoopJoinExec: join_type=RightMark, filter=id@0 = id@1 IS NULL +05)------DataSourceExec: partitions=1, partition_sizes=[1] +06)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +07)--------NestedLoopJoinExec: join_type=RightMark, filter=(id@0 = id@1) IS NOT DISTINCT FROM true +08)----------DataSourceExec: partitions=1, partition_sizes=[1] +09)----------DataSourceExec: partitions=1, partition_sizes=[1] + +query T rowsort +SELECT value +FROM outer_table +WHERE (id = ANY (SELECT id FROM inner_table_no_null)) IS TRUE; +---- +b +d + +query TT +EXPLAIN SELECT value +FROM outer_table +WHERE (id = ANY (SELECT id FROM inner_table_no_null)) IS NULL; +---- +logical_plan +01)Projection: outer_table.value +02)--Filter: __correlated_sq_1.mark OR __correlated_sq_2.mark AND NOT __correlated_sq_3.mark AND Boolean(NULL) IS NULL +03)----Projection: outer_table.value, __correlated_sq_1.mark, __correlated_sq_2.mark, __correlated_sq_3.mark +04)------LeftMark Join: Filter: outer_table.id = __correlated_sq_3.id IS TRUE +05)--------LeftMark Join: Filter: outer_table.id = __correlated_sq_2.id IS NULL +06)----------LeftMark Join: Filter: outer_table.id = __correlated_sq_1.id IS TRUE +07)------------TableScan: outer_table projection=[id, value] +08)------------SubqueryAlias: __correlated_sq_1 +09)--------------TableScan: inner_table_no_null projection=[id] +10)----------SubqueryAlias: __correlated_sq_2 +11)------------TableScan: inner_table_no_null projection=[id] +12)--------SubqueryAlias: __correlated_sq_3 +13)----------TableScan: inner_table_no_null projection=[id] +physical_plan +01)FilterExec: mark@1 OR mark@2 AND NOT mark@3 AND NULL IS NULL, projection=[value@0] +02)--NestedLoopJoinExec: join_type=RightMark, filter=(id@0 = id@1) IS NOT DISTINCT FROM true, projection=[value@1, mark@2, mark@3, mark@4] +03)----DataSourceExec: partitions=1, partition_sizes=[1] +04)----NestedLoopJoinExec: join_type=RightMark, filter=id@0 = id@1 IS NULL +05)------DataSourceExec: partitions=1, partition_sizes=[1] +06)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +07)--------NestedLoopJoinExec: join_type=RightMark, filter=(id@0 = id@1) IS NOT DISTINCT FROM true +08)----------DataSourceExec: partitions=1, partition_sizes=[1] +09)----------DataSourceExec: partitions=1, partition_sizes=[1] + +query T rowsort +SELECT value +FROM outer_table +WHERE (id = ANY (SELECT id FROM inner_table_no_null)) IS NULL; +---- +e + +# Positive IN goes through InSubquery decorrelation rather than the = ANY +# CASE rewrite; make sure its mark is null-aware too. +query TT +EXPLAIN SELECT value +FROM outer_table +WHERE (id IN (SELECT id FROM inner_table_with_null)) IS NULL; +---- +logical_plan +01)Projection: outer_table.value +02)--Filter: __correlated_sq_1.mark IS NULL +03)----Projection: outer_table.value, __correlated_sq_1.mark +04)------LeftMark Join: outer_table.id = __correlated_sq_1.id +05)--------TableScan: outer_table projection=[id, value] +06)--------SubqueryAlias: __correlated_sq_1 +07)----------TableScan: inner_table_with_null projection=[id] +physical_plan +01)FilterExec: mark@1 IS NULL, projection=[value@0] +02)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 +03)----HashJoinExec: mode=CollectLeft, join_type=LeftMark, on=[(id@0, id@0)], projection=[value@1, mark@2] +04)------DataSourceExec: partitions=1, partition_sizes=[1] +05)------DataSourceExec: partitions=1, partition_sizes=[1] + +query T rowsort +SELECT value +FROM outer_table +WHERE (id IN (SELECT id FROM inner_table_with_null)) IS NULL; +---- +a +c +d +e + # Regression test for row-valued NOT IN with a partially NULL tuple. # This matches DuckDB issue #19182 and the Postgres-style behavior from # https://github.com/duckdb/duckdb/pull/22683. It is commented out because @@ -207,8 +466,7 @@ statement ok set datafusion.optimizer.prefer_hash_join = false; query TT -EXPLAIN -SELECT id, value +EXPLAIN SELECT id, value FROM outer_table WHERE (id NOT IN (SELECT id FROM inner_table_with_null)) IS NULL; ---- @@ -227,8 +485,7 @@ physical_plan 05)------DataSourceExec: partitions=1, partition_sizes=[1] query TT -EXPLAIN -SELECT value +EXPLAIN SELECT value FROM outer_table WHERE (id NOT IN (SELECT id FROM inner_table_with_null)) IS NULL; ---- @@ -275,8 +532,7 @@ statement ok set datafusion.optimizer.prefer_hash_join = true; query TT -EXPLAIN -SELECT value +EXPLAIN SELECT value FROM outer_table WHERE (EXISTS ( SELECT 1 From 3fe77d7f50b23544168eae7f588ff7a625893314 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Thu, 11 Jun 2026 21:00:45 +0100 Subject: [PATCH 7/9] simplify build_null_aware_left_mark_column --- datafusion/physical-plan/src/joins/utils.rs | 59 ++++++++++----------- 1 file changed, 27 insertions(+), 32 deletions(-) diff --git a/datafusion/physical-plan/src/joins/utils.rs b/datafusion/physical-plan/src/joins/utils.rs index 683c92c1950d2..6999af1c0c7e6 100644 --- a/datafusion/physical-plan/src/joins/utils.rs +++ b/datafusion/physical-plan/src/joins/utils.rs @@ -1428,38 +1428,33 @@ pub(crate) fn build_null_aware_left_mark_column( probe_side_has_null: bool, probe_side_non_empty: bool, ) -> ArrayRef { - Arc::new( - build_indices - .iter() - .enumerate() - .map(|(output_idx, build_idx)| { - if probe_indices.is_valid(output_idx) { - Some(true) - } else { - let build_idx = build_idx.expect( - "LeftMark final indices should always contain build-side rows", - ) as usize; - if let Some(null_indices_bitmap) = null_indices_bitmap { - if null_indices_bitmap.get_bit(build_idx) { - None - } else { - Some(false) - } - } else if build_key_column.is_null(build_idx) { - if probe_side_non_empty { - None - } else { - Some(false) - } - } else if probe_side_has_null { - None - } else { - Some(false) - } - } - }) - .collect::(), - ) as ArrayRef + // Whether an unmatched build row's mark is NULL (UNKNOWN) instead of FALSE: + // correlated joins precomputed this per row in `null_indices_bitmap`; the + // uncorrelated rules are cases 1-4 in the doc above. + let unmatched_mark_is_null = |build_idx: usize| match null_indices_bitmap { + Some(bitmap) => bitmap.get_bit(build_idx), + None if build_key_column.is_null(build_idx) => probe_side_non_empty, + None => probe_side_has_null, + }; + + let marks: BooleanArray = build_indices + .iter() + .zip(probe_indices.iter()) + .map(|(build_idx, probe_idx)| { + if probe_idx.is_some() { + return Some(true); + } + let build_idx = build_idx + .expect("LeftMark final indices should always contain build-side rows") + as usize; + if unmatched_mark_is_null(build_idx) { + None + } else { + Some(false) + } + }) + .collect(); + Arc::new(marks) } /// Returns a new [RecordBatch] for a probe batch when no probe row can find a From 592c3ed0ed873421dd5d44474b1424f4e939afe9 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Tue, 23 Jun 2026 12:50:41 +0100 Subject: [PATCH 8/9] fixes Signed-off-by: Adam Gutglick --- .../physical-plan/src/joins/hash_join/exec.rs | 1 + .../src/joins/hash_join/stream.rs | 1 + .../test_files/null_aware_mark_join.slt | 20 +++++++++---------- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/datafusion/physical-plan/src/joins/hash_join/exec.rs b/datafusion/physical-plan/src/joins/hash_join/exec.rs index 4e21f4706445f..1ea96561e9775 100644 --- a/datafusion/physical-plan/src/joins/hash_join/exec.rs +++ b/datafusion/physical-plan/src/joins/hash_join/exec.rs @@ -2225,6 +2225,7 @@ async fn collect_left_input( &mut hashes_buffer, 0, true, + NullEquality::NullEqualsNothing, )?; // Build rows whose value key is NULL get a dedicated scope map: any diff --git a/datafusion/physical-plan/src/joins/hash_join/stream.rs b/datafusion/physical-plan/src/joins/hash_join/stream.rs index 9176c246b83ff..c685a72779db1 100644 --- a/datafusion/physical-plan/src/joins/hash_join/stream.rs +++ b/datafusion/physical-plan/src/joins/hash_join/stream.rs @@ -1247,6 +1247,7 @@ fn scan_scope_matches_into_bitmap( probe_scope_values, NullEquality::NullEqualsNothing, hashes_buffer, + None, batch_size, offset, probe_indices_buffer, diff --git a/datafusion/sqllogictest/test_files/null_aware_mark_join.slt b/datafusion/sqllogictest/test_files/null_aware_mark_join.slt index 270e7a354ddeb..35f54562b6998 100644 --- a/datafusion/sqllogictest/test_files/null_aware_mark_join.slt +++ b/datafusion/sqllogictest/test_files/null_aware_mark_join.slt @@ -88,14 +88,14 @@ WHERE (id NOT IN (SELECT id FROM inner_table_with_null)) IS NULL; logical_plan 01)Projection: outer_table.id, outer_table.value 02)--Filter: NOT __correlated_sq_1.mark IS NULL -03)----LeftMark Join: outer_table.id = __correlated_sq_1.id +03)----LeftMark Join: outer_table.id = __correlated_sq_1.id null_aware 04)------TableScan: outer_table projection=[id, value] 05)------SubqueryAlias: __correlated_sq_1 06)--------TableScan: inner_table_with_null projection=[id] physical_plan 01)FilterExec: NOT mark@2 IS NULL, projection=[id@0, value@1] 02)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -03)----HashJoinExec: mode=CollectLeft, join_type=LeftMark, on=[(id@0, id@0)] +03)----HashJoinExec: mode=CollectLeft, join_type=LeftMark, on=[(id@0, id@0)], null_aware 04)------DataSourceExec: partitions=1, partition_sizes=[1] 05)------DataSourceExec: partitions=1, partition_sizes=[1] @@ -118,14 +118,14 @@ logical_plan 01)Projection: outer_table.value 02)--Filter: NOT __correlated_sq_1.mark IS NULL 03)----Projection: outer_table.value, __correlated_sq_1.mark -04)------LeftMark Join: outer_table.id = __correlated_sq_1.id +04)------LeftMark Join: outer_table.id = __correlated_sq_1.id null_aware 05)--------TableScan: outer_table projection=[id, value] 06)--------SubqueryAlias: __correlated_sq_1 07)----------TableScan: inner_table_no_null projection=[id] physical_plan 01)FilterExec: NOT mark@1 IS NULL, projection=[value@0] 02)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -03)----HashJoinExec: mode=CollectLeft, join_type=LeftMark, on=[(id@0, id@0)], projection=[value@1, mark@2] +03)----HashJoinExec: mode=CollectLeft, join_type=LeftMark, on=[(id@0, id@0)], projection=[value@1, mark@2], null_aware 04)------DataSourceExec: partitions=1, partition_sizes=[1] 05)------DataSourceExec: partitions=1, partition_sizes=[1] @@ -413,14 +413,14 @@ logical_plan 01)Projection: outer_table.value 02)--Filter: __correlated_sq_1.mark IS NULL 03)----Projection: outer_table.value, __correlated_sq_1.mark -04)------LeftMark Join: outer_table.id = __correlated_sq_1.id +04)------LeftMark Join: outer_table.id = __correlated_sq_1.id null_aware 05)--------TableScan: outer_table projection=[id, value] 06)--------SubqueryAlias: __correlated_sq_1 07)----------TableScan: inner_table_with_null projection=[id] physical_plan 01)FilterExec: mark@1 IS NULL, projection=[value@0] 02)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -03)----HashJoinExec: mode=CollectLeft, join_type=LeftMark, on=[(id@0, id@0)], projection=[value@1, mark@2] +03)----HashJoinExec: mode=CollectLeft, join_type=LeftMark, on=[(id@0, id@0)], projection=[value@1, mark@2], null_aware 04)------DataSourceExec: partitions=1, partition_sizes=[1] 05)------DataSourceExec: partitions=1, partition_sizes=[1] @@ -473,14 +473,14 @@ WHERE (id NOT IN (SELECT id FROM inner_table_with_null)) IS NULL; logical_plan 01)Projection: outer_table.id, outer_table.value 02)--Filter: NOT __correlated_sq_1.mark IS NULL -03)----LeftMark Join: outer_table.id = __correlated_sq_1.id +03)----LeftMark Join: outer_table.id = __correlated_sq_1.id null_aware 04)------TableScan: outer_table projection=[id, value] 05)------SubqueryAlias: __correlated_sq_1 06)--------TableScan: inner_table_with_null projection=[id] physical_plan 01)FilterExec: NOT mark@2 IS NULL, projection=[id@0, value@1] 02)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -03)----HashJoinExec: mode=CollectLeft, join_type=LeftMark, on=[(id@0, id@0)] +03)----HashJoinExec: mode=CollectLeft, join_type=LeftMark, on=[(id@0, id@0)], null_aware 04)------DataSourceExec: partitions=1, partition_sizes=[1] 05)------DataSourceExec: partitions=1, partition_sizes=[1] @@ -493,14 +493,14 @@ logical_plan 01)Projection: outer_table.value 02)--Filter: NOT __correlated_sq_1.mark IS NULL 03)----Projection: outer_table.value, __correlated_sq_1.mark -04)------LeftMark Join: outer_table.id = __correlated_sq_1.id +04)------LeftMark Join: outer_table.id = __correlated_sq_1.id null_aware 05)--------TableScan: outer_table projection=[id, value] 06)--------SubqueryAlias: __correlated_sq_1 07)----------TableScan: inner_table_with_null projection=[id] physical_plan 01)FilterExec: NOT mark@1 IS NULL, projection=[value@0] 02)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -03)----HashJoinExec: mode=CollectLeft, join_type=LeftMark, on=[(id@0, id@0)], projection=[value@1, mark@2] +03)----HashJoinExec: mode=CollectLeft, join_type=LeftMark, on=[(id@0, id@0)], projection=[value@1, mark@2], null_aware 04)------DataSourceExec: partitions=1, partition_sizes=[1] 05)------DataSourceExec: partitions=1, partition_sizes=[1] From 7dceb8af62ed0cd5e97c00b1067ef2cbeef2f7e1 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Tue, 23 Jun 2026 13:29:21 +0100 Subject: [PATCH 9/9] less code --- datafusion/core/src/physical_planner.rs | 21 +--- datafusion/expr/src/logical_plan/builder.rs | 24 +--- datafusion/expr/src/logical_plan/mod.rs | 4 +- datafusion/expr/src/logical_plan/plan.rs | 17 +-- .../src/decorrelate_predicate_subquery.rs | 111 ++++++----------- .../optimizer/src/eliminate_cross_join.rs | 3 + .../optimizer/src/optimize_projections/mod.rs | 3 +- .../physical-plan/src/joins/hash_join/exec.rs | 44 +++---- .../src/joins/hash_join/stream.rs | 17 +-- .../src/joins/nested_loop_join.rs | 2 +- .../src/joins/piecewise_merge_join/exec.rs | 5 +- .../src/joins/sort_merge_join/exec.rs | 2 +- .../src/joins/symmetric_hash_join.rs | 2 +- datafusion/physical-plan/src/joins/utils.rs | 24 +--- .../test_files/null_aware_mark_join.slt | 116 +----------------- 15 files changed, 92 insertions(+), 303 deletions(-) diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 992fad3cee1dd..c654c9bda6779 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -3460,27 +3460,11 @@ mod tests { ctx.sql(query).await?.create_physical_plan().await } - async fn plan_sql_with_config( - query: &str, - config: SessionConfig, - ) -> Result> { - let ctx = SessionContext::new_with_config(config); - ctx.sql(query).await?.create_physical_plan().await - } - async fn collect_sql(query: &str) -> Result> { let ctx = SessionContext::new(); ctx.sql(query).await?.collect().await } - async fn collect_sql_with_config( - query: &str, - config: SessionConfig, - ) -> Result> { - let ctx = SessionContext::new_with_config(config); - ctx.sql(query).await?.collect().await - } - #[tokio::test] async fn test_all_operators() -> Result<()> { let logical_plan = test_csv_scan() @@ -3644,8 +3628,9 @@ mod tests { let config = SessionConfig::new() .with_target_partitions(4) .set_bool("datafusion.optimizer.prefer_hash_join", false); + let ctx = SessionContext::new_with_config(config); - let plan = plan_sql_with_config(query, config.clone()).await?; + let plan = ctx.sql(query).await?.create_physical_plan().await?; let formatted = displayable(plan.as_ref()).indent(true).to_string(); assert_contains!( &formatted, @@ -3653,7 +3638,7 @@ mod tests { ); assert!(!formatted.contains("SortMergeJoinExec"), "{formatted}"); - let batches = collect_sql_with_config(query, config).await?; + let batches = ctx.sql(query).await?.collect().await?; assert_batches_eq!( &[ "+-------+", diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 5707efa5b0a13..d937e8827a825 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -1154,7 +1154,7 @@ impl LogicalPlanBuilder { .zip(right_keys) .map(|(l, r)| (Expr::Column(l), Expr::Column(r))) .collect(); - let join_schema = build_join_schema_with_null_aware( + let join_schema = build_join_schema( self.plan.schema(), right.schema(), &join_type, @@ -1677,25 +1677,13 @@ fn mark_field(schema: &DFSchema, nullable: bool) -> (Option, Arc /// Creates a schema for a join operation. /// The fields from the left side are first. /// -/// The `mark` column of a mark join is non-nullable. Use -/// [`build_join_schema_with_null_aware`] for null-aware mark joins, whose mark -/// column can be `NULL` (SQL UNKNOWN). +/// When `null_aware` is set, the `LeftMark`/`RightMark` `mark` column is made +/// nullable so it can represent SQL UNKNOWN for null-aware `NOT IN` semantics. +/// `null_aware` has no effect on non-mark join types. pub fn build_join_schema( left: &DFSchema, right: &DFSchema, join_type: &JoinType, -) -> Result { - build_join_schema_with_null_aware(left, right, join_type, false) -} - -/// Like [`build_join_schema`], but makes the `LeftMark`/`RightMark` `mark` -/// column nullable when `null_aware` is set, so it can represent SQL UNKNOWN -/// for null-aware `NOT IN` semantics. `null_aware` has no effect on non-mark -/// join types. -pub fn build_join_schema_with_null_aware( - left: &DFSchema, - right: &DFSchema, - join_type: &JoinType, null_aware: bool, ) -> Result { fn nullify_fields<'a>( @@ -2933,13 +2921,13 @@ mod tests { )?; let join_schema = - build_join_schema(&left_schema, &right_schema, &JoinType::Left)?; + build_join_schema(&left_schema, &right_schema, &JoinType::Left, false)?; assert_eq!( join_schema.metadata(), &HashMap::from([("key".to_string(), "left".to_string())]) ); let join_schema = - build_join_schema(&left_schema, &right_schema, &JoinType::Right)?; + build_join_schema(&left_schema, &right_schema, &JoinType::Right, false)?; assert_eq!( join_schema.metadata(), &HashMap::from([("key".to_string(), "right".to_string())]) diff --git a/datafusion/expr/src/logical_plan/mod.rs b/datafusion/expr/src/logical_plan/mod.rs index b84e9b23d3c5c..4766c3f33379f 100644 --- a/datafusion/expr/src/logical_plan/mod.rs +++ b/datafusion/expr/src/logical_plan/mod.rs @@ -28,8 +28,8 @@ pub mod tree_node; pub use builder::{ LogicalPlanBuilder, LogicalPlanBuilderOptions, LogicalTableSource, UNNAMED_TABLE, - build_join_schema, build_join_schema_with_null_aware, requalify_sides_if_needed, - table_scan, union, wrap_projection_for_join_if_necessary, + build_join_schema, requalify_sides_if_needed, table_scan, union, + wrap_projection_for_join_if_necessary, }; pub use ddl::{ CreateCatalog, CreateCatalogSchema, CreateExternalTable, CreateFunction, diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index c0324f6f096ba..4aad4ce96b24d 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -47,8 +47,7 @@ use crate::utils::{ use crate::{ BinaryExpr, CreateMemoryTable, CreateView, Execute, Expr, ExprSchemable, GroupingSet, LogicalPlanBuilder, Operator, Prepare, TableProviderFilterPushDown, TableSource, - WindowFunctionDefinition, build_join_schema_with_null_aware, expr_vec_fmt, - requalify_sides_if_needed, + WindowFunctionDefinition, build_join_schema, expr_vec_fmt, requalify_sides_if_needed, }; use crate::statistics::StatisticsRequest; @@ -668,7 +667,7 @@ impl LogicalPlan { null_equality, null_aware, }) => { - let schema = build_join_schema_with_null_aware( + let schema = build_join_schema( left.schema(), right.schema(), &join_type, @@ -949,7 +948,7 @@ impl LogicalPlan { .. }) => { let (left, right) = self.only_two_inputs(inputs)?; - let schema = build_join_schema_with_null_aware( + let schema = build_join_schema( left.schema(), right.schema(), join_type, @@ -4279,12 +4278,8 @@ impl Join { null_equality: NullEquality, null_aware: bool, ) -> Result { - let join_schema = build_join_schema_with_null_aware( - left.schema(), - right.schema(), - &join_type, - null_aware, - )?; + let join_schema = + build_join_schema(left.schema(), right.schema(), &join_type, null_aware)?; Ok(Join { left, @@ -4335,7 +4330,7 @@ impl Join { .map(|(l, r)| (Expr::Column(l), Expr::Column(r))) .collect(); - let join_schema = build_join_schema_with_null_aware( + let join_schema = build_join_schema( left_sch.schema(), right_sch.schema(), &original_join.join_type, diff --git a/datafusion/optimizer/src/decorrelate_predicate_subquery.rs b/datafusion/optimizer/src/decorrelate_predicate_subquery.rs index 522cbb9d95569..e954598c80f06 100644 --- a/datafusion/optimizer/src/decorrelate_predicate_subquery.rs +++ b/datafusion/optimizer/src/decorrelate_predicate_subquery.rs @@ -627,14 +627,6 @@ mod tests { )) } - fn nullable_scalar_mark_scan(name: &str) -> Result { - let schema = Schema::new(vec![ - Field::new("id", DataType::Int32, true), - Field::new("grp", DataType::Int32, true), - ]); - table_scan(Some(name), &schema, None)?.build() - } - fn has_left_mark_join_with_null_aware( plan: &LogicalPlan, expected_null_aware: bool, @@ -1287,81 +1279,33 @@ mod tests { ) } - #[test] - fn correlated_not_in_mark_join_is_null_aware_for_hashable_filter() -> Result<()> { - let outer_scan = nullable_scalar_mark_scan("outer_t")?; - let inner_scan = nullable_scalar_mark_scan("inner_t")?; - - let subquery = Arc::new( - LogicalPlanBuilder::from(inner_scan) - .filter( - out_ref_col(DataType::Int32, "outer_t.grp").eq(col("inner_t.grp")), - )? - .project(vec![col("inner_t.id")])? - .build()?, - ); - - let plan = LogicalPlanBuilder::from(outer_scan) - .filter(not_in_subquery(col("outer_t.id"), subquery).is_null())? - .build()?; - - let optimized = optimize_with_decorrelate(plan)?; - assert!( - has_left_mark_join_with_null_aware(&optimized, true), - "{}", - optimized.display_indent_schema() - ); - - Ok(()) - } - - #[test] - fn correlated_not_in_mark_join_is_not_null_aware_for_residual_filter() -> Result<()> { - let outer_scan = nullable_scalar_mark_scan("outer_t")?; - let inner_scan = nullable_scalar_mark_scan("inner_t")?; - - let subquery = Arc::new( - LogicalPlanBuilder::from(inner_scan) - .filter( - out_ref_col(DataType::Int32, "outer_t.grp").lt(col("inner_t.grp")), - )? - .project(vec![col("inner_t.id")])? - .build()?, - ); - - let plan = LogicalPlanBuilder::from(outer_scan) - .filter(not_in_subquery(col("outer_t.id"), subquery).is_null())? - .build()?; - - let optimized = optimize_with_decorrelate(plan)?; - assert!( - has_left_mark_join_with_null_aware(&optimized, false), - "{}", - optimized.display_indent_schema() - ); - - Ok(()) - } - - #[test] - fn correlated_not_in_mark_join_not_null_aware_for_non_nullable_value_key() - -> Result<()> { - // The value key `id` is non-nullable on both sides; only the - // correlation key `grp` is nullable. A NULL correlation key just makes - // the correlated subquery empty, so NOT IN can never be UNKNOWN and - // the mark join does not need null-aware semantics. + /// Builds `... WHERE (id NOT IN (SELECT id FROM inner WHERE )) + /// IS NULL` over scans whose value key `id` has the given nullability (the + /// correlation key `grp` is always nullable), optimizes it, and asserts the + /// resulting `LeftMark` join's `null_aware` flag. + fn assert_correlated_not_in_mark_join_null_aware( + value_key_nullable: bool, + hashable_filter: bool, + expected_null_aware: bool, + ) -> Result<()> { let schema = Schema::new(vec![ - Field::new("id", DataType::Int32, false), + Field::new("id", DataType::Int32, value_key_nullable), Field::new("grp", DataType::Int32, true), ]); let outer_scan = table_scan(Some("outer_t"), &schema, None)?.build()?; let inner_scan = table_scan(Some("inner_t"), &schema, None)?.build()?; + let outer_grp = out_ref_col(DataType::Int32, "outer_t.grp"); + let inner_grp = col("inner_t.grp"); + let grp_filter = if hashable_filter { + outer_grp.eq(inner_grp) + } else { + outer_grp.lt(inner_grp) + }; + let subquery = Arc::new( LogicalPlanBuilder::from(inner_scan) - .filter( - out_ref_col(DataType::Int32, "outer_t.grp").eq(col("inner_t.grp")), - )? + .filter(grp_filter)? .project(vec![col("inner_t.id")])? .build()?, ); @@ -1372,7 +1316,7 @@ mod tests { let optimized = optimize_with_decorrelate(plan)?; assert!( - has_left_mark_join_with_null_aware(&optimized, false), + has_left_mark_join_with_null_aware(&optimized, expected_null_aware), "{}", optimized.display_indent_schema() ); @@ -1380,6 +1324,21 @@ mod tests { Ok(()) } + #[test] + fn correlated_not_in_mark_join_null_awareness() -> Result<()> { + // Hashable (`=`) correlation filter on a nullable value key: NOT IN can + // be UNKNOWN, so the mark join is null-aware. + assert_correlated_not_in_mark_join_null_aware(true, true, true)?; + // Residual (`<`) correlation filter: the equijoin predicate can't be + // extracted, so the mark join is not null-aware. + assert_correlated_not_in_mark_join_null_aware(true, false, false)?; + // Non-nullable value key `id`: a NULL correlation key just makes the + // subquery empty, so NOT IN can never be UNKNOWN and the join is not + // null-aware. + assert_correlated_not_in_mark_join_null_aware(false, true, false)?; + Ok(()) + } + #[test] fn in_subquery_both_side_expr() -> Result<()> { let table_scan = test_table_scan()?; diff --git a/datafusion/optimizer/src/eliminate_cross_join.rs b/datafusion/optimizer/src/eliminate_cross_join.rs index 95b70da443d88..6bd6356db4609 100644 --- a/datafusion/optimizer/src/eliminate_cross_join.rs +++ b/datafusion/optimizer/src/eliminate_cross_join.rs @@ -374,6 +374,7 @@ fn find_inner_join( left_input.schema(), right_input.schema(), &JoinType::Inner, + false, )?); return Ok(LogicalPlan::Join(Join { @@ -397,6 +398,7 @@ fn find_inner_join( left_input.schema(), right.schema(), &JoinType::Inner, + false, )?); Ok(LogicalPlan::Join(Join { @@ -1402,6 +1404,7 @@ mod tests { t1.schema(), t2.schema(), &JoinType::Inner, + false, )?); let inner_join = LogicalPlan::Join(Join { diff --git a/datafusion/optimizer/src/optimize_projections/mod.rs b/datafusion/optimizer/src/optimize_projections/mod.rs index acdbf71d05d5c..51ccf65f5fbc8 100644 --- a/datafusion/optimizer/src/optimize_projections/mod.rs +++ b/datafusion/optimizer/src/optimize_projections/mod.rs @@ -1024,7 +1024,8 @@ mod tests { let left_schema = left_child.schema(); let right_schema = right_child.schema(); let schema = Arc::new( - build_join_schema(left_schema, right_schema, &JoinType::Inner).unwrap(), + build_join_schema(left_schema, right_schema, &JoinType::Inner, false) + .unwrap(), ); Self { exprs: vec![], diff --git a/datafusion/physical-plan/src/joins/hash_join/exec.rs b/datafusion/physical-plan/src/joins/hash_join/exec.rs index 1ea96561e9775..2ff16ab92b65c 100644 --- a/datafusion/physical-plan/src/joins/hash_join/exec.rs +++ b/datafusion/physical-plan/src/joins/hash_join/exec.rs @@ -59,7 +59,7 @@ use crate::{ common::can_project, joins::utils::{ BuildProbeJoinMetrics, ColumnIndex, JoinFilter, JoinHashMapType, - build_join_schema_with_null_aware, check_join_is_valid, estimate_join_statistics, + build_join_schema, check_join_is_valid, estimate_join_statistics, need_produce_result_in_final, symmetric_join_output_partitioning, }, metrics::{ExecutionPlanMetricsSet, MetricsSet}, @@ -523,12 +523,8 @@ impl HashJoinExecBuilder { } check_join_is_valid(&left_schema, &right_schema, &on)?; - let (join_schema, column_indices) = build_join_schema_with_null_aware( - &left_schema, - &right_schema, - &join_type, - null_aware, - ); + let (join_schema, column_indices) = + build_join_schema(&left_schema, &right_schema, &join_type, null_aware); let join_schema = Arc::new(join_schema); @@ -1442,6 +1438,11 @@ impl ExecutionPlan for HashJoinExec { .flatten() .flatten(); + // The extra scope maps + null bitmap are only built for correlated + // null-aware LeftMark joins (`on[1..]` are correlation scope keys). + let with_null_aware_mark_state = + self.null_aware && self.join_type == JoinType::LeftMark && on_left.len() > 1; + let left_fut = match self.mode { PartitionMode::CollectLeft => self.left_fut.try_once(|| { let left_stream = self.left.execute(0, Arc::clone(&context))?; @@ -1461,12 +1462,7 @@ impl ExecutionPlan for HashJoinExec { Arc::clone(context.session_config().options()), self.null_equality, array_map_created_count, - // with_null_aware_mark_state: the extra scope maps + null - // bitmap are only built for correlated null-aware LeftMark - // (`on[1..]` are correlation scope keys). - self.null_aware - && self.join_type == JoinType::LeftMark - && on_left.len() > 1, + with_null_aware_mark_state, )) })?, PartitionMode::Partitioned => { @@ -1487,12 +1483,9 @@ impl ExecutionPlan for HashJoinExec { Arc::clone(context.session_config().options()), self.null_equality, array_map_created_count, - // with_null_aware_mark_state: the extra scope maps + null - // bitmap are only built for correlated null-aware LeftMark - // (`on[1..]` are correlation scope keys). - self.null_aware - && self.join_type == JoinType::LeftMark - && on_left.len() > 1, + // Partitioned mode is rejected for null-aware joins (see + // `try_new`), so the extra null-aware state is never built. + false, )) } PartitionMode::Auto => { @@ -2208,11 +2201,8 @@ async fn collect_left_input( "null-aware LeftMark needs on_left[0]=value, on_left[1..]=scope, got {} key(s)", on_left.len() ); - // This secondary map is keyed only by correlation scope keys. The - // primary join map may still use ArrayMap for full-key TRUE matches, - // but scope-only NULL marking uses a HashMap for arbitrary key shapes. - // It is probed only with the NULL-valued probe rows, so the lookup work - // is proportional to the probe-side NULLs. + // Scope-only NULL marking uses a HashMap (the primary join map may use + // ArrayMap for full-key matches, but scope keys have arbitrary shape). let mut scope_map = new_join_hashmap(num_rows, &mut reservation, &metrics)?; let mut hashes_buffer = vec![0; batch.num_rows()]; @@ -2228,10 +2218,8 @@ async fn collect_left_input( NullEquality::NullEqualsNothing, )?; - // Build rows whose value key is NULL get a dedicated scope map: any - // probe row sharing their scope makes their mark UNKNOWN, so they are - // probed with *every* probe row. Restricting the map to the NULL-valued - // rows keeps that lookup proportional to the build-side NULLs. + // Build the dedicated scope map over the NULL-valued build rows (see + // `NullValueScopeMap`). let value_key = &left_values[0]; let null_value_scope_map = if value_key.null_count() > 0 { let null_mask = arrow::compute::is_null(value_key.as_ref())?; diff --git a/datafusion/physical-plan/src/joins/hash_join/stream.rs b/datafusion/physical-plan/src/joins/hash_join/stream.rs index c685a72779db1..c71f915da3f14 100644 --- a/datafusion/physical-plan/src/joins/hash_join/stream.rs +++ b/datafusion/physical-plan/src/joins/hash_join/stream.rs @@ -1126,12 +1126,8 @@ impl HashJoinStream { /// 1. its value key is NULL and any probe row shares its correlation scope, or /// 2. some probe row in its correlation scope has a NULL value key. /// -/// Each case only does work proportional to the rows that actually involve a -/// NULL value key: case 1 probes the build-side NULL-value scope map (which -/// contains only NULL-valued build rows) with all probe rows, and case 2 -/// probes the full scope map with only the NULL-valued probe rows. Enumerating -/// every scope match of every probe row instead would degrade to -/// O(build * probe) for low-cardinality correlation keys. +/// Case 1 probes the build-side NULL-value scope map with all probe rows; +/// case 2 probes the full scope map with only the NULL-valued probe rows. fn mark_null_candidates_for_probe_batch( build_side: &BuildSideReadyState, state: &ProcessProbeBatchState, @@ -1145,19 +1141,12 @@ fn mark_null_candidates_for_probe_batch( return Ok(()); }; - debug_assert!( - build_side.left_data.values().len() > 1, - "build keys must be [value, scope..]" - ); - debug_assert!( - state.values.len() > 1, - "probe keys must be [value, scope..]" - ); debug_assert_eq!( build_side.left_data.values().len(), state.values.len(), "build/probe key counts must match" ); + debug_assert!(state.values.len() > 1, "keys must be [value, scope..]"); let probe_value_key = &state.values[0]; let build_scope_values = &build_side.left_data.values()[1..]; diff --git a/datafusion/physical-plan/src/joins/nested_loop_join.rs b/datafusion/physical-plan/src/joins/nested_loop_join.rs index db552fed96724..1d2937ccc60ab 100644 --- a/datafusion/physical-plan/src/joins/nested_loop_join.rs +++ b/datafusion/physical-plan/src/joins/nested_loop_join.rs @@ -284,7 +284,7 @@ impl NestedLoopJoinExecBuilder { let right_schema = right.schema(); check_join_is_valid(&left_schema, &right_schema, &[])?; let (join_schema, column_indices) = - build_join_schema(&left_schema, &right_schema, &join_type); + build_join_schema(&left_schema, &right_schema, &join_type, false); let join_schema = Arc::new(join_schema); let cache = NestedLoopJoinExec::compute_properties( &left, diff --git a/datafusion/physical-plan/src/joins/piecewise_merge_join/exec.rs b/datafusion/physical-plan/src/joins/piecewise_merge_join/exec.rs index 50e9252a21131..faabe6a7b2ba2 100644 --- a/datafusion/physical-plan/src/joins/piecewise_merge_join/exec.rs +++ b/datafusion/physical-plan/src/joins/piecewise_merge_join/exec.rs @@ -352,8 +352,9 @@ impl PiecewiseMergeJoinExec { let streamed_schema = streamed.schema(); // Create output schema for the join - let schema = - Arc::new(build_join_schema(&buffered_schema, &streamed_schema, &join_type).0); + let schema = Arc::new( + build_join_schema(&buffered_schema, &streamed_schema, &join_type, false).0, + ); let cache = Self::compute_properties( &buffered, &streamed, diff --git a/datafusion/physical-plan/src/joins/sort_merge_join/exec.rs b/datafusion/physical-plan/src/joins/sort_merge_join/exec.rs index a8d25fd002b76..3be6a3b22cade 100644 --- a/datafusion/physical-plan/src/joins/sort_merge_join/exec.rs +++ b/datafusion/physical-plan/src/joins/sort_merge_join/exec.rs @@ -185,7 +185,7 @@ impl SortMergeJoinExec { }; let schema = - Arc::new(build_join_schema(&left_schema, &right_schema, &join_type).0); + Arc::new(build_join_schema(&left_schema, &right_schema, &join_type, false).0); let cache = Self::compute_properties(&left, &right, Arc::clone(&schema), join_type, &on)?; Ok(Self { diff --git a/datafusion/physical-plan/src/joins/symmetric_hash_join.rs b/datafusion/physical-plan/src/joins/symmetric_hash_join.rs index 321d7e686c823..3ebe0d7e49b6a 100644 --- a/datafusion/physical-plan/src/joins/symmetric_hash_join.rs +++ b/datafusion/physical-plan/src/joins/symmetric_hash_join.rs @@ -234,7 +234,7 @@ impl SymmetricHashJoinExec { // Build the join schema from the left and right schemas: let (schema, column_indices) = - build_join_schema(&left_schema, &right_schema, join_type); + build_join_schema(&left_schema, &right_schema, join_type, false); // Initialize the random state for the join operation: let random_state = RandomState::with_seed(0); diff --git a/datafusion/physical-plan/src/joins/utils.rs b/datafusion/physical-plan/src/joins/utils.rs index 6999af1c0c7e6..776bd8ed2e80b 100644 --- a/datafusion/physical-plan/src/joins/utils.rs +++ b/datafusion/physical-plan/src/joins/utils.rs @@ -259,25 +259,13 @@ fn output_join_field(old_field: &Field, join_type: &JoinType, is_left: bool) -> /// Creates a schema for a join operation. /// The fields from the left side are first. /// -/// The `mark` column of a mark join is non-nullable. Use -/// [`build_join_schema_with_null_aware`] for null-aware mark joins, whose mark -/// column can be `NULL` (SQL UNKNOWN). +/// When `null_aware` is set, the `LeftMark`/`RightMark` `mark` column is made +/// nullable so it can represent SQL UNKNOWN for null-aware `NOT IN` semantics. +/// `null_aware` has no effect on non-mark join types. pub fn build_join_schema( left: &Schema, right: &Schema, join_type: &JoinType, -) -> (Schema, Vec) { - build_join_schema_with_null_aware(left, right, join_type, false) -} - -/// Like [`build_join_schema`], but makes the `LeftMark`/`RightMark` `mark` -/// column nullable when `null_aware` is set, so it can represent SQL UNKNOWN -/// for null-aware `NOT IN` semantics. `null_aware` has no effect on non-mark -/// join types. -pub fn build_join_schema_with_null_aware( - left: &Schema, - right: &Schema, - join_type: &JoinType, null_aware: bool, ) -> (Schema, Vec) { let left_fields = || { @@ -2854,7 +2842,7 @@ mod tests { ]; for (left_in, right_in, join_type, left_out, right_out) in cases { - let (schema, _) = build_join_schema(left_in, right_in, &join_type); + let (schema, _) = build_join_schema(left_in, right_in, &join_type, false); let expected_fields = left_out .fields() @@ -4358,13 +4346,13 @@ mod tests { .with_metadata(HashMap::from([("key".to_string(), "right".to_string())])); let (join_schema, _) = - build_join_schema(&left_schema, &right_schema, &JoinType::Left); + build_join_schema(&left_schema, &right_schema, &JoinType::Left, false); assert_eq!( join_schema.metadata(), &HashMap::from([("key".to_string(), "left".to_string())]) ); let (join_schema, _) = - build_join_schema(&left_schema, &right_schema, &JoinType::Right); + build_join_schema(&left_schema, &right_schema, &JoinType::Right, false); assert_eq!( join_schema.metadata(), &HashMap::from([("key".to_string(), "right".to_string())]) diff --git a/datafusion/sqllogictest/test_files/null_aware_mark_join.slt b/datafusion/sqllogictest/test_files/null_aware_mark_join.slt index 35f54562b6998..d0e24eed87e7b 100644 --- a/datafusion/sqllogictest/test_files/null_aware_mark_join.slt +++ b/datafusion/sqllogictest/test_files/null_aware_mark_join.slt @@ -49,18 +49,6 @@ CREATE TABLE inner_corr_table(id INT, grp INT) AS VALUES (NULL, 1), (1, 2); -# Out-of-scope row-valued NOT IN repro kept for future work. -# DuckDB issue #19182 covers partially NULL tuple comparison semantics; this PR -# only targets scalar null-aware mark joins. -# statement ok -# CREATE TABLE outer_tuple_table(f1 INT, f2 INT) AS VALUES -# (0, 0), -# (1, 0); -# -# statement ok -# CREATE TABLE inner_tuple_table(c1 BIGINT, c2 BIGINT) AS VALUES -# (0, NULL); - statement ok CREATE TABLE empty_table(id INT) AS SELECT * @@ -365,36 +353,8 @@ WHERE (id = ANY (SELECT id FROM inner_table_no_null)) IS TRUE; b d -query TT -EXPLAIN SELECT value -FROM outer_table -WHERE (id = ANY (SELECT id FROM inner_table_no_null)) IS NULL; ----- -logical_plan -01)Projection: outer_table.value -02)--Filter: __correlated_sq_1.mark OR __correlated_sq_2.mark AND NOT __correlated_sq_3.mark AND Boolean(NULL) IS NULL -03)----Projection: outer_table.value, __correlated_sq_1.mark, __correlated_sq_2.mark, __correlated_sq_3.mark -04)------LeftMark Join: Filter: outer_table.id = __correlated_sq_3.id IS TRUE -05)--------LeftMark Join: Filter: outer_table.id = __correlated_sq_2.id IS NULL -06)----------LeftMark Join: Filter: outer_table.id = __correlated_sq_1.id IS TRUE -07)------------TableScan: outer_table projection=[id, value] -08)------------SubqueryAlias: __correlated_sq_1 -09)--------------TableScan: inner_table_no_null projection=[id] -10)----------SubqueryAlias: __correlated_sq_2 -11)------------TableScan: inner_table_no_null projection=[id] -12)--------SubqueryAlias: __correlated_sq_3 -13)----------TableScan: inner_table_no_null projection=[id] -physical_plan -01)FilterExec: mark@1 OR mark@2 AND NOT mark@3 AND NULL IS NULL, projection=[value@0] -02)--NestedLoopJoinExec: join_type=RightMark, filter=(id@0 = id@1) IS NOT DISTINCT FROM true, projection=[value@1, mark@2, mark@3, mark@4] -03)----DataSourceExec: partitions=1, partition_sizes=[1] -04)----NestedLoopJoinExec: join_type=RightMark, filter=id@0 = id@1 IS NULL -05)------DataSourceExec: partitions=1, partition_sizes=[1] -06)------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -07)--------NestedLoopJoinExec: join_type=RightMark, filter=(id@0 = id@1) IS NOT DISTINCT FROM true -08)----------DataSourceExec: partitions=1, partition_sizes=[1] -09)----------DataSourceExec: partitions=1, partition_sizes=[1] - +# Same plan shape as the `= ANY (... with_null) IS NULL` case above; only the +# result differs, so assert just the result here. query T rowsort SELECT value FROM outer_table @@ -434,33 +394,8 @@ c d e -# Regression test for row-valued NOT IN with a partially NULL tuple. -# This matches DuckDB issue #19182 and the Postgres-style behavior from -# https://github.com/duckdb/duckdb/pull/22683. It is commented out because -# tuple-valued NOT IN needs row-comparison NULL semantics that are outside this -# scalar null-aware mark join PR. -# query II rowsort -# SELECT f1, f2 -# FROM outer_tuple_table -# WHERE (f1, f2) NOT IN ( -# SELECT c1, c2 -# FROM inner_tuple_table -# ); -# ---- -# 1 0 -# -# query II rowsort -# SELECT f1, f2 -# FROM outer_tuple_table -# WHERE ((f1, f2) NOT IN ( -# SELECT c1, c2 -# FROM inner_tuple_table -# )) IS NULL; -# ---- -# 0 0 - -# These queries would fall back to a sort-merge, but we force them -# into hash joins to make sure we preserve null awareness. +# Null-aware mark joins are pinned to CollectLeft hash joins (mark joins have no +# sort-merge path); flipping prefer_hash_join off must not drop null awareness. statement ok set datafusion.optimizer.prefer_hash_join = false; @@ -484,43 +419,6 @@ physical_plan 04)------DataSourceExec: partitions=1, partition_sizes=[1] 05)------DataSourceExec: partitions=1, partition_sizes=[1] -query TT -EXPLAIN SELECT value -FROM outer_table -WHERE (id NOT IN (SELECT id FROM inner_table_with_null)) IS NULL; ----- -logical_plan -01)Projection: outer_table.value -02)--Filter: NOT __correlated_sq_1.mark IS NULL -03)----Projection: outer_table.value, __correlated_sq_1.mark -04)------LeftMark Join: outer_table.id = __correlated_sq_1.id null_aware -05)--------TableScan: outer_table projection=[id, value] -06)--------SubqueryAlias: __correlated_sq_1 -07)----------TableScan: inner_table_with_null projection=[id] -physical_plan -01)FilterExec: NOT mark@1 IS NULL, projection=[value@0] -02)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -03)----HashJoinExec: mode=CollectLeft, join_type=LeftMark, on=[(id@0, id@0)], projection=[value@1, mark@2], null_aware -04)------DataSourceExec: partitions=1, partition_sizes=[1] -05)------DataSourceExec: partitions=1, partition_sizes=[1] - -query T rowsort -SELECT value -FROM outer_table -WHERE (id NOT IN (SELECT id FROM inner_table_with_null)) IS NULL; ----- -a -c -d -e - -query T rowsort -SELECT value -FROM outer_table -WHERE (id NOT IN (SELECT id FROM inner_table_no_null)) IS NULL; ----- -e - #################################### ## Nested loop mark join with NULLs #################################### @@ -591,11 +489,5 @@ DROP TABLE inner_corr_table; statement ok DROP TABLE outer_corr_table; -# statement ok -# DROP TABLE inner_tuple_table; -# -# statement ok -# DROP TABLE outer_tuple_table; - statement ok DROP TABLE outer_table;