25
25
#include " exec/hash_joiner.h"
26
26
#include " exec/pipeline/chunk_accumulate_operator.h"
27
27
#include " exec/pipeline/exchange/exchange_source_operator.h"
28
+ #include " exec/pipeline/group_execution/execution_group_builder.h"
29
+ #include " exec/pipeline/group_execution/execution_group_fwd.h"
28
30
#include " exec/pipeline/hashjoin/hash_join_build_operator.h"
29
31
#include " exec/pipeline/hashjoin/hash_join_probe_operator.h"
30
32
#include " exec/pipeline/hashjoin/hash_joiner_factory.h"
38
40
#include " exprs/expr.h"
39
41
#include " exprs/in_const_predicate.hpp"
40
42
#include " exprs/runtime_filter_bank.h"
43
+ #include " gen_cpp/PlanNodes_types.h"
44
+ #include " gen_cpp/RuntimeFilter_types.h"
41
45
#include " gutil/strings/substitute.h"
42
46
#include " runtime/current_thread.h"
43
47
#include " runtime/runtime_filter_worker.h"
@@ -425,30 +429,27 @@ void HashJoinNode::close(RuntimeState* state) {
425
429
template <class HashJoinerFactory , class HashJoinBuilderFactory , class HashJoinProbeFactory >
426
430
pipeline::OpFactories HashJoinNode::_decompose_to_pipeline (pipeline::PipelineBuilderContext* context) {
427
431
using namespace pipeline ;
428
-
429
432
auto rhs_operators = child (1 )->decompose_to_pipeline (context);
433
+ // "col NOT IN (NULL, val1, val2)" always returns false, so hash join should
434
+ // return empty result in this case. Hash join cannot be divided into multiple
435
+ // partitions in this case. Otherwise, NULL value in right table will only occur
436
+ // in some partition hash table, and other partition hash table can output chunk.
437
+ // TODO: support nullaware left anti join with shuffle join
438
+ DCHECK (_join_type != TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || _distribution_mode == TJoinDistributionMode::BROADCAST);
430
439
if (_distribution_mode == TJoinDistributionMode::BROADCAST) {
431
440
// Broadcast join need only create one hash table, because all the HashJoinProbeOperators
432
441
// use the same hash table with their own different probe states.
433
442
rhs_operators = context->maybe_interpolate_local_passthrough_exchange (runtime_state (), id (), rhs_operators);
434
443
} else {
435
- // "col NOT IN (NULL, val1, val2)" always returns false, so hash join should
436
- // return empty result in this case. Hash join cannot be divided into multiple
437
- // partitions in this case. Otherwise, NULL value in right table will only occur
438
- // in some partition hash table, and other partition hash table can output chunk.
439
- if (_join_type == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) {
440
- rhs_operators = context->maybe_interpolate_local_passthrough_exchange (runtime_state (), id (), rhs_operators);
441
- } else {
442
- // Both HashJoin{Build, Probe}Operator are parallelized
443
- // There are two ways of shuffle
444
- // 1. If previous op is ExchangeSourceOperator and its partition type is HASH_PARTITIONED or BUCKET_SHUFFLE_HASH_PARTITIONED
445
- // then pipeline level shuffle will be performed at sender side (ExchangeSinkOperator), so
446
- // there is no need to perform local shuffle again at receiver side
447
- // 2. Otherwise, add LocalExchangeOperator
448
- // to shuffle multi-stream into #degree_of_parallelism# streams each of that pipes into HashJoin{Build, Probe}Operator.
449
- rhs_operators = context->maybe_interpolate_local_shuffle_exchange (runtime_state (), id (), rhs_operators,
450
- _build_equivalence_partition_expr_ctxs);
451
- }
444
+ // Both HashJoin{Build, Probe}Operator are parallelized
445
+ // There are two ways of shuffle
446
+ // 1. If previous op is ExchangeSourceOperator and its partition type is HASH_PARTITIONED or BUCKET_SHUFFLE_HASH_PARTITIONED
447
+ // then pipeline level shuffle will be performed at sender side (ExchangeSinkOperator), so
448
+ // there is no need to perform local shuffle again at receiver side
449
+ // 2. Otherwise, add LocalExchangeOperator
450
+ // to shuffle multi-stream into #degree_of_parallelism# streams each of that pipes into HashJoin{Build, Probe}Operator.
451
+ rhs_operators = context->maybe_interpolate_local_shuffle_exchange (runtime_state (), id (), rhs_operators,
452
+ _build_equivalence_partition_expr_ctxs);
452
453
}
453
454
454
455
size_t num_right_partitions = context->source_operator (rhs_operators)->degree_of_parallelism ();
@@ -468,12 +469,8 @@ pipeline::OpFactories HashJoinNode::_decompose_to_pipeline(pipeline::PipelineBui
468
469
_build_runtime_filters, _output_slots, _output_slots, _distribution_mode, false );
469
470
auto hash_joiner_factory = std::make_shared<starrocks::pipeline::HashJoinerFactory>(param);
470
471
471
- // add placeholder into RuntimeFilterHub, HashJoinBuildOperator will generate runtime filters and fill it,
472
- // Operators consuming the runtime filters will inspect this placeholder.
473
- context->fragment_context ()->runtime_filter_hub ()->add_holder (_id);
474
-
475
472
// Create a shared RefCountedRuntimeFilterCollector
476
- auto && rc_rf_probe_collector = std::make_shared<RcRfProbeCollector>(2 , std::move (this ->runtime_filter_collector ()));
473
+ auto rc_rf_probe_collector = std::make_shared<RcRfProbeCollector>(2 , std::move (this ->runtime_filter_collector ()));
477
474
// In default query engine, we only build one hash table for join right child.
478
475
// But for pipeline query engine, we will build `num_right_partitions` hash tables, so we need to enlarge the limit
479
476
@@ -508,12 +505,20 @@ pipeline::OpFactories HashJoinNode::_decompose_to_pipeline(pipeline::PipelineBui
508
505
DeferOp pop_dependent_pipeline ([context]() { context->pop_dependent_pipeline (); });
509
506
510
507
auto lhs_operators = child (0 )->decompose_to_pipeline (context);
511
- if (_distribution_mode == TJoinDistributionMode::BROADCAST) {
512
- lhs_operators = context->maybe_interpolate_local_passthrough_exchange (runtime_state (), id (), lhs_operators,
513
- context->degree_of_parallelism ());
508
+ auto join_colocate_group = context->find_exec_group_by_plan_node_id (_id);
509
+ if (join_colocate_group->type () == ExecutionGroupType::COLOCATE) {
510
+ DCHECK (context->current_execution_group ()->is_colocate_exec_group ());
511
+ DCHECK_EQ (context->current_execution_group (), join_colocate_group);
512
+ context->set_current_execution_group (join_colocate_group);
514
513
} else {
515
- if (_join_type == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) {
516
- lhs_operators = context->maybe_interpolate_local_passthrough_exchange (runtime_state (), id (), lhs_operators);
514
+ // left child is colocate group, but current join is not colocate group
515
+ if (context->current_execution_group ()->is_colocate_exec_group ()) {
516
+ lhs_operators = context->interpolate_grouped_exchange (_id, lhs_operators);
517
+ }
518
+
519
+ if (_distribution_mode == TJoinDistributionMode::BROADCAST) {
520
+ lhs_operators = context->maybe_interpolate_local_passthrough_exchange (runtime_state (), id (), lhs_operators,
521
+ context->degree_of_parallelism ());
517
522
} else {
518
523
auto * rhs_source_op = context->source_operator (rhs_operators);
519
524
auto * lhs_source_op = context->source_operator (lhs_operators);
@@ -522,13 +527,27 @@ pipeline::OpFactories HashJoinNode::_decompose_to_pipeline(pipeline::PipelineBui
522
527
_probe_equivalence_partition_expr_ctxs);
523
528
}
524
529
}
530
+
525
531
lhs_operators.emplace_back (std::move (probe_op));
532
+ // add placeholder into RuntimeFilterHub, HashJoinBuildOperator will generate runtime filters and fill it,
533
+ // Operators consuming the runtime filters will inspect this placeholder.
534
+ if (context->is_colocate_group () && _distribution_mode == TJoinDistributionMode::COLOCATE) {
535
+ for (auto runtime_filter_build_desc : _build_runtime_filters) {
536
+ // local colocate won't generate global runtime filter
537
+ DCHECK (!runtime_filter_build_desc->has_remote_targets ());
538
+ runtime_filter_build_desc->set_num_colocate_partition (num_right_partitions);
539
+ }
540
+ context->fragment_context ()->runtime_filter_hub ()->add_holder (_id, num_right_partitions);
541
+ } else {
542
+ context->fragment_context ()->runtime_filter_hub ()->add_holder (_id);
543
+ }
526
544
527
545
if (limit () != -1 ) {
528
546
lhs_operators.emplace_back (std::make_shared<LimitOperatorFactory>(context->next_operator_id (), id (), limit ()));
529
547
}
530
548
531
- if (_hash_join_node.__isset .interpolate_passthrough && _hash_join_node.interpolate_passthrough ) {
549
+ if (_hash_join_node.__isset .interpolate_passthrough && _hash_join_node.interpolate_passthrough &&
550
+ !context->is_colocate_group ()) {
532
551
lhs_operators = context->maybe_interpolate_local_passthrough_exchange (runtime_state (), id (), lhs_operators,
533
552
context->degree_of_parallelism (), true );
534
553
}
0 commit comments