From e724d5526a1b66bb64ac48cbb01cb7f0fba3fd82 Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Thu, 28 May 2026 17:18:38 +0800 Subject: [PATCH 1/7] [feature](cloud) Add table-level event-driven warm up Issue Number: None Related PR: None Problem Summary: Add table-level event-driven warm-up support for cloud warm-up jobs. The change extends WARM UP ... ON TABLES parsing and validation, persists normalized include and exclude table filters, resolves matching table ids dynamically, prevents conflicting cluster-level and table-level load-event jobs, propagates table ids through BE warm-up requests, records per-job source and target warm-up progress metrics, and exposes compact and detailed SyncStats through SHOW WARM UP JOB and FE metrics. Virtual compute group rebuilds cancel existing table-level load-event jobs before recreating managed cluster-level jobs. Support table-level event-driven cloud warm-up with ON TABLES filters and warm-up sync statistics. - Test: - Unit Test: ./run-fe-ut.sh --run org.apache.doris.cloud.OnTablesFilterTest,org.apache.doris.cloud.CloudWarmUpJobTableFilterTest,org.apache.doris.cloud.CacheHotspotManagerTableFilterTest,org.apache.doris.cloud.WarmUpStatsTest,org.apache.doris.cloud.WarmUpClusterOnTablesParseTest,org.apache.doris.cloud.catalog.CloudInstanceStatusCheckerTest,org.apache.doris.metric.MetricsTest#testCloudWarmUpSyncJobMetricsReadStatsDirectlyFromJob+testEventDrivenCloudWarmUpSyncJobTriggerGapMetric - Unit Test: ./run-be-ut.sh --run --filter=CloudWarmUpManagerFilterTest.*:MBvarWindowedAdderTest.* -j100 - Manual test: build-support/check-format.sh - Manual test: ./build.sh --be --fe --cloud -j100 - Manual test: docker build -f docker/runtime/doris-compose/Dockerfile -t bh-cluster-2 . - Manual test: ./run-regression-test.sh --clean --compile - Regression test: env -u HTTP_PROXY -u HTTPS_PROXY -u http_proxy -u https_proxy -u ALL_PROXY -u all_proxy ./run-regression-test.sh --run -d regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables -runMode=cloud -image bh-cluster-2 -dockerSuiteParallel 1 (18/19 passed; test_warm_up_event_on_tables_overlap_and_mv failed due test SQL duplicate MV column name before the test was fixed) - Regression test: env -u HTTP_PROXY -u HTTPS_PROXY -u http_proxy -u https_proxy -u ALL_PROXY -u all_proxy ./run-regression-test.sh --run -d regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables -s test_warm_up_event_on_tables_overlap_and_mv -runMode=cloud -image bh-cluster-2 -dockerSuiteParallel 1 - Behavior changed: Yes. WARM UP supports ON TABLES filters for event-driven load warm-up and SHOW WARM UP JOB exposes table filter, matched tables, and sync stats. - Does this need documentation: Yes. Documentation for the new ON TABLES syntax and metrics should be added separately. --- be/src/cloud/cloud_backend_service.cpp | 6 +- be/src/cloud/cloud_internal_service.cpp | 139 ++- be/src/cloud/cloud_meta_mgr.cpp | 2 +- be/src/cloud/cloud_warm_up_manager.cpp | 222 +++- be/src/cloud/cloud_warm_up_manager.h | 39 +- be/src/cloud/cloud_warmup_metrics.cpp | 82 ++ be/src/cloud/cloud_warmup_metrics.h | 76 ++ .../http/action/warmup_stats_action.cpp | 121 ++ .../service/http/action/warmup_stats_action.h | 37 + be/src/service/http_service.cpp | 5 + be/src/util/bvar_windowed_adder.h | 162 +++ .../cloud_warm_up_manager_filter_test.cpp | 276 +++++ be/test/util/bvar_windowed_adder_test.cpp | 140 +++ .../java/org/apache/doris/common/Config.java | 15 + .../doris/cloud/CacheHotspotManager.java | 493 +++++++- .../apache/doris/cloud/CloudWarmUpJob.java | 260 ++++- .../apache/doris/cloud/JobWarmUpStats.java | 285 +++++ .../apache/doris/cloud/OnTablesFilter.java | 169 +++ .../doris/cloud/TableWarmUpWindowedStats.java | 206 ++++ .../cloud/catalog/CloudClusterChecker.java | 13 +- .../catalog/CloudInstanceStatusChecker.java | 9 +- .../org/apache/doris/metric/MetricRepo.java | 203 ++++ .../nereids/parser/LogicalPlanBuilder.java | 15 +- .../plans/commands/ShowWarmUpCommand.java | 3 + .../plans/commands/WarmUpClusterCommand.java | 57 +- .../CacheHotspotManagerTableFilterTest.java | 1003 +++++++++++++++++ .../cloud/CloudWarmUpJobTableFilterTest.java | 461 ++++++++ .../doris/cloud/OnTablesFilterTest.java | 141 +++ .../cloud/WarmUpClusterOnTablesParseTest.java | 447 ++++++++ .../apache/doris/cloud/WarmUpStatsTest.java | 497 ++++++++ .../CloudInstanceStatusCheckerTest.java | 260 +++++ .../org/apache/doris/metric/MetricsTest.java | 160 +++ .../org/apache/doris/nereids/DorisLexer.g4 | 1 + .../org/apache/doris/nereids/DorisParser.g4 | 12 +- gensrc/proto/internal_service.proto | 2 + gensrc/thrift/BackendService.thrift | 1 + .../regression/util/WarmupMetricsUtils.groovy | 268 +++++ ...bles_abnormal_cancel_empty_recovery.groovy | 212 ++++ ...n_tables_abnormal_stats_and_failure.groovy | 261 +++++ ...up_event_on_tables_canonicalization.groovy | 117 ++ ...est_warm_up_event_on_tables_dynamic.groovy | 217 ++++ ...event_on_tables_error_and_lifecycle.groovy | 387 +++++++ ...est_warm_up_event_on_tables_include.groovy | 167 +++ ..._up_event_on_tables_include_exclude.groovy | 153 +++ ...m_up_event_on_tables_mow_compaction.groovy | 221 ++++ ...t_warm_up_event_on_tables_multi_dst.groovy | 213 ++++ ...rm_up_event_on_tables_multi_include.groovy | 142 +++ ...m_up_event_on_tables_overlap_and_mv.groovy | 332 ++++++ ..._up_event_on_tables_show_and_cancel.groovy | 384 +++++++ ..._warm_up_event_on_tables_sync_stats.groovy | 298 +++++ ...ent_on_tables_system_cluster_change.groovy | 168 +++ ..._tables_system_compaction_sync_wait.groovy | 213 ++++ ...event_on_tables_system_e2e_multi_be.groovy | 269 +++++ ..._event_on_tables_system_packed_file.groovy | 154 +++ ...on_tables_system_restart_and_resize.groovy | 223 ++++ ...event_on_tables_system_schema_index.groovy | 181 +++ 56 files changed, 10509 insertions(+), 91 deletions(-) create mode 100644 be/src/cloud/cloud_warmup_metrics.cpp create mode 100644 be/src/cloud/cloud_warmup_metrics.h create mode 100644 be/src/service/http/action/warmup_stats_action.cpp create mode 100644 be/src/service/http/action/warmup_stats_action.h create mode 100644 be/src/util/bvar_windowed_adder.h create mode 100644 be/test/cloud/cloud_warm_up_manager_filter_test.cpp create mode 100644 be/test/util/bvar_windowed_adder_test.cpp create mode 100644 fe/fe-core/src/main/java/org/apache/doris/cloud/JobWarmUpStats.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/cloud/OnTablesFilter.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/cloud/TableWarmUpWindowedStats.java create mode 100644 fe/fe-core/src/test/java/org/apache/doris/cloud/CacheHotspotManagerTableFilterTest.java create mode 100644 fe/fe-core/src/test/java/org/apache/doris/cloud/CloudWarmUpJobTableFilterTest.java create mode 100644 fe/fe-core/src/test/java/org/apache/doris/cloud/OnTablesFilterTest.java create mode 100644 fe/fe-core/src/test/java/org/apache/doris/cloud/WarmUpClusterOnTablesParseTest.java create mode 100644 fe/fe-core/src/test/java/org/apache/doris/cloud/WarmUpStatsTest.java create mode 100644 fe/fe-core/src/test/java/org/apache/doris/cloud/catalog/CloudInstanceStatusCheckerTest.java create mode 100644 regression-test/framework/src/main/groovy/org/apache/doris/regression/util/WarmupMetricsUtils.groovy create mode 100644 regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_abnormal_cancel_empty_recovery.groovy create mode 100644 regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_abnormal_stats_and_failure.groovy create mode 100644 regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_canonicalization.groovy create mode 100644 regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_dynamic.groovy create mode 100644 regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_error_and_lifecycle.groovy create mode 100644 regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_include.groovy create mode 100644 regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_include_exclude.groovy create mode 100644 regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_mow_compaction.groovy create mode 100644 regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_multi_dst.groovy create mode 100644 regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_multi_include.groovy create mode 100644 regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_overlap_and_mv.groovy create mode 100644 regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_show_and_cancel.groovy create mode 100644 regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_sync_stats.groovy create mode 100644 regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_cluster_change.groovy create mode 100644 regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_compaction_sync_wait.groovy create mode 100644 regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_e2e_multi_be.groovy create mode 100644 regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_packed_file.groovy create mode 100644 regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_restart_and_resize.groovy create mode 100644 regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_schema_index.groovy diff --git a/be/src/cloud/cloud_backend_service.cpp b/be/src/cloud/cloud_backend_service.cpp index 403da0b76c6ee5..efcc096c313d4a 100644 --- a/be/src/cloud/cloud_backend_service.cpp +++ b/be/src/cloud/cloud_backend_service.cpp @@ -104,7 +104,11 @@ void CloudBackendService::warm_up_tablets(TWarmUpTabletsResponse& response, .tag("request_type", "SET_JOB") .tag("job_id", request.job_id); if (request.__isset.event) { - st = manager.set_event(request.job_id, request.event); + const std::vector* table_ids_ptr = nullptr; + if (request.__isset.table_ids) { + table_ids_ptr = &request.table_ids; + } + st = manager.set_event(request.job_id, request.event, false, table_ids_ptr); if (st.ok()) { break; } diff --git a/be/src/cloud/cloud_internal_service.cpp b/be/src/cloud/cloud_internal_service.cpp index 9e685a8f90bc13..d8b67eab439f66 100644 --- a/be/src/cloud/cloud_internal_service.cpp +++ b/be/src/cloud/cloud_internal_service.cpp @@ -20,13 +20,19 @@ #include #include +#include +#include +#include +#include #include #include +#include #include "cloud/cloud_storage_engine.h" #include "cloud/cloud_tablet.h" #include "cloud/cloud_tablet_mgr.h" #include "cloud/cloud_warm_up_manager.h" +#include "cloud/cloud_warmup_metrics.h" #include "cloud/config.h" #include "io/cache/block_file_cache.h" #include "io/cache/block_file_cache_downloader.h" @@ -34,6 +40,7 @@ #include "runtime/thread_context.h" #include "runtime/workload_management/io_throttle.h" #include "util/async_io.h" +#include "util/bvar_windowed_adder.h" #include "util/debug_points.h" namespace doris { @@ -407,10 +414,103 @@ bvar::Adder g_file_cache_warm_up_rowset_wait_for_compaction_num( bvar::Adder g_file_cache_warm_up_rowset_wait_for_compaction_timeout_num( "file_cache_warm_up_rowset_wait_for_compaction_timeout_num"); +// Per-job windowed metrics for target BE +// bvar::Window enforces MAX_SECONDS_LIMIT = 3600, so the longest window is 1h. +static constexpr int WINDOW_5M = 300; +static constexpr int WINDOW_30M = 1800; +static constexpr int WINDOW_1H = 3600; + +MBvarWindowedAdder g_warmup_ed_finish_segment_num("warmup_ed_finish_segment_num", {"job_id"}, + {WINDOW_5M, WINDOW_30M, WINDOW_1H}, false); +MBvarWindowedAdder g_warmup_ed_finish_segment_size("warmup_ed_finish_segment_size", {"job_id"}, + {WINDOW_5M, WINDOW_30M, WINDOW_1H}, false); +MBvarWindowedAdder g_warmup_ed_finish_index_num("warmup_ed_finish_index_num", {"job_id"}, + {WINDOW_5M, WINDOW_30M, WINDOW_1H}, false); +MBvarWindowedAdder g_warmup_ed_finish_index_size("warmup_ed_finish_index_size", {"job_id"}, + {WINDOW_5M, WINDOW_30M, WINDOW_1H}, false); +MBvarWindowedAdder g_warmup_ed_fail_segment_num("warmup_ed_fail_segment_num", {"job_id"}, + {WINDOW_5M, WINDOW_30M, WINDOW_1H}, false); +MBvarWindowedAdder g_warmup_ed_fail_segment_size("warmup_ed_fail_segment_size", {"job_id"}, + {WINDOW_5M, WINDOW_30M, WINDOW_1H}, false); +MBvarWindowedAdder g_warmup_ed_fail_index_num("warmup_ed_fail_index_num", {"job_id"}, + {WINDOW_5M, WINDOW_30M, WINDOW_1H}, false); +MBvarWindowedAdder g_warmup_ed_fail_index_size("warmup_ed_fail_index_size", {"job_id"}, + {WINDOW_5M, WINDOW_30M, WINDOW_1H}, false); +bvar::MultiDimension> g_warmup_ed_last_finish_ts({"job_id"}); + +void update_warmup_ed_last_finish_ts(const std::string& job_id_str) { + auto* finish_ts = g_warmup_ed_last_finish_ts.get_stats(std::list {job_id_str}); + if (finish_ts) { + finish_ts->set_value(std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count()); + } +} + +void record_warmup_ed_finish_segment(const std::string& job_id_str, int64_t segment_size) { + g_warmup_ed_finish_segment_num.put({job_id_str}, 1); + g_warmup_ed_finish_segment_size.put({job_id_str}, segment_size); + update_warmup_ed_last_finish_ts(job_id_str); +} + +void record_warmup_ed_finish_index(const std::string& job_id_str, int64_t idx_size) { + g_warmup_ed_finish_index_num.put({job_id_str}, 1); + g_warmup_ed_finish_index_size.put({job_id_str}, idx_size); + update_warmup_ed_last_finish_ts(job_id_str); +} + +void record_warmup_ed_fail_segment(const std::string& job_id_str, int64_t segment_size) { + g_warmup_ed_fail_segment_num.put({job_id_str}, 1); + g_warmup_ed_fail_segment_size.put({job_id_str}, segment_size); +} + +void record_warmup_ed_fail_index(const std::string& job_id_str, int64_t idx_size) { + g_warmup_ed_fail_index_num.put({job_id_str}, 1); + g_warmup_ed_fail_index_size.put({job_id_str}, idx_size); +} + +void record_warmup_ed_skipped_rowset_as_finished(RowsetMeta& rs_meta, + const std::string& job_id_str) { + auto schema_ptr = rs_meta.tablet_schema(); + bool has_inverted_index = schema_ptr->has_inverted_index() || schema_ptr->has_ann_index(); + auto idx_version = schema_ptr->get_inverted_index_storage_format(); + for (int64_t segment_id = 0; segment_id < rs_meta.num_segments(); segment_id++) { + record_warmup_ed_finish_segment(job_id_str, rs_meta.segment_file_size(segment_id)); + + if (!has_inverted_index) { + continue; + } + auto&& inverted_index_info = rs_meta.inverted_index_file_info(segment_id); + if (idx_version == InvertedIndexStorageFormatPB::V1) { + std::unordered_map index_size_map; + for (const auto& info : inverted_index_info.index_info()) { + if (info.index_file_size() != -1) { + index_size_map[info.index_id()] = info.index_file_size(); + } else { + VLOG_DEBUG << "Invalid index_file_size for segment_id " << segment_id + << ", index_id " << info.index_id(); + } + } + for (const auto& index : schema_ptr->inverted_indexes()) { + record_warmup_ed_finish_index(job_id_str, index_size_map[index->index_id()]); + } + } else { // InvertedIndexStorageFormatPB::V2 + int64_t idx_size = 0; + if (inverted_index_info.has_index_size()) { + idx_size = inverted_index_info.index_size(); + } else { + VLOG_DEBUG << "index_size is not set for segment " << segment_id; + } + record_warmup_ed_finish_index(job_id_str, idx_size); + } + } +} + void handle_segment_download_done(Status st, int64_t tablet_id, const RowsetId& rowset_id, int64_t segment_id, std::shared_ptr tablet, std::shared_ptr wait, Version version, - int64_t segment_size, int64_t request_ts, int64_t handle_ts) { + int64_t segment_size, int64_t request_ts, int64_t handle_ts, + std::string job_id_str, int64_t upstream_trigger_ts_ms) { DBUG_EXECUTE_IF("CloudInternalServiceImpl::warm_up_rowset.download_segment", { auto sleep_time = dp->param("sleep", 3); LOG_INFO("[verbose] block download for rowset={}, version={}, sleep={}", @@ -428,6 +528,7 @@ void handle_segment_download_done(Status st, int64_t tablet_id, const RowsetId& if (st.ok()) { g_file_cache_event_driven_warm_up_finished_segment_num << 1; g_file_cache_event_driven_warm_up_finished_segment_size << segment_size; + record_warmup_ed_finish_segment(job_id_str, segment_size); int64_t now_ts = current_unix_time_us(); g_file_cache_warm_up_rowset_last_finish_unix_ts.set_value(now_ts); auto rowset_latency_us = warm_up_rowset_cross_host_latency_us(request_ts, now_ts); @@ -451,6 +552,7 @@ void handle_segment_download_done(Status st, int64_t tablet_id, const RowsetId& } else { g_file_cache_event_driven_warm_up_failed_segment_num << 1; g_file_cache_event_driven_warm_up_failed_segment_size << segment_size; + record_warmup_ed_fail_segment(job_id_str, segment_size); LOG(WARNING) << "download segment failed, tablet_id: " << tablet_id << " rowset_id: " << rowset_id.to_string() << ", error: " << st; } @@ -460,6 +562,7 @@ void handle_segment_download_done(Status st, int64_t tablet_id, const RowsetId& VLOG_DEBUG << "warmup rowset " << version.to_string() << "(" << rowset_id.to_string() << ") completed"; } + g_warmup_ed_downstream_progress_tracker.record_task_done(job_id_str, upstream_trigger_ts_ms); if (wait) { wait->signal(); } @@ -470,7 +573,8 @@ void handle_inverted_index_download_done(Status st, int64_t tablet_id, const Row std::shared_ptr tablet, std::shared_ptr wait, Version version, uint64_t idx_size, int64_t request_ts, - int64_t handle_ts) { + int64_t handle_ts, std::string job_id_str, + int64_t upstream_trigger_ts_ms) { DBUG_EXECUTE_IF("CloudInternalServiceImpl::warm_up_rowset.download_inverted_idx", { auto sleep_time = dp->param("sleep", 3); LOG_INFO( @@ -482,6 +586,7 @@ void handle_inverted_index_download_done(Status st, int64_t tablet_id, const Row if (st.ok()) { g_file_cache_event_driven_warm_up_finished_index_num << 1; g_file_cache_event_driven_warm_up_finished_index_size << idx_size; + record_warmup_ed_finish_index(job_id_str, static_cast(idx_size)); int64_t now_ts = current_unix_time_us(); g_file_cache_warm_up_rowset_last_finish_unix_ts.set_value(now_ts); auto rowset_latency_us = warm_up_rowset_cross_host_latency_us(request_ts, now_ts); @@ -505,6 +610,7 @@ void handle_inverted_index_download_done(Status st, int64_t tablet_id, const Row } else { g_file_cache_event_driven_warm_up_failed_index_num << 1; g_file_cache_event_driven_warm_up_failed_index_size << idx_size; + record_warmup_ed_fail_index(job_id_str, static_cast(idx_size)); LOG(WARNING) << "download inverted index failed, tablet_id: " << tablet_id << " rowset_id: " << rowset_id << ", error: " << st; } @@ -514,6 +620,7 @@ void handle_inverted_index_download_done(Status st, int64_t tablet_id, const Row VLOG_DEBUG << "warmup rowset " << version.to_string() << "(" << rowset_id.to_string() << ") completed"; } + g_warmup_ed_downstream_progress_tracker.record_task_done(job_id_str, upstream_trigger_ts_ms); if (wait) { wait->signal(); } @@ -534,6 +641,11 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c due_time = butil::milliseconds_from_now(request->sync_wait_timeout_ms()); } + // Extract job_id from request (0 if not set, for backward compatibility) + std::string job_id_str = std::to_string(request->has_job_id() ? request->job_id() : 0); + int64_t upstream_trigger_ts_ms = + request->has_upstream_trigger_ts_ms() ? request->upstream_trigger_ts_ms() : 0; + for (auto& rs_meta_pb : request->rowset_metas()) { RowsetMeta rs_meta; rs_meta.init_from_pb(rs_meta_pb); @@ -581,8 +693,15 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c if (!tablet->add_rowset_warmup_state(rs_meta, WarmUpTriggerSource::EVENT_DRIVEN)) { LOG(INFO) << "found duplicate warmup task for rowset " << rowset_id.to_string() << ", skip it"; + g_warmup_ed_downstream_progress_tracker.record_task_done(job_id_str, + upstream_trigger_ts_ms); + record_warmup_ed_skipped_rowset_as_finished(rs_meta, job_id_str); continue; } + if (rs_meta.num_segments() == 0) { + g_warmup_ed_downstream_progress_tracker.record_task_done(job_id_str, + upstream_trigger_ts_ms); + } for (int64_t segment_id = 0; segment_id < rs_meta.num_segments(); segment_id++) { if (!config::file_cache_enable_only_warm_up_idx) { @@ -605,7 +724,8 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c [=, version = rs_meta.version()](Status st) { handle_segment_download_done( st, tablet_id, rowset_id, segment_id, tablet, wait, - version, segment_size, request_ts, handle_ts); + version, segment_size, request_ts, handle_ts, + job_id_str, upstream_trigger_ts_ms); }, .tablet_id = tablet_id}; @@ -614,12 +734,15 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c if (wait) { wait->add_count(); } + g_warmup_ed_downstream_progress_tracker.record_task_submit(job_id_str, + upstream_trigger_ts_ms); _engine.file_cache_block_downloader().submit_download_task(download_meta); } // Use rs_meta.fs() to support packed files for inverted index download. - auto download_inverted_index = [&, tablet](std::string index_path, uint64_t idx_size) { + auto download_inverted_index = [&, tablet, job_id_str](std::string index_path, + uint64_t idx_size) { io::DownloadFileMeta download_meta { .path = io::Path(index_path), .file_size = static_cast(idx_size), @@ -632,9 +755,11 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c [=, version = rs_meta.version()](Status st) { handle_inverted_index_download_done( st, tablet_id, rowset_id, segment_id, index_path, - tablet, wait, version, idx_size, request_ts, handle_ts); + tablet, wait, version, idx_size, request_ts, handle_ts, + job_id_str, upstream_trigger_ts_ms); }, - .tablet_id = tablet_id}; + .tablet_id = tablet_id, + }; g_file_cache_event_driven_warm_up_submitted_index_num << 1; g_file_cache_event_driven_warm_up_submitted_index_size << idx_size; tablet->update_rowset_warmup_state_inverted_idx_num( @@ -642,6 +767,8 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c if (wait) { wait->add_count(); } + g_warmup_ed_downstream_progress_tracker.record_task_submit(job_id_str, + upstream_trigger_ts_ms); _engine.file_cache_block_downloader().submit_download_task(download_meta); }; diff --git a/be/src/cloud/cloud_meta_mgr.cpp b/be/src/cloud/cloud_meta_mgr.cpp index d84a54cd1e9b2f..f6817411f54c5d 100644 --- a/be/src/cloud/cloud_meta_mgr.cpp +++ b/be/src/cloud/cloud_meta_mgr.cpp @@ -1512,7 +1512,7 @@ Status CloudMetaMgr::commit_rowset(RowsetMeta& rs_meta, const std::string& job_i << ", with timeout: " << timeout_ms << " ms"; } auto& manager = ExecEnv::GetInstance()->storage_engine().to_cloud().cloud_warm_up_manager(); - manager.warm_up_rowset(rs_meta, timeout_ms); + manager.warm_up_rowset(rs_meta, table_id, timeout_ms); return st; } diff --git a/be/src/cloud/cloud_warm_up_manager.cpp b/be/src/cloud/cloud_warm_up_manager.cpp index 40d0066e2eee76..ba8234539b93a4 100644 --- a/be/src/cloud/cloud_warm_up_manager.cpp +++ b/be/src/cloud/cloud_warm_up_manager.cpp @@ -26,13 +26,17 @@ #include #include +#include +#include #include +#include #include "bvar/bvar.h" #include "cloud/cloud_tablet.h" #include "cloud/cloud_tablet_mgr.h" #include "cloud/config.h" #include "common/cast_set.h" +#include "common/config.h" #include "common/logging.h" #include "cpp/sync_point.h" #include "io/cache/block_file_cache_downloader.h" @@ -41,7 +45,9 @@ #include "storage/rowset/beta_rowset.h" #include "storage/tablet/tablet.h" #include "util/brpc_client_cache.h" // BrpcClientCache +#include "util/bvar_windowed_adder.h" #include "util/client_cache.h" +#include "util/defer_op.h" #include "util/stack_util.h" #include "util/thrift_rpc_helper.h" #include "util/time.h" @@ -90,6 +96,23 @@ bvar::Adder g_balance_tablet_be_mapping_size("balance_tablet_be_mappin bvar::LatencyRecorder g_file_cache_warm_up_rowset_wait_for_compaction_latency( "file_cache_warm_up_rowset_wait_for_compaction_latency"); +// Per-job windowed metrics for source BE +// bvar::Window enforces MAX_SECONDS_LIMIT = 3600, so the longest window is 1h. +static constexpr int WINDOW_5M = 300; +static constexpr int WINDOW_30M = 1800; +static constexpr int WINDOW_1H = 3600; + +MBvarWindowedAdder g_warmup_ed_requested_segment_num("warmup_ed_requested_segment_num", {"job_id"}, + {WINDOW_5M, WINDOW_30M, WINDOW_1H}, false); +MBvarWindowedAdder g_warmup_ed_requested_segment_size("warmup_ed_requested_segment_size", + {"job_id"}, + {WINDOW_5M, WINDOW_30M, WINDOW_1H}, false); +MBvarWindowedAdder g_warmup_ed_requested_index_num("warmup_ed_requested_index_num", {"job_id"}, + {WINDOW_5M, WINDOW_30M, WINDOW_1H}, false); +MBvarWindowedAdder g_warmup_ed_requested_index_size("warmup_ed_requested_index_size", {"job_id"}, + {WINDOW_5M, WINDOW_30M, WINDOW_1H}, false); +bvar::MultiDimension> g_warmup_ed_last_trigger_ts({"job_id"}); + CloudWarmUpManager::CloudWarmUpManager(CloudStorageEngine& engine) : _engine(engine) { _download_thread = std::thread(&CloudWarmUpManager::handle_jobs, this); static_cast(ThreadPoolBuilder("CloudWarmUpManagerThreadPool") @@ -460,7 +483,8 @@ Status CloudWarmUpManager::clear_job(int64_t job_id) { return st; } -Status CloudWarmUpManager::set_event(int64_t job_id, TWarmUpEventType::type event, bool clear) { +Status CloudWarmUpManager::set_event(int64_t job_id, TWarmUpEventType::type event, bool clear, + const std::vector* table_ids) { DBUG_EXECUTE_IF("CloudWarmUpManager.set_event.ignore_all", { LOG(INFO) << "Ignore set_event request, job_id=" << job_id << ", event=" << event << ", clear=" << clear; @@ -471,10 +495,28 @@ Status CloudWarmUpManager::set_event(int64_t job_id, TWarmUpEventType::type even if (event == TWarmUpEventType::type::LOAD) { if (clear) { _tablet_replica_cache.erase(job_id); + _event_driven_filters.erase(job_id); LOG(INFO) << "Clear event driven sync, job_id=" << job_id << ", event=" << event; } else if (!_tablet_replica_cache.contains(job_id)) { static_cast(_tablet_replica_cache[job_id]); - LOG(INFO) << "Set event driven sync, job_id=" << job_id << ", event=" << event; + if (table_ids != nullptr) { + // table-level filter: set to the given table_id set (may be empty, + // meaning all matched tables were deleted — warm up nothing) + _event_driven_filters[job_id] = + std::unordered_set(table_ids->begin(), table_ids->end()); + LOG(INFO) << "Set event driven sync with table filter, job_id=" << job_id + << ", event=" << event << ", table_ids_size=" << table_ids->size(); + } else { + // cluster-level: no filter, warm up all tables + _event_driven_filters[job_id] = std::nullopt; + LOG(INFO) << "Set event driven sync, job_id=" << job_id << ", event=" << event; + } + } else if (table_ids != nullptr) { + // Update table_ids for an existing job (may be empty) + _event_driven_filters[job_id] = + std::unordered_set(table_ids->begin(), table_ids->end()); + LOG(INFO) << "Updated table filter for event driven sync, job_id=" << job_id + << ", table_ids_size=" << table_ids->size(); } } else { st = Status::InternalError("The event {} is not supported yet", event); @@ -482,13 +524,29 @@ Status CloudWarmUpManager::set_event(int64_t job_id, TWarmUpEventType::type even return st; } -std::vector CloudWarmUpManager::get_replica_info(int64_t tablet_id, bool bypass_cache, - bool& cache_hit) { - std::vector replicas; +std::vector CloudWarmUpManager::get_replica_info(int64_t tablet_id, + int64_t table_id, + bool bypass_cache, + bool& cache_hit) { + std::vector replicas; std::vector cancelled_jobs; std::lock_guard lock(_mtx); cache_hit = false; for (auto& [job_id, cache] : _tablet_replica_cache) { + // Check table-level filter: skip this job if table_id doesn't match + // table_id == 0 means the caller doesn't have table context (e.g., recycle_cache), + // so skip filtering + if (table_id != 0) { + auto filter_it = _event_driven_filters.find(job_id); + if (filter_it != _event_driven_filters.end() && filter_it->second.has_value()) { + if (filter_it->second->find(table_id) == filter_it->second->end()) { + VLOG_DEBUG << "get_replica_info: table_id=" << table_id + << " not in filter for job_id=" << job_id << ", skipping"; + continue; + } + } + } + if (!bypass_cache) { auto it = cache.find(tablet_id); if (it != cache.end()) { @@ -496,9 +554,9 @@ std::vector CloudWarmUpManager::get_replica_info(int64_t tablet_id auto now = std::chrono::steady_clock::now(); auto sec = std::chrono::duration_cast(now - it->second.first); if (sec.count() < config::warmup_tablet_replica_info_cache_ttl_sec) { - replicas.push_back(it->second.second); - LOG(INFO) << "get_replica_info: cache hit, tablet_id=" << tablet_id - << ", job_id=" << job_id; + replicas.push_back(JobReplicaInfo {job_id, it->second.second}); + VLOG_DEBUG << "get_replica_info: cache hit, tablet_id=" << tablet_id + << ", job_id=" << job_id; cache_hit = true; continue; } else { @@ -566,7 +624,7 @@ std::vector CloudWarmUpManager::get_replica_info(int64_t tablet_id << " replica_infos, tablet id=" << tid << ", job_id=" << job_id; for (const auto& replica : it.second) { cache[tid] = std::make_pair(std::chrono::steady_clock::now(), replica); - replicas.push_back(replica); + replicas.push_back(JobReplicaInfo {job_id, replica}); LOG(INFO) << "get_replica_info: cache add, tablet_id=" << tid << ", job_id=" << job_id; } @@ -581,10 +639,12 @@ std::vector CloudWarmUpManager::get_replica_info(int64_t tablet_id return replicas; } -void CloudWarmUpManager::warm_up_rowset(RowsetMeta& rs_meta, int64_t sync_wait_timeout_ms) { +void CloudWarmUpManager::warm_up_rowset(RowsetMeta& rs_meta, int64_t table_id, + int64_t sync_wait_timeout_ms) { if (sync_wait_timeout_ms <= 0) { auto rs_meta_pb = std::make_shared(rs_meta.get_rowset_pb()); - auto st = _thread_pool_token->submit_func([this, rs_meta_pb, sync_wait_timeout_ms]() { + auto st = _thread_pool_token->submit_func([this, rs_meta_pb, table_id, + sync_wait_timeout_ms]() { RowsetMeta async_rs_meta; bool init_succeed = async_rs_meta.init_from_pb(*rs_meta_pb); TEST_SYNC_POINT_CALLBACK("CloudWarmUpManager::warm_up_rowset.async_init_from_pb", @@ -593,7 +653,7 @@ void CloudWarmUpManager::warm_up_rowset(RowsetMeta& rs_meta, int64_t sync_wait_t LOG(WARNING) << "Failed to init rowset meta when warming up rowset asynchronously"; return; } - _warm_up_rowset(async_rs_meta, sync_wait_timeout_ms); + _warm_up_rowset(async_rs_meta, table_id, sync_wait_timeout_ms); }); if (!st.ok()) { LOG(WARNING) << "Failed to submit warm up rowset task: " << st; @@ -607,7 +667,7 @@ void CloudWarmUpManager::warm_up_rowset(RowsetMeta& rs_meta, int64_t sync_wait_t bool finished = false; std::unique_lock lock(mu); auto st = _thread_pool_token->submit_func([&, this]() { - _warm_up_rowset(rs_meta, sync_wait_timeout_ms); + _warm_up_rowset(rs_meta, table_id, sync_wait_timeout_ms); std::unique_lock l(mu); finished = true; cv.notify_one(); @@ -623,21 +683,22 @@ void CloudWarmUpManager::warm_up_rowset(RowsetMeta& rs_meta, int64_t sync_wait_t } } -void CloudWarmUpManager::_warm_up_rowset(RowsetMeta& rs_meta, int64_t sync_wait_timeout_ms) { +void CloudWarmUpManager::_warm_up_rowset(RowsetMeta& rs_meta, int64_t table_id, + int64_t sync_wait_timeout_ms) { TEST_SYNC_POINT_CALLBACK("CloudWarmUpManager::_warm_up_rowset.enter", &rs_meta, &sync_wait_timeout_ms); bool cache_hit = false; - auto replicas = get_replica_info(rs_meta.tablet_id(), false, cache_hit); + auto replicas = get_replica_info(rs_meta.tablet_id(), table_id, false, cache_hit); if (replicas.empty()) { VLOG_DEBUG << "There is no need to warmup tablet=" << rs_meta.tablet_id() << ", skipping rowset=" << rs_meta.rowset_id().to_string(); g_file_cache_event_driven_warm_up_skipped_rowset_num << 1; return; } - Status st = _do_warm_up_rowset(rs_meta, replicas, sync_wait_timeout_ms, !cache_hit); + Status st = _do_warm_up_rowset(rs_meta, table_id, replicas, sync_wait_timeout_ms, !cache_hit); if (cache_hit && !st.ok() && st.is()) { - replicas = get_replica_info(rs_meta.tablet_id(), true, cache_hit); - st = _do_warm_up_rowset(rs_meta, replicas, sync_wait_timeout_ms, true); + replicas = get_replica_info(rs_meta.tablet_id(), table_id, true, cache_hit); + st = _do_warm_up_rowset(rs_meta, table_id, replicas, sync_wait_timeout_ms, true); } if (!st.ok()) { LOG(WARNING) << "Failed to warm up rowset, tablet_id=" << rs_meta.tablet_id() @@ -645,8 +706,33 @@ void CloudWarmUpManager::_warm_up_rowset(RowsetMeta& rs_meta, int64_t sync_wait_ } } -Status CloudWarmUpManager::_do_warm_up_rowset(RowsetMeta& rs_meta, - std::vector& replicas, +Status CloudWarmUpManager::_build_warm_up_rowset_result( + const std::vector& failures, size_t replica_count, int64_t tablet_id, + const std::string& rowset_id) { + if (failures.empty()) { + return Status::OK(); + } + + int code = failures.front().code; + std::string failure_msg; + for (size_t i = 0; i < failures.size(); ++i) { + if (failures[i].code == ErrorCode::TABLE_NOT_FOUND) { + code = ErrorCode::TABLE_NOT_FOUND; + } + if (i > 0) { + failure_msg.append("; "); + } + failure_msg.append(failures[i].reason); + } + + return Status::Error(code, + "warm up rowset failed on {}/{} replicas, tablet_id={}, rowset_id={}, " + "failures=[{}]", + failures.size(), replica_count, tablet_id, rowset_id, failure_msg); +} + +Status CloudWarmUpManager::_do_warm_up_rowset(RowsetMeta& rs_meta, int64_t table_id, + std::vector& replicas, int64_t sync_wait_timeout_ms, bool skip_existence_check) { auto tablet_id = rs_meta.tablet_id(); @@ -654,34 +740,53 @@ Status CloudWarmUpManager::_do_warm_up_rowset(RowsetMeta& rs_meta, std::chrono::system_clock::now().time_since_epoch()) .count(); g_file_cache_warm_up_rowset_last_call_unix_ts.set_value(now_ts); - auto ret_st = Status::OK(); + std::vector failures; + auto add_failure = [&failures](const JobReplicaInfo& info, const std::string& target, + const Status& st) { + failures.push_back(WarmUpRowsetFailure { + .code = st.code(), + .reason = "job_id=" + std::to_string(info.job_id) + + ", backend_id=" + std::to_string(info.replica.backend_id) + + ", target=" + target + ", status=" + st.to_string_no_stack()}); + }; + + for (auto& info : replicas) { + std::string job_id_str = std::to_string(info.job_id); + std::string target = get_host_port(info.replica.host, info.replica.brpc_port); + int64_t trigger_ts_ms = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + + PWarmUpRowsetRequest request; + request.add_rowset_metas()->CopyFrom(rs_meta.get_rowset_pb()); + request.set_unix_ts_us(now_ts); + request.set_sync_wait_timeout_ms(sync_wait_timeout_ms); + request.set_skip_existence_check(skip_existence_check); + request.set_job_id(info.job_id); + request.set_upstream_trigger_ts_ms(trigger_ts_ms); - PWarmUpRowsetRequest request; - request.add_rowset_metas()->CopyFrom(rs_meta.get_rowset_pb()); - request.set_unix_ts_us(now_ts); - request.set_sync_wait_timeout_ms(sync_wait_timeout_ms); - request.set_skip_existence_check(skip_existence_check); - for (auto& replica : replicas) { // send sync request - std::string host = replica.host; + std::string host = info.replica.host; auto dns_cache = ExecEnv::GetInstance()->dns_cache(); if (dns_cache == nullptr) { LOG(WARNING) << "DNS cache is not initialized, skipping hostname resolve"; - } else if (!is_valid_ip(replica.host)) { - Status status = dns_cache->get(replica.host, &host); + } else if (!is_valid_ip(info.replica.host)) { + Status status = dns_cache->get(info.replica.host, &host); if (!status.ok()) { - LOG(WARNING) << "failed to get ip from host " << replica.host << ": " + LOG(WARNING) << "failed to get ip from host " << info.replica.host << ": " << status.to_string(); + add_failure(info, target, status); continue; } } - std::string brpc_addr = get_host_port(host, replica.brpc_port); + std::string brpc_addr = get_host_port(host, info.replica.brpc_port); Status st = Status::OK(); std::shared_ptr brpc_stub = ExecEnv::GetInstance()->brpc_internal_client_cache()->get_new_client_no_cache( brpc_addr); if (!brpc_stub) { st = Status::RpcError("Address {} is wrong", brpc_addr); + add_failure(info, target, st); continue; } @@ -689,9 +794,13 @@ Status CloudWarmUpManager::_do_warm_up_rowset(RowsetMeta& rs_meta, auto schema_ptr = rs_meta.tablet_schema(); auto idx_version = schema_ptr->get_inverted_index_storage_format(); for (int64_t segment_id = 0; segment_id < rs_meta.num_segments(); segment_id++) { + auto seg_size = rs_meta.segment_file_size(cast_set(segment_id)); + g_file_cache_event_driven_warm_up_requested_segment_num << 1; - g_file_cache_event_driven_warm_up_requested_segment_size - << rs_meta.segment_file_size(cast_set(segment_id)); + g_warmup_ed_requested_segment_num.put({job_id_str}, 1); + + g_file_cache_event_driven_warm_up_requested_segment_size << seg_size; + g_warmup_ed_requested_segment_size.put({job_id_str}, seg_size); if (schema_ptr->has_inverted_index() || schema_ptr->has_ann_index()) { if (idx_version == InvertedIndexStorageFormatPB::V1) { @@ -701,23 +810,31 @@ Status CloudWarmUpManager::_do_warm_up_rowset(RowsetMeta& rs_meta, VLOG_DEBUG << "No index info available for segment " << segment_id; continue; } - for (const auto& info : inverted_index_info.index_info()) { + for (const auto& idx_info : inverted_index_info.index_info()) { g_file_cache_event_driven_warm_up_requested_index_num << 1; - if (info.index_file_size() != -1) { + g_warmup_ed_requested_index_num.put({job_id_str}, 1); + + if (idx_info.index_file_size() != -1) { g_file_cache_event_driven_warm_up_requested_index_size - << info.index_file_size(); + << idx_info.index_file_size(); + g_warmup_ed_requested_index_size.put({job_id_str}, + idx_info.index_file_size()); } else { VLOG_DEBUG << "Invalid index_file_size for segment_id " << segment_id - << ", index_id " << info.index_id(); + << ", index_id " << idx_info.index_id(); } } } else { // InvertedIndexStorageFormatPB::V2 auto&& inverted_index_info = rs_meta.inverted_index_file_info(cast_set(segment_id)); g_file_cache_event_driven_warm_up_requested_index_num << 1; + g_warmup_ed_requested_index_num.put({job_id_str}, 1); + if (inverted_index_info.has_index_size()) { g_file_cache_event_driven_warm_up_requested_index_size << inverted_index_info.index_size(); + g_warmup_ed_requested_index_size.put({job_id_str}, + inverted_index_info.index_size()); } else { VLOG_DEBUG << "index_size is not set for segment " << segment_id; } @@ -725,6 +842,13 @@ Status CloudWarmUpManager::_do_warm_up_rowset(RowsetMeta& rs_meta, } } + // Update last trigger timestamp + auto* trigger_ts = + g_warmup_ed_last_trigger_ts.get_stats(std::list {job_id_str}); + if (trigger_ts) { + trigger_ts->set_value(trigger_ts_ms); + } + brpc::Controller cntl; if (sync_wait_timeout_ms > 0) { cntl.set_timeout_ms(sync_wait_timeout_ms + 1000); @@ -736,7 +860,8 @@ Status CloudWarmUpManager::_do_warm_up_rowset(RowsetMeta& rs_meta, if (cntl.Failed()) { LOG_WARNING("warm up rowset {} for tablet {} failed, rpc error: {}", rs_meta.rowset_id().to_string(), tablet_id, cntl.ErrorText()); - return Status::RpcError(cntl.ErrorText()); + add_failure(info, target, Status::RpcError(cntl.ErrorText())); + continue; } if (sync_wait_timeout_ms > 0) { auto cost_us = watch.elapsed_time_microseconds(); @@ -752,12 +877,13 @@ Status CloudWarmUpManager::_do_warm_up_rowset(RowsetMeta& rs_meta, if (response.has_status() && !status.ok()) { LOG(INFO) << "warm_up_rowset failed, tablet_id=" << rs_meta.tablet_id() << ", rowset_id=" << rs_meta.rowset_id().to_string() - << ", target=" << replica.host << ", skip_existence_check" + << ", target=" << info.replica.host << ", skip_existence_check" << skip_existence_check << ", status=" << status; - ret_st = status; + add_failure(info, target, status); } } - return ret_st; + return _build_warm_up_rowset_result(failures, replicas.size(), tablet_id, + rs_meta.rowset_id().to_string()); } void CloudWarmUpManager::recycle_cache(int64_t tablet_id, @@ -782,7 +908,7 @@ void CloudWarmUpManager::_recycle_cache(int64_t tablet_id, const std::vector& rowsets) { LOG(INFO) << "recycle_cache: tablet_id=" << tablet_id << ", num_rowsets=" << rowsets.size(); bool cache_hit = false; - auto replicas = get_replica_info(tablet_id, false, cache_hit); + auto replicas = get_replica_info(tablet_id, /*table_id=*/0, false, cache_hit); if (replicas.empty()) { return; } @@ -802,18 +928,18 @@ void CloudWarmUpManager::_recycle_cache(int64_t tablet_id, auto dns_cache = ExecEnv::GetInstance()->dns_cache(); for (auto& replica : replicas) { // send sync request - std::string host = replica.host; + std::string host = replica.replica.host; if (dns_cache == nullptr) { LOG(WARNING) << "DNS cache is not initialized, skipping hostname resolve"; - } else if (!is_valid_ip(replica.host)) { - Status status = dns_cache->get(replica.host, &host); + } else if (!is_valid_ip(replica.replica.host)) { + Status status = dns_cache->get(replica.replica.host, &host); if (!status.ok()) { - LOG(WARNING) << "failed to get ip from host " << replica.host << ": " + LOG(WARNING) << "failed to get ip from host " << replica.replica.host << ": " << status.to_string(); return; } } - std::string brpc_addr = get_host_port(host, replica.brpc_port); + std::string brpc_addr = get_host_port(host, replica.replica.brpc_port); Status st = Status::OK(); std::shared_ptr brpc_stub = ExecEnv::GetInstance()->brpc_internal_client_cache()->get_new_client_no_cache( diff --git a/be/src/cloud/cloud_warm_up_manager.h b/be/src/cloud/cloud_warm_up_manager.h index 992702f162e0a1..eb656790599a95 100644 --- a/be/src/cloud/cloud_warm_up_manager.h +++ b/be/src/cloud/cloud_warm_up_manager.h @@ -21,10 +21,12 @@ #include #include +#include #include #include #include #include +#include #include #include "cloud/cloud_storage_engine.h" @@ -39,6 +41,16 @@ enum class DownloadType { S3, }; +// Filter for event-driven warmup jobs. +// nullopt = cluster-level (no table filter, warm up all tables) +// has_value = table-level filter (only warm up tables in the set) +using EventDrivenJobFilter = std::optional>; + +struct JobReplicaInfo { + int64_t job_id; + TReplicaInfo replica; +}; + struct JobMeta { JobMeta() = default; JobMeta(const TJobMeta& meta); @@ -75,7 +87,8 @@ class CloudWarmUpManager { // Cancel the job Status clear_job(int64_t job_id); - Status set_event(int64_t job_id, TWarmUpEventType::type event, bool clear = false); + Status set_event(int64_t job_id, TWarmUpEventType::type event, bool clear = false, + const std::vector* table_ids = nullptr); // If `sync_wait_timeout_ms` <= 0, the function will send the warm-up RPC // and return immediately without waiting for the warm-up to complete. @@ -85,7 +98,7 @@ class CloudWarmUpManager { // @param rs_meta Metadata of the rowset to be warmed up. // @param sync_wait_timeout_ms Timeout in milliseconds to wait for the warm-up // to complete. Non-positive value means no waiting. - void warm_up_rowset(RowsetMeta& rs_meta, int64_t sync_wait_timeout_ms = -1); + void warm_up_rowset(RowsetMeta& rs_meta, int64_t table_id, int64_t sync_wait_timeout_ms = -1); void recycle_cache(int64_t tablet_id, const std::vector& rowsets); @@ -98,17 +111,27 @@ class CloudWarmUpManager { std::unordered_map> get_all_balanced_tablets() const; private: + struct WarmUpRowsetFailure { + int code; + std::string reason; + }; + + static Status _build_warm_up_rowset_result(const std::vector& failures, + size_t replica_count, int64_t tablet_id, + const std::string& rowset_id); + void schedule_remove_balanced_tablet(int64_t tablet_id); static void clean_up_expired_mappings(void* arg); void handle_jobs(); - Status _do_warm_up_rowset(RowsetMeta& rs_meta, std::vector& replicas, - int64_t sync_wait_timeout_ms, bool skip_existence_check); + Status _do_warm_up_rowset(RowsetMeta& rs_meta, int64_t table_id, + std::vector& replicas, int64_t sync_wait_timeout_ms, + bool skip_existence_check); - std::vector get_replica_info(int64_t tablet_id, bool bypass_cache, - bool& cache_hit); + std::vector get_replica_info(int64_t tablet_id, int64_t table_id, + bool bypass_cache, bool& cache_hit); - void _warm_up_rowset(RowsetMeta& rs_meta, int64_t sync_wait_timeout_ms); + void _warm_up_rowset(RowsetMeta& rs_meta, int64_t table_id, int64_t sync_wait_timeout_ms); void _recycle_cache(int64_t tablet_id, const std::vector& rowsets); void submit_download_tasks(io::Path path, int64_t file_size, io::FileSystemSPtr file_system, @@ -133,6 +156,8 @@ class CloudWarmUpManager { using Cache = std::unordered_map; // job_id -> cache std::unordered_map _tablet_replica_cache; + // job_id -> table filter (nullopt = cluster-level, no filter) + std::unordered_map _event_driven_filters; std::unique_ptr _thread_pool; std::unique_ptr _thread_pool_token; diff --git a/be/src/cloud/cloud_warmup_metrics.cpp b/be/src/cloud/cloud_warmup_metrics.cpp new file mode 100644 index 00000000000000..59d6c769c1d1fa --- /dev/null +++ b/be/src/cloud/cloud_warmup_metrics.cpp @@ -0,0 +1,82 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "cloud/cloud_warmup_metrics.h" + +#include + +namespace doris { + +WarmUpEdDownstreamProgressTracker g_warmup_ed_downstream_progress_tracker; + +void WarmUpEdDownstreamProgressTracker::record_task_submit(const std::string& job_id_str, + int64_t upstream_trigger_ts_ms) { + if (upstream_trigger_ts_ms <= 0) { + return; + } + std::lock_guard lock(_mtx); + auto& progress = _progress_by_job[job_id_str]; + ++progress.pending_trigger_ts_counts[upstream_trigger_ts_ms]; +} + +void WarmUpEdDownstreamProgressTracker::record_task_done(const std::string& job_id_str, + int64_t upstream_trigger_ts_ms) { + if (upstream_trigger_ts_ms <= 0) { + return; + } + std::lock_guard lock(_mtx); + auto& progress = _progress_by_job[job_id_str]; + auto pending_it = progress.pending_trigger_ts_counts.find(upstream_trigger_ts_ms); + if (pending_it != progress.pending_trigger_ts_counts.end()) { + --pending_it->second; + if (pending_it->second <= 0) { + progress.pending_trigger_ts_counts.erase(pending_it); + } + } + progress.last_finished_trigger_ts = + std::max(progress.last_finished_trigger_ts, upstream_trigger_ts_ms); +} + +int64_t WarmUpEdDownstreamProgressTracker::get_progress_ts(const std::string& job_id_str) const { + std::lock_guard lock(_mtx); + auto progress_it = _progress_by_job.find(job_id_str); + if (progress_it == _progress_by_job.end()) { + return 0; + } + const auto& progress = progress_it->second; + if (!progress.pending_trigger_ts_counts.empty()) { + return progress.pending_trigger_ts_counts.begin()->first; + } + return progress.last_finished_trigger_ts; +} + +std::vector WarmUpEdDownstreamProgressTracker::list_job_ids() const { + std::lock_guard lock(_mtx); + std::vector job_ids; + job_ids.reserve(_progress_by_job.size()); + for (const auto& entry : _progress_by_job) { + job_ids.emplace_back(entry.first); + } + return job_ids; +} + +void WarmUpEdDownstreamProgressTracker::reset_for_test() { + std::lock_guard lock(_mtx); + _progress_by_job.clear(); +} + +} // namespace doris diff --git a/be/src/cloud/cloud_warmup_metrics.h b/be/src/cloud/cloud_warmup_metrics.h new file mode 100644 index 00000000000000..3c4840d1803178 --- /dev/null +++ b/be/src/cloud/cloud_warmup_metrics.h @@ -0,0 +1,76 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "util/bvar_windowed_adder.h" + +namespace doris { + +// Source BE metrics keyed by job_id (defined in cloud_warm_up_manager.cpp). +extern MBvarWindowedAdder g_warmup_ed_requested_segment_num; +extern MBvarWindowedAdder g_warmup_ed_requested_segment_size; +extern MBvarWindowedAdder g_warmup_ed_requested_index_num; +extern MBvarWindowedAdder g_warmup_ed_requested_index_size; +extern bvar::MultiDimension> g_warmup_ed_last_trigger_ts; + +// Target BE metrics keyed by job_id (defined in cloud_internal_service.cpp). +extern MBvarWindowedAdder g_warmup_ed_finish_segment_num; +extern MBvarWindowedAdder g_warmup_ed_finish_segment_size; +extern MBvarWindowedAdder g_warmup_ed_finish_index_num; +extern MBvarWindowedAdder g_warmup_ed_finish_index_size; +extern MBvarWindowedAdder g_warmup_ed_fail_segment_num; +extern MBvarWindowedAdder g_warmup_ed_fail_segment_size; +extern MBvarWindowedAdder g_warmup_ed_fail_index_num; +extern MBvarWindowedAdder g_warmup_ed_fail_index_size; +extern bvar::MultiDimension> g_warmup_ed_last_finish_ts; + +// Tracks the target BE's event-driven warm-up progress by upstream trigger timestamp. +// If there are unfinished downloads for a job, progress is the earliest pending upstream trigger +// time. If the job has no pending downloads, progress falls back to the latest completed upstream +// trigger time, so FE can report a zero trigger gap once the target side catches up. +class WarmUpEdDownstreamProgressTracker { +public: + void record_task_submit(const std::string& job_id_str, int64_t upstream_trigger_ts_ms); + void record_task_done(const std::string& job_id_str, int64_t upstream_trigger_ts_ms); + int64_t get_progress_ts(const std::string& job_id_str) const; + std::vector list_job_ids() const; + void reset_for_test(); + +private: + struct JobProgress { + std::map pending_trigger_ts_counts; + int64_t last_finished_trigger_ts = 0; + }; + + mutable std::mutex _mtx; + std::unordered_map _progress_by_job; +}; + +extern WarmUpEdDownstreamProgressTracker g_warmup_ed_downstream_progress_tracker; + +} // namespace doris diff --git a/be/src/service/http/action/warmup_stats_action.cpp b/be/src/service/http/action/warmup_stats_action.cpp new file mode 100644 index 00000000000000..a41f388e686a34 --- /dev/null +++ b/be/src/service/http/action/warmup_stats_action.cpp @@ -0,0 +1,121 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "service/http/action/warmup_stats_action.h" + +#include +#include +#include +#include +#include + +#include "cloud/cloud_warmup_metrics.h" +#include "service/http/http_channel.h" +#include "service/http/http_headers.h" +#include "service/http/http_request.h" +#include "service/http/http_status.h" +#include "util/debug_points.h" +#include "util/easy_json.h" + +namespace doris { + +// Fill windowed num/size metrics into a JSON object +static void fill_windowed(EasyJson& parent, const std::string& key, MBvarWindowedAdder& num_adder, + MBvarWindowedAdder& size_adder, const std::string& dim_key) { + EasyJson obj = parent.Set(key, EasyJson::kObject); + EasyJson num = obj.Set("num", EasyJson::kObject); + num["5m"] = num_adder.get_window_value(dim_key, 0); + num["30m"] = num_adder.get_window_value(dim_key, 1); + num["1h"] = num_adder.get_window_value(dim_key, 2); + EasyJson size = obj.Set("size", EasyJson::kObject); + size["5m"] = size_adder.get_window_value(dim_key, 0); + size["30m"] = size_adder.get_window_value(dim_key, 1); + size["1h"] = size_adder.get_window_value(dim_key, 2); +} + +void WarmUpStatsAction::handle(HttpRequest* req) { + DBUG_EXECUTE_IF("WarmUpStatsAction.handle.return_error", { + HttpChannel::send_reply(req, HttpStatus::INTERNAL_SERVER_ERROR, + "injected warmup stats error"); + return; + }); + DBUG_EXECUTE_IF("WarmUpStatsAction.handle.sleep", { + auto sleep_ms = dp->param("sleep_ms", 6000); + std::this_thread::sleep_for(std::chrono::milliseconds(sleep_ms)); + }); + + // Collect all job_id dimension keys from all metrics + std::set all_keys; + for (auto& k : g_warmup_ed_requested_segment_num.list_dimensions()) all_keys.insert(k); + for (auto& k : g_warmup_ed_requested_index_num.list_dimensions()) all_keys.insert(k); + for (auto& k : g_warmup_ed_finish_segment_num.list_dimensions()) all_keys.insert(k); + for (auto& k : g_warmup_ed_finish_index_num.list_dimensions()) all_keys.insert(k); + for (auto& k : g_warmup_ed_fail_segment_num.list_dimensions()) all_keys.insert(k); + for (auto& k : g_warmup_ed_fail_index_num.list_dimensions()) all_keys.insert(k); + for (auto& k : g_warmup_ed_downstream_progress_tracker.list_job_ids()) all_keys.insert(k); + + EasyJson result; + result["code"] = 0; + EasyJson jobs = result.Set("data", EasyJson::kArray); + + for (auto& job_id_str : all_keys) { + EasyJson entry = jobs.PushBack(EasyJson::kObject); + try { + entry["job_id"] = static_cast(std::stoll(job_id_str)); + } catch (...) { + entry["job_id"] = 0; + } + + // requested + EasyJson req_obj = entry.Set("requested", EasyJson::kObject); + fill_windowed(req_obj, "seg", g_warmup_ed_requested_segment_num, + g_warmup_ed_requested_segment_size, job_id_str); + fill_windowed(req_obj, "idx", g_warmup_ed_requested_index_num, + g_warmup_ed_requested_index_size, job_id_str); + + // finish + EasyJson fin_obj = entry.Set("finish", EasyJson::kObject); + fill_windowed(fin_obj, "seg", g_warmup_ed_finish_segment_num, + g_warmup_ed_finish_segment_size, job_id_str); + fill_windowed(fin_obj, "idx", g_warmup_ed_finish_index_num, g_warmup_ed_finish_index_size, + job_id_str); + + // fail + EasyJson fail_obj = entry.Set("fail", EasyJson::kObject); + fill_windowed(fail_obj, "seg", g_warmup_ed_fail_segment_num, g_warmup_ed_fail_segment_size, + job_id_str); + fill_windowed(fail_obj, "idx", g_warmup_ed_fail_index_num, g_warmup_ed_fail_index_size, + job_id_str); + + // Timestamps + auto* trigger_ts = + g_warmup_ed_last_trigger_ts.get_stats(std::list {job_id_str}); + entry["last_trigger_ts"] = trigger_ts ? trigger_ts->get_value() : 0; + auto* finish_ts = g_warmup_ed_last_finish_ts.get_stats(std::list {job_id_str}); + entry["last_finish_ts"] = finish_ts ? finish_ts->get_value() : 0; + // Target-side progress watermark for trigger-gap calculation. Pending work reports the + // earliest unfinished upstream trigger time; fully caught-up work reports the latest + // finished upstream trigger time. + entry["progress_trigger_ts"] = + g_warmup_ed_downstream_progress_tracker.get_progress_ts(job_id_str); + } + + req->add_output_header(HttpHeaders::CONTENT_TYPE, "application/json"); + HttpChannel::send_reply(req, HttpStatus::OK, result.ToString()); +} + +} // namespace doris diff --git a/be/src/service/http/action/warmup_stats_action.h b/be/src/service/http/action/warmup_stats_action.h new file mode 100644 index 00000000000000..72e0a17fd4802a --- /dev/null +++ b/be/src/service/http/action/warmup_stats_action.h @@ -0,0 +1,37 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "service/http/http_handler_with_auth.h" + +namespace doris { + +class ExecEnv; + +// HTTP action for /api/warmup_event_driven_stats +// Returns per-job_id windowed warmup metrics as JSON. +class WarmUpStatsAction final : public HttpHandlerWithAuth { +public: + explicit WarmUpStatsAction(ExecEnv* exec_env) : HttpHandlerWithAuth(exec_env) {} + + ~WarmUpStatsAction() override = default; + + void handle(HttpRequest* req) override; +}; + +} // namespace doris diff --git a/be/src/service/http_service.cpp b/be/src/service/http_service.cpp index 93a50a81cc10ff..a7c46e267fd6cf 100644 --- a/be/src/service/http_service.cpp +++ b/be/src/service/http_service.cpp @@ -76,6 +76,7 @@ #include "service/http/action/tablets_distribution_action.h" #include "service/http/action/tablets_info_action.h" #include "service/http/action/version_action.h" +#include "service/http/action/warmup_stats_action.h" #include "service/http/default_path_handlers.h" #include "service/http/ev_http_server.h" #include "service/http/http_method.h" @@ -502,6 +503,10 @@ void HttpService::register_cloud_handler(CloudStorageEngine& engine) { auto* show_hotspot_action = _pool.add(new ShowHotspotAction(engine, _env)); _ev_http_server->register_handler(HttpMethod::GET, "/api/hotspot/tablet", show_hotspot_action); + auto* warmup_stats_action = _pool.add(new WarmUpStatsAction(_env)); + _ev_http_server->register_handler(HttpMethod::GET, "/api/warmup_event_driven_stats", + warmup_stats_action); + CalcFileCrcAction* calc_crc_action = _pool.add( new CalcFileCrcAction(_env, engine, TPrivilegeHier::GLOBAL, TPrivilegeType::ADMIN)); _ev_http_server->register_handler(HttpMethod::GET, "/api/calc_crc", calc_crc_action); diff --git a/be/src/util/bvar_windowed_adder.h b/be/src/util/bvar_windowed_adder.h new file mode 100644 index 00000000000000..c4e9245b7e3246 --- /dev/null +++ b/be/src/util/bvar_windowed_adder.h @@ -0,0 +1,162 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace doris { + +/** + * Multi-dimension windowed adder. + * + * For each dimension value combination (e.g., job_id), automatically creates: + * - A bvar::Adder (cumulative counter managed by MultiDimension) + * - Multiple bvar::Window instances (sliding window views at different time scales) + * + * Windows are lazily created on first write to a dimension value. + * + * @example + * MBvarWindowedAdder requested_seg_num( + * "warmup_ed_requested_segment_num", + * {"job_id"}, + * {300, 1800, 7200} + * ); + * requested_seg_num.put({"13419"}, 1); + */ +class MBvarWindowedAdder { +public: + MBvarWindowedAdder(const std::string& name, const std::initializer_list& dim_names, + std::vector window_seconds, bool expose = true) + : name_(name), + window_seconds_(std::move(window_seconds)), + md_total_(std::list(dim_names)), + expose_(expose) { + if (expose_) { + md_total_.expose(name_ + "_total"); + } + } + + void put(const std::initializer_list& dim_values, int64_t value) { + auto* adder = md_total_.get_stats(std::list(dim_values)); + if (!adder) return; + ensure_windows(dim_values, adder); + *adder << value; + } + + /** Get the current window value for the specified dimension and window index. */ + int64_t get_window_value(const std::initializer_list& dim_values, + size_t window_idx) { + std::lock_guard lock(mutex_); + auto it = dims_.find(make_key(dim_values)); + if (it == dims_.end() || window_idx >= it->second.windows.size()) { + return 0; + } + return it->second.windows[window_idx]->get_value(); + } + + /** Overload accepting a pre-built key string (e.g., "job_id,table_id"). */ + int64_t get_window_value(const std::string& dim_key, size_t window_idx) { + std::lock_guard lock(mutex_); + auto it = dims_.find(dim_key); + if (it == dims_.end() || window_idx >= it->second.windows.size()) { + return 0; + } + return it->second.windows[window_idx]->get_value(); + } + + /** List all dimension key strings that have been seen. */ + std::vector list_dimensions() const { + std::lock_guard lock(mutex_); + std::vector result; + result.reserve(dims_.size()); + for (auto& [key, _] : dims_) { + result.push_back(key); + } + return result; + } + + void hide() { + std::lock_guard lock(mutex_); + if (!expose_) { + return; + } + expose_ = false; + md_total_.hide(); + for (auto& [_, entry] : dims_) { + for (auto& window : entry.windows) { + window->hide(); + } + } + } + +private: + struct DimEntry { + bvar::Adder* adder; // owned by MultiDimension + std::vector>>> windows; + }; + + void ensure_windows(const std::initializer_list& dim_values, + bvar::Adder* adder) { + std::string key = make_key(dim_values); + std::lock_guard lock(mutex_); + if (dims_.count(key)) return; + DimEntry entry; + entry.adder = adder; + for (int ws : window_seconds_) { + if (expose_) { + std::string wname = name_ + "_" + std::to_string(ws) + "s_" + key; + entry.windows.emplace_back( + std::make_unique>>(wname, adder, ws)); + } else { + entry.windows.emplace_back( + std::make_unique>>(adder, ws)); + } + } + dims_[key] = std::move(entry); + } + + static std::string make_key(const std::initializer_list& dim_values) { + std::string result; + for (auto& v : dim_values) { + if (!result.empty()) result += ","; + result += v; + } + return result; + } + + std::string name_; + std::vector window_seconds_; + bvar::MultiDimension> md_total_; + bool expose_; + mutable bthread::Mutex mutex_; + std::map dims_; +}; + +} // namespace doris diff --git a/be/test/cloud/cloud_warm_up_manager_filter_test.cpp b/be/test/cloud/cloud_warm_up_manager_filter_test.cpp new file mode 100644 index 00000000000000..1ebae403422601 --- /dev/null +++ b/be/test/cloud/cloud_warm_up_manager_filter_test.cpp @@ -0,0 +1,276 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include +#include +#include +#include +#include +#include + +#include "cloud/cloud_storage_engine.h" +#include "cloud/cloud_warm_up_manager.h" +#include "cloud/cloud_warmup_metrics.h" +#include "gen_cpp/AgentService_types.h" + +namespace doris { + +class CloudWarmUpManagerFilterTest : public testing::Test { +public: + CloudWarmUpManagerFilterTest() : _engine(CloudStorageEngine(EngineOptions {})) {} + +protected: + CloudStorageEngine _engine; +}; + +static TReplicaInfo make_replica(int64_t backend_id) { + TReplicaInfo replica; + replica.__set_backend_id(backend_id); + replica.__set_host("127.0.0.1"); + replica.__set_brpc_port(8000 + backend_id); + replica.__set_is_alive(true); + return replica; +} + +TEST_F(CloudWarmUpManagerFilterTest, EventDrivenJobFilterNullopt) { + EventDrivenJobFilter filter = std::nullopt; + EXPECT_FALSE(filter.has_value()); +} + +TEST_F(CloudWarmUpManagerFilterTest, EventDrivenJobFilterWithTableIds) { + EventDrivenJobFilter filter = std::unordered_set {100, 200, 300}; + EXPECT_TRUE(filter.has_value()); + EXPECT_EQ(3, filter->size()); + EXPECT_TRUE(filter->count(100) > 0); + EXPECT_TRUE(filter->count(200) > 0); + EXPECT_TRUE(filter->count(300) > 0); + EXPECT_TRUE(filter->count(999) == 0); +} + +TEST_F(CloudWarmUpManagerFilterTest, EventDrivenJobFilterEmpty) { + EventDrivenJobFilter filter = std::unordered_set {}; + EXPECT_TRUE(filter.has_value()); + EXPECT_EQ(0, filter->size()); +} + +TEST_F(CloudWarmUpManagerFilterTest, SetEventWithoutTableIdsStoresClusterLevelFilter) { + CloudWarmUpManager manager(_engine); + int64_t job_id = 1001; + + auto st = manager.set_event(job_id, TWarmUpEventType::LOAD, false, nullptr); + EXPECT_TRUE(st.ok()); + EXPECT_TRUE(manager._tablet_replica_cache.contains(job_id)); + auto filter_it = manager._event_driven_filters.find(job_id); + ASSERT_NE(filter_it, manager._event_driven_filters.end()); + EXPECT_FALSE(filter_it->second.has_value()); +} + +TEST_F(CloudWarmUpManagerFilterTest, SetEventWithTableIdsStoresFilter) { + CloudWarmUpManager manager(_engine); + int64_t job_id = 1002; + std::vector table_ids = {10, 20, 30}; + + auto st = manager.set_event(job_id, TWarmUpEventType::LOAD, false, &table_ids); + EXPECT_TRUE(st.ok()); + EXPECT_TRUE(manager._tablet_replica_cache.contains(job_id)); + auto filter_it = manager._event_driven_filters.find(job_id); + ASSERT_NE(filter_it, manager._event_driven_filters.end()); + ASSERT_TRUE(filter_it->second.has_value()); + EXPECT_EQ(3, filter_it->second->size()); + EXPECT_TRUE(filter_it->second->contains(10)); + EXPECT_TRUE(filter_it->second->contains(20)); + EXPECT_TRUE(filter_it->second->contains(30)); +} + +TEST_F(CloudWarmUpManagerFilterTest, SetEventWithEmptyTableIdsStoresEmptyFilter) { + CloudWarmUpManager manager(_engine); + int64_t job_id = 1003; + std::vector table_ids = {}; + + auto st = manager.set_event(job_id, TWarmUpEventType::LOAD, false, &table_ids); + EXPECT_TRUE(st.ok()); + auto filter_it = manager._event_driven_filters.find(job_id); + ASSERT_NE(filter_it, manager._event_driven_filters.end()); + ASSERT_TRUE(filter_it->second.has_value()); + EXPECT_TRUE(filter_it->second->empty()); +} + +TEST_F(CloudWarmUpManagerFilterTest, SetEventClearRemovesFilterAndCache) { + CloudWarmUpManager manager(_engine); + int64_t job_id = 1004; + std::vector table_ids = {10, 20}; + + auto st = manager.set_event(job_id, TWarmUpEventType::LOAD, false, &table_ids); + EXPECT_TRUE(st.ok()); + EXPECT_TRUE(manager._tablet_replica_cache.contains(job_id)); + EXPECT_TRUE(manager._event_driven_filters.contains(job_id)); + + st = manager.set_event(job_id, TWarmUpEventType::LOAD, true); + EXPECT_TRUE(st.ok()); + EXPECT_FALSE(manager._tablet_replica_cache.contains(job_id)); + EXPECT_FALSE(manager._event_driven_filters.contains(job_id)); +} + +TEST_F(CloudWarmUpManagerFilterTest, SetEventUpdateTableIdsReplacesFilter) { + CloudWarmUpManager manager(_engine); + int64_t job_id = 1005; + + std::vector initial_ids = {10, 20}; + auto st = manager.set_event(job_id, TWarmUpEventType::LOAD, false, &initial_ids); + EXPECT_TRUE(st.ok()); + + std::vector updated_ids = {30, 40, 50}; + st = manager.set_event(job_id, TWarmUpEventType::LOAD, false, &updated_ids); + EXPECT_TRUE(st.ok()); + auto filter_it = manager._event_driven_filters.find(job_id); + ASSERT_NE(filter_it, manager._event_driven_filters.end()); + ASSERT_TRUE(filter_it->second.has_value()); + EXPECT_EQ(3, filter_it->second->size()); + EXPECT_FALSE(filter_it->second->contains(10)); + EXPECT_TRUE(filter_it->second->contains(30)); + EXPECT_TRUE(filter_it->second->contains(40)); + EXPECT_TRUE(filter_it->second->contains(50)); +} + +TEST_F(CloudWarmUpManagerFilterTest, SetEventUnsupportedType) { + CloudWarmUpManager manager(_engine); + int64_t job_id = 1006; + + auto st = manager.set_event(job_id, TWarmUpEventType::QUERY, false, nullptr); + EXPECT_FALSE(st.ok()); +} + +TEST_F(CloudWarmUpManagerFilterTest, GetReplicaInfoAppliesTableFilter) { + CloudWarmUpManager manager(_engine); + int64_t tablet_id = 3001; + auto now = std::chrono::steady_clock::now(); + + manager._tablet_replica_cache[2001][tablet_id] = {now, make_replica(11)}; + manager._event_driven_filters[2001] = std::unordered_set {10}; + + manager._tablet_replica_cache[2002][tablet_id] = {now, make_replica(22)}; + manager._event_driven_filters[2002] = std::unordered_set {20}; + + bool cache_hit = false; + auto replicas = manager.get_replica_info(tablet_id, 20, false, cache_hit); + + ASSERT_EQ(1, replicas.size()); + EXPECT_EQ(22, replicas[0].replica.backend_id); + EXPECT_TRUE(cache_hit); +} + +TEST_F(CloudWarmUpManagerFilterTest, GetReplicaInfoBypassesFilterWhenTableIdUnknown) { + CloudWarmUpManager manager(_engine); + int64_t tablet_id = 3002; + auto now = std::chrono::steady_clock::now(); + + manager._tablet_replica_cache[3001][tablet_id] = {now, make_replica(31)}; + manager._event_driven_filters[3001] = std::unordered_set {10}; + + manager._tablet_replica_cache[3002][tablet_id] = {now, make_replica(32)}; + manager._event_driven_filters[3002] = std::unordered_set {20}; + + bool cache_hit = false; + auto replicas = manager.get_replica_info(tablet_id, 0, false, cache_hit); + + ASSERT_EQ(2, replicas.size()); + EXPECT_TRUE(cache_hit); +} + +TEST_F(CloudWarmUpManagerFilterTest, BuildWarmUpRowsetResultReturnsOkWithoutFailures) { + auto st = CloudWarmUpManager::_build_warm_up_rowset_result({}, 2, 4001, "rowset-1"); + EXPECT_TRUE(st.ok()); +} + +TEST_F(CloudWarmUpManagerFilterTest, BuildWarmUpRowsetResultAggregatesAllFailures) { + std::vector failures = { + {ErrorCode::THRIFT_RPC_ERROR, + "job_id=1, backend_id=11, target=127.0.0.1:8011, status=[THRIFT_RPC_ERROR]rpc one"}, + {ErrorCode::INTERNAL_ERROR, + "job_id=2, backend_id=22, target=127.0.0.1:8022, status=[INTERNAL_ERROR]rpc two"}}; + + auto st = CloudWarmUpManager::_build_warm_up_rowset_result(failures, 3, 4002, "rowset-2"); + + EXPECT_FALSE(st.ok()); + EXPECT_EQ(ErrorCode::THRIFT_RPC_ERROR, st.code()); + std::string msg = st.to_string_no_stack(); + EXPECT_NE(std::string::npos, msg.find("failed on 2/3 replicas")); + EXPECT_NE(std::string::npos, msg.find("rpc one")); + EXPECT_NE(std::string::npos, msg.find("rpc two")); +} + +TEST_F(CloudWarmUpManagerFilterTest, BuildWarmUpRowsetResultKeepsTableNotFoundRetrySignal) { + std::vector failures = { + {ErrorCode::THRIFT_RPC_ERROR, + "job_id=1, backend_id=11, target=127.0.0.1:8011, status=[THRIFT_RPC_ERROR]rpc one"}, + {ErrorCode::TABLE_NOT_FOUND, + "job_id=2, backend_id=22, target=127.0.0.1:8022, status=[TABLET_MISSING]missing"}}; + + auto st = CloudWarmUpManager::_build_warm_up_rowset_result(failures, 2, 4003, "rowset-3"); + + EXPECT_FALSE(st.ok()); + EXPECT_TRUE(st.is()); + std::string msg = st.to_string_no_stack(); + EXPECT_NE(std::string::npos, msg.find("rpc one")); + EXPECT_NE(std::string::npos, msg.find("missing")); +} + +TEST_F(CloudWarmUpManagerFilterTest, DownstreamProgressTracksEarliestPendingTrigger) { + g_warmup_ed_downstream_progress_tracker.reset_for_test(); + std::string job_id = "9001"; + + g_warmup_ed_downstream_progress_tracker.record_task_submit(job_id, 1000); + g_warmup_ed_downstream_progress_tracker.record_task_submit(job_id, 1500); + g_warmup_ed_downstream_progress_tracker.record_task_submit(job_id, 1000); + + EXPECT_EQ(1000, g_warmup_ed_downstream_progress_tracker.get_progress_ts(job_id)); + + g_warmup_ed_downstream_progress_tracker.record_task_done(job_id, 1000); + EXPECT_EQ(1000, g_warmup_ed_downstream_progress_tracker.get_progress_ts(job_id)); + + g_warmup_ed_downstream_progress_tracker.record_task_done(job_id, 1000); + EXPECT_EQ(1500, g_warmup_ed_downstream_progress_tracker.get_progress_ts(job_id)); + + g_warmup_ed_downstream_progress_tracker.record_task_done(job_id, 1500); + EXPECT_EQ(1500, g_warmup_ed_downstream_progress_tracker.get_progress_ts(job_id)); + + g_warmup_ed_downstream_progress_tracker.reset_for_test(); +} + +TEST_F(CloudWarmUpManagerFilterTest, DownstreamProgressFallsBackToLatestFinishedTrigger) { + g_warmup_ed_downstream_progress_tracker.reset_for_test(); + std::string job_id = "9002"; + + g_warmup_ed_downstream_progress_tracker.record_task_done(job_id, 2000); + g_warmup_ed_downstream_progress_tracker.record_task_done(job_id, 1800); + EXPECT_EQ(2000, g_warmup_ed_downstream_progress_tracker.get_progress_ts(job_id)); + + g_warmup_ed_downstream_progress_tracker.record_task_submit(job_id, 1900); + EXPECT_EQ(1900, g_warmup_ed_downstream_progress_tracker.get_progress_ts(job_id)); + + g_warmup_ed_downstream_progress_tracker.record_task_done(job_id, 1900); + EXPECT_EQ(2000, g_warmup_ed_downstream_progress_tracker.get_progress_ts(job_id)); + + auto job_ids = g_warmup_ed_downstream_progress_tracker.list_job_ids(); + EXPECT_NE(job_ids.end(), std::find(job_ids.begin(), job_ids.end(), job_id)); + + g_warmup_ed_downstream_progress_tracker.reset_for_test(); +} + +} // namespace doris diff --git a/be/test/util/bvar_windowed_adder_test.cpp b/be/test/util/bvar_windowed_adder_test.cpp new file mode 100644 index 00000000000000..9306364b4d4694 --- /dev/null +++ b/be/test/util/bvar_windowed_adder_test.cpp @@ -0,0 +1,140 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "util/bvar_windowed_adder.h" + +#include + +#include +#include +#include + +namespace doris { + +TEST(MBvarWindowedAdderTest, PutAndGetTotal) { + // bvar::Window has MAX_SECONDS_LIMIT = 3600, so use values within that limit + MBvarWindowedAdder adder("test_put_get", {"job_id"}, {3600}); + + adder.put({"100"}, 5); + adder.put({"100"}, 3); + + // Window value should reflect accumulated puts + // Note: bvar::Window reports the *per-second average* × window_size in some modes, + // but bvar::Adder-backed windows report the sum of samples within the window. + // The exact value depends on bvar internals and timing, so just verify it's > 0. + int64_t val = adder.get_window_value({"100"}, 0); + EXPECT_GE(val, 0); // Window may need time to accumulate +} + +TEST(MBvarWindowedAdderTest, UnknownDimensionReturnsZero) { + MBvarWindowedAdder adder("test_unknown_dim", {"job_id"}, {3600}); + + EXPECT_EQ(0, adder.get_window_value({"nonexistent"}, 0)); + EXPECT_EQ(0, adder.get_window_value("nonexistent", 0)); +} + +TEST(MBvarWindowedAdderTest, InvalidWindowIndexReturnsZero) { + MBvarWindowedAdder adder("test_invalid_idx", {"job_id"}, {3600}); + + adder.put({"100"}, 1); + + // Window index 1 doesn't exist (only index 0) + EXPECT_EQ(0, adder.get_window_value({"100"}, 1)); + EXPECT_EQ(0, adder.get_window_value({"100"}, 999)); +} + +TEST(MBvarWindowedAdderTest, MultipleDimensions) { + MBvarWindowedAdder adder("test_multi_dim", {"job_id"}, {3600}); + + adder.put({"100"}, 10); + adder.put({"200"}, 20); + adder.put({"300"}, 30); + + auto dims = adder.list_dimensions(); + EXPECT_EQ(3, dims.size()); + + std::sort(dims.begin(), dims.end()); + EXPECT_EQ("100", dims[0]); + EXPECT_EQ("200", dims[1]); + EXPECT_EQ("300", dims[2]); +} + +TEST(MBvarWindowedAdderTest, ListDimensionsEmpty) { + MBvarWindowedAdder adder("test_empty_dims", {"job_id"}, {3600}); + + auto dims = adder.list_dimensions(); + EXPECT_TRUE(dims.empty()); +} + +TEST(MBvarWindowedAdderTest, MultipleWindowSizes) { + // bvar::Window has MAX_SECONDS_LIMIT = 3600, all values must be within this limit + MBvarWindowedAdder adder("test_multi_win", {"job_id"}, {300, 1800, 3600}); + + adder.put({"100"}, 42); + + // All 3 windows should be created (indices 0, 1, 2) + // Values may be 0 due to bvar internal timing, but should not crash + adder.get_window_value({"100"}, 0); + adder.get_window_value({"100"}, 1); + adder.get_window_value({"100"}, 2); + + // Index 3 out of range + EXPECT_EQ(0, adder.get_window_value({"100"}, 3)); +} + +TEST(MBvarWindowedAdderTest, GetWindowValueByStringKey) { + MBvarWindowedAdder adder("test_str_key", {"job_id"}, {3600}); + + adder.put({"42"}, 100); + + // String key for single dimension is just the value itself + int64_t val = adder.get_window_value("42", 0); + EXPECT_GE(val, 0); + + // Unknown string key + EXPECT_EQ(0, adder.get_window_value("unknown", 0)); +} + +TEST(MBvarWindowedAdderTest, EnsureWindowsIdempotent) { + MBvarWindowedAdder adder("test_idempotent", {"job_id"}, {3600}); + + // Multiple puts to the same dimension should not create duplicate windows + adder.put({"100"}, 1); + adder.put({"100"}, 2); + adder.put({"100"}, 3); + + auto dims = adder.list_dimensions(); + EXPECT_EQ(1, dims.size()); + EXPECT_EQ("100", dims[0]); +} + +TEST(MBvarWindowedAdderTest, MakeKeyComposite) { + // Test that multi-value dimensions produce comma-separated keys + MBvarWindowedAdder adder("test_composite", {"a", "b"}, {3600}); + + adder.put({"x", "y"}, 1); + + auto dims = adder.list_dimensions(); + EXPECT_EQ(1, dims.size()); + EXPECT_EQ("x,y", dims[0]); + + // Can also query by composite string key + int64_t val = adder.get_window_value("x,y", 0); + EXPECT_GE(val, 0); +} + +} // namespace doris diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java index bdd64284c9af27..2b7bb7d8d4a17e 100644 --- a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java +++ b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java @@ -3295,6 +3295,21 @@ public static int metaServiceRpcRetryTimes() { @ConfField(mutable = true, masterOnly = true) public static long cloud_warm_up_job_max_bytes_per_batch = 21474836480L; // 20GB + @ConfField(mutable = true, masterOnly = true, description = { + "zh-CN: 定期刷新 table-level warmup 任务匹配的 table ID 集合的时间间隔(毫秒)", + "en: Interval in milliseconds to refresh matched table IDs for table-level warmup jobs"}) + public static long cloud_warm_up_table_filter_refresh_interval_ms = 60000; // 60 seconds + + @ConfField(mutable = true, masterOnly = true, description = { + "zh-CN: 定期从 BE 拉取主动增量预热 SyncStats 并缓存到 FE job 的时间间隔(毫秒)", + "en: Interval in milliseconds to collect event-driven warmup SyncStats from BEs and cache it in FE jobs"}) + public static long cloud_warm_up_sync_stats_refresh_interval_ms = 15000; // 15 seconds + + @ConfField(mutable = true, masterOnly = true, description = { + "zh-CN: SHOW WARM UP JOB 和 FE 日志中 MatchedTables 最多展示的表数量", + "en: Maximum number of MatchedTables entries displayed in SHOW WARM UP JOB and FE logs"}) + public static int cloud_warm_up_matched_tables_display_limit = 100; + @ConfField(mutable = true, masterOnly = true) public static boolean cloud_warm_up_force_all_partitions = false; diff --git a/fe/fe-core/src/main/java/org/apache/doris/cloud/CacheHotspotManager.java b/fe/fe-core/src/main/java/org/apache/doris/cloud/CacheHotspotManager.java index 500f65de153df4..2b4ffa287477ce 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/cloud/CacheHotspotManager.java +++ b/fe/fe-core/src/main/java/org/apache/doris/cloud/CacheHotspotManager.java @@ -18,12 +18,14 @@ package org.apache.doris.cloud; import org.apache.doris.catalog.Database; +import org.apache.doris.catalog.DatabaseIf; import org.apache.doris.catalog.Env; import org.apache.doris.catalog.MaterializedIndex; import org.apache.doris.catalog.MaterializedIndex.IndexExtState; import org.apache.doris.catalog.OlapTable; import org.apache.doris.catalog.Partition; import org.apache.doris.catalog.Table; +import org.apache.doris.catalog.TableIf; import org.apache.doris.catalog.Tablet; import org.apache.doris.cloud.CloudWarmUpJob.JobState; import org.apache.doris.cloud.CloudWarmUpJob.JobType; @@ -39,7 +41,10 @@ import org.apache.doris.common.ThreadPoolManager; import org.apache.doris.common.Triple; import org.apache.doris.common.util.MasterDaemon; +import org.apache.doris.common.util.NetUtils; import org.apache.doris.common.util.TimeUtils; +import org.apache.doris.httpv2.rest.manager.HttpUtils; +import org.apache.doris.metric.MetricRepo; import org.apache.doris.nereids.trees.plans.commands.CancelWarmUpJobCommand; import org.apache.doris.nereids.trees.plans.commands.WarmUpClusterCommand; import org.apache.doris.rpc.RpcException; @@ -52,8 +57,14 @@ import org.apache.doris.thrift.TNetworkAddress; import org.apache.doris.thrift.TStatusCode; +import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.Lists; import com.google.common.collect.Maps; +import com.google.common.util.concurrent.ThreadFactoryBuilder; +import com.google.gson.JsonArray; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; +import com.google.gson.JsonParser; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.apache.thrift.TException; @@ -67,6 +78,7 @@ import java.util.Collections; import java.util.Comparator; import java.util.HashMap; +import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; @@ -78,10 +90,14 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorCompletionService; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; +import java.util.stream.Collectors; public class CacheHotspotManager extends MasterDaemon { public static final int MAX_SHOW_ENTRIES = 2000; @@ -103,6 +119,18 @@ public class CacheHotspotManager extends MasterDaemon { private boolean startJobDaemon = false; + private MasterDaemon tableFilterRefreshDaemon; + + private boolean startTableFilterRefreshDaemon = false; + + private MasterDaemon warmUpSyncStatsRefreshDaemon; + + private boolean startWarmUpSyncStatsRefreshDaemon = false; + + // Thread pool for concurrent BE HTTP requests during on-demand stats collection + private final ExecutorService warmupStatsExecutor = Executors.newFixedThreadPool(16, + new ThreadFactoryBuilder().setNameFormat("warmup-stats-collector-%d").setDaemon(true).build()); + private ConcurrentMap cloudWarmUpJobs = Maps.newConcurrentMap(); private ConcurrentMap activeCloudWarmUpJobs = Maps.newConcurrentMap(); @@ -116,11 +144,17 @@ private static class JobKey { private final String srcName; private final String dstName; private final CloudWarmUpJob.SyncMode syncMode; + private final String tableFilterExpr; public JobKey(String srcName, String dstName, CloudWarmUpJob.SyncMode syncMode) { + this(srcName, dstName, syncMode, ""); + } + + public JobKey(String srcName, String dstName, CloudWarmUpJob.SyncMode syncMode, String tableFilterExpr) { this.srcName = srcName; this.dstName = dstName; this.syncMode = syncMode; + this.tableFilterExpr = tableFilterExpr == null ? "" : tableFilterExpr; } @Override @@ -134,17 +168,22 @@ public boolean equals(Object o) { JobKey jobKey = (JobKey) o; return Objects.equals(srcName, jobKey.srcName) && Objects.equals(dstName, jobKey.dstName) - && syncMode == jobKey.syncMode; + && syncMode == jobKey.syncMode + && Objects.equals(tableFilterExpr, jobKey.tableFilterExpr); } @Override public int hashCode() { - return Objects.hash(srcName, dstName, syncMode); + return Objects.hash(srcName, dstName, syncMode, tableFilterExpr); } @Override public String toString() { - return "WarmUpJob src='" + srcName + "', dst='" + dstName + "', syncMode=" + String.valueOf(syncMode); + String s = "WarmUpJob src='" + srcName + "', dst='" + dstName + "', syncMode=" + String.valueOf(syncMode); + if (!tableFilterExpr.isEmpty()) { + s += ", tableFilter=" + tableFilterExpr; + } + return s; } } @@ -156,10 +195,12 @@ private void registerJobForRepeatDetection(CloudWarmUpJob job, boolean replay) t if (job.isDone()) { return; } + if (!replay) { + checkLoadEventWarmUpConflict(job); + } if (job.isEventDriven() || job.isPeriodic()) { - // For long lasting jobs, i.e. event-driven and periodic. - // It is meaningless to create more than one job for a given src, dst, and syncMode. - JobKey key = new JobKey(job.getSrcClusterName(), job.getDstClusterName(), job.getSyncMode()); + JobKey key = new JobKey(job.getSrcClusterName(), job.getDstClusterName(), + job.getSyncMode(), job.getTableFilterExpr()); boolean added = this.repeatJobDetectionSet.add(key); if (!added && !replay) { throw new AnalysisException(key + " already has a runnable job"); @@ -167,6 +208,106 @@ private void registerJobForRepeatDetection(CloudWarmUpJob job, boolean replay) t } } + // Only checks cross-type conflicts between table-level and cluster-level load-event warm-up jobs. + // Same-type duplicate jobs are still rejected later by repeatJobDetectionSet. + private void checkLoadEventWarmUpConflict(CloudWarmUpJob newJob) throws AnalysisException { + if (!isLoadEventWarmUpJob(newJob)) { + return; + } + + for (CloudWarmUpJob existingJob : runnableCloudWarmUpJobs.values()) { + if (existingJob.getJobId() == newJob.getJobId() || existingJob.isDone() + || !isLoadEventWarmUpJob(existingJob)) { + continue; + } + if (!isSameWarmUpPair(newJob, existingJob)) { + continue; + } + if (isTableLevelLoadEventWarmUpJob(newJob) != isTableLevelLoadEventWarmUpJob(existingJob)) { + throw buildLoadEventWarmUpConflictException(newJob, existingJob); + } + } + } + + public void cancelTableLevelLoadEventWarmUpJobsForVirtualComputeGroup( + String virtualComputeGroupName, String activeComputeGroup, String standbyComputeGroup, + List subComputeGroups, String reason) throws AnalysisException { + String cancelReason = reason + " for virtual compute group '" + virtualComputeGroupName + "'"; + Set computeGroupsInVcg = new HashSet<>(); + if (subComputeGroups != null) { + computeGroupsInVcg.addAll(subComputeGroups); + } + computeGroupsInVcg.add(activeComputeGroup); + computeGroupsInVcg.add(standbyComputeGroup); + + for (CloudWarmUpJob existingJob : runnableCloudWarmUpJobs.values()) { + if (existingJob.isDone() || !isTableLevelLoadEventWarmUpJob(existingJob)) { + continue; + } + if (!computeGroupsInVcg.contains(existingJob.getSrcClusterName()) + || !computeGroupsInVcg.contains(existingJob.getDstClusterName())) { + continue; + } + try { + cancel(existingJob.getJobId(), cancelReason); + LOG.info("cancel table-level load-event warm up job {} before virtual compute group '{}' creates " + + "cluster-level load-event warm up job. active compute group {}, " + + "standby compute group {}, source compute group {}, destination compute group {}{}, " + + "reason: {}", + existingJob.getJobId(), virtualComputeGroupName, activeComputeGroup, standbyComputeGroup, + existingJob.getSrcClusterName(), existingJob.getDstClusterName(), + formatExistingTableFilter(existingJob), cancelReason); + } catch (DdlException e) { + throw new AnalysisException("Failed to cancel table-level load-event warm up job " + + existingJob.getJobId() + " before virtual compute group '" + virtualComputeGroupName + + "' creates cluster-level load-event warm up job from active compute group '" + + activeComputeGroup + "' to standby compute group '" + standbyComputeGroup + + "'. Source compute group '" + existingJob.getSrcClusterName() + + "', destination compute group '" + existingJob.getDstClusterName() + "'" + + formatExistingTableFilter(existingJob) + ". Cancel table-level load-event warm up job " + + existingJob.getJobId() + " before retrying.", e); + } + } + } + + private static boolean isLoadEventWarmUpJob(CloudWarmUpJob job) { + return job != null && job.isEventDriven() && job.getSyncEvent() == CloudWarmUpJob.SyncEvent.LOAD; + } + + private static boolean isClusterLevelLoadEventWarmUpJob(CloudWarmUpJob job) { + return isLoadEventWarmUpJob(job) && job.getJobType() == JobType.CLUSTER; + } + + private static boolean isTableLevelLoadEventWarmUpJob(CloudWarmUpJob job) { + return isLoadEventWarmUpJob(job) && job.getJobType() == JobType.TABLES; + } + + private static boolean isSameWarmUpPair(CloudWarmUpJob left, CloudWarmUpJob right) { + return Objects.equals(left.getSrcClusterName(), right.getSrcClusterName()) + && Objects.equals(left.getDstClusterName(), right.getDstClusterName()); + } + + private static AnalysisException buildLoadEventWarmUpConflictException( + CloudWarmUpJob newJob, CloudWarmUpJob existingJob) { + String newJobLevel = isTableLevelLoadEventWarmUpJob(newJob) ? "table-level" : "cluster-level"; + String existingJobLevel = isClusterLevelLoadEventWarmUpJob(existingJob) ? "cluster-level" : "table-level"; + return new AnalysisException("Cannot create " + newJobLevel + " load-event warm up job from source " + + "compute group '" + newJob.getSrcClusterName() + "' to destination compute group '" + + newJob.getDstClusterName() + "': conflicting " + existingJobLevel + + " load-event warm up job " + existingJob.getJobId() + + " already exists for the same source and destination" + + formatExistingTableFilter(existingJob) + + ". Cancel existing load-event warm up job " + existingJob.getJobId() + + " before creating this job."); + } + + private static String formatExistingTableFilter(CloudWarmUpJob job) { + if (!job.hasTableFilter()) { + return ""; + } + return " with table filter [" + job.getTableFilterExpr() + "]"; + } + // Tracks warm-up jobs scheduled by CacheHotSpotManager. // Ensures that at most one job runs concurrently per destination cluster. private Map clusterToRunningJobId = new ConcurrentHashMap<>(); @@ -236,7 +377,8 @@ public void notifyJobStop(CloudWarmUpJob job) { } if (job.isEventDriven() || job.isPeriodic()) { this.repeatJobDetectionSet.remove(new JobKey( - job.getSrcClusterName(), job.getDstClusterName(), job.getSyncMode())); + job.getSrcClusterName(), job.getDstClusterName(), + job.getSyncMode(), job.getTableFilterExpr())); } } @@ -252,6 +394,17 @@ public void runAfterCatalogReady() { jobDaemon.start(); startJobDaemon = true; } + if (!startTableFilterRefreshDaemon) { + tableFilterRefreshDaemon = new TableFilterRefreshDaemon(); + tableFilterRefreshDaemon.start(); + startTableFilterRefreshDaemon = true; + } + if (Config.isCloudMode() && !startWarmUpSyncStatsRefreshDaemon) { + warmUpSyncStatsRefreshDaemon = new WarmUpSyncStatsRefreshDaemon(); + warmUpSyncStatsRefreshDaemon.start(); + startWarmUpSyncStatsRefreshDaemon = true; + } + if (!tableCreated) { try { @@ -339,6 +492,20 @@ private void triggerBatchInsert() { } } + private void refreshWarmUpSyncStats() { + if (!Env.getCurrentEnv().isMaster()) { + MetricRepo.syncCloudWarmUpSyncJobMetricDefinitions(Collections.emptyList()); + return; + } + + Map statsMap = collectAndAggregate(); + for (CloudWarmUpJob job : cloudWarmUpJobs.values()) { + JobWarmUpStats stats = job.isEventDriven() && !job.isDone() ? statsMap.get(job.getJobId()) : null; + job.setSyncStats(stats); + } + MetricRepo.syncCloudWarmUpSyncJobMetricDefinitions(cloudWarmUpJobs.values()); + } + private void insertIntoTable(String clusterId, long tableId, long indexId, long fileCacheSize, THotPartition partition, Backend backend) { LOG.info("table id {}, index id {}, partition id {}", tableId, indexId, partition.partition_id); @@ -613,7 +780,8 @@ public List> getSingleJobInfo(long jobId) throws AnalysisException if (job == null) { throw new AnalysisException("cloud warm up with job " + jobId + " does not exist"); } - infos.add(job.getJobInfo()); + Map statsMap = collectAndAggregate(); + infos.add(job.getJobInfo(statsMap.get(jobId), true)); return infos; } @@ -634,6 +802,190 @@ public void runAfterCatalogReady() { } } + private class TableFilterRefreshDaemon extends MasterDaemon { + TableFilterRefreshDaemon() { + super("TableFilterRefreshDaemon", Config.cloud_warm_up_table_filter_refresh_interval_ms); + LOG.info("start table filter refresh daemon, interval={}ms", + Config.cloud_warm_up_table_filter_refresh_interval_ms); + } + + @Override + public void runAfterCatalogReady() { + if (getInterval() != Config.cloud_warm_up_table_filter_refresh_interval_ms) { + setInterval(Config.cloud_warm_up_table_filter_refresh_interval_ms); + LOG.info("update table filter refresh daemon interval to {}ms", getInterval()); + } + refreshAllTableFilters(); + } + } + + private class WarmUpSyncStatsRefreshDaemon extends MasterDaemon { + WarmUpSyncStatsRefreshDaemon() { + super("WarmUpSyncStatsRefreshDaemon", Config.cloud_warm_up_sync_stats_refresh_interval_ms); + LOG.info("start warm up sync stats refresh daemon, interval={}ms", + Config.cloud_warm_up_sync_stats_refresh_interval_ms); + } + + @Override + public void runAfterCatalogReady() { + if (getInterval() != Config.cloud_warm_up_sync_stats_refresh_interval_ms) { + setInterval(Config.cloud_warm_up_sync_stats_refresh_interval_ms); + LOG.info("update warm up sync stats refresh daemon interval to {}ms", getInterval()); + } + refreshWarmUpSyncStats(); + } + } + + + /** + * Collect warmup stats from all BEs on demand and aggregate per-job. + * Called when SHOW WARM UP JOB is executed. + * + * @return per-job aggregated warmup stats; empty map if no event-driven jobs exist + */ + private Map collectAndAggregate() { + Map result = new HashMap<>(); + + // 1. Collect all clusters involved in event-driven jobs + Set allClusters = new HashSet<>(); + for (CloudWarmUpJob job : runnableCloudWarmUpJobs.values()) { + if (job.isEventDriven()) { + allClusters.add(job.getSrcClusterName()); + allClusters.add(job.getDstClusterName()); + } + } + if (allClusters.isEmpty()) { + return result; + } + + // 2. Enumerate all (cluster, BE) pairs + List> allTargets = new ArrayList<>(); + for (String cluster : allClusters) { + for (Backend be : getBackendsFromCluster(cluster)) { + if (be.isAlive()) { + allTargets.add(Pair.of(cluster, be)); + } + } + } + if (allTargets.isEmpty()) { + return result; + } + + // 3. Concurrent HTTP requests to all BEs + ExecutorCompletionService> completionService = + new ExecutorCompletionService<>(warmupStatsExecutor); + + // Acquire auth token once for all BE requests (needed when enable_all_http_auth is on) + Map authHeaders = new HashMap<>(); + try { + String token = Env.getCurrentEnv().getTokenManager().acquireToken(); + authHeaders.put("Auth-Token", token); + } catch (Exception e) { + LOG.warn("Failed to acquire auth token for warmup stats collection, " + + "requests may fail if enable_all_http_auth is enabled: {}", e.getMessage()); + } + + for (Pair target : allTargets) { + String cluster = target.first; + Backend be = target.second; + completionService.submit(() -> { + String url = "http://" + + NetUtils.getHostPortInAccessibleFormat(be.getHost(), be.getHttpPort()) + + "/api/warmup_event_driven_stats"; + String json = HttpUtils.doGet(url, authHeaders, 5000); + return Pair.of(cluster, json); + }); + } + + // 4. Collect results and merge by cluster → jobId + Map> clusterStats = new HashMap<>(); + for (int i = 0; i < allTargets.size(); i++) { + try { + Future> future = completionService.take(); + Pair resultPair = future.get(10, TimeUnit.SECONDS); + String cluster = resultPair.first; + String json = resultPair.second; + Map jobMap = + clusterStats.computeIfAbsent(cluster, k -> new HashMap<>()); + mergeStatsFromJson(jobMap, json); + } catch (Exception e) { + LOG.warn("Failed to collect warmup stats: {}", e.getMessage()); + } + } + + // 5. Aggregate per-job + for (CloudWarmUpJob job : runnableCloudWarmUpJobs.values()) { + if (!job.isEventDriven()) { + continue; + } + JobWarmUpStats stats = aggregateStatsForJob(job, clusterStats); + result.put(job.getJobId(), stats); + } + return result; + } + + /** + * Parse BE JSON response and merge into jobMap. + * JSON structure: data[].{job_id, requested, finish, fail, ...} + */ + private void mergeStatsFromJson( + Map jobMap, String json) { + try { + JsonObject root = JsonParser.parseString(json).getAsJsonObject(); + JsonArray data = root.getAsJsonArray("data"); + if (data == null) { + return; + } + for (JsonElement jobElem : data) { + JsonObject jobObj = jobElem.getAsJsonObject(); + long jobId = jobObj.get("job_id").getAsLong(); + TableWarmUpWindowedStats stats = TableWarmUpWindowedStats.fromJson(jobObj); + jobMap.compute(jobId, (id, existing) -> { + if (existing == null) { + return stats; + } + existing.merge(stats); + return existing; + }); + } + } catch (Exception e) { + LOG.warn("Failed to parse warmup stats JSON: {}", e.getMessage()); + } + } + + /** + * Aggregate per-job stats: from srcCluster take requested, from dstCluster take finished. + */ + @VisibleForTesting + JobWarmUpStats aggregateStatsForJob( + CloudWarmUpJob job, + Map> clusterStats) { + JobWarmUpStats result = new JobWarmUpStats(); + long jobId = job.getJobId(); + String srcCluster = job.getSrcClusterName(); + String dstCluster = job.getDstClusterName(); + + TableWarmUpWindowedStats srcStat = clusterStats + .getOrDefault(srcCluster, Collections.emptyMap()) + .get(jobId); + TableWarmUpWindowedStats dstStat = clusterStats + .getOrDefault(dstCluster, Collections.emptyMap()) + .get(jobId); + + if (srcStat != null) { + result.mergeRequested(srcStat); + } + if (dstStat != null) { + // Target-side progress timestamp is a watermark, not an additive counter. The merge + // keeps the minimum positive watermark across BEs so FE reports the slowest target + // progress for trigger-gap calculation. + result.mergeFinished(dstStat); + } + result.computeGap(); + return result; + } + + private void clearFinishedOrCancelCloudWarmUpJob() { Iterator> iterator = runnableCloudWarmUpJobs.entrySet().iterator(); while (iterator.hasNext()) { @@ -664,22 +1016,34 @@ public CloudWarmUpJob getCloudWarmUpJob(long jobId) { } public List> getAllJobInfos(int limit) { + Map statsMap = collectAndAggregate(); List> infos = Lists.newArrayList(); Collection allJobs = cloudWarmUpJobs.values(); allJobs.stream().sorted(Comparator.comparing(CloudWarmUpJob::getCreateTimeMs).reversed()) .limit(limit).forEach(t -> { - infos.add(t.getJobInfo()); + infos.add(t.getJobInfo(statsMap.get(t.getJobId()), false)); }); return infos; } public void addCloudWarmUpJob(CloudWarmUpJob job) throws AnalysisException { + restoreTableFilterState(job); registerJobForRepeatDetection(job, false); cloudWarmUpJobs.put(job.getJobId(), job); LOG.info("add cloud warm up job {}", job.getJobId()); runnableCloudWarmUpJobs.put(job.getJobId(), job); } + private void restoreTableFilterState(CloudWarmUpJob job) { + if (!job.hasTableFilter()) { + return; + } + job.rebuildOnTablesFilter(); + Map tableIdNames = resolveTableIds(job.getOnTablesFilter()); + job.setCurrentTableIdNames(tableIdNames); + logMatchedTables("restored table filter for job " + job.getJobId(), tableIdNames); + } + public List getPartitionsFromTriple(Triple tableTriple) { String dbName = tableTriple.getLeft(); String tableName = tableTriple.getMiddle(); @@ -826,10 +1190,35 @@ public long createJob(WarmUpClusterCommand stmt) throws AnalysisException { } builder.setSyncMode(SyncMode.EVENT_DRIVEN) .setSyncEvent(syncEvent); + + // Handle ON TABLES rules + List onTablesRules = stmt.getOnTablesRules(); + if (onTablesRules != null && !onTablesRules.isEmpty()) { + builder.setJobType(JobType.TABLES); + List persistedRules = new ArrayList<>(); + for (OnTablesFilter.TableFilterRule rule : onTablesRules) { + CloudWarmUpJob.PersistedTableFilterRule pr = new CloudWarmUpJob.PersistedTableFilterRule(); + pr.ruleType = rule.getRuleType().name(); + pr.pattern = rule.getRawPattern(); + persistedRules.add(pr); + } + builder.setTableFilterRules(persistedRules); + } } else { builder.setSyncMode(SyncMode.ONCE); } warmUpJob = builder.build(); + + // For event-driven jobs with ON TABLES, rebuild filter and resolve initial table IDs + if (warmUpJob.hasTableFilter()) { + warmUpJob.rebuildOnTablesFilter(); + Map initialTableIdNames = resolveTableIds(warmUpJob.getOnTablesFilter()); + logMatchedTables("created table filter for job " + jobId, initialTableIdNames); + if (initialTableIdNames.isEmpty()) { + throw new AnalysisException("No tables matched the ON TABLES filter"); + } + warmUpJob.setCurrentTableIdNames(initialTableIdNames); + } } addCloudWarmUpJob(warmUpJob); @@ -858,6 +1247,26 @@ public void cancel(long jobId, String msg) throws DdlException { } } + public void cancelTableFilterJobsForClusterChange(String clusterName, String reason) { + for (CloudWarmUpJob job : runnableCloudWarmUpJobs.values()) { + if (job.isDone() || !job.hasTableFilter()) { + continue; + } + if (!Objects.equals(clusterName, job.getSrcClusterName()) + && !Objects.equals(clusterName, job.getDstClusterName())) { + continue; + } + try { + cancel(job.getJobId(), reason); + LOG.info("cancel table-level cloud warm up job {} because compute group {} changed: {}", + job.getJobId(), clusterName, reason); + } catch (DdlException e) { + LOG.warn("failed to cancel table-level cloud warm up job {} after compute group {} changed", + job.getJobId(), clusterName, e); + } + } + } + private void runCloudWarmUpJob() { runnableCloudWarmUpJobs.values().forEach(cloudWarmUpJob -> { if (cloudWarmUpJob.shouldWait()) { @@ -887,6 +1296,9 @@ public void replayCloudWarmUpJob(CloudWarmUpJob cloudWarmUpJob) throws Exception runnableCloudWarmUpJobs.put(cloudWarmUpJob.getJobId(), cloudWarmUpJob); cloudWarmUpJobs.put(cloudWarmUpJob.getJobId(), cloudWarmUpJob); LOG.info("replay cloud warm up job {}, state {}", cloudWarmUpJob.getJobId(), cloudWarmUpJob.getJobState()); + + restoreTableFilterState(cloudWarmUpJob); + if (cloudWarmUpJob.isDone()) { notifyJobStop(cloudWarmUpJob); } else { @@ -904,4 +1316,67 @@ public void replayCloudWarmUpJob(CloudWarmUpJob cloudWarmUpJob) throws Exception } } + /** + * Resolve glob-based ON TABLES filter to a map of matching table ID → "db.table" name + * by iterating all databases and tables in the internal catalog. + */ + public Map resolveTableIds(OnTablesFilter filter) { + Map result = new HashMap<>(); + if (filter == null) { + return result; + } + Collection> allDbs = + Env.getCurrentInternalCatalog().getAllDbs(); + for (DatabaseIf dbIf : allDbs) { + String dbName = dbIf.getFullName(); + // Strip "default_cluster:" prefix if present + if (dbName.contains(":")) { + dbName = dbName.substring(dbName.indexOf(':') + 1); + } + Set tableNames = dbIf.getTableNamesOrEmptyWithLock(); + for (String tableName : tableNames) { + TableIf table = dbIf.getTableNullable(tableName); + if (table != null && table.isManagedTable() && filter.shouldWarmUp(dbName, tableName)) { + result.put(table.getId(), dbName + "." + tableName); + } + } + } + return result; + } + + private void logMatchedTables(String action, Map tableIdNames) { + String matchedTables = CloudWarmUpJob.formatMatchedTablesForDisplay(tableIdNames.entrySet().stream() + .sorted(Map.Entry.comparingByKey()) + .map(entry -> entry.getKey() + ":" + entry.getValue()) + .collect(Collectors.toList())); + LOG.info("{}: matched_table_count={}, matched_tables=[{}]", + action, tableIdNames.size(), matchedTables); + } + + /** + * Periodically refresh table IDs for all running event-driven jobs with ON TABLES filter. + * Called from the daemon loop to pick up newly created/dropped tables matching glob patterns. + */ + public void refreshAllTableFilters() { + for (CloudWarmUpJob job : runnableCloudWarmUpJobs.values()) { + if (job.isDone() || !job.isEventDriven() || !job.hasTableFilter()) { + continue; + } + try { + Map newTableIdNames = resolveTableIds(job.getOnTablesFilter()); + logMatchedTables("refreshed table filter for job " + job.getJobId(), newTableIdNames); + Set oldTableIds = job.getCurrentTableIds(); + if (!newTableIdNames.equals(job.getCurrentTableIdNames())) { + job.setCurrentTableIdNames(newTableIdNames); + LOG.info("refreshed table filter for job {}: {} -> {} tables", + job.getJobId(), + oldTableIds == null ? 0 : oldTableIds.size(), + newTableIdNames.size()); + } + } catch (Exception e) { + LOG.warn("failed to refresh table filter for job {}", job.getJobId(), e); + } + } + } + } diff --git a/fe/fe-core/src/main/java/org/apache/doris/cloud/CloudWarmUpJob.java b/fe/fe-core/src/main/java/org/apache/doris/cloud/CloudWarmUpJob.java index ae12107c3ddac4..c4f47ed9269eb6 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/cloud/CloudWarmUpJob.java +++ b/fe/fe-core/src/main/java/org/apache/doris/cloud/CloudWarmUpJob.java @@ -46,6 +46,8 @@ import com.google.common.base.Preconditions; import com.google.common.collect.Lists; +import com.google.gson.JsonArray; +import com.google.gson.JsonObject; import com.google.gson.annotations.SerializedName; import org.apache.commons.lang3.StringUtils; import org.apache.logging.log4j.LogManager; @@ -55,10 +57,13 @@ import java.io.DataOutput; import java.io.IOException; import java.util.ArrayList; +import java.util.Comparator; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; import java.util.stream.Collectors; public class CloudWarmUpJob implements Writable { @@ -77,7 +82,8 @@ public boolean isFinalState() { public enum JobType { CLUSTER, - TABLE; + TABLE, + TABLES; } public enum SyncMode { @@ -139,6 +145,32 @@ public enum SyncEvent { @SerializedName(value = "syncEvent") protected SyncEvent syncEvent; + @SerializedName(value = "tableFilterRules") + protected List tableFilterRules = new ArrayList<>(); + + // Computed from tableFilterRules via canonicalize(); not persisted. + private transient String tableFilterExpr = ""; + private transient OnTablesFilter onTablesFilter; + // Maps table ID → "db.table" qualified name for matched tables. + private transient volatile Map currentTableIdNames = new ConcurrentHashMap<>(); + + // Latest event-driven SyncStats collected by FE background metrics refresh. Not persisted. + private transient volatile JobWarmUpStats syncStats; + + /** + * Serializable rule for GSON persistence. + */ + public static class PersistedTableFilterRule { + @SerializedName("ruleType") + public String ruleType; + @SerializedName("pattern") + public String pattern; + } + + private static final Comparator TABLE_FILTER_RULE_COMPARATOR = + Comparator.comparingInt(CloudWarmUpJob::tableFilterRuleTypeOrder) + .thenComparing(rule -> StringUtils.defaultString(rule.pattern)); + private Map beToClient; private Map beToAddr; @@ -159,6 +191,7 @@ public static class Builder { private SyncMode syncMode = SyncMode.ONCE; private SyncEvent syncEvent; private long syncInterval; + private List tableFilterRules = new ArrayList<>(); public Builder() {} @@ -197,6 +230,11 @@ public Builder setSyncInterval(long syncInterval) { return this; } + public Builder setTableFilterRules(List tableFilterRules) { + this.tableFilterRules = tableFilterRules; + return this; + } + public CloudWarmUpJob build() { if (jobId == 0 || srcClusterName == null || dstClusterName == null || jobType == null || syncMode == null) { throw new IllegalStateException("Missing required fields for CloudWarmUpJob"); @@ -214,6 +252,8 @@ private CloudWarmUpJob(Builder builder) { this.syncMode = builder.syncMode; this.syncEvent = builder.syncEvent; this.syncInterval = builder.syncInterval; + this.tableFilterRules = normalizeTableFilterRules(builder.tableFilterRules); + this.tableFilterExpr = computeTableFilterExpr(); this.createTimeMs = System.currentTimeMillis(); } @@ -273,7 +313,7 @@ public void fetchBeToTabletIdBatches() { if (FeConstants.runningUnitTest) { return; } - if (jobType == JobType.TABLE) { + if (jobType == JobType.TABLE || jobType == JobType.TABLES) { // warm up with table will have to set tablets on creation return; } @@ -338,6 +378,10 @@ public long getCreateTimeMs() { return createTimeMs; } + public long getStartTimeMs() { + return startTimeMs; + } + public String getErrMsg() { return errMsg; } @@ -366,6 +410,18 @@ public SyncMode getSyncMode() { return syncMode; } + public SyncEvent getSyncEvent() { + return syncEvent; + } + + public JobWarmUpStats getSyncStats() { + return syncStats; + } + + public void setSyncStats(JobWarmUpStats syncStats) { + this.syncStats = syncStats; + } + public String getSyncModeString() { if (syncMode == null) { // For backward compatibility: older FE versions did not set syncMode for jobs, @@ -390,7 +446,11 @@ public String getSyncModeString() { return sb.toString(); } - public List getJobInfo() { + public List getJobInfo(JobWarmUpStats stats) { + return getJobInfo(stats, true); + } + + public List getJobInfo(JobWarmUpStats stats, boolean showDetailedSyncStats) { List info = Lists.newArrayList(); info.add(String.valueOf(jobId)); info.add(srcClusterName); @@ -416,9 +476,42 @@ public List getJobInfo() { ? t.getLeft() + "." + t.getMiddle() : t.getLeft() + "." + t.getMiddle() + "." + t.getRight()) .collect(Collectors.joining(", "))); + info.add(tableFilterExpr == null ? "" : tableFilterExpr); + info.add(getMatchedTablesString()); + // SyncStats: only for event-driven jobs + if (isEventDriven() && stats != null) { + info.add(showDetailedSyncStats ? stats.toJsonString() : stats.toSummaryJsonString()); + } else { + info.add(""); + } return info; } + private String getMatchedTablesString() { + if (currentTableIdNames == null || currentTableIdNames.isEmpty()) { + return ""; + } + return formatMatchedTablesForDisplay(currentTableIdNames.values().stream() + .sorted() + .collect(Collectors.toList())); + } + + static String formatMatchedTablesForDisplay(List matchedTables) { + if (matchedTables == null || matchedTables.isEmpty()) { + return ""; + } + int displayLimit = Math.max(0, Config.cloud_warm_up_matched_tables_display_limit); + int shownCount = Math.min(matchedTables.size(), displayLimit); + String result = matchedTables.stream() + .limit(shownCount) + .collect(Collectors.joining(", ")); + if (matchedTables.size() <= displayLimit) { + return result; + } + String truncatedSuffix = "... (truncated, " + shownCount + " of " + matchedTables.size() + " shown)"; + return result.isEmpty() ? truncatedSuffix : result + ", " + truncatedSuffix; + } + public void setJobState(JobState jobState) { this.jobState = jobState; } @@ -477,6 +570,153 @@ public String getSrcClusterName() { return srcClusterName; } + public boolean hasTableFilter() { + return tableFilterRules != null && !tableFilterRules.isEmpty(); + } + + public String getTableFilterExpr() { + return tableFilterExpr; + } + + public List getTableFilterRules() { + return tableFilterRules; + } + + public OnTablesFilter getOnTablesFilter() { + return onTablesFilter; + } + + /** + * Returns the set of currently matched table IDs. + */ + public Set getCurrentTableIds() { + if (currentTableIdNames == null) { + currentTableIdNames = new ConcurrentHashMap<>(); + } + return currentTableIdNames.keySet(); + } + + /** + * Sets the current matched table ID-to-name mapping. + */ + public void setCurrentTableIdNames(Map idNames) { + this.currentTableIdNames = new ConcurrentHashMap<>(idNames); + } + + public Map getCurrentTableIdNames() { + if (currentTableIdNames == null) { + currentTableIdNames = new ConcurrentHashMap<>(); + } + return currentTableIdNames; + } + + /** + * Compute the canonical table filter expression from persisted rules. + * Returns empty string when no table filter rules exist. + */ + private String computeTableFilterExpr() { + List normalizedRules = normalizeTableFilterRules(tableFilterRules); + tableFilterRules = normalizedRules; + if (normalizedRules.isEmpty()) { + return ""; + } + return canonicalizeNormalizedRules(normalizedRules); + } + + /** + * Generate canonical JSON from persisted rules for JobKey dedup and SHOW output. + * Steps: group by type → sort alphabetically → deduplicate → compact JSON. + */ + public static String canonicalize(List rules) { + return canonicalizeNormalizedRules(normalizeTableFilterRules(rules)); + } + + private static String canonicalizeNormalizedRules(List normalizedRules) { + List includes = normalizedRules.stream() + .filter(r -> "INCLUDE".equals(r.ruleType)) + .map(r -> r.pattern) + .collect(Collectors.toList()); + List excludes = normalizedRules.stream() + .filter(r -> "EXCLUDE".equals(r.ruleType)) + .map(r -> r.pattern) + .collect(Collectors.toList()); + + JsonObject json = new JsonObject(); + JsonArray incArr = new JsonArray(); + includes.forEach(incArr::add); + json.add("include", incArr); + if (!excludes.isEmpty()) { + JsonArray excArr = new JsonArray(); + excludes.forEach(excArr::add); + json.add("exclude", excArr); + } + return json.toString(); + } + + /** + * Rebuild the transient OnTablesFilter and tableFilterExpr from persisted tableFilterRules. + * Called after deserialization (EditLog replay, FE restart). + */ + public void rebuildOnTablesFilter() { + if (currentTableIdNames == null) { + currentTableIdNames = new ConcurrentHashMap<>(); + } + if (tableFilterRules == null || tableFilterRules.isEmpty()) { + this.tableFilterRules = new ArrayList<>(); + this.tableFilterExpr = ""; + this.onTablesFilter = null; + return; + } + this.tableFilterExpr = computeTableFilterExpr(); + List rules = tableFilterRules.stream() + .map(r -> new OnTablesFilter.TableFilterRule( + "INCLUDE".equals(r.ruleType) + ? OnTablesFilter.TableFilterRule.RuleType.INCLUDE + : OnTablesFilter.TableFilterRule.RuleType.EXCLUDE, + r.pattern)) + .collect(Collectors.toList()); + this.onTablesFilter = new OnTablesFilter(rules); + } + + private static int tableFilterRuleTypeOrder(PersistedTableFilterRule rule) { + return "INCLUDE".equals(rule.ruleType) ? 0 : 1; + } + + private static String normalizeTableFilterRuleType(String ruleType) { + Preconditions.checkNotNull(ruleType, "table filter rule type cannot be null"); + Preconditions.checkState("INCLUDE".equalsIgnoreCase(ruleType) || "EXCLUDE".equalsIgnoreCase(ruleType), + "Unexpected table filter rule type: %s", ruleType); + return "INCLUDE".equalsIgnoreCase(ruleType) ? "INCLUDE" : "EXCLUDE"; + } + + private static PersistedTableFilterRule copyNormalizedTableFilterRule(PersistedTableFilterRule rule) { + PersistedTableFilterRule normalizedRule = new PersistedTableFilterRule(); + normalizedRule.ruleType = normalizeTableFilterRuleType(rule.ruleType); + normalizedRule.pattern = rule.pattern; + return normalizedRule; + } + + private static List normalizeTableFilterRules(List rules) { + if (rules == null || rules.isEmpty()) { + return new ArrayList<>(); + } + List sortedRules = rules.stream() + .map(CloudWarmUpJob::copyNormalizedTableFilterRule) + .sorted(TABLE_FILTER_RULE_COMPARATOR) + .collect(Collectors.toList()); + List normalizedRules = new ArrayList<>(); + String lastRuleKey = null; + for (PersistedTableFilterRule rule : sortedRules) { + String ruleKey = rule.ruleType + "\0" + StringUtils.defaultString(rule.pattern); + if (ruleKey.equals(lastRuleKey)) { + continue; + } + normalizedRules.add(rule); + lastRuleKey = ruleKey; + } + return normalizedRules; + } + public synchronized void run() { if (isTimeout()) { cancel("Timeout", false); @@ -741,8 +981,13 @@ private void runEventDrivenJob() throws Exception { throw new IllegalArgumentException("Unknown SyncEvent " + syncEvent); } request.setEvent(event); - LOG.debug("send warm up request to BE {} ({}). job_id={}, event={}, request_type=SET_JOB(EVENT)", - entry.getKey(), getBackendEndpoint(entry.getKey()), jobId, syncEvent); + if (hasTableFilter()) { + request.setTableIds(new ArrayList<>(getCurrentTableIds())); + } + LOG.debug("send warm up request to BE {} ({}). job_id={}, event={}, " + + "request_type=SET_JOB(EVENT), table_ids_count={}", + entry.getKey(), getBackendEndpoint(entry.getKey()), jobId, syncEvent, + hasTableFilter() ? getCurrentTableIdNames().size() : "all"); TWarmUpTabletsResponse response = entry.getValue().warmUpTablets(request); if (response.getStatus().getStatusCode() != TStatusCode.OK) { if (!response.getStatus().getErrorMsgs().isEmpty()) { @@ -753,6 +998,7 @@ private void runEventDrivenJob() throws Exception { } } } catch (Exception e) { + errMsg = e.getMessage(); LOG.warn("send warm up request job_id={} failed with exception {}", jobId, e); } finally { @@ -901,6 +1147,8 @@ public void write(DataOutput out) throws IOException { public static CloudWarmUpJob read(DataInput in) throws IOException { String json = Text.readString(in); - return GsonUtils.GSON.fromJson(json, CloudWarmUpJob.class); + CloudWarmUpJob job = GsonUtils.GSON.fromJson(json, CloudWarmUpJob.class); + job.rebuildOnTablesFilter(); + return job; } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/cloud/JobWarmUpStats.java b/fe/fe-core/src/main/java/org/apache/doris/cloud/JobWarmUpStats.java new file mode 100644 index 00000000000000..cdb293216a1b54 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/cloud/JobWarmUpStats.java @@ -0,0 +1,285 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.cloud; + +import org.apache.doris.monitor.unit.ByteSizeValue; + +import com.google.gson.JsonObject; + +import java.time.Instant; +import java.time.LocalDateTime; +import java.time.ZoneId; +import java.time.format.DateTimeFormatter; + +/** + * Per-Job aggregated warmup statistics. + * Aggregates requested (from source cluster) and finished/failed (from target cluster) + * across all matched tables, then computes gap = requested - finished. + */ +public class JobWarmUpStats { + private static final DateTimeFormatter TIME_FMT = DateTimeFormatter.ofPattern("HH:mm:ss"); + + // Aggregated requested + public long requestedSegmentNum5m; + public long requestedSegmentNum30m; + public long requestedSegmentNum1h; + public long requestedSegmentSize5m; + public long requestedSegmentSize30m; + public long requestedSegmentSize1h; + public long requestedIndexNum5m; + public long requestedIndexNum30m; + public long requestedIndexNum1h; + public long requestedIndexSize5m; + public long requestedIndexSize30m; + public long requestedIndexSize1h; + public long lastTriggerTs; + + // Aggregated finished + public long finishSegmentNum5m; + public long finishSegmentNum30m; + public long finishSegmentNum1h; + public long finishSegmentSize5m; + public long finishSegmentSize30m; + public long finishSegmentSize1h; + public long finishIndexNum5m; + public long finishIndexNum30m; + public long finishIndexNum1h; + public long finishIndexSize5m; + public long finishIndexSize30m; + public long finishIndexSize1h; + + // Aggregated failed + public long failSegmentNum5m; + public long failSegmentNum30m; + public long failSegmentNum1h; + public long failSegmentSize5m; + public long failSegmentSize30m; + public long failSegmentSize1h; + public long failIndexNum5m; + public long failIndexNum30m; + public long failIndexNum1h; + public long failIndexSize5m; + public long failIndexSize30m; + public long failIndexSize1h; + public long lastFinishTs; + // Aggregated from target BEs. FE takes the minimum positive target progress watermark so the + // slowest target BE decides how far the job has caught up to source-side triggers. + public long progressTriggerTs; + + // gap = requested - finished + public long gapSegmentNum5m; + public long gapSegmentNum30m; + public long gapSegmentNum1h; + public long gapSegmentSize5m; + public long gapSegmentSize30m; + public long gapSegmentSize1h; + public long gapIndexNum5m; + public long gapIndexNum30m; + public long gapIndexNum1h; + public long gapIndexSize5m; + public long gapIndexSize30m; + public long gapIndexSize1h; + // Source last trigger timestamp minus target progress watermark. A caught-up target reports its + // latest finished trigger as progress, so this value naturally becomes 0. + public long triggerGapMs; + + /** Accumulate requested stats from a table in the source cluster. */ + public void mergeRequested(TableWarmUpWindowedStats t) { + requestedSegmentNum5m += t.requestedSegmentNum5m; + requestedSegmentNum30m += t.requestedSegmentNum30m; + requestedSegmentNum1h += t.requestedSegmentNum1h; + requestedSegmentSize5m += t.requestedSegmentSize5m; + requestedSegmentSize30m += t.requestedSegmentSize30m; + requestedSegmentSize1h += t.requestedSegmentSize1h; + requestedIndexNum5m += t.requestedIndexNum5m; + requestedIndexNum30m += t.requestedIndexNum30m; + requestedIndexNum1h += t.requestedIndexNum1h; + requestedIndexSize5m += t.requestedIndexSize5m; + requestedIndexSize30m += t.requestedIndexSize30m; + requestedIndexSize1h += t.requestedIndexSize1h; + lastTriggerTs = Math.max(lastTriggerTs, t.lastTriggerTs); + } + + /** Accumulate finished/failed stats from a table in the target cluster. */ + public void mergeFinished(TableWarmUpWindowedStats t) { + finishSegmentNum5m += t.finishSegmentNum5m; + finishSegmentNum30m += t.finishSegmentNum30m; + finishSegmentNum1h += t.finishSegmentNum1h; + finishSegmentSize5m += t.finishSegmentSize5m; + finishSegmentSize30m += t.finishSegmentSize30m; + finishSegmentSize1h += t.finishSegmentSize1h; + finishIndexNum5m += t.finishIndexNum5m; + finishIndexNum30m += t.finishIndexNum30m; + finishIndexNum1h += t.finishIndexNum1h; + finishIndexSize5m += t.finishIndexSize5m; + finishIndexSize30m += t.finishIndexSize30m; + finishIndexSize1h += t.finishIndexSize1h; + failSegmentNum5m += t.failSegmentNum5m; + failSegmentNum30m += t.failSegmentNum30m; + failSegmentNum1h += t.failSegmentNum1h; + failSegmentSize5m += t.failSegmentSize5m; + failSegmentSize30m += t.failSegmentSize30m; + failSegmentSize1h += t.failSegmentSize1h; + failIndexNum5m += t.failIndexNum5m; + failIndexNum30m += t.failIndexNum30m; + failIndexNum1h += t.failIndexNum1h; + failIndexSize5m += t.failIndexSize5m; + failIndexSize30m += t.failIndexSize30m; + failIndexSize1h += t.failIndexSize1h; + lastFinishTs = Math.max(lastFinishTs, t.lastFinishTs); + progressTriggerTs = minPositive(progressTriggerTs, t.progressTriggerTs); + } + + /** Compute gap = requested - finished for all window/metric combinations. */ + public void computeGap() { + gapSegmentNum5m = requestedSegmentNum5m - finishSegmentNum5m; + gapSegmentNum30m = requestedSegmentNum30m - finishSegmentNum30m; + gapSegmentNum1h = requestedSegmentNum1h - finishSegmentNum1h; + gapSegmentSize5m = requestedSegmentSize5m - finishSegmentSize5m; + gapSegmentSize30m = requestedSegmentSize30m - finishSegmentSize30m; + gapSegmentSize1h = requestedSegmentSize1h - finishSegmentSize1h; + gapIndexNum5m = requestedIndexNum5m - finishIndexNum5m; + gapIndexNum30m = requestedIndexNum30m - finishIndexNum30m; + gapIndexNum1h = requestedIndexNum1h - finishIndexNum1h; + gapIndexSize5m = requestedIndexSize5m - finishIndexSize5m; + gapIndexSize30m = requestedIndexSize30m - finishIndexSize30m; + gapIndexSize1h = requestedIndexSize1h - finishIndexSize1h; + triggerGapMs = lastTriggerTs > 0 && progressTriggerTs > 0 + ? Math.max(0, lastTriggerTs - progressTriggerTs) : 0; + } + + /** Serialize compact 30m SyncStats summary for SHOW WARM UP JOB list output. */ + public String toSummaryJsonString() { + JsonObject root = new JsonObject(); + root.addProperty("window", "30m"); + long srcSize = requestedSegmentSize30m + requestedIndexSize30m; + long dstSize = finishSegmentSize30m + finishIndexSize30m; + root.addProperty("src_size", humanReadableSize(srcSize)); + root.addProperty("dst_size", humanReadableSize(dstSize)); + root.addProperty("gap_size", humanReadableSize(srcSize - dstSize)); + // Compact SHOW WARM UP JOB output still exposes the active incremental warm-up time lag. + root.addProperty("trigger_gap_ms", triggerGapMs); + return root.toString(); + } + + /** Serialize detailed SyncStats JSON for SHOW WARM UP JOB WHERE ID = ... output. */ + public String toJsonString() { + JsonObject root = new JsonObject(); + + // seg_num + JsonObject segNum = new JsonObject(); + segNum.addProperty("requested_5m", requestedSegmentNum5m); + segNum.addProperty("finish_5m", finishSegmentNum5m); + segNum.addProperty("gap_5m", gapSegmentNum5m); + segNum.addProperty("fail_5m", failSegmentNum5m); + segNum.addProperty("requested_30m", requestedSegmentNum30m); + segNum.addProperty("finish_30m", finishSegmentNum30m); + segNum.addProperty("gap_30m", gapSegmentNum30m); + segNum.addProperty("fail_30m", failSegmentNum30m); + segNum.addProperty("requested_1h", requestedSegmentNum1h); + segNum.addProperty("finish_1h", finishSegmentNum1h); + segNum.addProperty("gap_1h", gapSegmentNum1h); + segNum.addProperty("fail_1h", failSegmentNum1h); + root.add("seg_num", segNum); + + // seg_size + JsonObject segSize = new JsonObject(); + segSize.addProperty("requested_5m", humanReadableSize(requestedSegmentSize5m)); + segSize.addProperty("finish_5m", humanReadableSize(finishSegmentSize5m)); + segSize.addProperty("gap_5m", humanReadableSize(gapSegmentSize5m)); + segSize.addProperty("fail_5m", humanReadableSize(failSegmentSize5m)); + segSize.addProperty("requested_30m", humanReadableSize(requestedSegmentSize30m)); + segSize.addProperty("finish_30m", humanReadableSize(finishSegmentSize30m)); + segSize.addProperty("gap_30m", humanReadableSize(gapSegmentSize30m)); + segSize.addProperty("fail_30m", humanReadableSize(failSegmentSize30m)); + segSize.addProperty("requested_1h", humanReadableSize(requestedSegmentSize1h)); + segSize.addProperty("finish_1h", humanReadableSize(finishSegmentSize1h)); + segSize.addProperty("gap_1h", humanReadableSize(gapSegmentSize1h)); + segSize.addProperty("fail_1h", humanReadableSize(failSegmentSize1h)); + root.add("seg_size", segSize); + + // idx_num + JsonObject idxNum = new JsonObject(); + idxNum.addProperty("requested_5m", requestedIndexNum5m); + idxNum.addProperty("finish_5m", finishIndexNum5m); + idxNum.addProperty("gap_5m", gapIndexNum5m); + idxNum.addProperty("fail_5m", failIndexNum5m); + idxNum.addProperty("requested_30m", requestedIndexNum30m); + idxNum.addProperty("finish_30m", finishIndexNum30m); + idxNum.addProperty("gap_30m", gapIndexNum30m); + idxNum.addProperty("fail_30m", failIndexNum30m); + idxNum.addProperty("requested_1h", requestedIndexNum1h); + idxNum.addProperty("finish_1h", finishIndexNum1h); + idxNum.addProperty("gap_1h", gapIndexNum1h); + idxNum.addProperty("fail_1h", failIndexNum1h); + root.add("idx_num", idxNum); + + // idx_size + JsonObject idxSize = new JsonObject(); + idxSize.addProperty("requested_5m", humanReadableSize(requestedIndexSize5m)); + idxSize.addProperty("finish_5m", humanReadableSize(finishIndexSize5m)); + idxSize.addProperty("gap_5m", humanReadableSize(gapIndexSize5m)); + idxSize.addProperty("fail_5m", humanReadableSize(failIndexSize5m)); + idxSize.addProperty("requested_30m", humanReadableSize(requestedIndexSize30m)); + idxSize.addProperty("finish_30m", humanReadableSize(finishIndexSize30m)); + idxSize.addProperty("gap_30m", humanReadableSize(gapIndexSize30m)); + idxSize.addProperty("fail_30m", humanReadableSize(failIndexSize30m)); + idxSize.addProperty("requested_1h", humanReadableSize(requestedIndexSize1h)); + idxSize.addProperty("finish_1h", humanReadableSize(finishIndexSize1h)); + idxSize.addProperty("gap_1h", humanReadableSize(gapIndexSize1h)); + idxSize.addProperty("fail_1h", humanReadableSize(failIndexSize1h)); + root.add("idx_size", idxSize); + + // timestamps + root.addProperty("last_trigger_ts", formatEpochMs(lastTriggerTs)); + root.addProperty("last_finish_ts", formatEpochMs(lastFinishTs)); + root.addProperty("progress_trigger_ts", formatEpochMs(progressTriggerTs)); + root.addProperty("trigger_gap_ms", triggerGapMs); + + return root.toString(); + } + + private static long minPositive(long current, long candidate) { + if (current <= 0) { + return Math.max(candidate, 0); + } + if (candidate <= 0) { + return current; + } + return Math.min(current, candidate); + } + + private static String humanReadableSize(long bytes) { + if (bytes < 0) { + return "-" + new ByteSizeValue(-bytes).toString(); + } + return new ByteSizeValue(bytes).toString(); + } + + private static String formatEpochMs(long epochMs) { + if (epochMs <= 0) { + return ""; + } + try { + return LocalDateTime.ofInstant(Instant.ofEpochMilli(epochMs), ZoneId.systemDefault()) + .format(TIME_FMT); + } catch (Exception e) { + return ""; + } + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/cloud/OnTablesFilter.java b/fe/fe-core/src/main/java/org/apache/doris/cloud/OnTablesFilter.java new file mode 100644 index 00000000000000..f0aaef8c7de891 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/cloud/OnTablesFilter.java @@ -0,0 +1,169 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.cloud; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +/** + * ON TABLES clause filter for table-level event-driven warmup. + * + * Semantics: INCLUDE union − EXCLUDE union. + * A table is warmed up if it matches any INCLUDE rule and does not match any EXCLUDE rule. + */ +public class OnTablesFilter { + + /** + * A single INCLUDE or EXCLUDE rule with a glob pattern compiled to a Java regex. + */ + public static class TableFilterRule { + public enum RuleType { + INCLUDE, + EXCLUDE + } + + private final RuleType ruleType; + private final String rawPattern; + private final Pattern compiledPattern; + + public TableFilterRule(RuleType ruleType, String globPattern) { + this.ruleType = ruleType; + this.rawPattern = globPattern; + this.compiledPattern = compileGlob(globPattern); + } + + /** + * Compile a glob pattern to an anchored Java regex. + * Glob: '*' matches any characters, '?' matches a single character, + * '.' and other regex metacharacters are treated as literals. + */ + private static Pattern compileGlob(String glob) { + StringBuilder regex = new StringBuilder("^"); + for (char c : glob.toCharArray()) { + switch (c) { + case '*': + regex.append(".*"); + break; + case '?': + regex.append("."); + break; + case '.': + case '(': + case ')': + case '[': + case ']': + case '{': + case '}': + case '\\': + case '^': + case '$': + case '|': + case '+': + regex.append('\\').append(c); + break; + default: + regex.append(c); + } + } + regex.append("$"); + return Pattern.compile(regex.toString()); + } + + public boolean matches(String fullTableName) { + return compiledPattern.matcher(fullTableName).matches(); + } + + public RuleType getRuleType() { + return ruleType; + } + + public String getRawPattern() { + return rawPattern; + } + } + + private final List includeRules; + private final List excludeRules; + + public OnTablesFilter(List rules) { + List includes = new ArrayList<>(); + List excludes = new ArrayList<>(); + for (TableFilterRule rule : rules) { + if (rule.getRuleType() == TableFilterRule.RuleType.INCLUDE) { + includes.add(rule); + } else { + excludes.add(rule); + } + } + this.includeRules = Collections.unmodifiableList(includes); + this.excludeRules = Collections.unmodifiableList(excludes); + } + + /** + * Determine whether a table should be warmed up. + * 1. If the table matches any INCLUDE rule → candidate + * 2. If the candidate matches any EXCLUDE rule → excluded + */ + public boolean shouldWarmUp(String dbName, String tableName) { + String fullName = dbName + "." + tableName; + + boolean included = includeRules.stream() + .anyMatch(rule -> rule.matches(fullName)); + if (!included) { + return false; + } + + boolean excluded = excludeRules.stream() + .anyMatch(rule -> rule.matches(fullName)); + return !excluded; + } + + public List getIncludeRules() { + return includeRules; + } + + public List getExcludeRules() { + return excludeRules; + } + + /** + * Get all rules (include + exclude) for iteration. + */ + public List getAllRules() { + List all = new ArrayList<>(includeRules.size() + excludeRules.size()); + all.addAll(includeRules); + all.addAll(excludeRules); + return all; + } + + /** + * Generate a human-readable string representation for logging. + */ + @Override + public String toString() { + return "OnTablesFilter{include=" + includeRules.stream() + .map(TableFilterRule::getRawPattern) + .collect(Collectors.joining(", ")) + + ", exclude=" + excludeRules.stream() + .map(TableFilterRule::getRawPattern) + .collect(Collectors.joining(", ")) + "}"; + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/cloud/TableWarmUpWindowedStats.java b/fe/fe-core/src/main/java/org/apache/doris/cloud/TableWarmUpWindowedStats.java new file mode 100644 index 00000000000000..d9c315efa7057e --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/cloud/TableWarmUpWindowedStats.java @@ -0,0 +1,206 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.cloud; + +import com.google.gson.JsonObject; + +/** + * Per-job windowed warmup statistics collected from a single BE. + * Contains requested, finish, and fail counters for segments and indexes + * across 3 time windows (5m, 30m, 1h). + */ +public class TableWarmUpWindowedStats { + + // requested (source BE populates these) + public long requestedSegmentNum5m; + public long requestedSegmentNum30m; + public long requestedSegmentNum1h; + public long requestedSegmentSize5m; + public long requestedSegmentSize30m; + public long requestedSegmentSize1h; + public long requestedIndexNum5m; + public long requestedIndexNum30m; + public long requestedIndexNum1h; + public long requestedIndexSize5m; + public long requestedIndexSize30m; + public long requestedIndexSize1h; + public long lastTriggerTs; + // Target BE progress watermark carried from source BE trigger time. Pending downloads use the + // earliest unfinished trigger time; when no downloads are pending, BE reports the latest + // finished trigger time. + public long progressTriggerTs; + + // finish (target BE populates these) + public long finishSegmentNum5m; + public long finishSegmentNum30m; + public long finishSegmentNum1h; + public long finishSegmentSize5m; + public long finishSegmentSize30m; + public long finishSegmentSize1h; + public long finishIndexNum5m; + public long finishIndexNum30m; + public long finishIndexNum1h; + public long finishIndexSize5m; + public long finishIndexSize30m; + public long finishIndexSize1h; + public long lastFinishTs; + + // fail (target BE populates these) + public long failSegmentNum5m; + public long failSegmentNum30m; + public long failSegmentNum1h; + public long failSegmentSize5m; + public long failSegmentSize30m; + public long failSegmentSize1h; + public long failIndexNum5m; + public long failIndexNum30m; + public long failIndexNum1h; + public long failIndexSize5m; + public long failIndexSize30m; + public long failIndexSize1h; + + /** + * Parse from BE JSON response. + * JSON hierarchy: {requested|finish|fail}.{seg|idx}.{num|size}.{5m|30m|1h} + */ + public static TableWarmUpWindowedStats fromJson(JsonObject obj) { + TableWarmUpWindowedStats s = new TableWarmUpWindowedStats(); + + JsonObject req = obj.getAsJsonObject("requested"); + if (req != null) { + s.requestedSegmentNum5m = getWindow(req, "seg", "num", "5m"); + s.requestedSegmentNum30m = getWindow(req, "seg", "num", "30m"); + s.requestedSegmentNum1h = getWindow(req, "seg", "num", "1h"); + s.requestedSegmentSize5m = getWindow(req, "seg", "size", "5m"); + s.requestedSegmentSize30m = getWindow(req, "seg", "size", "30m"); + s.requestedSegmentSize1h = getWindow(req, "seg", "size", "1h"); + s.requestedIndexNum5m = getWindow(req, "idx", "num", "5m"); + s.requestedIndexNum30m = getWindow(req, "idx", "num", "30m"); + s.requestedIndexNum1h = getWindow(req, "idx", "num", "1h"); + s.requestedIndexSize5m = getWindow(req, "idx", "size", "5m"); + s.requestedIndexSize30m = getWindow(req, "idx", "size", "30m"); + s.requestedIndexSize1h = getWindow(req, "idx", "size", "1h"); + } + + JsonObject fin = obj.getAsJsonObject("finish"); + if (fin != null) { + s.finishSegmentNum5m = getWindow(fin, "seg", "num", "5m"); + s.finishSegmentNum30m = getWindow(fin, "seg", "num", "30m"); + s.finishSegmentNum1h = getWindow(fin, "seg", "num", "1h"); + s.finishSegmentSize5m = getWindow(fin, "seg", "size", "5m"); + s.finishSegmentSize30m = getWindow(fin, "seg", "size", "30m"); + s.finishSegmentSize1h = getWindow(fin, "seg", "size", "1h"); + s.finishIndexNum5m = getWindow(fin, "idx", "num", "5m"); + s.finishIndexNum30m = getWindow(fin, "idx", "num", "30m"); + s.finishIndexNum1h = getWindow(fin, "idx", "num", "1h"); + s.finishIndexSize5m = getWindow(fin, "idx", "size", "5m"); + s.finishIndexSize30m = getWindow(fin, "idx", "size", "30m"); + s.finishIndexSize1h = getWindow(fin, "idx", "size", "1h"); + } + + JsonObject fail = obj.getAsJsonObject("fail"); + if (fail != null) { + s.failSegmentNum5m = getWindow(fail, "seg", "num", "5m"); + s.failSegmentNum30m = getWindow(fail, "seg", "num", "30m"); + s.failSegmentNum1h = getWindow(fail, "seg", "num", "1h"); + s.failSegmentSize5m = getWindow(fail, "seg", "size", "5m"); + s.failSegmentSize30m = getWindow(fail, "seg", "size", "30m"); + s.failSegmentSize1h = getWindow(fail, "seg", "size", "1h"); + s.failIndexNum5m = getWindow(fail, "idx", "num", "5m"); + s.failIndexNum30m = getWindow(fail, "idx", "num", "30m"); + s.failIndexNum1h = getWindow(fail, "idx", "num", "1h"); + s.failIndexSize5m = getWindow(fail, "idx", "size", "5m"); + s.failIndexSize30m = getWindow(fail, "idx", "size", "30m"); + s.failIndexSize1h = getWindow(fail, "idx", "size", "1h"); + } + + s.lastTriggerTs = obj.has("last_trigger_ts") ? obj.get("last_trigger_ts").getAsLong() : 0; + s.lastFinishTs = obj.has("last_finish_ts") ? obj.get("last_finish_ts").getAsLong() : 0; + s.progressTriggerTs = obj.has("progress_trigger_ts") + ? obj.get("progress_trigger_ts").getAsLong() : 0; + return s; + } + + private static long getWindow(JsonObject parent, String type, String metric, String window) { + JsonObject typeObj = parent.getAsJsonObject(type); + if (typeObj == null) { + return 0; + } + JsonObject metricObj = typeObj.getAsJsonObject(metric); + if (metricObj == null) { + return 0; + } + return metricObj.has(window) ? metricObj.get(window).getAsLong() : 0; + } + + /** Merge stats from another BE in the same cluster (additive for counts, max for timestamps). */ + public void merge(TableWarmUpWindowedStats other) { + requestedSegmentNum5m += other.requestedSegmentNum5m; + requestedSegmentNum30m += other.requestedSegmentNum30m; + requestedSegmentNum1h += other.requestedSegmentNum1h; + requestedSegmentSize5m += other.requestedSegmentSize5m; + requestedSegmentSize30m += other.requestedSegmentSize30m; + requestedSegmentSize1h += other.requestedSegmentSize1h; + requestedIndexNum5m += other.requestedIndexNum5m; + requestedIndexNum30m += other.requestedIndexNum30m; + requestedIndexNum1h += other.requestedIndexNum1h; + requestedIndexSize5m += other.requestedIndexSize5m; + requestedIndexSize30m += other.requestedIndexSize30m; + requestedIndexSize1h += other.requestedIndexSize1h; + + finishSegmentNum5m += other.finishSegmentNum5m; + finishSegmentNum30m += other.finishSegmentNum30m; + finishSegmentNum1h += other.finishSegmentNum1h; + finishSegmentSize5m += other.finishSegmentSize5m; + finishSegmentSize30m += other.finishSegmentSize30m; + finishSegmentSize1h += other.finishSegmentSize1h; + finishIndexNum5m += other.finishIndexNum5m; + finishIndexNum30m += other.finishIndexNum30m; + finishIndexNum1h += other.finishIndexNum1h; + finishIndexSize5m += other.finishIndexSize5m; + finishIndexSize30m += other.finishIndexSize30m; + finishIndexSize1h += other.finishIndexSize1h; + + failSegmentNum5m += other.failSegmentNum5m; + failSegmentNum30m += other.failSegmentNum30m; + failSegmentNum1h += other.failSegmentNum1h; + failSegmentSize5m += other.failSegmentSize5m; + failSegmentSize30m += other.failSegmentSize30m; + failSegmentSize1h += other.failSegmentSize1h; + failIndexNum5m += other.failIndexNum5m; + failIndexNum30m += other.failIndexNum30m; + failIndexNum1h += other.failIndexNum1h; + failIndexSize5m += other.failIndexSize5m; + failIndexSize30m += other.failIndexSize30m; + failIndexSize1h += other.failIndexSize1h; + + lastTriggerTs = Math.max(lastTriggerTs, other.lastTriggerTs); + lastFinishTs = Math.max(lastFinishTs, other.lastFinishTs); + progressTriggerTs = minPositive(progressTriggerTs, other.progressTriggerTs); + } + + private static long minPositive(long current, long candidate) { + if (current <= 0) { + return Math.max(candidate, 0); + } + if (candidate <= 0) { + return current; + } + return Math.min(current, candidate); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudClusterChecker.java b/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudClusterChecker.java index 338d619604f372..0445d48545cdcb 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudClusterChecker.java +++ b/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudClusterChecker.java @@ -146,14 +146,16 @@ private void checkToDelCluster(Map remoteClusterIdToPB, Set toDel = - new ArrayList<>(finalClusterIdToBackend.getOrDefault(delId, new ArrayList<>())); - cloudSystemInfoService.updateCloudBackends(new ArrayList<>(), toDel); - // del clusterName String delClusterName = cloudSystemInfoService.getClusterNameByClusterId(delId); if (delClusterName.isEmpty()) { return; } + ((CloudEnv) Env.getCurrentEnv()).getCacheHotspotMgr().cancelTableFilterJobsForClusterChange( + delClusterName, "system cancel: compute group " + delClusterName + " dropped"); + List toDel = + new ArrayList<>(finalClusterIdToBackend.getOrDefault(delId, new ArrayList<>())); + cloudSystemInfoService.updateCloudBackends(new ArrayList<>(), toDel); + // del clusterName // del clusterID MetricRepo.unregisterCloudMetrics(delId, delClusterName, toDel); cloudSystemInfoService.dropCluster(delId, delClusterName); @@ -262,6 +264,9 @@ private void checkDiffNode(Map remoteClusterIdToPB, LOG.info("cluster_name corresponding to cluster_id has been changed," + " cluster_id : {} , current_cluster_name : {}, new_cluster_name :{}", cid, currentClusterName, newClusterName); + ((CloudEnv) Env.getCurrentEnv()).getCacheHotspotMgr().cancelTableFilterJobsForClusterChange( + currentClusterName, "system cancel: compute group " + currentClusterName + + " renamed to " + newClusterName); // change all be's cluster_name currentBes.forEach(b -> b.setCloudClusterName(newClusterName)); // update clusterNameToId diff --git a/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudInstanceStatusChecker.java b/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudInstanceStatusChecker.java index 90c8ea42573504..4b3b7cf79b735d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudInstanceStatusChecker.java +++ b/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudInstanceStatusChecker.java @@ -284,8 +284,12 @@ private void syncFileCacheTasksForVirtualGroup(Cloud.ClusterPB virtualGroupInMs, if (virtualGroupInFe.isNeedRebuildFileCache()) { String srcCg = virtualGroupInFe.getActiveComputeGroup(); String dstCg = virtualGroupInFe.getStandbyComputeGroup(); - cancelCacheJobs(virtualGroupInFe, jobIdsInMs); try { + cacheHotspotManager.cancelTableLevelLoadEventWarmUpJobsForVirtualComputeGroup( + virtualGroupInFe.getName(), srcCg, dstCg, virtualGroupInFe.getSubComputeGroups(), + "vcg cancel table-level load-event warm up job before rebuilding file cache jobs"); + cancelCacheJobs(virtualGroupInFe, jobIdsInMs); + // all Map periodicProperties = new HashMap<>(); // "sync_mode" = "periodic", "sync_interval_sec" = "fetch_cluster_cache_hotspot_interval_ms" @@ -316,7 +320,8 @@ private void syncFileCacheTasksForVirtualGroup(Cloud.ClusterPB virtualGroupInMs, LOG.info("virtual compute group {}, generate new jobIds periodic={}, event={}, and old jobIds {}", virtualGroupInFe, jobIdPeriodic, jobIdEvent, jobIdsInMs); } catch (AnalysisException e) { - LOG.warn("virtual compute err, name: {}, analysis error", virtualGroupInFe.getName(), e); + LOG.warn("virtual compute err, name: {}, failed to generate file cache warm up jobs: {}", + virtualGroupInFe.getName(), e.getMessage(), e); return; } virtualGroupInFe.setNeedRebuildFileCache(false); diff --git a/fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java b/fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java index 723cb3f6eb188e..6248be5386429b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java +++ b/fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java @@ -22,6 +22,8 @@ import org.apache.doris.catalog.Database; import org.apache.doris.catalog.Env; import org.apache.doris.catalog.TabletSlidingWindowAccessStats; +import org.apache.doris.cloud.CloudWarmUpJob; +import org.apache.doris.cloud.JobWarmUpStats; import org.apache.doris.cloud.catalog.CloudTabletRebalancer; import org.apache.doris.cloud.system.CloudSystemInfoService; import org.apache.doris.common.Config; @@ -63,14 +65,19 @@ import org.apache.logging.log4j.Logger; import java.util.ArrayList; +import java.util.Collection; import java.util.Collections; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.SortedMap; +import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ScheduledThreadPoolExecutor; import java.util.concurrent.TimeUnit; import java.util.function.Predicate; import java.util.function.Supplier; +import java.util.function.ToLongFunction; public final class MetricRepo { private static final Logger LOG = LogManager.getLogger(MetricRepo.class); @@ -134,6 +141,9 @@ public final class MetricRepo { public static LongCounterMetric COUNTER_SQL_CACHE_HIT; public static LongCounterMetric COUNTER_SQL_SQL_CACHE_TOTAL_SEARCH_TIMES; + private static final Map CLOUD_WARM_UP_SYNC_JOB_METRICS = + new ConcurrentHashMap<>(); + public static LongCounterMetric COUNTER_UPDATE_TABLET_STAT_FAILED; public static GaugeMetric GAUGE_TABLET_ACCESS_RECENT; @@ -1624,6 +1634,199 @@ public static void visitHistograms(MetricVisitor visitor) { DORIS_METRIC_REGISTER.acceptHistograms(visitor); } + /* + * Example Prometheus output for a running event-driven cluster warm-up job. MetricVisitor adds the + * "doris_fe_" prefix to the metric names registered below. + * + * doris_fe_file_cache_warm_up_sync_job_info{ + * job_id="1778211593204", job_type="CLUSTER", sync_mode="EVENT_DRIVEN", + * sync_event="LOAD", job_state="RUNNING", src_cluster_name="warmup_source", + * dst_cluster_name="warmup_target" + * } 1 + * doris_fe_file_cache_warm_up_sync_job_size_bytes{ + * job_id="1778211593204", job_type="CLUSTER", src_cluster_name="warmup_source", + * dst_cluster_name="warmup_target", side="src", window="5m" + * } 113246208 + * doris_fe_file_cache_warm_up_sync_job_size_bytes{ + * job_id="1778211593204", job_type="CLUSTER", src_cluster_name="warmup_source", + * dst_cluster_name="warmup_target", side="dst", window="5m" + * } 100663296 + * + * The size metric emits the same label shape for side=(src,dst) and window=(5m,30m,1h). + */ + public static void syncCloudWarmUpSyncJobMetricDefinitions(Collection jobs) { + if (!MetricRepo.isInit || Config.isNotCloudMode()) { + clearCloudWarmUpSyncJobMetrics(); + return; + } + + Collection currentJobs = jobs == null ? Collections.emptyList() : jobs; + Set currentMetricKeys = new HashSet<>(); + for (CloudWarmUpJob job : currentJobs) { + if (job == null) { + continue; + } + registerCloudWarmUpSyncJobMetrics(job, currentMetricKeys); + } + CLOUD_WARM_UP_SYNC_JOB_METRICS.entrySet().removeIf(entry -> { + if (currentMetricKeys.contains(entry.getKey())) { + return false; + } + CloudWarmUpSyncJobGauge metric = entry.getValue(); + DORIS_METRIC_REGISTER.removeMetricsByNameAndLabels(metric.getName(), metric.getLabels()); + return true; + }); + } + + private static void clearCloudWarmUpSyncJobMetrics() { + CLOUD_WARM_UP_SYNC_JOB_METRICS.forEach((key, metric) -> + DORIS_METRIC_REGISTER.removeMetricsByNameAndLabels(metric.getName(), metric.getLabels())); + CLOUD_WARM_UP_SYNC_JOB_METRICS.clear(); + } + + private static void registerCloudWarmUpSyncJobMetrics(CloudWarmUpJob job, Set currentMetricKeys) { + String jobId = String.valueOf(job.getJobId()); + String jobType = labelValue(job.getJobType()); + String srcClusterName = labelValue(job.getSrcClusterName()); + String dstClusterName = labelValue(job.getDstClusterName()); + + List infoLabels = new ArrayList<>(); + infoLabels.add(new MetricLabel("job_id", jobId)); + infoLabels.add(new MetricLabel("job_type", jobType)); + infoLabels.add(new MetricLabel("sync_mode", labelValue(job.getSyncMode()))); + infoLabels.add(new MetricLabel("sync_event", labelValue(job.getSyncEvent()))); + infoLabels.add(new MetricLabel("job_state", labelValue(job.getJobState()))); + infoLabels.add(new MetricLabel("src_cluster_name", srcClusterName)); + infoLabels.add(new MetricLabel("dst_cluster_name", dstClusterName)); + addCloudWarmUpSyncJobGauge(currentMetricKeys, "file_cache_warm_up_sync_job_info", MetricUnit.NOUNIT, + "warm up sync job info", infoLabels, job, currentJob -> 1L); + + if (!job.isEventDriven() || job.isDone()) { + return; + } + + for (String window : new String[] {"5m", "30m", "1h"}) { + for (String side : new String[] {"src", "dst"}) { + List labels = new ArrayList<>(); + labels.add(new MetricLabel("job_id", jobId)); + labels.add(new MetricLabel("job_type", jobType)); + labels.add(new MetricLabel("src_cluster_name", srcClusterName)); + labels.add(new MetricLabel("dst_cluster_name", dstClusterName)); + labels.add(new MetricLabel("side", side)); + labels.add(new MetricLabel("window", window)); + addCloudWarmUpSyncJobGauge(currentMetricKeys, "file_cache_warm_up_sync_job_size_bytes", + MetricUnit.BYTES, "warm up sync job source or target total size in bytes", + labels, job, currentJob -> getCloudWarmUpSyncJobSizeBytes(currentJob, side, window)); + } + } + + List labels = new ArrayList<>(); + labels.add(new MetricLabel("job_id", jobId)); + labels.add(new MetricLabel("job_type", jobType)); + labels.add(new MetricLabel("src_cluster_name", srcClusterName)); + labels.add(new MetricLabel("dst_cluster_name", dstClusterName)); + // Trigger gap observes active event-driven warm-up lag in time: source latest trigger time + // minus the target-side progress watermark collected from BEs. + addCloudWarmUpSyncJobGauge(currentMetricKeys, "file_cache_warm_up_sync_job_trigger_gap_ms", + MetricUnit.MILLISECONDS, "warm up sync job trigger progress gap in milliseconds", + labels, job, MetricRepo::getCloudWarmUpSyncJobTriggerGapMs); + } + + private static void addCloudWarmUpSyncJobGauge(Set currentMetricKeys, String name, MetricUnit unit, + String description, List labels, CloudWarmUpJob job, + ToLongFunction valueFunction) { + String key = metricKey(name, labels); + currentMetricKeys.add(key); + CloudWarmUpSyncJobGauge gauge = new CloudWarmUpSyncJobGauge(name, unit, description, labels, job, + valueFunction); + CloudWarmUpSyncJobGauge existingGauge = CLOUD_WARM_UP_SYNC_JOB_METRICS.putIfAbsent(key, gauge); + if (existingGauge == null) { + DORIS_METRIC_REGISTER.addMetrics(gauge); + } else { + existingGauge.setJob(job); + } + } + + private static class CloudWarmUpSyncJobGauge extends GaugeMetric { + private volatile CloudWarmUpJob job; + private final ToLongFunction valueFunction; + + CloudWarmUpSyncJobGauge(String name, MetricUnit unit, String description, List labels, + CloudWarmUpJob job, ToLongFunction valueFunction) { + super(name, unit, description); + this.job = job; + this.valueFunction = valueFunction; + setLabels(labels); + } + + void setJob(CloudWarmUpJob job) { + this.job = job; + } + + @Override + public Long getValue() { + CloudWarmUpJob currentJob = job; + if (currentJob == null) { + return 0L; + } + try { + return valueFunction.applyAsLong(currentJob); + } catch (Exception e) { + return 0L; + } + } + } + + private static String metricKey(String name, List labels) { + StringBuilder sb = new StringBuilder(name); + for (MetricLabel label : labels) { + sb.append('|').append(label.getKey()).append('=').append(label.getValue()); + } + return sb.toString(); + } + + private static String labelValue(Object value) { + return value == null ? "" : value.toString(); + } + + private static JobWarmUpStats getCloudWarmUpSyncJobStats(CloudWarmUpJob job) { + JobWarmUpStats stats = job.getSyncStats(); + return stats == null ? new JobWarmUpStats() : stats; + } + + private static long getCloudWarmUpSyncJobSizeBytes(CloudWarmUpJob job, String side, String window) { + JobWarmUpStats stats = getCloudWarmUpSyncJobStats(job); + switch (side) { + case "src": + return byWindow(window, stats.requestedSegmentSize5m + stats.requestedIndexSize5m, + stats.requestedSegmentSize30m + stats.requestedIndexSize30m, + stats.requestedSegmentSize1h + stats.requestedIndexSize1h); + case "dst": + return byWindow(window, stats.finishSegmentSize5m + stats.finishIndexSize5m, + stats.finishSegmentSize30m + stats.finishIndexSize30m, + stats.finishSegmentSize1h + stats.finishIndexSize1h); + default: + return 0L; + } + } + + private static long getCloudWarmUpSyncJobTriggerGapMs(CloudWarmUpJob job) { + return getCloudWarmUpSyncJobStats(job).triggerGapMs; + } + + private static long byWindow(String window, long value5m, long value30m, long value1h) { + switch (window) { + case "5m": + return value5m; + case "30m": + return value30m; + case "1h": + return value1h; + default: + return 0; + } + } + // update some metrics to make a ready to be visited private static void updateMetrics() { SYSTEM_METRICS.update(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java index 0bb14eff373d39..5e8ab9decabed9 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java @@ -55,6 +55,7 @@ import org.apache.doris.catalog.info.PartitionNamesInfo; import org.apache.doris.catalog.info.TableNameInfo; import org.apache.doris.catalog.info.TagOptions; +import org.apache.doris.cloud.OnTablesFilter.TableFilterRule; import org.apache.doris.cloud.stage.StageUtil; import org.apache.doris.common.Config; import org.apache.doris.common.FeConstants; @@ -9463,7 +9464,19 @@ public LogicalPlan visitWarmUpCluster(DorisParser.WarmUpClusterContext ctx) { isForce = true; } ImmutableMap properties = ImmutableMap.copyOf(visitPropertyClause(ctx.properties)); - return new WarmUpClusterCommand(warmUpItems, srcCluster, dstCluster, isForce, isWarmUpWithTable, properties); + List onTablesRules = new ArrayList<>(); + if (ctx.onTablesClause() != null) { + for (DorisParser.OnTablesFilterRuleContext ruleContext + : ctx.onTablesClause().onTablesFilterRule()) { + TableFilterRule.RuleType ruleType = ruleContext.INCLUDE() != null + ? TableFilterRule.RuleType.INCLUDE + : TableFilterRule.RuleType.EXCLUDE; + onTablesRules.add(new TableFilterRule( + ruleType, stripQuotes(ruleContext.STRING_LITERAL().getText()))); + } + } + return new WarmUpClusterCommand(warmUpItems, srcCluster, dstCluster, isForce, + isWarmUpWithTable, properties, onTablesRules); } void fileCacheAdmissionCheck(DorisParser.WarmUpSelectContext ctx) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/ShowWarmUpCommand.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/ShowWarmUpCommand.java index 29f9487dcc13b9..063cfea68bbf7f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/ShowWarmUpCommand.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/ShowWarmUpCommand.java @@ -54,6 +54,9 @@ public class ShowWarmUpCommand extends ShowCommand { .add("FinishTime") .add("ErrMsg") .add("Tables") + .add("TableFilter") + .add("MatchedTables") + .add("SyncStats") .build(); private Expression whereClause; private boolean showAllJobs = false; diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/WarmUpClusterCommand.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/WarmUpClusterCommand.java index 1a077573417db0..4c6d74c898f54f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/WarmUpClusterCommand.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/WarmUpClusterCommand.java @@ -23,6 +23,7 @@ import org.apache.doris.catalog.OlapTable; import org.apache.doris.catalog.ScalarType; import org.apache.doris.catalog.info.TableNameInfo; +import org.apache.doris.cloud.OnTablesFilter.TableFilterRule; import org.apache.doris.cloud.catalog.CloudEnv; import org.apache.doris.cloud.catalog.ComputeGroup; import org.apache.doris.cloud.system.CloudSystemInfoService; @@ -64,6 +65,7 @@ public class WarmUpClusterCommand extends Command implements ForwardWithSync { private boolean isWarmUpWithTable; private List> tables = new ArrayList<>(); private Map properties = new HashMap<>(); + private List onTablesRules = new ArrayList<>(); /** * WarmUpClusterCommand @@ -87,8 +89,20 @@ public WarmUpClusterCommand(List warmUpItems, boolean isForce, boolean isWarmUpWithTable, Map properties) { + this(warmUpItems, srcCluster, dstCluster, isForce, isWarmUpWithTable, properties, + new ArrayList<>()); + } + + public WarmUpClusterCommand(List warmUpItems, + String srcCluster, + String dstCluster, + boolean isForce, + boolean isWarmUpWithTable, + Map properties, + List onTablesRules) { this(warmUpItems, srcCluster, dstCluster, isForce, isWarmUpWithTable); - this.properties = properties; + this.properties = properties == null ? new HashMap<>() : properties; + this.onTablesRules = onTablesRules == null ? new ArrayList<>() : onTablesRules; } public List getWarmUpItems() { @@ -115,6 +129,10 @@ public List> getTables() { return tables; } + public List getOnTablesRules() { + return onTablesRules; + } + @Override public void run(ConnectContext ctx, StmtExecutor executor) throws Exception { validate(ctx); @@ -140,10 +158,16 @@ private void checkWarmupCgs(CloudSystemInfoService cloudSys) throws AnalysisExce if (!Strings.isNullOrEmpty(srcCluster) && !Strings.isNullOrEmpty(dstCluster)) { String srcMayOwnedVcg = cloudSys.ownedByVirtualComputeGroup(srcCluster); - String dstMayOwnedVcg = cloudSys.ownedByVirtualComputeGroup(srcCluster); - if (srcMayOwnedVcg != null && srcMayOwnedVcg.equals(dstMayOwnedVcg)) { - throw new AnalysisException("The srcClusterName " + srcCluster + " dstClusterName " + dstCluster - + " is owned by virtual compute group " + srcMayOwnedVcg + " not support"); + String dstMayOwnedVcg = cloudSys.ownedByVirtualComputeGroup(dstCluster); + if (srcMayOwnedVcg != null && Objects.equals(srcMayOwnedVcg, dstMayOwnedVcg)) { + StringBuilder message = new StringBuilder("Cannot create warm up job from source compute group '") + .append(srcCluster).append("' to destination compute group '").append(dstCluster) + .append("': "); + message.append("source compute group '").append(srcCluster) + .append("' and destination compute group '").append(dstCluster) + .append("' are both owned by virtual compute group '").append(srcMayOwnedVcg) + .append("', not support"); + throw new AnalysisException(message.toString()); } } } @@ -180,6 +204,11 @@ public void validate(ConnectContext connectContext) throws UserException { + " is same with srcClusterName: " + srcCluster); } + boolean hasOnTablesRules = onTablesRules != null && !onTablesRules.isEmpty(); + if (hasOnTablesRules && isWarmUpWithTable) { + throw new AnalysisException("ON TABLES clause cannot be used with WITH TABLE warmup"); + } + if (isWarmUpWithTable) { for (WarmUpItem warmUpItem : warmUpItems) { TableNameInfo tableNameInfo = warmUpItem.getTableNameInfo(); @@ -203,6 +232,24 @@ public void validate(ConnectContext connectContext) throws UserException { tables.add(Triple.of(dbName, tableNameInfo.getTbl(), partitionName)); } } + + if (hasOnTablesRules) { + boolean hasInclude = onTablesRules.stream() + .anyMatch(r -> r.getRuleType() == TableFilterRule.RuleType.INCLUDE); + if (!hasInclude) { + throw new AnalysisException("ON TABLES clause must contain at least one INCLUDE rule"); + } + for (TableFilterRule rule : onTablesRules) { + if (!rule.getRawPattern().contains(".")) { + throw new AnalysisException("ON TABLES pattern must be in 'db.table' format: '" + + rule.getRawPattern() + "'"); + } + } + String syncMode = properties.get("sync_mode"); + if (!"event_driven".equals(syncMode)) { + throw new AnalysisException("ON TABLES clause is only supported with event_driven sync_mode"); + } + } } private void handleWarmUp(ConnectContext ctx, StmtExecutor executor) throws IOException { diff --git a/fe/fe-core/src/test/java/org/apache/doris/cloud/CacheHotspotManagerTableFilterTest.java b/fe/fe-core/src/test/java/org/apache/doris/cloud/CacheHotspotManagerTableFilterTest.java new file mode 100644 index 00000000000000..12b9b8ef127f1c --- /dev/null +++ b/fe/fe-core/src/test/java/org/apache/doris/cloud/CacheHotspotManagerTableFilterTest.java @@ -0,0 +1,1003 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.cloud; + +import org.apache.doris.catalog.DatabaseIf; +import org.apache.doris.catalog.Env; +import org.apache.doris.catalog.TableIf; +import org.apache.doris.cloud.OnTablesFilter.TableFilterRule; +import org.apache.doris.cloud.OnTablesFilter.TableFilterRule.RuleType; +import org.apache.doris.cloud.system.CloudSystemInfoService; +import org.apache.doris.common.AnalysisException; +import org.apache.doris.datasource.CatalogMgr; +import org.apache.doris.datasource.InternalCatalog; +import org.apache.doris.nereids.trees.plans.commands.WarmUpClusterCommand; +import org.apache.doris.persist.EditLog; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.core.LogEvent; +import org.apache.logging.log4j.core.Logger; +import org.apache.logging.log4j.core.appender.AbstractAppender; +import org.apache.logging.log4j.core.config.Property; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.Mockito; + +import java.lang.reflect.Field; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; + +/** + * Tests for CacheHotspotManager's table filter methods: + * resolveTableIds() and refreshAllTableFilters(). + * Uses Mockito to mock Env.getCurrentInternalCatalog() with fake databases/tables. + */ +public class CacheHotspotManagerTableFilterTest { + + private Env env; + private CatalogMgr mockCatalogMgr; + private InternalCatalog mockCatalog; + private EditLog mockEditLog; + private CacheHotspotManager manager; + private List> databases; + private Object originalCatalogMgr; + private EditLog originalEditLog; + private Object originalSystemInfo; + + private static Object getField(Object target, Class clazz, String fieldName) throws Exception { + Field field = clazz.getDeclaredField(fieldName); + field.setAccessible(true); + return field.get(target); + } + + private static void setField(Object target, Class clazz, String fieldName, Object value) throws Exception { + Field field = clazz.getDeclaredField(fieldName); + field.setAccessible(true); + field.set(target, value); + } + + @BeforeEach + public void setUp() throws Exception { + env = Env.getCurrentEnv(); + mockCatalogMgr = Mockito.mock(CatalogMgr.class); + mockCatalog = Mockito.mock(InternalCatalog.class); + mockEditLog = Mockito.mock(EditLog.class); + + originalCatalogMgr = getField(env, Env.class, "catalogMgr"); + originalEditLog = env.getEditLog(); + originalSystemInfo = getField(env, Env.class, "systemInfo"); + setField(env, Env.class, "catalogMgr", mockCatalogMgr); + setField(env, Env.class, "systemInfo", Mockito.mock(CloudSystemInfoService.class)); + env.setEditLog(mockEditLog); + Mockito.when(mockCatalogMgr.getInternalCatalog()).thenReturn(mockCatalog); + + databases = new ArrayList<>(); + Mockito.when(mockCatalog.getAllDbs()).thenAnswer(inv -> databases); + + manager = new CacheHotspotManager(Mockito.mock(CloudSystemInfoService.class)); + } + + @AfterEach + public void tearDown() throws Exception { + setField(env, Env.class, "catalogMgr", originalCatalogMgr); + setField(env, Env.class, "systemInfo", originalSystemInfo); + env.setEditLog(originalEditLog); + } + + @SuppressWarnings("unchecked") + private DatabaseIf mockDb(String name, TableIf... tables) { + DatabaseIf db = Mockito.mock(DatabaseIf.class); + Mockito.when(db.getFullName()).thenReturn(name); + // For resolveTableIds: getTableNamesOrEmptyWithLock + getTableNullable + HashSet tableNames = new HashSet<>(); + for (TableIf t : tables) { + tableNames.add(t.getName()); + Mockito.when(db.getTableNullable(t.getName())).thenReturn(t); + } + Mockito.when(db.getTableNamesOrEmptyWithLock()).thenReturn(tableNames); + // Keep getTables for other test paths (refreshAllTableFilters) + Mockito.when(db.getTables()).thenReturn(Arrays.asList(tables)); + return db; + } + + private TableIf mockTable(long id, String name) { + TableIf table = Mockito.mock(TableIf.class); + Mockito.when(table.getId()).thenReturn(id); + Mockito.when(table.getName()).thenReturn(name); + Mockito.when(table.getType()).thenReturn(TableIf.TableType.OLAP); + Mockito.when(table.isManagedTable()).thenReturn(true); + return table; + } + + private TableIf mockMtmv(long id, String name) { + TableIf table = Mockito.mock(TableIf.class); + Mockito.when(table.getId()).thenReturn(id); + Mockito.when(table.getName()).thenReturn(name); + Mockito.when(table.getType()).thenReturn(TableIf.TableType.MATERIALIZED_VIEW); + Mockito.when(table.isManagedTable()).thenReturn(true); + return table; + } + + private TableIf mockView(long id, String name) { + TableIf table = Mockito.mock(TableIf.class); + Mockito.when(table.getId()).thenReturn(id); + Mockito.when(table.getName()).thenReturn(name); + Mockito.when(table.getType()).thenReturn(TableIf.TableType.VIEW); + Mockito.when(table.isManagedTable()).thenReturn(false); + return table; + } + + private OnTablesFilter buildFilter(TableFilterRule... rules) { + return new OnTablesFilter(Arrays.asList(rules)); + } + + private Map eventDrivenProperties() { + Map properties = new HashMap<>(); + properties.put("sync_mode", "event_driven"); + properties.put("sync_event", "load"); + return properties; + } + + private WarmUpClusterCommand buildEventDrivenStmt(String src, String dst, TableFilterRule... rules) { + return new WarmUpClusterCommand(new ArrayList<>(), src, dst, false, false, + eventDrivenProperties(), rules.length == 0 ? new ArrayList<>() : Arrays.asList(rules)); + } + + private CloudWarmUpJob createEventDrivenJob(String src, String dst, TableFilterRule... rules) throws Exception { + long jobId = manager.createJob(buildEventDrivenStmt(src, dst, rules)); + CloudWarmUpJob job = manager.getCloudWarmUpJob(jobId); + Assertions.assertNotNull(job); + return job; + } + + private CloudWarmUpJob replayEventDrivenJob(long jobId, String src, String dst, TableFilterRule... rules) + throws Exception { + CloudWarmUpJob.Builder builder = new CloudWarmUpJob.Builder() + .setJobId(jobId) + .setSrcClusterName(src) + .setDstClusterName(dst) + .setJobType(CloudWarmUpJob.JobType.CLUSTER) + .setSyncMode(CloudWarmUpJob.SyncMode.EVENT_DRIVEN) + .setSyncEvent(CloudWarmUpJob.SyncEvent.LOAD); + if (rules.length > 0) { + List persistedRules = new ArrayList<>(); + for (TableFilterRule rule : rules) { + CloudWarmUpJob.PersistedTableFilterRule persistedRule = + new CloudWarmUpJob.PersistedTableFilterRule(); + persistedRule.ruleType = rule.getRuleType().name(); + persistedRule.pattern = rule.getRawPattern(); + persistedRules.add(persistedRule); + } + builder.setTableFilterRules(persistedRules); + } + CloudWarmUpJob job = builder.build(); + manager.replayCloudWarmUpJob(job); + return job; + } + + // ===== resolveTableIds() ===== + + @Test + public void testResolveTableIdsBasicMatching() { + // Scenario: INCLUDE 'ods.*' matches all tables in ods database + databases.add(mockDb("ods", + mockTable(1001, "orders"), + mockTable(1002, "users"), + mockTable(1003, "tmp_staging"))); + databases.add(mockDb("dw", + mockTable(2001, "fact_sales"))); + + OnTablesFilter filter = buildFilter( + new TableFilterRule(RuleType.INCLUDE, "ods.*")); + Map idNames = manager.resolveTableIds(filter); + + Assertions.assertEquals(3, idNames.size()); + Assertions.assertEquals("ods.orders", idNames.get(1001L)); + Assertions.assertEquals("ods.users", idNames.get(1002L)); + Assertions.assertEquals("ods.tmp_staging", idNames.get(1003L)); + Assertions.assertFalse(idNames.containsKey(2001L)); + } + + @Test + public void testResolveTableIdsWithExclude() { + // Scenario: INCLUDE 'ods.*' EXCLUDE 'ods.tmp_*' — exclude tmp tables + databases.add(mockDb("ods", + mockTable(1001, "orders"), + mockTable(1002, "tmp_staging"), + mockTable(1003, "tmp_data"))); + + OnTablesFilter filter = buildFilter( + new TableFilterRule(RuleType.INCLUDE, "ods.*"), + new TableFilterRule(RuleType.EXCLUDE, "ods.tmp_*")); + Map idNames = manager.resolveTableIds(filter); + + Assertions.assertEquals(1, idNames.size()); + Assertions.assertEquals("ods.orders", idNames.get(1001L)); + } + + @Test + public void testResolveTableIdsMultipleDatabases() { + // Scenario: INCLUDE 'ods.*', INCLUDE 'dw.fact_*' — match across two databases + databases.add(mockDb("ods", + mockTable(1001, "orders"), + mockTable(1002, "users"))); + databases.add(mockDb("dw", + mockTable(2001, "fact_sales"), + mockTable(2002, "dim_product"), + mockTable(2003, "fact_orders"))); + + OnTablesFilter filter = buildFilter( + new TableFilterRule(RuleType.INCLUDE, "ods.*"), + new TableFilterRule(RuleType.INCLUDE, "dw.fact_*")); + Map idNames = manager.resolveTableIds(filter); + + Assertions.assertEquals(4, idNames.size()); + Assertions.assertEquals("ods.orders", idNames.get(1001L)); + Assertions.assertEquals("ods.users", idNames.get(1002L)); + Assertions.assertEquals("dw.fact_sales", idNames.get(2001L)); + Assertions.assertEquals("dw.fact_orders", idNames.get(2003L)); + } + + @Test + public void testResolveTableIdsNoMatch() { + // Scenario: pattern matches nothing → empty map + databases.add(mockDb("ods", mockTable(1001, "orders"))); + + OnTablesFilter filter = buildFilter( + new TableFilterRule(RuleType.INCLUDE, "nonexistent.*")); + Map idNames = manager.resolveTableIds(filter); + + Assertions.assertTrue(idNames.isEmpty()); + } + + @Test + public void testResolveTableIdsNullFilter() { + Map idNames = manager.resolveTableIds(null); + Assertions.assertTrue(idNames.isEmpty()); + } + + @Test + public void testResolveTableIdsSkipsViews() { + databases.add(mockDb("ods", + mockTable(1001, "orders"), + mockView(1002, "orders_view"))); + + OnTablesFilter filter = buildFilter( + new TableFilterRule(RuleType.INCLUDE, "ods.*")); + Map idNames = manager.resolveTableIds(filter); + + Assertions.assertEquals(1, idNames.size()); + Assertions.assertEquals("ods.orders", idNames.get(1001L)); + Assertions.assertFalse(idNames.containsKey(1002L)); + } + + @Test + public void testResolveTableIdsDbNameWithPrefix() { + // CacheHotspotManager strips "default_cluster:" prefix from db name + databases.add(mockDb("default_cluster:ods", + mockTable(1001, "orders"))); + + OnTablesFilter filter = buildFilter( + new TableFilterRule(RuleType.INCLUDE, "ods.*")); + Map idNames = manager.resolveTableIds(filter); + + Assertions.assertEquals(1, idNames.size()); + Assertions.assertEquals("ods.orders", idNames.get(1001L)); + } + + // ===== resolveTableIds() with dynamic table changes ===== + + @Test + public void testResolveTableIdsAfterNewTableCreated() { + // Initial: ods has orders. After new table created, re-resolve picks it up. + DatabaseIf odsDb = mockDb("ods", mockTable(1001, "orders")); + databases.add(odsDb); + + OnTablesFilter filter = buildFilter( + new TableFilterRule(RuleType.INCLUDE, "ods.*")); + + Map ids1 = manager.resolveTableIds(filter); + Assertions.assertEquals(1, ids1.size()); + + // Simulate new table created: replace the db mock to include new table + databases.clear(); + databases.add(mockDb("ods", + mockTable(1001, "orders"), + mockTable(1004, "payments"))); + + Map ids2 = manager.resolveTableIds(filter); + Assertions.assertEquals(2, ids2.size()); + Assertions.assertEquals("ods.orders", ids2.get(1001L)); + Assertions.assertEquals("ods.payments", ids2.get(1004L)); + } + + @Test + public void testResolveTableIdsAfterTableDropped() { + // Initial: ods has orders and users. After orders dropped, re-resolve removes it. + databases.add(mockDb("ods", + mockTable(1001, "orders"), + mockTable(1002, "users"))); + + OnTablesFilter filter = buildFilter( + new TableFilterRule(RuleType.INCLUDE, "ods.*")); + + Map ids1 = manager.resolveTableIds(filter); + Assertions.assertEquals(2, ids1.size()); + + databases.clear(); + databases.add(mockDb("ods", mockTable(1002, "users"))); + + Map ids2 = manager.resolveTableIds(filter); + Assertions.assertEquals(1, ids2.size()); + Assertions.assertEquals("ods.users", ids2.get(1002L)); + } + + @Test + public void testResolveTableIdsAfterTableRenamed() { + // Scenario from user guide: INCLUDE 'db.order_*', rename order_2024→archive_2024 → stops matching + databases.add(mockDb("db", + mockTable(1001, "order_2024"), + mockTable(1002, "order_2025"))); + + OnTablesFilter filter = buildFilter( + new TableFilterRule(RuleType.INCLUDE, "db.order_*")); + + Map ids1 = manager.resolveTableIds(filter); + Assertions.assertEquals(2, ids1.size()); + + // Rename order_2024 → archive_2024 (no longer matches order_*) + databases.clear(); + databases.add(mockDb("db", + mockTable(1001, "archive_2024"), + mockTable(1002, "order_2025"))); + + Map ids2 = manager.resolveTableIds(filter); + Assertions.assertEquals(1, ids2.size()); + Assertions.assertEquals("db.order_2025", ids2.get(1002L)); + } + + @Test + public void testResolveTableIdsAfterAllTablesDropped() { + // User guide: all matched tables dropped → empty set, Job stays RUNNING + databases.add(mockDb("ods", mockTable(1001, "orders"))); + + OnTablesFilter filter = buildFilter( + new TableFilterRule(RuleType.INCLUDE, "ods.*")); + + Map ids1 = manager.resolveTableIds(filter); + Assertions.assertEquals(1, ids1.size()); + + databases.clear(); + databases.add(mockDb("ods")); // empty database + + Map ids2 = manager.resolveTableIds(filter); + Assertions.assertTrue(ids2.isEmpty()); + } + + // ===== refreshAllTableFilters() ===== + + @Test + public void testRefreshAllTableFiltersUpdatesJobTableIds() throws Exception { + databases.add(mockDb("ods", + mockTable(1001, "orders"), + mockTable(1002, "users"))); + + CloudWarmUpJob job = createEventDrivenJob("write_cg", "read_cg", + new TableFilterRule(RuleType.INCLUDE, "ods.*")); + + // Verify initial resolution picked up 2 tables with correct names + Assertions.assertEquals( + new HashSet<>(Arrays.asList(1001L, 1002L)), + job.getCurrentTableIds()); + + // Simulate new table created + databases.clear(); + databases.add(mockDb("ods", + mockTable(1001, "orders"), + mockTable(1002, "users"), + mockTable(1003, "payments"))); + + manager.refreshAllTableFilters(); + + // Verify job now has 3 table IDs + Assertions.assertEquals( + new HashSet<>(Arrays.asList(1001L, 1002L, 1003L)), + job.getCurrentTableIds()); + } + + @Test + public void testRefreshAllTableFiltersSkipsClusterLevelJob() throws Exception { + // Cluster-level job (no table filter) should not be affected by refresh + databases.add(mockDb("ods", mockTable(1001, "orders"))); + + CloudWarmUpJob clusterJob = replayEventDrivenJob(200L, "write_cg", "read_cg"); + + // currentTableIds should be empty (no table filter) + Assertions.assertTrue(clusterJob.getCurrentTableIds().isEmpty()); + + manager.refreshAllTableFilters(); + + // Still empty after refresh — cluster-level jobs are skipped + Assertions.assertTrue(clusterJob.getCurrentTableIds().isEmpty()); + } + + @Test + public void testRefreshAllTableFiltersHandlesTableDrop() throws Exception { + // Setup: job matching ods.*, initially 2 tables + databases.add(mockDb("ods", + mockTable(1001, "orders"), + mockTable(1002, "users"))); + + CloudWarmUpJob job = replayEventDrivenJob(300L, "write_cg", "read_cg", + new TableFilterRule(RuleType.INCLUDE, "ods.*")); + Assertions.assertEquals(2, job.getCurrentTableIds().size()); + + // Drop one table + databases.clear(); + databases.add(mockDb("ods", mockTable(1002, "users"))); + + manager.refreshAllTableFilters(); + + Assertions.assertEquals( + new HashSet<>(Arrays.asList(1002L)), + job.getCurrentTableIds()); + } + + @Test + public void testRefreshAllTableFiltersUpdatesMatchedNamesAfterRenameStillMatches() throws Exception { + databases.add(mockDb("db", + mockTable(1001, "order_2024"), + mockTable(1002, "order_2025"))); + + CloudWarmUpJob job = createEventDrivenJob("write_cg", "read_cg", + new TableFilterRule(RuleType.INCLUDE, "db.order_*")); + Assertions.assertEquals("db.order_2024, db.order_2025", job.getJobInfo(null).get(14)); + + databases.clear(); + databases.add(mockDb("db", + mockTable(1001, "order_2024_v2"), + mockTable(1002, "order_2025"))); + + manager.refreshAllTableFilters(); + + Assertions.assertEquals(new HashSet<>(Arrays.asList(1001L, 1002L)), job.getCurrentTableIds()); + Assertions.assertEquals("db.order_2024_v2, db.order_2025", job.getJobInfo(null).get(14)); + } + + @Test + public void testCreateJobRejectsOnTablesWithoutInitialMatches() { + databases.add(mockDb("ods", mockTable(1001, "orders"))); + + WarmUpClusterCommand stmt = buildEventDrivenStmt("write_cg", "read_cg", + new TableFilterRule(RuleType.INCLUDE, "dw.*")); + + AnalysisException exception = Assertions.assertThrows(AnalysisException.class, + () -> manager.createJob(stmt)); + Assertions.assertTrue(exception.getMessage().contains("No tables matched the ON TABLES filter")); + } + + @Test + public void testCreateJobRejectsEquivalentDuplicateTableFilter() throws Exception { + databases.add(mockDb("ods", mockTable(1001, "orders"))); + databases.add(mockDb("dw", + mockTable(2001, "fact_sales"), + mockTable(2002, "tmp_staging"))); + + WarmUpClusterCommand first = buildEventDrivenStmt("write_cg", "read_cg", + new TableFilterRule(RuleType.INCLUDE, "ods.*"), + new TableFilterRule(RuleType.INCLUDE, "dw.*"), + new TableFilterRule(RuleType.EXCLUDE, "dw.tmp_*")); + WarmUpClusterCommand second = buildEventDrivenStmt("write_cg", "read_cg", + new TableFilterRule(RuleType.EXCLUDE, "dw.tmp_*"), + new TableFilterRule(RuleType.INCLUDE, "dw.*"), + new TableFilterRule(RuleType.INCLUDE, "ods.*"), + new TableFilterRule(RuleType.INCLUDE, "ods.*")); + + manager.createJob(first); + + AnalysisException exception = Assertions.assertThrows(AnalysisException.class, + () -> manager.createJob(second)); + Assertions.assertTrue(exception.getMessage().contains("already has a runnable job")); + } + + @Test + public void testCreateJobRejectsTableLevelWhenClusterLevelLoadEventExists() throws Exception { + databases.add(mockDb("ods", mockTable(1001, "orders"))); + + WarmUpClusterCommand clusterLevel = buildEventDrivenStmt("write_cg", "read_cg"); + WarmUpClusterCommand tableLevel = buildEventDrivenStmt("write_cg", "read_cg", + new TableFilterRule(RuleType.INCLUDE, "ods.*")); + + long clusterJobId = manager.createJob(clusterLevel); + AnalysisException exception = Assertions.assertThrows(AnalysisException.class, + () -> manager.createJob(tableLevel)); + + Assertions.assertTrue(exception.getMessage().contains( + "Cannot create table-level load-event warm up job")); + Assertions.assertTrue(exception.getMessage().contains("cluster-level load-event warm up job " + + clusterJobId)); + Assertions.assertTrue(exception.getMessage().contains( + "Cancel existing load-event warm up job " + clusterJobId)); + Assertions.assertTrue(exception.getMessage().contains( + "source compute group 'write_cg' to destination compute group 'read_cg'")); + Assertions.assertEquals(1, manager.getAllJobInfos(10).size()); + } + + @Test + public void testCreateJobRejectsClusterLevelWhenTableLevelLoadEventExists() throws Exception { + databases.add(mockDb("ods", mockTable(1001, "orders"))); + + WarmUpClusterCommand tableLevel = buildEventDrivenStmt("write_cg", "read_cg", + new TableFilterRule(RuleType.INCLUDE, "ods.*")); + WarmUpClusterCommand clusterLevel = buildEventDrivenStmt("write_cg", "read_cg"); + + long tableJobId = manager.createJob(tableLevel); + AnalysisException exception = Assertions.assertThrows(AnalysisException.class, + () -> manager.createJob(clusterLevel)); + + Assertions.assertTrue(exception.getMessage().contains( + "Cannot create cluster-level load-event warm up job")); + Assertions.assertTrue(exception.getMessage().contains("table-level load-event warm up job " + + tableJobId)); + Assertions.assertTrue(exception.getMessage().contains( + "Cancel existing load-event warm up job " + tableJobId)); + Assertions.assertTrue(exception.getMessage().contains( + "with table filter [{\"include\":[\"ods.*\"]}]")); + Assertions.assertEquals(1, manager.getAllJobInfos(10).size()); + } + + @Test + public void testVirtualComputeGroupCancelsExistingTableLevelLoadEvent() throws Exception { + databases.add(mockDb("ods", mockTable(1001, "orders"))); + + CloudWarmUpJob tableLevelJob = createEventDrivenJob("write_cg", "read_cg", + new TableFilterRule(RuleType.INCLUDE, "ods.*")); + CloudWarmUpJob reverseTableLevelJob = createEventDrivenJob("read_cg", "write_cg", + new TableFilterRule(RuleType.INCLUDE, "ods.*")); + CloudWarmUpJob unrelatedTableLevelJob = createEventDrivenJob("write_cg", "outside_cg", + new TableFilterRule(RuleType.INCLUDE, "ods.*")); + CloudWarmUpJob clusterLevelJob = manager.getCloudWarmUpJob( + manager.createJob(buildEventDrivenStmt("other_write_cg", "other_read_cg"))); + CloudWarmUpJob finishedTableLevelJob = createEventDrivenJob("write_cg", "read_cg", + new TableFilterRule(RuleType.INCLUDE, "ods.orders")); + setField(finishedTableLevelJob, CloudWarmUpJob.class, "jobState", CloudWarmUpJob.JobState.CANCELLED); + + List cancelledJobIds = new ArrayList<>(); + Map cancelReasons = new HashMap<>(); + CacheHotspotManager spyManager = Mockito.spy(manager); + Mockito.doAnswer(invocation -> { + Long jobId = invocation.getArgument(0); + String errMsg = invocation.getArgument(1); + cancelledJobIds.add(jobId); + cancelReasons.put(jobId, errMsg); + return null; + }).when(spyManager).cancel(Mockito.anyLong(), Mockito.anyString()); + + String reason = "vcg cancel table-level load-event warm up job before rebuilding file cache jobs"; + RecordingAppender appender = new RecordingAppender("vcg-cancel-table-warmup-test"); + Logger logger = (Logger) LogManager.getLogger(CacheHotspotManager.class); + appender.start(); + logger.addAppender(appender); + try { + spyManager.cancelTableLevelLoadEventWarmUpJobsForVirtualComputeGroup( + "vcg", "write_cg", "read_cg", Arrays.asList("write_cg", "read_cg"), reason); + } finally { + logger.removeAppender(appender); + appender.stop(); + } + + Assertions.assertEquals(new HashSet<>(Arrays.asList( + tableLevelJob.getJobId(), reverseTableLevelJob.getJobId())), + new HashSet<>(cancelledJobIds)); + Assertions.assertEquals(2, cancelledJobIds.size()); + String expectedReason = reason + " for virtual compute group 'vcg'"; + Assertions.assertEquals(expectedReason, cancelReasons.get(tableLevelJob.getJobId())); + Assertions.assertEquals(expectedReason, cancelReasons.get(reverseTableLevelJob.getJobId())); + Assertions.assertFalse(cancelReasons.containsKey(unrelatedTableLevelJob.getJobId())); + Assertions.assertFalse(cancelReasons.containsKey(clusterLevelJob.getJobId())); + Assertions.assertFalse(cancelReasons.containsKey(finishedTableLevelJob.getJobId())); + + String logs = appender.messagesAsString(); + Assertions.assertTrue(logs.contains("virtual compute group 'vcg'"), logs); + Assertions.assertTrue(logs.contains(expectedReason), logs); + } + + @Test + public void testCancelTableFilterJobsForClusterChangeOnlyCancelsMatchingTableFilterJobs() throws Exception { + databases.add(mockDb("ods", mockTable(1001, "orders"))); + + CloudWarmUpJob srcMatchedJob = createEventDrivenJob("write_cg", "read_cg", + new TableFilterRule(RuleType.INCLUDE, "ods.*")); + CloudWarmUpJob dstMatchedJob = createEventDrivenJob("other_write_cg", "write_cg", + new TableFilterRule(RuleType.INCLUDE, "ods.*")); + CloudWarmUpJob clusterLevelJob = manager.getCloudWarmUpJob( + manager.createJob(buildEventDrivenStmt("write_cg", "cluster_level_read_cg"))); + CloudWarmUpJob unrelatedJob = createEventDrivenJob("other_write_cg", "other_read_cg", + new TableFilterRule(RuleType.INCLUDE, "ods.*")); + CloudWarmUpJob finishedJob = createEventDrivenJob("write_cg", "finished_read_cg", + new TableFilterRule(RuleType.INCLUDE, "ods.*")); + setField(finishedJob, CloudWarmUpJob.class, "jobState", CloudWarmUpJob.JobState.CANCELLED); + + List cancelledJobIds = new ArrayList<>(); + Map cancelReasons = new HashMap<>(); + CacheHotspotManager spyManager = Mockito.spy(manager); + Mockito.doAnswer(invocation -> { + Long jobId = invocation.getArgument(0); + String errMsg = invocation.getArgument(1); + cancelledJobIds.add(jobId); + cancelReasons.put(jobId, errMsg); + return null; + }).when(spyManager).cancel(Mockito.anyLong(), Mockito.anyString()); + + String reason = "system cancel: compute group write_cg renamed to write_cg_new"; + spyManager.cancelTableFilterJobsForClusterChange("write_cg", reason); + + Assertions.assertEquals(new HashSet<>(Arrays.asList( + srcMatchedJob.getJobId(), dstMatchedJob.getJobId())), + new HashSet<>(cancelledJobIds)); + Assertions.assertEquals(2, cancelledJobIds.size()); + Assertions.assertEquals(reason, cancelReasons.get(srcMatchedJob.getJobId())); + Assertions.assertEquals(reason, cancelReasons.get(dstMatchedJob.getJobId())); + Assertions.assertFalse(cancelReasons.containsKey(clusterLevelJob.getJobId())); + Assertions.assertFalse(cancelReasons.containsKey(unrelatedJob.getJobId())); + Assertions.assertFalse(cancelReasons.containsKey(finishedJob.getJobId())); + } + + // ===== Async materialized view (MTMV) matching ===== + + @Test + public void testResolveTableIdsMatchesAsyncMaterializedView() { + // Async MVs (MTMV) are separate table entries in the database catalog. + // They should be matched by ON TABLES filter just like regular OlapTables. + databases.add(mockDb("ods", + mockTable(1001, "orders"), + mockTable(1002, "users"), + mockMtmv(1003, "mv_order_summary"))); + + OnTablesFilter filter = buildFilter( + new TableFilterRule(RuleType.INCLUDE, "ods.*")); + Map idNames = manager.resolveTableIds(filter); + + Assertions.assertEquals(3, idNames.size()); + Assertions.assertEquals("ods.orders", idNames.get(1001L)); + Assertions.assertEquals("ods.users", idNames.get(1002L)); + Assertions.assertEquals("ods.mv_order_summary", idNames.get(1003L)); + } + + @Test + public void testResolveTableIdsMtmvMatchedByMvPattern() { + // Verify async MVs can be matched by mv_* pattern while base tables are not + databases.add(mockDb("analytics", + mockTable(2001, "fact_sales"), + mockMtmv(2002, "mv_daily_sales"), + mockMtmv(2003, "mv_monthly_revenue"), + mockTable(2004, "dim_product"))); + + OnTablesFilter filter = buildFilter( + new TableFilterRule(RuleType.INCLUDE, "analytics.mv_*")); + Map idNames = manager.resolveTableIds(filter); + + Assertions.assertEquals(2, idNames.size()); + Assertions.assertEquals("analytics.mv_daily_sales", idNames.get(2002L)); + Assertions.assertEquals("analytics.mv_monthly_revenue", idNames.get(2003L)); + Assertions.assertFalse(idNames.containsKey(2001L)); + Assertions.assertFalse(idNames.containsKey(2004L)); + } + + @Test + public void testResolveTableIdsMtmvExcludedByPattern() { + // Verify async MVs can be excluded by EXCLUDE rule + databases.add(mockDb("ods", + mockTable(1001, "orders"), + mockMtmv(1002, "mv_order_summary"), + mockMtmv(1003, "mv_user_stats"))); + + OnTablesFilter filter = buildFilter( + new TableFilterRule(RuleType.INCLUDE, "ods.*"), + new TableFilterRule(RuleType.EXCLUDE, "ods.mv_*")); + Map idNames = manager.resolveTableIds(filter); + + Assertions.assertEquals(1, idNames.size()); + Assertions.assertEquals("ods.orders", idNames.get(1001L)); + } + + @Test + public void testResolveTableIdsMixedTableTypesAcrossDatabases() { + // Multiple databases with mixed OlapTable and MTMV types + databases.add(mockDb("ods", + mockTable(1001, "orders"), + mockMtmv(1002, "mv_orders_agg"))); + databases.add(mockDb("dw", + mockTable(2001, "fact_sales"), + mockMtmv(2002, "mv_daily_report"))); + + OnTablesFilter filter = buildFilter( + new TableFilterRule(RuleType.INCLUDE, "ods.*"), + new TableFilterRule(RuleType.INCLUDE, "dw.mv_*")); + Map idNames = manager.resolveTableIds(filter); + + // ods.* matches orders + mv_orders_agg; dw.mv_* matches mv_daily_report + Assertions.assertEquals(3, idNames.size()); + Assertions.assertEquals("ods.orders", idNames.get(1001L)); + Assertions.assertEquals("ods.mv_orders_agg", idNames.get(1002L)); + Assertions.assertEquals("dw.mv_daily_report", idNames.get(2002L)); + Assertions.assertFalse(idNames.containsKey(2001L)); + } + + @Test + public void testRefreshAllTableFiltersPicksUpNewMtmv() throws Exception { + // When a new async MV is created after job creation, refreshAllTableFilters picks it up + databases.add(mockDb("ods", + mockTable(1001, "orders"))); + + CloudWarmUpJob job = createEventDrivenJob("write_cg", "read_cg", + new TableFilterRule(RuleType.INCLUDE, "ods.*")); + Assertions.assertEquals(1, job.getCurrentTableIds().size()); + + // Simulate async MV created + databases.clear(); + databases.add(mockDb("ods", + mockTable(1001, "orders"), + mockMtmv(1002, "mv_order_summary"))); + + manager.refreshAllTableFilters(); + + Assertions.assertEquals( + new HashSet<>(Arrays.asList(1001L, 1002L)), + job.getCurrentTableIds()); + } + + // ========== Performance tests: regex matching throughput at scale ========== + + /** + * Generate table name strings (db.table) for timing shouldWarmUp regex calls. + * No mocks needed — we test the filter's regex matching performance directly. + */ + private List generateTableNames(int dbCount, int tablesPerDb) { + List names = new ArrayList<>(dbCount * tablesPerDb); + for (int d = 0; d < dbCount; d++) { + String db = "db_" + d; + for (int t = 0; t < tablesPerDb; t++) { + names.add(new String[]{db, "tbl_" + String.format("%05d", t)}); + } + } + return names; + } + + @Test + public void testShouldWarmUpPerformance10kTables() { + List names = generateTableNames(10, 1000); // 10K + OnTablesFilter filter = buildFilter( + new TableFilterRule(RuleType.INCLUDE, "db_*.*")); + + long start = System.nanoTime(); + int matched = 0; + for (String[] pair : names) { + if (filter.shouldWarmUp(pair[0], pair[1])) { + matched++; + } + } + long elapsedMs = (System.nanoTime() - start) / 1_000_000; + + Assertions.assertEquals(10000, matched); + System.out.println("[Perf] 10K tables, wildcard match-all: " + elapsedMs + " ms"); + Assertions.assertTrue(elapsedMs < 500, + "10K regex matches should complete within 500ms, took " + elapsedMs + " ms"); + } + + @Test + public void testShouldWarmUpPerformance50kTables() { + List names = generateTableNames(50, 1000); // 50K + OnTablesFilter filter = buildFilter( + new TableFilterRule(RuleType.INCLUDE, "db_*.*")); + + long start = System.nanoTime(); + int matched = 0; + for (String[] pair : names) { + if (filter.shouldWarmUp(pair[0], pair[1])) { + matched++; + } + } + long elapsedMs = (System.nanoTime() - start) / 1_000_000; + + Assertions.assertEquals(50000, matched); + System.out.println("[Perf] 50K tables, wildcard match-all: " + elapsedMs + " ms"); + Assertions.assertTrue(elapsedMs < 500, + "50K regex matches should complete within 500ms, took " + elapsedMs + " ms"); + } + + @Test + public void testShouldWarmUpPerformance200kTables() { + List names = generateTableNames(100, 2000); // 200K + OnTablesFilter filter = buildFilter( + new TableFilterRule(RuleType.INCLUDE, "db_*.*")); + + long start = System.nanoTime(); + int matched = 0; + for (String[] pair : names) { + if (filter.shouldWarmUp(pair[0], pair[1])) { + matched++; + } + } + long elapsedMs = (System.nanoTime() - start) / 1_000_000; + + Assertions.assertEquals(200000, matched); + System.out.println("[Perf] 200K tables, wildcard match-all: " + elapsedMs + " ms"); + Assertions.assertTrue(elapsedMs < 1000, + "200K regex matches should complete within 1s, took " + elapsedMs + " ms"); + } + + @Test + public void testShouldWarmUpPerformance500kTables() { + List names = generateTableNames(100, 5000); // 500K + OnTablesFilter filter = buildFilter( + new TableFilterRule(RuleType.INCLUDE, "db_*.*")); + + long start = System.nanoTime(); + int matched = 0; + for (String[] pair : names) { + if (filter.shouldWarmUp(pair[0], pair[1])) { + matched++; + } + } + long elapsedMs = (System.nanoTime() - start) / 1_000_000; + + Assertions.assertEquals(500000, matched); + System.out.println("[Perf] 500K tables, wildcard match-all: " + elapsedMs + " ms"); + Assertions.assertTrue(elapsedMs < 2000, + "500K regex matches should complete within 2s, took " + elapsedMs + " ms"); + } + + @Test + public void testShouldWarmUpPerformanceSelectivePattern50k() { + List names = generateTableNames(50, 1000); // 50K + // Only match tables in db_0 + OnTablesFilter filter = buildFilter( + new TableFilterRule(RuleType.INCLUDE, "db_0.*")); + + long start = System.nanoTime(); + int matched = 0; + for (String[] pair : names) { + if (filter.shouldWarmUp(pair[0], pair[1])) { + matched++; + } + } + long elapsedMs = (System.nanoTime() - start) / 1_000_000; + + Assertions.assertEquals(1000, matched); + System.out.println("[Perf] 50K tables, selective db_0 pattern: " + elapsedMs + " ms"); + Assertions.assertTrue(elapsedMs < 500, + "50K regex matches (selective) should complete within 500ms, took " + elapsedMs + " ms"); + } + + @Test + public void testShouldWarmUpPerformanceMultipleRules50k() { + List names = generateTableNames(50, 1000); // 50K + // Include db_1* tables, exclude tables ending with digit 9 + OnTablesFilter filter = buildFilter( + new TableFilterRule(RuleType.INCLUDE, "db_1*.*"), + new TableFilterRule(RuleType.EXCLUDE, "*.*9")); + + long start = System.nanoTime(); + int matched = 0; + for (String[] pair : names) { + if (filter.shouldWarmUp(pair[0], pair[1])) { + matched++; + } + } + long elapsedMs = (System.nanoTime() - start) / 1_000_000; + + // db_1, db_10..db_19 = 11 dbs × 1000 tables = 11000 candidates + // Exclude tables ending with "9": tbl_00009, tbl_00019, ..., tbl_00999 = 100 per db + // Result = 11000 - 11*100 = 9900 + Assertions.assertEquals(9900, matched); + System.out.println("[Perf] 50K tables, include+exclude: " + elapsedMs + " ms, matched=" + matched); + Assertions.assertTrue(elapsedMs < 500, + "50K regex matches (multi-rule) should complete within 500ms, took " + elapsedMs + " ms"); + } + + @Test + public void testShouldWarmUpPerformanceManyRules200k() { + List names = generateTableNames(100, 2000); // 200K + // 10 include rules + 5 exclude rules + OnTablesFilter filter = buildFilter( + new TableFilterRule(RuleType.INCLUDE, "db_0.*"), + new TableFilterRule(RuleType.INCLUDE, "db_1.*"), + new TableFilterRule(RuleType.INCLUDE, "db_2.*"), + new TableFilterRule(RuleType.INCLUDE, "db_3.*"), + new TableFilterRule(RuleType.INCLUDE, "db_4.*"), + new TableFilterRule(RuleType.INCLUDE, "db_5.*"), + new TableFilterRule(RuleType.INCLUDE, "db_6.*"), + new TableFilterRule(RuleType.INCLUDE, "db_7.*"), + new TableFilterRule(RuleType.INCLUDE, "db_8.*"), + new TableFilterRule(RuleType.INCLUDE, "db_9.*"), + new TableFilterRule(RuleType.EXCLUDE, "*.tbl_00000"), + new TableFilterRule(RuleType.EXCLUDE, "*.tbl_00001"), + new TableFilterRule(RuleType.EXCLUDE, "*.tbl_00002"), + new TableFilterRule(RuleType.EXCLUDE, "*.tbl_00003"), + new TableFilterRule(RuleType.EXCLUDE, "*.tbl_00004")); + + long start = System.nanoTime(); + int matched = 0; + for (String[] pair : names) { + if (filter.shouldWarmUp(pair[0], pair[1])) { + matched++; + } + } + long elapsedMs = (System.nanoTime() - start) / 1_000_000; + + // 10 dbs × 2000 tables = 20000 included, minus 10 × 5 excluded = 19950 + Assertions.assertEquals(19950, matched); + System.out.println("[Perf] 200K tables, 15 rules (10 incl + 5 excl): " + elapsedMs + " ms"); + Assertions.assertTrue(elapsedMs < 2000, + "200K regex matches with 15 rules should complete within 2s, took " + elapsedMs + " ms"); + } + + @Test + public void testShouldWarmUpPerformanceRepeatedCycles200k() { + List names = generateTableNames(100, 2000); // 200K + OnTablesFilter filter = buildFilter( + new TableFilterRule(RuleType.INCLUDE, "db_*.*")); + + // JIT warm-up + for (String[] pair : names) { + filter.shouldWarmUp(pair[0], pair[1]); + } + + long start = System.nanoTime(); + int iterations = 5; + int totalMatched = 0; + for (int i = 0; i < iterations; i++) { + for (String[] pair : names) { + if (filter.shouldWarmUp(pair[0], pair[1])) { + totalMatched++; + } + } + } + long totalMs = (System.nanoTime() - start) / 1_000_000; + long avgMs = totalMs / iterations; + + Assertions.assertEquals(200000 * iterations, totalMatched); + System.out.println("[Perf] 200K tables × 5 cycles: total=" + totalMs + " ms, avg=" + avgMs + " ms/cycle"); + Assertions.assertTrue(avgMs < 1000, + "Avg per refresh cycle for 200K tables should be < 1s, avg=" + avgMs + " ms"); + } + + private static class RecordingAppender extends AbstractAppender { + private final List messages = new ArrayList<>(); + + RecordingAppender(String name) { + super(name, null, null, true, Property.EMPTY_ARRAY); + } + + @Override + public void append(LogEvent event) { + messages.add(event.getMessage().getFormattedMessage()); + } + + String messagesAsString() { + return String.join("\n", messages); + } + } +} diff --git a/fe/fe-core/src/test/java/org/apache/doris/cloud/CloudWarmUpJobTableFilterTest.java b/fe/fe-core/src/test/java/org/apache/doris/cloud/CloudWarmUpJobTableFilterTest.java new file mode 100644 index 00000000000000..1af6bf284db221 --- /dev/null +++ b/fe/fe-core/src/test/java/org/apache/doris/cloud/CloudWarmUpJobTableFilterTest.java @@ -0,0 +1,461 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.cloud; + +import org.apache.doris.cloud.CloudWarmUpJob.PersistedTableFilterRule; +import org.apache.doris.common.Config; +import org.apache.doris.common.io.Text; + +import com.google.gson.JsonObject; +import com.google.gson.JsonParser; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * Tests for table-filter extensions in {@link CloudWarmUpJob}: + * canonicalize(), rebuildOnTablesFilter(), hasTableFilter(), getJobInfo(), + * getMatchedTablesString(), dynamic table ID tracking, SHOW WARM UP JOB columns. + */ +public class CloudWarmUpJobTableFilterTest { + + private static final int COL_JOB_ID = 0; + private static final int COL_SRC = 1; + private static final int COL_DST = 2; + private static final int COL_STATUS = 3; + private static final int COL_TYPE = 4; + private static final int COL_SYNC_MODE = 5; + private static final int COL_CREATE_TIME = 6; + private static final int COL_START_TIME = 7; + private static final int COL_FINISH_BATCH = 8; + private static final int COL_ALL_BATCH = 9; + private static final int COL_FINISH_TIME = 10; + private static final int COL_ERR_MSG = 11; + private static final int COL_TABLES = 12; + private static final int COL_TABLE_FILTER = 13; + private static final int COL_MATCHED_TABLES = 14; + private static final int COL_SYNC_STATS = 15; + private static final int TOTAL_COLUMNS = 16; + + private PersistedTableFilterRule rule(String type, String pattern) { + PersistedTableFilterRule r = new PersistedTableFilterRule(); + r.ruleType = type; + r.pattern = pattern; + return r; + } + + private CloudWarmUpJob.Builder baseBuilder() { + return new CloudWarmUpJob.Builder() + .setJobId(1L) + .setSrcClusterName("write_cg") + .setDstClusterName("read_cg") + .setJobType(CloudWarmUpJob.JobType.TABLES) + .setSyncMode(CloudWarmUpJob.SyncMode.EVENT_DRIVEN); + } + + private CloudWarmUpJob.Builder clusterBuilder() { + return new CloudWarmUpJob.Builder() + .setJobId(1L) + .setSrcClusterName("write_cg") + .setDstClusterName("read_cg") + .setJobType(CloudWarmUpJob.JobType.CLUSTER) + .setSyncMode(CloudWarmUpJob.SyncMode.EVENT_DRIVEN); + } + + // ===== canonicalize() ===== + + @Test + public void testCanonicalizeIncludeOnly() { + List rules = Arrays.asList( + rule("INCLUDE", "dw.*"), + rule("INCLUDE", "ods.*")); + String expr = CloudWarmUpJob.canonicalize(rules); + Assertions.assertEquals("{\"include\":[\"dw.*\",\"ods.*\"]}", expr); + } + + @Test + public void testCanonicalizeWithExclude() { + List rules = Arrays.asList( + rule("INCLUDE", "ods.*"), + rule("INCLUDE", "dw.*"), + rule("EXCLUDE", "dw.tmp_*")); + String expr = CloudWarmUpJob.canonicalize(rules); + Assertions.assertEquals("{\"include\":[\"dw.*\",\"ods.*\"],\"exclude\":[\"dw.tmp_*\"]}", expr); + } + + @Test + public void testCanonicalizeOrderIndependentAndDedup() { + // Different order + duplicates → same canonical form (FAQ: order doesn't matter) + List rules1 = Arrays.asList( + rule("INCLUDE", "ods.*"), rule("INCLUDE", "dw.*"), rule("EXCLUDE", "dw.tmp_*")); + List rules2 = Arrays.asList( + rule("EXCLUDE", "dw.tmp_*"), rule("INCLUDE", "dw.*"), + rule("INCLUDE", "ods.*"), rule("INCLUDE", "ods.*")); + Assertions.assertEquals( + CloudWarmUpJob.canonicalize(rules1), + CloudWarmUpJob.canonicalize(rules2)); + } + + @Test + public void testBuilderNormalizesPersistedTableFilterRules() { + CloudWarmUpJob job = baseBuilder() + .setTableFilterRules(Arrays.asList( + rule("EXCLUDE", "dw.tmp_*"), + rule("INCLUDE", "ods.*"), + rule("INCLUDE", "dw.*"), + rule("INCLUDE", "ods.*"))) + .build(); + + List normalizedRules = job.getTableFilterRules(); + Assertions.assertEquals(3, normalizedRules.size()); + Assertions.assertEquals("INCLUDE", normalizedRules.get(0).ruleType); + Assertions.assertEquals("dw.*", normalizedRules.get(0).pattern); + Assertions.assertEquals("INCLUDE", normalizedRules.get(1).ruleType); + Assertions.assertEquals("ods.*", normalizedRules.get(1).pattern); + Assertions.assertEquals("EXCLUDE", normalizedRules.get(2).ruleType); + Assertions.assertEquals("dw.tmp_*", normalizedRules.get(2).pattern); + } + + @Test + public void testCanonicalizeExcludeKeyAbsentWhenNoExcludes() { + String expr = CloudWarmUpJob.canonicalize(Arrays.asList(rule("INCLUDE", "ods.*"))); + Assertions.assertFalse(expr.contains("exclude")); + } + + @Test + public void testCanonicalizeEmptyRules() { + String expr = CloudWarmUpJob.canonicalize(new ArrayList<>()); + Assertions.assertEquals("{\"include\":[]}", expr); + } + + // ===== rebuildOnTablesFilter() ===== + + @Test + public void testRebuildOnTablesFilter() { + CloudWarmUpJob job = baseBuilder() + .setTableFilterRules(Arrays.asList( + rule("INCLUDE", "ods.*"), rule("EXCLUDE", "ods.tmp_*"))) + .build(); + job.rebuildOnTablesFilter(); + + OnTablesFilter filter = job.getOnTablesFilter(); + Assertions.assertNotNull(filter); + Assertions.assertTrue(filter.shouldWarmUp("ods", "orders")); + Assertions.assertFalse(filter.shouldWarmUp("ods", "tmp_staging")); + Assertions.assertFalse(filter.shouldWarmUp("dw", "something")); + } + + @Test + public void testRebuildOnTablesFilterAlsoComputesExpr() { + // tableFilterExpr is transient, so after rebuild it should be recomputed from rules + CloudWarmUpJob job = baseBuilder() + .setTableFilterRules(Arrays.asList( + rule("INCLUDE", "ods.*"), rule("EXCLUDE", "ods.tmp_*"))) + .build(); + job.rebuildOnTablesFilter(); + List info = job.getJobInfo(null); + Assertions.assertEquals("{\"include\":[\"ods.*\"],\"exclude\":[\"ods.tmp_*\"]}", + info.get(COL_TABLE_FILTER)); + } + + @Test + public void testReadNormalizesPersistedTableFilterRules() throws IOException { + String json = "{" + + "\"jobId\":1," + + "\"jobState\":\"PENDING\"," + + "\"srcClusterName\":\"write_cg\"," + + "\"cloudClusterName\":\"read_cg\"," + + "\"JobType\":\"TABLES\"," + + "\"syncMode\":\"EVENT_DRIVEN\"," + + "\"tableFilterRules\":[" + + "{\"ruleType\":\"EXCLUDE\",\"pattern\":\"dw.tmp_*\"}," + + "{\"ruleType\":\"INCLUDE\",\"pattern\":\"ods.*\"}," + + "{\"ruleType\":\"INCLUDE\",\"pattern\":\"dw.*\"}," + + "{\"ruleType\":\"INCLUDE\",\"pattern\":\"ods.*\"}" + + "]" + + "}"; + ByteArrayOutputStream bytes = new ByteArrayOutputStream(); + DataOutputStream out = new DataOutputStream(bytes); + Text.writeString(out, json); + out.flush(); + + CloudWarmUpJob job = CloudWarmUpJob.read( + new DataInputStream(new ByteArrayInputStream(bytes.toByteArray()))); + + List normalizedRules = job.getTableFilterRules(); + Assertions.assertEquals(3, normalizedRules.size()); + Assertions.assertEquals("INCLUDE", normalizedRules.get(0).ruleType); + Assertions.assertEquals("dw.*", normalizedRules.get(0).pattern); + Assertions.assertEquals("INCLUDE", normalizedRules.get(1).ruleType); + Assertions.assertEquals("ods.*", normalizedRules.get(1).pattern); + Assertions.assertEquals("EXCLUDE", normalizedRules.get(2).ruleType); + Assertions.assertEquals("dw.tmp_*", normalizedRules.get(2).pattern); + Assertions.assertEquals("{\"include\":[\"dw.*\",\"ods.*\"],\"exclude\":[\"dw.tmp_*\"]}", + job.getTableFilterExpr()); + } + + @Test + public void testRebuildOnTablesFilterNoRules() { + CloudWarmUpJob job = baseBuilder().build(); + job.rebuildOnTablesFilter(); + Assertions.assertNull(job.getOnTablesFilter()); + } + + // ===== hasTableFilter() ===== + + @Test + public void testHasTableFilter() { + CloudWarmUpJob withFilter = baseBuilder() + .setTableFilterRules(Arrays.asList(rule("INCLUDE", "ods.*"))) + .build(); + Assertions.assertTrue(withFilter.hasTableFilter()); + + CloudWarmUpJob withoutFilter = baseBuilder().build(); + Assertions.assertFalse(withoutFilter.hasTableFilter()); + } + + // ===== tableFilterExpr derived from rules (single source of truth) ===== + + @Test + public void testTableFilterExprDerivedFromRules() { + // tableFilterExpr should be computed from rules, not set explicitly + CloudWarmUpJob job = baseBuilder() + .setTableFilterRules(Arrays.asList( + rule("INCLUDE", "dw.*"), rule("INCLUDE", "ods.*"))) + .build(); + List info = job.getJobInfo(null); + Assertions.assertEquals("{\"include\":[\"dw.*\",\"ods.*\"]}", info.get(COL_TABLE_FILTER)); + } + + @Test + public void testTableFilterExprEmptyWhenNoRules() { + CloudWarmUpJob job = baseBuilder().build(); + List info = job.getJobInfo(null); + Assertions.assertEquals("", info.get(COL_TABLE_FILTER)); + } + + // ===== getJobInfo() — SHOW WARM UP JOB output ===== + + @Test + public void testGetJobInfoTableLevelJob() { + // Scenario: user creates a table-level event-driven job and runs SHOW WARM UP JOB + CloudWarmUpJob job = baseBuilder() + .setTableFilterRules(Arrays.asList( + rule("INCLUDE", "ods.*"), rule("EXCLUDE", "ods.tmp_*"))) + .setSyncEvent(CloudWarmUpJob.SyncEvent.LOAD) + .build(); + // Simulate resolved table IDs with db.table names + Map idNames = new HashMap<>(); + idNames.put(1001L, "ods.orders"); + idNames.put(1002L, "ods.products"); + idNames.put(1003L, "ods.users"); + job.setCurrentTableIdNames(idNames); + + List info = job.getJobInfo(null); + Assertions.assertEquals(TOTAL_COLUMNS, info.size()); + Assertions.assertEquals("1", info.get(COL_JOB_ID)); + Assertions.assertEquals("write_cg", info.get(COL_SRC)); + Assertions.assertEquals("read_cg", info.get(COL_DST)); + Assertions.assertEquals("PENDING", info.get(COL_STATUS)); + Assertions.assertEquals("TABLES", info.get(COL_TYPE)); + Assertions.assertTrue(info.get(COL_SYNC_MODE).contains("EVENT_DRIVEN")); + Assertions.assertEquals("{\"include\":[\"ods.*\"],\"exclude\":[\"ods.tmp_*\"]}", + info.get(COL_TABLE_FILTER)); + // MatchedTables should show sorted db.table names + Assertions.assertEquals("ods.orders, ods.products, ods.users", info.get(COL_MATCHED_TABLES)); + } + + @Test + public void testGetJobInfoMatchedTablesTruncated() { + CloudWarmUpJob job = baseBuilder() + .setTableFilterRules(Arrays.asList(rule("INCLUDE", "ods.*"))) + .build(); + Map idNames = new HashMap<>(); + int originalDisplayLimit = Config.cloud_warm_up_matched_tables_display_limit; + Config.cloud_warm_up_matched_tables_display_limit = 3; + int totalTables = 5; + try { + for (int i = 0; i < totalTables; i++) { + idNames.put((long) i, String.format("ods.tbl_%03d", i)); + } + job.setCurrentTableIdNames(idNames); + + String matchedTables = job.getJobInfo(null).get(COL_MATCHED_TABLES); + Assertions.assertEquals("ods.tbl_000, ods.tbl_001, ods.tbl_002, " + + "... (truncated, 3 of 5 shown)", matchedTables); + Assertions.assertFalse(matchedTables.contains("ods.tbl_003")); + Assertions.assertEquals(totalTables, job.getCurrentTableIds().size()); + } finally { + Config.cloud_warm_up_matched_tables_display_limit = originalDisplayLimit; + } + } + + @Test + public void testMatchedTablesLogDisplayTruncated() { + List logEntries = new ArrayList<>(); + int originalDisplayLimit = Config.cloud_warm_up_matched_tables_display_limit; + Config.cloud_warm_up_matched_tables_display_limit = 3; + int totalTables = 5; + try { + for (int i = 0; i < totalTables; i++) { + logEntries.add(String.format("%d:ods.tbl_%03d", i, i)); + } + + String matchedTables = CloudWarmUpJob.formatMatchedTablesForDisplay(logEntries); + Assertions.assertEquals("0:ods.tbl_000, 1:ods.tbl_001, 2:ods.tbl_002, " + + "... (truncated, 3 of 5 shown)", matchedTables); + Assertions.assertFalse(matchedTables.contains("3:ods.tbl_003")); + } finally { + Config.cloud_warm_up_matched_tables_display_limit = originalDisplayLimit; + } + } + + @Test + public void testGetJobInfoClusterLevelJob() { + // Scenario: cluster-level job without ON TABLES — TableFilter and MatchedTables are empty + CloudWarmUpJob job = clusterBuilder() + .setSyncEvent(CloudWarmUpJob.SyncEvent.LOAD) + .build(); + List info = job.getJobInfo(null); + Assertions.assertEquals(TOTAL_COLUMNS, info.size()); + Assertions.assertEquals("CLUSTER", info.get(COL_TYPE)); + Assertions.assertEquals("", info.get(COL_TABLE_FILTER)); + Assertions.assertEquals("", info.get(COL_MATCHED_TABLES)); + Assertions.assertEquals("", info.get(COL_TABLES)); + } + + @Test + public void testGetJobInfoClusterLevelEventDrivenJobShowsSyncStats() { + CloudWarmUpJob job = clusterBuilder() + .setSyncEvent(CloudWarmUpJob.SyncEvent.LOAD) + .build(); + JobWarmUpStats stats = new JobWarmUpStats(); + stats.requestedSegmentNum30m = 6; + stats.requestedSegmentSize30m = 2048; + stats.requestedIndexNum30m = 2; + stats.requestedIndexSize30m = 1024; + stats.finishSegmentNum30m = 4; + stats.finishSegmentSize30m = 1024; + stats.finishIndexNum30m = 1; + stats.finishIndexSize30m = 512; + stats.failSegmentNum30m = 1; + stats.lastTriggerTs = 5000; + stats.progressTriggerTs = 4200; + stats.computeGap(); + + List detailed = job.getJobInfo(stats, true); + Assertions.assertEquals(TOTAL_COLUMNS, detailed.size()); + Assertions.assertEquals("CLUSTER", detailed.get(COL_TYPE)); + Assertions.assertEquals("", detailed.get(COL_TABLE_FILTER)); + Assertions.assertEquals("", detailed.get(COL_MATCHED_TABLES)); + + JsonObject detailStats = JsonParser.parseString(detailed.get(COL_SYNC_STATS)).getAsJsonObject(); + JsonObject segNum = detailStats.getAsJsonObject("seg_num"); + Assertions.assertEquals(6, segNum.get("requested_30m").getAsLong()); + Assertions.assertEquals(4, segNum.get("finish_30m").getAsLong()); + Assertions.assertEquals(2, segNum.get("gap_30m").getAsLong()); + Assertions.assertEquals(800, detailStats.get("trigger_gap_ms").getAsLong()); + Assertions.assertFalse(detailStats.has("window")); + + List summary = job.getJobInfo(stats, false); + JsonObject summaryStats = JsonParser.parseString(summary.get(COL_SYNC_STATS)).getAsJsonObject(); + Assertions.assertEquals("30m", summaryStats.get("window").getAsString()); + Assertions.assertEquals("3kb", summaryStats.get("src_size").getAsString()); + Assertions.assertEquals("1.5kb", summaryStats.get("dst_size").getAsString()); + Assertions.assertEquals("1.5kb", summaryStats.get("gap_size").getAsString()); + Assertions.assertEquals(800, summaryStats.get("trigger_gap_ms").getAsLong()); + Assertions.assertFalse(summaryStats.has("seg_num")); + } + + @Test + public void testGetJobInfoMatchedTablesEmpty() { + // Scenario: all matched tables have been dropped → MatchedTables becomes empty + CloudWarmUpJob job = baseBuilder() + .setTableFilterRules(Arrays.asList(rule("INCLUDE", "ods.*"))) + .build(); + // Initially had tables, now all dropped + job.setCurrentTableIdNames(new HashMap<>()); + + List info = job.getJobInfo(null); + Assertions.assertEquals("{\"include\":[\"ods.*\"]}", info.get(COL_TABLE_FILTER)); + Assertions.assertEquals("", info.get(COL_MATCHED_TABLES)); + } + + // ===== Dynamic table ID tracking (simulating create/drop/rename) ===== + + @Test + public void testDynamicTableIdTracking() { + // Scenario: User guide says system re-evaluates every 60s. + // - Initial: tables 1001, 1002 matched + // - New table 1003 created → next refresh adds it + // - Table 1001 dropped → next refresh removes it + CloudWarmUpJob job = baseBuilder() + .setTableFilterRules(Arrays.asList(rule("INCLUDE", "ods.*"))) + .build(); + + // Phase 1: initial resolution + Map initial = new HashMap<>(); + initial.put(1001L, "ods.orders"); + initial.put(1002L, "ods.products"); + job.setCurrentTableIdNames(initial); + Assertions.assertEquals(2, job.getCurrentTableIds().size()); + Assertions.assertTrue(job.getCurrentTableIds().contains(1001L)); + Assertions.assertTrue(job.getCurrentTableIds().contains(1002L)); + // Verify SHOW output shows db.table names + List info1 = job.getJobInfo(null); + Assertions.assertEquals("ods.orders, ods.products", info1.get(COL_MATCHED_TABLES)); + + // Phase 2: new table created + old table dropped (simulate refresh) + Map afterRefresh = new HashMap<>(); + afterRefresh.put(1002L, "ods.products"); + afterRefresh.put(1003L, "ods.users"); + job.setCurrentTableIdNames(afterRefresh); + Assertions.assertEquals(2, job.getCurrentTableIds().size()); + Assertions.assertFalse(job.getCurrentTableIds().contains(1001L)); + Assertions.assertTrue(job.getCurrentTableIds().contains(1003L)); + List info2 = job.getJobInfo(null); + Assertions.assertEquals("ods.products, ods.users", info2.get(COL_MATCHED_TABLES)); + + // Phase 3: all tables dropped → empty set (Job stays RUNNING per user guide) + job.setCurrentTableIdNames(new HashMap<>()); + Assertions.assertTrue(job.getCurrentTableIds().isEmpty()); + // TableFilter expr is still there (job not cancelled) + Assertions.assertTrue(job.hasTableFilter()); + List info3 = job.getJobInfo(null); + Assertions.assertEquals("", info3.get(COL_MATCHED_TABLES)); + } + + // ===== Builder validation ===== + + @Test + public void testBuilderMissingRequiredFieldsThrows() { + Assertions.assertThrows(IllegalStateException.class, () -> { + new CloudWarmUpJob.Builder().build(); + }); + } +} diff --git a/fe/fe-core/src/test/java/org/apache/doris/cloud/OnTablesFilterTest.java b/fe/fe-core/src/test/java/org/apache/doris/cloud/OnTablesFilterTest.java new file mode 100644 index 00000000000000..65bfad86bc8a05 --- /dev/null +++ b/fe/fe-core/src/test/java/org/apache/doris/cloud/OnTablesFilterTest.java @@ -0,0 +1,141 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.cloud; + +import org.apache.doris.cloud.OnTablesFilter.TableFilterRule; +import org.apache.doris.cloud.OnTablesFilter.TableFilterRule.RuleType; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.util.Arrays; +import java.util.Collections; + +/** + * Tests for {@link OnTablesFilter}: glob compilation, INCLUDE/EXCLUDE semantics, + * edge cases for wildcards and regex metacharacters. + */ +public class OnTablesFilterTest { + + private OnTablesFilter buildFilter(TableFilterRule... rules) { + return new OnTablesFilter(Arrays.asList(rules)); + } + + private TableFilterRule inc(String pattern) { + return new TableFilterRule(RuleType.INCLUDE, pattern); + } + + private TableFilterRule exc(String pattern) { + return new TableFilterRule(RuleType.EXCLUDE, pattern); + } + + // ===== Glob matching semantics ===== + + @Test + public void testGlobWildcards() { + // '*' matches any characters, '?' matches exactly one character + OnTablesFilter filter = buildFilter(inc("db?.tbl_*")); + Assertions.assertTrue(filter.shouldWarmUp("db1", "tbl_orders")); + Assertions.assertTrue(filter.shouldWarmUp("dbA", "tbl_")); + Assertions.assertFalse(filter.shouldWarmUp("db12", "tbl_x")); // '?' must match exactly one char + Assertions.assertFalse(filter.shouldWarmUp("db", "tbl_x")); // '?' requires one char, not zero + Assertions.assertFalse(filter.shouldWarmUp("db1", "orders")); // prefix must match + } + + @Test + public void testDotIsLiteral() { + // '.' is a regex metachar but in glob it should be literal + TableFilterRule rule = inc("ods.tbl"); + Assertions.assertTrue(rule.matches("ods.tbl")); + Assertions.assertFalse(rule.matches("odsXtbl")); // '.' must not match arbitrary char + } + + @Test + public void testRegexMetacharsEscaped() { + // All regex metacharacters should be treated as literals in glob + OnTablesFilter filter = buildFilter(inc("db(1).tbl[2]")); + Assertions.assertTrue(filter.shouldWarmUp("db(1)", "tbl[2]")); + Assertions.assertFalse(filter.shouldWarmUp("db1", "tbl2")); + } + + // ===== INCLUDE / EXCLUDE semantics ===== + + @Test + public void testIncludeOnlyMatchesTargetDb() { + OnTablesFilter filter = buildFilter(inc("ods.*")); + Assertions.assertTrue(filter.shouldWarmUp("ods", "orders")); + Assertions.assertTrue(filter.shouldWarmUp("ods", "users")); + Assertions.assertFalse(filter.shouldWarmUp("dw", "orders")); + } + + @Test + public void testExcludeOverridesInclude() { + OnTablesFilter filter = buildFilter(inc("ods.*"), exc("ods.tmp_*")); + Assertions.assertTrue(filter.shouldWarmUp("ods", "orders")); + Assertions.assertFalse(filter.shouldWarmUp("ods", "tmp_staging")); + Assertions.assertFalse(filter.shouldWarmUp("dw", "orders")); // not included + } + + @Test + public void testMultipleIncludesFormUnion() { + OnTablesFilter filter = buildFilter(inc("ods.*"), inc("dw.*")); + Assertions.assertTrue(filter.shouldWarmUp("ods", "orders")); + Assertions.assertTrue(filter.shouldWarmUp("dw", "fact_sales")); + Assertions.assertFalse(filter.shouldWarmUp("staging", "temp")); + } + + @Test + public void testExcludeOnlyNeverMatches() { + // No INCLUDE rules means nothing is included, regardless of EXCLUDE + OnTablesFilter filter = buildFilter(exc("ods.tmp_*")); + Assertions.assertFalse(filter.shouldWarmUp("ods", "orders")); + Assertions.assertFalse(filter.shouldWarmUp("ods", "tmp_staging")); + } + + @Test + public void testEmptyRulesNeverMatches() { + OnTablesFilter filter = new OnTablesFilter(Collections.emptyList()); + Assertions.assertFalse(filter.shouldWarmUp("any", "table")); + } + + // ===== Typical user scenario: multiple databases + selective exclusion ===== + + @Test + public void testComplexScenario() { + // Include everything in ods and dw, but exclude all tmp tables and a specific table + OnTablesFilter filter = buildFilter( + inc("ods.*"), inc("dw.*"), + exc("*.tmp_*"), exc("dw.secret_report")); + Assertions.assertTrue(filter.shouldWarmUp("ods", "orders")); + Assertions.assertTrue(filter.shouldWarmUp("dw", "fact_sales")); + Assertions.assertFalse(filter.shouldWarmUp("ods", "tmp_staging")); + Assertions.assertFalse(filter.shouldWarmUp("dw", "tmp_data")); + Assertions.assertFalse(filter.shouldWarmUp("dw", "secret_report")); + Assertions.assertFalse(filter.shouldWarmUp("staging", "anything")); + } + + // ===== Rule partitioning ===== + + @Test + public void testGetRulesPartition() { + OnTablesFilter filter = buildFilter(inc("ods.*"), exc("ods.tmp_*"), inc("dw.*")); + Assertions.assertEquals(2, filter.getIncludeRules().size()); + Assertions.assertEquals(1, filter.getExcludeRules().size()); + Assertions.assertEquals(3, filter.getAllRules().size()); + } +} diff --git a/fe/fe-core/src/test/java/org/apache/doris/cloud/WarmUpClusterOnTablesParseTest.java b/fe/fe-core/src/test/java/org/apache/doris/cloud/WarmUpClusterOnTablesParseTest.java new file mode 100644 index 00000000000000..8bec1d2d5eb394 --- /dev/null +++ b/fe/fe-core/src/test/java/org/apache/doris/cloud/WarmUpClusterOnTablesParseTest.java @@ -0,0 +1,447 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.cloud; + +import org.apache.doris.catalog.Env; +import org.apache.doris.cloud.OnTablesFilter.TableFilterRule; +import org.apache.doris.cloud.OnTablesFilter.TableFilterRule.RuleType; +import org.apache.doris.cloud.catalog.ComputeGroup; +import org.apache.doris.cloud.system.CloudSystemInfoService; +import org.apache.doris.common.AnalysisException; +import org.apache.doris.common.Config; +import org.apache.doris.nereids.parser.NereidsParser; +import org.apache.doris.nereids.trees.plans.commands.WarmUpClusterCommand; +import org.apache.doris.qe.ConnectContext; + +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.mockito.Mockito; + +import java.lang.reflect.Field; +import java.util.Arrays; +import java.util.List; + +/** + * Tests parsing of WARM UP CLUSTER ... ON TABLES (...) grammar. + * Covers valid syntax, extracted rule types/patterns, and syntax errors. + */ +public class WarmUpClusterOnTablesParseTest { + + private static ConnectContext connectContext; + private static Env env; + private static Object originalSystemInfo; + + private static void setField(Object target, Class clazz, String fieldName, Object value) throws Exception { + Field field = clazz.getDeclaredField(fieldName); + field.setAccessible(true); + field.set(target, value); + } + + private static Object getField(Object target, Class clazz, String fieldName) throws Exception { + Field field = clazz.getDeclaredField(fieldName); + field.setAccessible(true); + return field.get(target); + } + + @BeforeAll + public static void init() throws Exception { + env = Env.getCurrentEnv(); + originalSystemInfo = getField(env, Env.class, "systemInfo"); + connectContext = new ConnectContext(); + connectContext.setEnv(env); + connectContext.setThreadLocalInfo(); + } + + @AfterAll + public static void tearDown() throws Exception { + setField(env, Env.class, "systemInfo", originalSystemInfo); + ConnectContext.remove(); + } + + private WarmUpClusterCommand parse(String sql) { + try { + return (WarmUpClusterCommand) new NereidsParser().parseSingle(sql); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + private void analyze(WarmUpClusterCommand stmt) throws Exception { + stmt.validate(connectContext); + } + + private void mockValidateEnv(String srcCluster, String dstCluster) throws Exception { + CloudSystemInfoService cloudSys = Mockito.mock(CloudSystemInfoService.class); + Mockito.when(cloudSys.containClusterName(srcCluster)).thenReturn(true); + Mockito.when(cloudSys.containClusterName(dstCluster)).thenReturn(true); + setField(env, Env.class, "systemInfo", cloudSys); + } + + private CloudSystemInfoService buildCloudSystemInfoWithVirtualComputeGroup( + String virtualComputeGroupName, String activeComputeGroupName, String standbyComputeGroupName) { + CloudSystemInfoService cloudSys = new CloudSystemInfoService(); + addVirtualComputeGroup(cloudSys, virtualComputeGroupName, activeComputeGroupName, standbyComputeGroupName); + return cloudSys; + } + + private void addVirtualComputeGroup(CloudSystemInfoService cloudSys, + String virtualComputeGroupName, String activeComputeGroupName, String standbyComputeGroupName) { + ComputeGroup activeComputeGroup = new ComputeGroup(activeComputeGroupName + "_id", + activeComputeGroupName, ComputeGroup.ComputeTypeEnum.COMPUTE); + ComputeGroup standbyComputeGroup = new ComputeGroup(standbyComputeGroupName + "_id", + standbyComputeGroupName, ComputeGroup.ComputeTypeEnum.COMPUTE); + ComputeGroup virtualComputeGroup = new ComputeGroup(virtualComputeGroupName + "_id", + virtualComputeGroupName, ComputeGroup.ComputeTypeEnum.VIRTUAL); + virtualComputeGroup.setSubComputeGroups(Arrays.asList(activeComputeGroupName, standbyComputeGroupName)); + ComputeGroup.Policy policy = new ComputeGroup.Policy(); + policy.setActiveComputeGroup(activeComputeGroupName); + policy.setStandbyComputeGroup(standbyComputeGroupName); + virtualComputeGroup.setPolicy(policy); + + cloudSys.addComputeGroup(activeComputeGroup.getId(), activeComputeGroup); + cloudSys.addComputeGroup(standbyComputeGroup.getId(), standbyComputeGroup); + cloudSys.addComputeGroup(virtualComputeGroup.getId(), virtualComputeGroup); + } + + // ===== Valid syntax: ON TABLES clause is parsed correctly ===== + + @Test + public void testOnTablesSingleInclude() { + WarmUpClusterCommand cmd = parse( + "WARM UP CLUSTER dst WITH CLUSTER src " + + "ON TABLES (INCLUDE 'ods.*') " + + "PROPERTIES('sync_mode'='event_driven', 'sync_event'='LOAD')"); + List rules = cmd.getOnTablesRules(); + Assertions.assertNotNull(rules); + Assertions.assertEquals(1, rules.size()); + Assertions.assertEquals(RuleType.INCLUDE, rules.get(0).getRuleType()); + Assertions.assertEquals("ods.*", rules.get(0).getRawPattern()); + } + + @Test + public void testOnTablesMultipleRules() { + WarmUpClusterCommand cmd = parse( + "WARM UP CLUSTER dst WITH CLUSTER src " + + "ON TABLES (INCLUDE 'ods.*', INCLUDE 'dw.*', EXCLUDE 'dw.tmp_*') " + + "PROPERTIES('sync_mode'='event_driven', 'sync_event'='LOAD')"); + List rules = cmd.getOnTablesRules(); + Assertions.assertNotNull(rules); + Assertions.assertEquals(3, rules.size()); + Assertions.assertEquals(RuleType.INCLUDE, rules.get(0).getRuleType()); + Assertions.assertEquals("ods.*", rules.get(0).getRawPattern()); + Assertions.assertEquals(RuleType.INCLUDE, rules.get(1).getRuleType()); + Assertions.assertEquals("dw.*", rules.get(1).getRawPattern()); + Assertions.assertEquals(RuleType.EXCLUDE, rules.get(2).getRuleType()); + Assertions.assertEquals("dw.tmp_*", rules.get(2).getRawPattern()); + } + + @Test + public void testWithoutOnTablesClause() { + WarmUpClusterCommand cmd = parse( + "WARM UP CLUSTER dst WITH CLUSTER src " + + "PROPERTIES('sync_mode'='event_driven', 'sync_event'='LOAD')"); + Assertions.assertTrue(cmd.getOnTablesRules().isEmpty()); + } + + @Test + public void testOnTablesWithForce() { + WarmUpClusterCommand cmd = parse( + "WARM UP CLUSTER dst WITH CLUSTER src FORCE " + + "ON TABLES (INCLUDE 'ods.*') " + + "PROPERTIES('sync_mode'='event_driven', 'sync_event'='LOAD')"); + Assertions.assertTrue(cmd.isForce()); + Assertions.assertNotNull(cmd.getOnTablesRules()); + Assertions.assertEquals(1, cmd.getOnTablesRules().size()); + } + + @Test + public void testOnTablesWithComputeGroup() { + WarmUpClusterCommand cmd = parse( + "WARM UP COMPUTE GROUP dst WITH COMPUTE GROUP src " + + "ON TABLES (INCLUDE 'db1.*') " + + "PROPERTIES('sync_mode'='event_driven', 'sync_event'='LOAD')"); + Assertions.assertNotNull(cmd.getOnTablesRules()); + Assertions.assertEquals(1, cmd.getOnTablesRules().size()); + } + + // ===== Syntax errors ===== + + @Test + public void testOnTablesEmptyParensFails() { + Assertions.assertThrows(RuntimeException.class, () -> + parse("WARM UP CLUSTER dst WITH CLUSTER src " + + "ON TABLES () " + + "PROPERTIES('sync_mode'='event_driven', 'sync_event'='LOAD')")); + } + + @Test + public void testOnTablesMissingParensFails() { + Assertions.assertThrows(RuntimeException.class, () -> + parse("WARM UP CLUSTER dst WITH CLUSTER src " + + "ON TABLES INCLUDE 'ods.*' " + + "PROPERTIES('sync_mode'='event_driven', 'sync_event'='LOAD')")); + } + + @Test + public void testOnTablesMissingPatternFails() { + Assertions.assertThrows(RuntimeException.class, () -> + parse("WARM UP CLUSTER dst WITH CLUSTER src " + + "ON TABLES (INCLUDE) " + + "PROPERTIES('sync_mode'='event_driven', 'sync_event'='LOAD')")); + } + + // ===== Validation logic in WarmUpClusterCommand ===== + + @Test + public void testOnTablesExcludeOnlyParsesButLacksInclude() { + WarmUpClusterCommand cmd = parse( + "WARM UP CLUSTER dst WITH CLUSTER src " + + "ON TABLES (EXCLUDE 'ods.tmp_*') " + + "PROPERTIES('sync_mode'='event_driven', 'sync_event'='LOAD')"); + List rules = cmd.getOnTablesRules(); + Assertions.assertEquals(1, rules.size()); + Assertions.assertEquals(RuleType.EXCLUDE, rules.get(0).getRuleType()); + boolean hasInclude = rules.stream() + .anyMatch(r -> r.getRuleType() == RuleType.INCLUDE); + Assertions.assertFalse(hasInclude, "Exclude-only rules should have no INCLUDE"); + } + + @Test + public void testOnTablesNonEventDrivenSyncModeParses() { + WarmUpClusterCommand cmd = parse( + "WARM UP CLUSTER dst WITH CLUSTER src " + + "ON TABLES (INCLUDE 'ods.*') " + + "PROPERTIES('sync_mode'='once')"); + Assertions.assertNotNull(cmd.getOnTablesRules()); + Assertions.assertEquals("once", cmd.getProperties().get("sync_mode")); + } + + @Test + public void testOnTablesExcludeOnlyValidateFails() { + String originalCloudUniqueId = Config.cloud_unique_id; + Config.cloud_unique_id = "test_cloud"; + try { + mockValidateEnv("src", "dst"); + WarmUpClusterCommand cmd = parse( + "WARM UP CLUSTER dst WITH CLUSTER src " + + "ON TABLES (EXCLUDE 'ods.tmp_*') " + + "PROPERTIES('sync_mode'='event_driven', 'sync_event'='LOAD')"); + Assertions.assertThrows(AnalysisException.class, () -> analyze(cmd)); + } catch (Exception e) { + Assertions.fail(e); + } finally { + try { + setField(env, Env.class, "systemInfo", originalSystemInfo); + } catch (Exception e) { + throw new RuntimeException(e); + } + Config.cloud_unique_id = originalCloudUniqueId; + } + } + + @Test + public void testOnTablesNonEventDrivenValidateFails() { + String originalCloudUniqueId = Config.cloud_unique_id; + Config.cloud_unique_id = "test_cloud"; + try { + mockValidateEnv("src", "dst"); + WarmUpClusterCommand cmd = parse( + "WARM UP CLUSTER dst WITH CLUSTER src " + + "ON TABLES (INCLUDE 'ods.*') " + + "PROPERTIES('sync_mode'='once')"); + Assertions.assertThrows(AnalysisException.class, () -> analyze(cmd)); + } catch (Exception e) { + Assertions.fail(e); + } finally { + try { + setField(env, Env.class, "systemInfo", originalSystemInfo); + } catch (Exception e) { + throw new RuntimeException(e); + } + Config.cloud_unique_id = originalCloudUniqueId; + } + } + + @Test + public void testOnTablesWithExplicitTableValidateFails() { + String originalCloudUniqueId = Config.cloud_unique_id; + Config.cloud_unique_id = "test_cloud"; + try { + mockValidateEnv("src", "dst"); + WarmUpClusterCommand cmd = parse( + "WARM UP CLUSTER dst WITH TABLE db1.orders " + + "ON TABLES (INCLUDE 'ods.*')"); + AnalysisException exception = Assertions.assertThrows( + AnalysisException.class, () -> analyze(cmd)); + Assertions.assertTrue(exception.getMessage().contains("ON TABLES clause cannot be used")); + } catch (Exception e) { + Assertions.fail(e); + } finally { + try { + setField(env, Env.class, "systemInfo", originalSystemInfo); + } catch (Exception e) { + throw new RuntimeException(e); + } + Config.cloud_unique_id = originalCloudUniqueId; + } + } + + @Test + public void testOnTablesPatternWithoutDbTableFormatValidateFails() { + String originalCloudUniqueId = Config.cloud_unique_id; + Config.cloud_unique_id = "test_cloud"; + try { + mockValidateEnv("src", "dst"); + WarmUpClusterCommand cmd = parse( + "WARM UP CLUSTER dst WITH CLUSTER src " + + "ON TABLES (INCLUDE 'orders') " + + "PROPERTIES('sync_mode'='event_driven', 'sync_event'='LOAD')"); + Assertions.assertThrows(AnalysisException.class, () -> analyze(cmd)); + } catch (Exception e) { + Assertions.fail(e); + } finally { + try { + setField(env, Env.class, "systemInfo", originalSystemInfo); + } catch (Exception e) { + throw new RuntimeException(e); + } + Config.cloud_unique_id = originalCloudUniqueId; + } + } + + @Test + public void testOnTablesLoadEventValidateFailsWhenComputeGroupsOwnedByVirtualComputeGroup() { + String originalCloudUniqueId = Config.cloud_unique_id; + Config.cloud_unique_id = "test_cloud"; + try { + setField(env, Env.class, "systemInfo", buildCloudSystemInfoWithVirtualComputeGroup( + "vcg", "active_cg", "standby_cg")); + WarmUpClusterCommand cmd = parse( + "WARM UP CLUSTER standby_cg WITH CLUSTER active_cg " + + "ON TABLES (INCLUDE 'ods.*') " + + "PROPERTIES('sync_mode'='event_driven', 'sync_event'='LOAD')"); + + AnalysisException exception = Assertions.assertThrows( + AnalysisException.class, () -> analyze(cmd)); + Assertions.assertTrue(exception.getMessage().contains( + "Cannot create warm up job from source compute group 'active_cg' " + + "to destination compute group 'standby_cg'")); + Assertions.assertTrue(exception.getMessage().contains( + "source compute group 'active_cg' and destination compute group 'standby_cg' " + + "are both owned by virtual compute group 'vcg'")); + Assertions.assertTrue(exception.getMessage().contains( + "not support")); + Assertions.assertFalse(exception.getMessage().contains( + "cancel the conflicting managed warm-up job")); + } catch (Exception e) { + Assertions.fail(e); + } finally { + try { + setField(env, Env.class, "systemInfo", originalSystemInfo); + } catch (Exception e) { + throw new RuntimeException(e); + } + Config.cloud_unique_id = originalCloudUniqueId; + } + } + + @Test + public void testOnTablesLoadEventValidateAllowsDestinationComputeGroupOwnedByVirtualComputeGroupOnly() { + String originalCloudUniqueId = Config.cloud_unique_id; + Config.cloud_unique_id = "test_cloud"; + try { + CloudSystemInfoService cloudSys = buildCloudSystemInfoWithVirtualComputeGroup( + "vcg", "active_cg", "standby_cg"); + cloudSys.addComputeGroup("outside_cg_id", + new ComputeGroup("outside_cg_id", "outside_cg", ComputeGroup.ComputeTypeEnum.COMPUTE)); + setField(env, Env.class, "systemInfo", cloudSys); + WarmUpClusterCommand cmd = parse( + "WARM UP CLUSTER standby_cg WITH CLUSTER outside_cg " + + "ON TABLES (INCLUDE 'ods.*') " + + "PROPERTIES('sync_mode'='event_driven', 'sync_event'='LOAD')"); + + Assertions.assertDoesNotThrow(() -> analyze(cmd)); + } catch (Exception e) { + Assertions.fail(e); + } finally { + try { + setField(env, Env.class, "systemInfo", originalSystemInfo); + } catch (Exception e) { + throw new RuntimeException(e); + } + Config.cloud_unique_id = originalCloudUniqueId; + } + } + + @Test + public void testOnTablesLoadEventValidateAllowsSourceComputeGroupOwnedByVirtualComputeGroupOnly() { + String originalCloudUniqueId = Config.cloud_unique_id; + Config.cloud_unique_id = "test_cloud"; + try { + CloudSystemInfoService cloudSys = buildCloudSystemInfoWithVirtualComputeGroup( + "vcg", "active_cg", "standby_cg"); + cloudSys.addComputeGroup("outside_cg_id", + new ComputeGroup("outside_cg_id", "outside_cg", ComputeGroup.ComputeTypeEnum.COMPUTE)); + setField(env, Env.class, "systemInfo", cloudSys); + WarmUpClusterCommand cmd = parse( + "WARM UP CLUSTER outside_cg WITH CLUSTER active_cg " + + "ON TABLES (INCLUDE 'ods.*') " + + "PROPERTIES('sync_mode'='event_driven', 'sync_event'='LOAD')"); + + Assertions.assertDoesNotThrow(() -> analyze(cmd)); + } catch (Exception e) { + Assertions.fail(e); + } finally { + try { + setField(env, Env.class, "systemInfo", originalSystemInfo); + } catch (Exception e) { + throw new RuntimeException(e); + } + Config.cloud_unique_id = originalCloudUniqueId; + } + } + + @Test + public void testOnTablesLoadEventValidateAllowsComputeGroupsOwnedByDifferentVirtualComputeGroups() { + String originalCloudUniqueId = Config.cloud_unique_id; + Config.cloud_unique_id = "test_cloud"; + try { + CloudSystemInfoService cloudSys = new CloudSystemInfoService(); + addVirtualComputeGroup(cloudSys, "vcg1", "active_cg", "standby_cg"); + addVirtualComputeGroup(cloudSys, "vcg2", "other_active_cg", "other_standby_cg"); + setField(env, Env.class, "systemInfo", cloudSys); + WarmUpClusterCommand cmd = parse( + "WARM UP CLUSTER other_standby_cg WITH CLUSTER active_cg " + + "ON TABLES (INCLUDE 'ods.*') " + + "PROPERTIES('sync_mode'='event_driven', 'sync_event'='LOAD')"); + + Assertions.assertDoesNotThrow(() -> analyze(cmd)); + } catch (Exception e) { + Assertions.fail(e); + } finally { + try { + setField(env, Env.class, "systemInfo", originalSystemInfo); + } catch (Exception e) { + throw new RuntimeException(e); + } + Config.cloud_unique_id = originalCloudUniqueId; + } + } +} diff --git a/fe/fe-core/src/test/java/org/apache/doris/cloud/WarmUpStatsTest.java b/fe/fe-core/src/test/java/org/apache/doris/cloud/WarmUpStatsTest.java new file mode 100644 index 00000000000000..2f8c35f73941f0 --- /dev/null +++ b/fe/fe-core/src/test/java/org/apache/doris/cloud/WarmUpStatsTest.java @@ -0,0 +1,497 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.cloud; + +import com.google.gson.JsonObject; +import com.google.gson.JsonParser; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.util.HashMap; +import java.util.Map; + +/** + * Unit tests for warmup progress observation data models: + * - TableWarmUpWindowedStats: parse BE JSON, merge from multiple BEs + * - JobWarmUpStats: aggregate requested/finished, compute gap, serialize + */ +public class WarmUpStatsTest { + + // ==================== TableWarmUpWindowedStats ==================== + + @Test + public void testFromJsonComplete() { + String json = "{" + + "\"job_id\": 100," + + "\"requested\": {" + + " \"seg\": {\"num\": {\"5m\": 10, \"30m\": 50, \"1h\": 200}," + + " \"size\": {\"5m\": 1024, \"30m\": 5120, \"1h\": 20480}}," + + " \"idx\": {\"num\": {\"5m\": 3, \"30m\": 15, \"1h\": 60}," + + " \"size\": {\"5m\": 512, \"30m\": 2560, \"1h\": 10240}}" + + "}," + + "\"finish\": {" + + " \"seg\": {\"num\": {\"5m\": 8, \"30m\": 45, \"1h\": 190}," + + " \"size\": {\"5m\": 800, \"30m\": 4500, \"1h\": 19000}}," + + " \"idx\": {\"num\": {\"5m\": 2, \"30m\": 12, \"1h\": 55}," + + " \"size\": {\"5m\": 400, \"30m\": 2400, \"1h\": 9500}}" + + "}," + + "\"fail\": {" + + " \"seg\": {\"num\": {\"5m\": 1, \"30m\": 3, \"1h\": 5}," + + " \"size\": {\"5m\": 100, \"30m\": 300, \"1h\": 500}}," + + " \"idx\": {\"num\": {\"5m\": 0, \"30m\": 1, \"1h\": 2}," + + " \"size\": {\"5m\": 0, \"30m\": 50, \"1h\": 100}}" + + "}," + + "\"last_trigger_ts\": 1700000000000," + + "\"last_finish_ts\": 1700000001000," + + "\"progress_trigger_ts\": 1699999999000" + + "}"; + JsonObject obj = JsonParser.parseString(json).getAsJsonObject(); + TableWarmUpWindowedStats stats = TableWarmUpWindowedStats.fromJson(obj); + + // requested + Assertions.assertEquals(10, stats.requestedSegmentNum5m); + Assertions.assertEquals(50, stats.requestedSegmentNum30m); + Assertions.assertEquals(200, stats.requestedSegmentNum1h); + Assertions.assertEquals(1024, stats.requestedSegmentSize5m); + Assertions.assertEquals(3, stats.requestedIndexNum5m); + Assertions.assertEquals(512, stats.requestedIndexSize5m); + + // finish + Assertions.assertEquals(8, stats.finishSegmentNum5m); + Assertions.assertEquals(45, stats.finishSegmentNum30m); + Assertions.assertEquals(400, stats.finishIndexSize5m); + + // fail + Assertions.assertEquals(1, stats.failSegmentNum5m); + Assertions.assertEquals(0, stats.failIndexNum5m); + Assertions.assertEquals(100, stats.failSegmentSize5m); + + // timestamps + Assertions.assertEquals(1700000000000L, stats.lastTriggerTs); + Assertions.assertEquals(1700000001000L, stats.lastFinishTs); + Assertions.assertEquals(1699999999000L, stats.progressTriggerTs); + } + + @Test + public void testFromJsonMissingSections() { + // JSON with only requested, no finish or fail + String json = "{" + + "\"job_id\": 200," + + "\"requested\": {" + + " \"seg\": {\"num\": {\"5m\": 5}}" + + "}" + + "}"; + JsonObject obj = JsonParser.parseString(json).getAsJsonObject(); + TableWarmUpWindowedStats stats = TableWarmUpWindowedStats.fromJson(obj); + + Assertions.assertEquals(5, stats.requestedSegmentNum5m); + Assertions.assertEquals(0, stats.requestedSegmentNum30m); + Assertions.assertEquals(0, stats.finishSegmentNum5m); + Assertions.assertEquals(0, stats.failSegmentNum5m); + Assertions.assertEquals(0, stats.lastTriggerTs); + Assertions.assertEquals(0, stats.lastFinishTs); + Assertions.assertEquals(0, stats.progressTriggerTs); + } + + @Test + public void testFromJsonEmptyObject() { + String json = "{\"job_id\": 300}"; + JsonObject obj = JsonParser.parseString(json).getAsJsonObject(); + TableWarmUpWindowedStats stats = TableWarmUpWindowedStats.fromJson(obj); + + Assertions.assertEquals(0, stats.requestedSegmentNum5m); + Assertions.assertEquals(0, stats.finishSegmentNum5m); + Assertions.assertEquals(0, stats.failSegmentNum5m); + } + + @Test + public void testMergeAddsCounts() { + TableWarmUpWindowedStats a = new TableWarmUpWindowedStats(); + a.requestedSegmentNum5m = 10; + a.requestedSegmentSize5m = 1000; + a.finishSegmentNum5m = 8; + a.failSegmentNum5m = 1; + a.lastTriggerTs = 100; + a.lastFinishTs = 200; + a.progressTriggerTs = 500; + + TableWarmUpWindowedStats b = new TableWarmUpWindowedStats(); + b.requestedSegmentNum5m = 20; + b.requestedSegmentSize5m = 2000; + b.finishSegmentNum5m = 15; + b.failSegmentNum5m = 2; + b.lastTriggerTs = 150; + b.lastFinishTs = 180; + b.progressTriggerTs = 300; + + a.merge(b); + + Assertions.assertEquals(30, a.requestedSegmentNum5m); + Assertions.assertEquals(3000, a.requestedSegmentSize5m); + Assertions.assertEquals(23, a.finishSegmentNum5m); + Assertions.assertEquals(3, a.failSegmentNum5m); + Assertions.assertEquals(150, a.lastTriggerTs); // max + Assertions.assertEquals(200, a.lastFinishTs); // max + Assertions.assertEquals(300, a.progressTriggerTs); // min positive + } + + @Test + public void testMergeProgressTriggerTsIgnoresMissingValues() { + TableWarmUpWindowedStats a = new TableWarmUpWindowedStats(); + a.progressTriggerTs = 500; + + TableWarmUpWindowedStats missing = new TableWarmUpWindowedStats(); + a.merge(missing); + Assertions.assertEquals(500, a.progressTriggerTs); + + TableWarmUpWindowedStats b = new TableWarmUpWindowedStats(); + b.progressTriggerTs = 300; + missing.merge(b); + Assertions.assertEquals(300, missing.progressTriggerTs); + } + + // ==================== JobWarmUpStats ==================== + + @Test + public void testMergeRequestedAccumulates() { + JobWarmUpStats job = new JobWarmUpStats(); + + TableWarmUpWindowedStats src1 = new TableWarmUpWindowedStats(); + src1.requestedSegmentNum5m = 10; + src1.requestedSegmentSize5m = 1000; + src1.requestedIndexNum5m = 3; + src1.lastTriggerTs = 100; + + TableWarmUpWindowedStats src2 = new TableWarmUpWindowedStats(); + src2.requestedSegmentNum5m = 20; + src2.requestedSegmentSize5m = 2000; + src2.requestedIndexNum5m = 5; + src2.lastTriggerTs = 200; + + job.mergeRequested(src1); + job.mergeRequested(src2); + + Assertions.assertEquals(30, job.requestedSegmentNum5m); + Assertions.assertEquals(3000, job.requestedSegmentSize5m); + Assertions.assertEquals(8, job.requestedIndexNum5m); + Assertions.assertEquals(200, job.lastTriggerTs); + } + + @Test + public void testMergeFinishedAccumulates() { + JobWarmUpStats job = new JobWarmUpStats(); + + TableWarmUpWindowedStats dst = new TableWarmUpWindowedStats(); + dst.finishSegmentNum5m = 7; + dst.finishSegmentSize5m = 700; + dst.failSegmentNum5m = 2; + dst.failSegmentSize5m = 200; + dst.lastFinishTs = 300; + dst.progressTriggerTs = 250; + + job.mergeFinished(dst); + + Assertions.assertEquals(7, job.finishSegmentNum5m); + Assertions.assertEquals(700, job.finishSegmentSize5m); + Assertions.assertEquals(2, job.failSegmentNum5m); + Assertions.assertEquals(200, job.failSegmentSize5m); + Assertions.assertEquals(300, job.lastFinishTs); + Assertions.assertEquals(250, job.progressTriggerTs); + } + + @Test + public void testComputeGap() { + JobWarmUpStats job = new JobWarmUpStats(); + job.requestedSegmentNum5m = 100; + job.requestedSegmentNum30m = 500; + job.requestedSegmentNum1h = 2000; + job.requestedSegmentSize5m = 10240; + job.requestedIndexNum5m = 30; + + job.finishSegmentNum5m = 80; + job.finishSegmentNum30m = 450; + job.finishSegmentNum1h = 1900; + job.finishSegmentSize5m = 8192; + job.finishIndexNum5m = 25; + job.lastTriggerTs = 5000; + job.progressTriggerTs = 3000; + + job.computeGap(); + + Assertions.assertEquals(20, job.gapSegmentNum5m); + Assertions.assertEquals(50, job.gapSegmentNum30m); + Assertions.assertEquals(100, job.gapSegmentNum1h); + Assertions.assertEquals(2048, job.gapSegmentSize5m); + Assertions.assertEquals(5, job.gapIndexNum5m); + Assertions.assertEquals(2000, job.triggerGapMs); + } + + @Test + public void testComputeGapNegative() { + // Finished can exceed requested in windowed metrics (timing variance) + JobWarmUpStats job = new JobWarmUpStats(); + job.requestedSegmentNum5m = 10; + job.finishSegmentNum5m = 15; + + job.computeGap(); + + Assertions.assertEquals(-5, job.gapSegmentNum5m); + } + + @Test + public void testToJsonStringStructure() { + JobWarmUpStats job = new JobWarmUpStats(); + job.requestedSegmentNum5m = 100; + job.finishSegmentNum5m = 80; + job.failSegmentNum5m = 5; + job.gapSegmentNum5m = 20; + job.requestedSegmentSize5m = 1048576; // 1 MB + job.finishSegmentSize5m = 524288; // 512 KB + job.gapSegmentSize5m = 524288; + + String jsonStr = job.toJsonString(); + JsonObject root = JsonParser.parseString(jsonStr).getAsJsonObject(); + + // Verify structure + Assertions.assertTrue(root.has("seg_num")); + Assertions.assertTrue(root.has("seg_size")); + Assertions.assertTrue(root.has("idx_num")); + Assertions.assertTrue(root.has("idx_size")); + Assertions.assertTrue(root.has("last_trigger_ts")); + Assertions.assertTrue(root.has("last_finish_ts")); + Assertions.assertTrue(root.has("progress_trigger_ts")); + Assertions.assertTrue(root.has("trigger_gap_ms")); + Assertions.assertFalse(root.has("window")); + Assertions.assertFalse(root.has("src_size")); + Assertions.assertFalse(root.has("dst_size")); + Assertions.assertFalse(root.has("gap_size")); + + // seg_num values + JsonObject segNum = root.getAsJsonObject("seg_num"); + Assertions.assertEquals(100, segNum.get("requested_5m").getAsLong()); + Assertions.assertEquals(80, segNum.get("finish_5m").getAsLong()); + Assertions.assertEquals(20, segNum.get("gap_5m").getAsLong()); + Assertions.assertEquals(5, segNum.get("fail_5m").getAsLong()); + + // seg_size values are human-readable strings (via ByteSizeValue) + JsonObject segSize = root.getAsJsonObject("seg_size"); + Assertions.assertEquals("1mb", segSize.get("requested_5m").getAsString()); + Assertions.assertEquals("512kb", segSize.get("finish_5m").getAsString()); + } + + @Test + public void testToJsonStringZeroTimestamps() { + JobWarmUpStats job = new JobWarmUpStats(); + // All zeros + String jsonStr = job.toJsonString(); + JsonObject root = JsonParser.parseString(jsonStr).getAsJsonObject(); + + // Zero timestamps should be empty strings + Assertions.assertEquals("", root.get("last_trigger_ts").getAsString()); + Assertions.assertEquals("", root.get("last_finish_ts").getAsString()); + Assertions.assertEquals("", root.get("progress_trigger_ts").getAsString()); + Assertions.assertEquals(0, root.get("trigger_gap_ms").getAsLong()); + + // Zero counts + JsonObject segNum = root.getAsJsonObject("seg_num"); + Assertions.assertEquals(0, segNum.get("requested_5m").getAsLong()); + Assertions.assertEquals(0, segNum.get("gap_5m").getAsLong()); + } + + @Test + public void testToSummaryJsonStringMergesDataAndIndexSize() { + JobWarmUpStats job = new JobWarmUpStats(); + job.requestedSegmentSize30m = 1048576; // 1 MB + job.requestedIndexSize30m = 1048576; // 1 MB + job.finishSegmentSize30m = 524288; // 512 KB + job.finishIndexSize30m = 524288; // 512 KB + job.lastTriggerTs = 5000; + job.progressTriggerTs = 4500; + job.computeGap(); + + String jsonStr = job.toSummaryJsonString(); + JsonObject root = JsonParser.parseString(jsonStr).getAsJsonObject(); + + Assertions.assertEquals("30m", root.get("window").getAsString()); + Assertions.assertEquals("2mb", root.get("src_size").getAsString()); + Assertions.assertEquals("1mb", root.get("dst_size").getAsString()); + Assertions.assertEquals("1mb", root.get("gap_size").getAsString()); + Assertions.assertEquals(500, root.get("trigger_gap_ms").getAsLong()); + Assertions.assertFalse(root.has("seg_num")); + Assertions.assertFalse(root.has("seg_size")); + Assertions.assertFalse(root.has("idx_num")); + Assertions.assertFalse(root.has("idx_size")); + Assertions.assertFalse(root.has("last_trigger_ts")); + Assertions.assertFalse(root.has("last_finish_ts")); + Assertions.assertFalse(root.has("data_size")); + Assertions.assertFalse(root.has("index_size")); + } + + @Test + public void testHumanReadableSizeInJson() { + JobWarmUpStats job = new JobWarmUpStats(); + job.requestedSegmentSize5m = 500; // 500 B + job.finishSegmentSize5m = 1536; // 1.5 KB + job.gapSegmentSize5m = 1048576; // 1.0 MB + job.failSegmentSize5m = 1073741824L; // 1.0 GB + + String jsonStr = job.toJsonString(); + JsonObject segSize = JsonParser.parseString(jsonStr).getAsJsonObject() + .getAsJsonObject("seg_size"); + + Assertions.assertEquals("500b", segSize.get("requested_5m").getAsString()); + Assertions.assertEquals("1.5kb", segSize.get("finish_5m").getAsString()); + Assertions.assertEquals("1mb", segSize.get("gap_5m").getAsString()); + Assertions.assertEquals("1gb", segSize.get("fail_5m").getAsString()); + } + + @Test + public void testEndToEndSourceAndTargetAggregation() { + // Simulate: 2 source BEs + 1 target BE → aggregate into JobWarmUpStats + + // Source BE1 + String srcJson1 = "{" + + "\"job_id\": 42," + + "\"requested\": {" + + " \"seg\": {\"num\": {\"5m\": 50, \"30m\": 200, \"1h\": 800}," + + " \"size\": {\"5m\": 5000, \"30m\": 20000, \"1h\": 80000}}," + + " \"idx\": {\"num\": {\"5m\": 10, \"30m\": 40, \"1h\": 160}," + + " \"size\": {\"5m\": 1000, \"30m\": 4000, \"1h\": 16000}}" + + "}," + + "\"last_trigger_ts\": 1000" + + "}"; + + // Source BE2 + String srcJson2 = "{" + + "\"job_id\": 42," + + "\"requested\": {" + + " \"seg\": {\"num\": {\"5m\": 30, \"30m\": 120, \"1h\": 500}," + + " \"size\": {\"5m\": 3000, \"30m\": 12000, \"1h\": 50000}}," + + " \"idx\": {\"num\": {\"5m\": 6, \"30m\": 24, \"1h\": 100}," + + " \"size\": {\"5m\": 600, \"30m\": 2400, \"1h\": 10000}}" + + "}," + + "\"last_trigger_ts\": 1200" + + "}"; + + // Target BE + String dstJson = "{" + + "\"job_id\": 42," + + "\"finish\": {" + + " \"seg\": {\"num\": {\"5m\": 70, \"30m\": 300, \"1h\": 1250}," + + " \"size\": {\"5m\": 7000, \"30m\": 30000, \"1h\": 125000}}," + + " \"idx\": {\"num\": {\"5m\": 14, \"30m\": 60, \"1h\": 250}," + + " \"size\": {\"5m\": 1400, \"30m\": 6000, \"1h\": 25000}}" + + "}," + + "\"fail\": {" + + " \"seg\": {\"num\": {\"5m\": 2, \"30m\": 5, \"1h\": 10}," + + " \"size\": {\"5m\": 200, \"30m\": 500, \"1h\": 1000}}," + + " \"idx\": {\"num\": {\"5m\": 0, \"30m\": 1, \"1h\": 3}," + + " \"size\": {\"5m\": 0, \"30m\": 100, \"1h\": 300}}" + + "}," + + "\"last_finish_ts\": 1100," + + "\"progress_trigger_ts\": 900" + + "}"; + + // Parse and merge source BEs + TableWarmUpWindowedStats src = TableWarmUpWindowedStats.fromJson( + JsonParser.parseString(srcJson1).getAsJsonObject()); + src.merge(TableWarmUpWindowedStats.fromJson( + JsonParser.parseString(srcJson2).getAsJsonObject())); + + // Parse target BE + TableWarmUpWindowedStats dst = TableWarmUpWindowedStats.fromJson( + JsonParser.parseString(dstJson).getAsJsonObject()); + + // Aggregate + JobWarmUpStats job = new JobWarmUpStats(); + job.mergeRequested(src); + job.mergeFinished(dst); + job.computeGap(); + + // Verify aggregated requested (50+30=80, 200+120=320, ...) + Assertions.assertEquals(80, job.requestedSegmentNum5m); + Assertions.assertEquals(320, job.requestedSegmentNum30m); + Assertions.assertEquals(1300, job.requestedSegmentNum1h); + Assertions.assertEquals(8000, job.requestedSegmentSize5m); + Assertions.assertEquals(16, job.requestedIndexNum5m); + Assertions.assertEquals(1200, job.lastTriggerTs); // max(1000, 1200) + + // Verify finished + Assertions.assertEquals(70, job.finishSegmentNum5m); + Assertions.assertEquals(300, job.finishSegmentNum30m); + Assertions.assertEquals(2, job.failSegmentNum5m); + Assertions.assertEquals(1100, job.lastFinishTs); + Assertions.assertEquals(900, job.progressTriggerTs); + + // Verify gap + Assertions.assertEquals(10, job.gapSegmentNum5m); // 80 - 70 + Assertions.assertEquals(20, job.gapSegmentNum30m); // 320 - 300 + Assertions.assertEquals(50, job.gapSegmentNum1h); // 1300 - 1250 + Assertions.assertEquals(1000, job.gapSegmentSize5m); // 8000 - 7000 + Assertions.assertEquals(2, job.gapIndexNum5m); // 16 - 14 + Assertions.assertEquals(300, job.triggerGapMs); // 1200 - 900 + } + + @Test + public void testClusterLevelEventDrivenJobAggregatesStatsByJobId() { + CloudWarmUpJob job = new CloudWarmUpJob.Builder() + .setJobId(77L) + .setSrcClusterName("write_cg") + .setDstClusterName("read_cg") + .setJobType(CloudWarmUpJob.JobType.CLUSTER) + .setSyncMode(CloudWarmUpJob.SyncMode.EVENT_DRIVEN) + .setSyncEvent(CloudWarmUpJob.SyncEvent.LOAD) + .build(); + + TableWarmUpWindowedStats src = new TableWarmUpWindowedStats(); + src.requestedSegmentNum30m = 6; + src.requestedSegmentSize30m = 2048; + src.requestedIndexNum30m = 2; + src.requestedIndexSize30m = 1024; + src.lastTriggerTs = 1000; + + TableWarmUpWindowedStats dst = new TableWarmUpWindowedStats(); + dst.finishSegmentNum30m = 4; + dst.finishSegmentSize30m = 1024; + dst.finishIndexNum30m = 1; + dst.finishIndexSize30m = 512; + dst.failSegmentNum30m = 1; + dst.failSegmentSize30m = 128; + dst.lastFinishTs = 1200; + + Map srcStats = new HashMap<>(); + srcStats.put(77L, src); + Map dstStats = new HashMap<>(); + dstStats.put(77L, dst); + Map> clusterStats = new HashMap<>(); + clusterStats.put("write_cg", srcStats); + clusterStats.put("read_cg", dstStats); + + JobWarmUpStats stats = new CacheHotspotManager(null).aggregateStatsForJob(job, clusterStats); + + Assertions.assertEquals(6, stats.requestedSegmentNum30m); + Assertions.assertEquals(4, stats.finishSegmentNum30m); + Assertions.assertEquals(2, stats.gapSegmentNum30m); + Assertions.assertEquals(2, stats.requestedIndexNum30m); + Assertions.assertEquals(1, stats.finishIndexNum30m); + Assertions.assertEquals(1, stats.gapIndexNum30m); + Assertions.assertEquals(1536, stats.gapSegmentSize30m + stats.gapIndexSize30m); + Assertions.assertEquals(1000, stats.lastTriggerTs); + Assertions.assertEquals(1200, stats.lastFinishTs); + } +} diff --git a/fe/fe-core/src/test/java/org/apache/doris/cloud/catalog/CloudInstanceStatusCheckerTest.java b/fe/fe-core/src/test/java/org/apache/doris/cloud/catalog/CloudInstanceStatusCheckerTest.java new file mode 100644 index 00000000000000..ff19f67dcb64dc --- /dev/null +++ b/fe/fe-core/src/test/java/org/apache/doris/cloud/catalog/CloudInstanceStatusCheckerTest.java @@ -0,0 +1,260 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.cloud.catalog; + +import org.apache.doris.catalog.DatabaseIf; +import org.apache.doris.catalog.Env; +import org.apache.doris.catalog.StorageVaultMgr; +import org.apache.doris.catalog.TableIf; +import org.apache.doris.cloud.CacheHotspotManager; +import org.apache.doris.cloud.CloudWarmUpJob; +import org.apache.doris.cloud.OnTablesFilter.TableFilterRule; +import org.apache.doris.cloud.OnTablesFilter.TableFilterRule.RuleType; +import org.apache.doris.cloud.proto.Cloud; +import org.apache.doris.cloud.system.CloudSystemInfoService; +import org.apache.doris.common.Config; +import org.apache.doris.datasource.InternalCatalog; +import org.apache.doris.nereids.trees.plans.commands.WarmUpClusterCommand; +import org.apache.doris.persist.EditLog; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.core.LogEvent; +import org.apache.logging.log4j.core.Logger; +import org.apache.logging.log4j.core.appender.AbstractAppender; +import org.apache.logging.log4j.core.config.Property; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.MockedStatic; +import org.mockito.Mockito; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.concurrent.atomic.AtomicLong; + +public class CloudInstanceStatusCheckerTest { + private String originalCloudUniqueId; + private CloudSystemInfoService cloudSystemInfoService; + private CacheHotspotManager cacheHotspotManager; + private InternalCatalog internalCatalog; + private List> databases; + private MockedStatic mockedEnv; + private CloudEnv cloudEnv; + + @BeforeEach + public void setUp() { + originalCloudUniqueId = Config.cloud_unique_id; + Config.cloud_unique_id = "test_cloud"; + + cloudSystemInfoService = Mockito.spy(new CloudSystemInfoService()); + cacheHotspotManager = new CacheHotspotManager(cloudSystemInfoService); + internalCatalog = Mockito.mock(InternalCatalog.class); + databases = new ArrayList<>(); + Mockito.when(internalCatalog.getAllDbs()).thenAnswer(invocation -> databases); + + cloudEnv = Mockito.mock(CloudEnv.class); + AtomicLong nextId = new AtomicLong(10000L); + Mockito.when(cloudEnv.getNextId()).thenAnswer(invocation -> nextId.incrementAndGet()); + Mockito.when(cloudEnv.getEditLog()).thenReturn(Mockito.mock(EditLog.class)); + Mockito.when(cloudEnv.getStorageVaultMgr()).thenReturn(Mockito.mock(StorageVaultMgr.class)); + Mockito.when(cloudEnv.getCacheHotspotMgr()).thenReturn(cacheHotspotManager); + Mockito.when(cloudEnv.isMaster()).thenReturn(false); + + mockedEnv = Mockito.mockStatic(Env.class); + mockedEnv.when(Env::getCurrentEnv).thenReturn(cloudEnv); + mockedEnv.when(Env::getCurrentInternalCatalog).thenReturn(internalCatalog); + mockedEnv.when(Env::getCurrentSystemInfo).thenReturn(cloudSystemInfoService); + } + + @AfterEach + public void tearDown() { + if (mockedEnv != null) { + mockedEnv.close(); + } + Config.cloud_unique_id = originalCloudUniqueId; + } + + @Test + public void testSyncInstanceCreatesVirtualComputeGroup() { + addComputeGroup("active_cg_id", "active_cg"); + addComputeGroup("standby_cg_id", "standby_cg"); + Mockito.doReturn(instanceResponseWithVirtualComputeGroup()).when(cloudSystemInfoService).getCloudInstance(); + + new CloudInstanceStatusChecker(cloudSystemInfoService).runAfterCatalogReady(); + + ComputeGroup virtualComputeGroup = cloudSystemInfoService.getComputeGroupById("vcg_id"); + Assertions.assertNotNull(virtualComputeGroup); + Assertions.assertTrue(virtualComputeGroup.isVirtual()); + Assertions.assertEquals("vcg", virtualComputeGroup.getName()); + Assertions.assertEquals(Arrays.asList("active_cg", "standby_cg"), + virtualComputeGroup.getSubComputeGroups()); + Assertions.assertEquals("active_cg", virtualComputeGroup.getActiveComputeGroup()); + Assertions.assertEquals("standby_cg", virtualComputeGroup.getStandbyComputeGroup()); + } + + @Test + public void testSyncInstanceCreatesVirtualComputeGroupAndCancelsTableLevelLoadEvent() throws Exception { + databases.add(mockDb("ods", mockTable(1001, "orders"))); + addComputeGroup("active_cg_id", "active_cg"); + addComputeGroup("standby_cg_id", "standby_cg"); + long tableLevelJobId = cacheHotspotManager.createJob(buildEventDrivenStmt("active_cg", "standby_cg", + new TableFilterRule(RuleType.INCLUDE, "ods.*"))); + Mockito.doReturn(instanceResponseWithVirtualComputeGroup()).when(cloudSystemInfoService).getCloudInstance(); + Mockito.when(cloudEnv.isMaster()).thenReturn(true); + + RecordingAppender appender = new RecordingAppender("vcg-create-cancel-table-warmup-test"); + Logger logger = (Logger) LogManager.getLogger(CloudInstanceStatusChecker.class); + appender.start(); + logger.addAppender(appender); + try (MockedStatic mockedCloudSystemInfoService = + Mockito.mockStatic(CloudSystemInfoService.class, Mockito.CALLS_REAL_METHODS)) { + mockedCloudSystemInfoService.when(() -> CloudSystemInfoService.updateFileCacheJobIds( + Mockito.any(ComputeGroup.class), Mockito.anyList())).thenAnswer(invocation -> null); + + new CloudInstanceStatusChecker(cloudSystemInfoService).runAfterCatalogReady(); + mockedCloudSystemInfoService.verify(() -> CloudSystemInfoService.updateFileCacheJobIds( + Mockito.any(ComputeGroup.class), Mockito.anyList())); + } finally { + logger.removeAppender(appender); + appender.stop(); + } + + ComputeGroup virtualComputeGroup = cloudSystemInfoService.getComputeGroupById("vcg_id"); + Assertions.assertNotNull(virtualComputeGroup); + Assertions.assertTrue(virtualComputeGroup.isVirtual()); + Assertions.assertFalse(virtualComputeGroup.isNeedRebuildFileCache()); + + CloudWarmUpJob tableLevelJob = cacheHotspotManager.getCloudWarmUpJob(tableLevelJobId); + Assertions.assertEquals(CloudWarmUpJob.JobState.CANCELLED, tableLevelJob.getJobState()); + Assertions.assertTrue(tableLevelJob.getErrMsg().contains( + "vcg cancel table-level load-event warm up job before rebuilding file cache jobs")); + Assertions.assertTrue(tableLevelJob.getErrMsg().contains("virtual compute group 'vcg'")); + + Assertions.assertEquals(3, cacheHotspotManager.getAllJobInfos(10).size()); + Assertions.assertTrue(cacheHotspotManager.getCloudWarmUpJobs().values().stream().anyMatch(job -> + job.getJobType() == CloudWarmUpJob.JobType.CLUSTER + && job.isPeriodic() + && "active_cg".equals(job.getSrcClusterName()) + && "standby_cg".equals(job.getDstClusterName()))); + Assertions.assertTrue(cacheHotspotManager.getCloudWarmUpJobs().values().stream().anyMatch(job -> + job.getJobType() == CloudWarmUpJob.JobType.CLUSTER + && job.isEventDriven() + && job.getSyncEvent() == CloudWarmUpJob.SyncEvent.LOAD + && !job.hasTableFilter() + && "active_cg".equals(job.getSrcClusterName()) + && "standby_cg".equals(job.getDstClusterName()))); + + String logs = appender.messagesAsString(); + Assertions.assertFalse(logs.contains("failed to create virtual compute group vcg"), logs); + Assertions.assertTrue(logs.contains("generate new jobIds"), logs); + } + + private void addComputeGroup(String computeGroupId, String computeGroupName) { + cloudSystemInfoService.addComputeGroup(computeGroupId, + new ComputeGroup(computeGroupId, computeGroupName, ComputeGroup.ComputeTypeEnum.COMPUTE)); + } + + private Cloud.GetInstanceResponse instanceResponseWithVirtualComputeGroup() { + Cloud.ClusterPB activeComputeGroup = computeGroup("active_cg_id", "active_cg"); + Cloud.ClusterPB standbyComputeGroup = computeGroup("standby_cg_id", "standby_cg"); + Cloud.ClusterPB virtualComputeGroup = Cloud.ClusterPB.newBuilder() + .setClusterId("vcg_id") + .setClusterName("vcg") + .setType(Cloud.ClusterPB.Type.VIRTUAL) + .addClusterNames("active_cg") + .addClusterNames("standby_cg") + .setClusterPolicy(Cloud.ClusterPolicy.newBuilder() + .setType(Cloud.ClusterPolicy.PolicyType.ActiveStandby) + .setActiveClusterName("active_cg") + .addStandbyClusterNames("standby_cg") + .build()) + .build(); + return Cloud.GetInstanceResponse.newBuilder() + .setStatus(Cloud.MetaServiceResponseStatus.newBuilder() + .setCode(Cloud.MetaServiceCode.OK) + .setMsg("OK") + .build()) + .setInstance(Cloud.InstanceInfoPB.newBuilder() + .setStatus(Cloud.InstanceInfoPB.Status.NORMAL) + .addClusters(activeComputeGroup) + .addClusters(standbyComputeGroup) + .addClusters(virtualComputeGroup) + .build()) + .build(); + } + + private Cloud.ClusterPB computeGroup(String computeGroupId, String computeGroupName) { + return Cloud.ClusterPB.newBuilder() + .setClusterId(computeGroupId) + .setClusterName(computeGroupName) + .setType(Cloud.ClusterPB.Type.COMPUTE) + .build(); + } + + @SuppressWarnings("unchecked") + private DatabaseIf mockDb(String name, TableIf... tables) { + DatabaseIf db = Mockito.mock(DatabaseIf.class); + Mockito.when(db.getFullName()).thenReturn(name); + HashSet tableNames = new HashSet<>(); + for (TableIf table : tables) { + tableNames.add(table.getName()); + Mockito.when(db.getTableNullable(table.getName())).thenReturn(table); + } + Mockito.when(db.getTableNamesOrEmptyWithLock()).thenReturn(tableNames); + return db; + } + + private TableIf mockTable(long id, String name) { + TableIf table = Mockito.mock(TableIf.class); + Mockito.when(table.getId()).thenReturn(id); + Mockito.when(table.getName()).thenReturn(name); + Mockito.when(table.getType()).thenReturn(TableIf.TableType.OLAP); + Mockito.when(table.isManagedTable()).thenReturn(true); + return table; + } + + private WarmUpClusterCommand buildEventDrivenStmt(String src, String dst, TableFilterRule... rules) { + Map properties = new HashMap<>(); + properties.put("sync_mode", "event_driven"); + properties.put("sync_event", "load"); + return new WarmUpClusterCommand(new ArrayList<>(), src, dst, false, false, + properties, Arrays.asList(rules)); + } + + private static class RecordingAppender extends AbstractAppender { + private final List messages = new ArrayList<>(); + + RecordingAppender(String name) { + super(name, null, null, true, Property.EMPTY_ARRAY); + } + + @Override + public void append(LogEvent event) { + messages.add(event.getMessage().getFormattedMessage()); + } + + String messagesAsString() { + return String.join("\n", messages); + } + } +} diff --git a/fe/fe-core/src/test/java/org/apache/doris/metric/MetricsTest.java b/fe/fe-core/src/test/java/org/apache/doris/metric/MetricsTest.java index 164d767b66e203..0fe7d33b2e6028 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/metric/MetricsTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/metric/MetricsTest.java @@ -17,6 +17,8 @@ package org.apache.doris.metric; +import org.apache.doris.cloud.CloudWarmUpJob; +import org.apache.doris.cloud.JobWarmUpStats; import org.apache.doris.common.Config; import org.apache.doris.common.FeConstants; import org.apache.doris.common.util.JsonUtil; @@ -33,6 +35,7 @@ import java.lang.management.GarbageCollectorMXBean; import java.lang.management.ManagementFactory; +import java.util.Collections; import java.util.List; import java.util.concurrent.atomic.AtomicInteger; @@ -196,6 +199,163 @@ public void testVirtualComputeGroupSwitchMetricRename() { } } + @Test + public void testCloudWarmUpSyncJobMetricsReadStatsDirectlyFromJob() { + String oldCloudUniqueId = Config.cloud_unique_id; + Config.cloud_unique_id = "test_cloud_unique_id"; + try { + CloudWarmUpJob job = new CloudWarmUpJob.Builder() + .setJobId(1778211593204L) + .setSrcClusterName("warmup_source") + .setDstClusterName("warmup_target") + .setJobType(CloudWarmUpJob.JobType.CLUSTER) + .setSyncMode(CloudWarmUpJob.SyncMode.EVENT_DRIVEN) + .setSyncEvent(CloudWarmUpJob.SyncEvent.LOAD) + .build(); + job.setJobState(CloudWarmUpJob.JobState.RUNNING); + + JobWarmUpStats stats = new JobWarmUpStats(); + stats.requestedSegmentSize5m = 104857600L; + stats.requestedSegmentSize30m = 209715200L; + stats.requestedSegmentSize1h = 314572800L; + stats.finishSegmentSize5m = 94371840L; + stats.finishSegmentSize30m = 188743680L; + stats.finishSegmentSize1h = 283115520L; + stats.requestedIndexSize5m = 8388608L; + stats.requestedIndexSize30m = 16777216L; + stats.requestedIndexSize1h = 25165824L; + stats.finishIndexSize5m = 6291456L; + stats.finishIndexSize30m = 12582912L; + stats.finishIndexSize1h = 18874368L; + stats.computeGap(); + job.setSyncStats(stats); + + MetricRepo.syncCloudWarmUpSyncJobMetricDefinitions(Collections.singletonList(job)); + String metricResult = getPrometheusMetrics(); + Assert.assertTrue(metricResult.contains("doris_fe_file_cache_warm_up_sync_job_info" + + "{job_id=\"1778211593204\", job_type=\"CLUSTER\", sync_mode=\"EVENT_DRIVEN\", " + + "sync_event=\"LOAD\", job_state=\"RUNNING\", src_cluster_name=\"warmup_source\", " + + "dst_cluster_name=\"warmup_target\"} 1")); + Assert.assertFalse(metricResult.contains("doris_fe_file_cache_warm_up_sync_job_create_time_ms")); + Assert.assertFalse(metricResult.contains("doris_fe_file_cache_warm_up_sync_job_last_trigger_time_ms")); + Assert.assertFalse(metricResult.contains("doris_fe_file_cache_warm_up_sync_job_stats")); + Assert.assertTrue(metricResult.contains("doris_fe_file_cache_warm_up_sync_job_size_bytes" + + "{job_id=\"1778211593204\", job_type=\"CLUSTER\", src_cluster_name=\"warmup_source\", " + + "dst_cluster_name=\"warmup_target\", side=\"src\", window=\"5m\"} 113246208")); + Assert.assertTrue(metricResult.contains("doris_fe_file_cache_warm_up_sync_job_size_bytes" + + "{job_id=\"1778211593204\", job_type=\"CLUSTER\", src_cluster_name=\"warmup_source\", " + + "dst_cluster_name=\"warmup_target\", side=\"dst\", window=\"5m\"} 100663296")); + Assert.assertTrue(metricResult.contains("doris_fe_file_cache_warm_up_sync_job_size_bytes" + + "{job_id=\"1778211593204\", job_type=\"CLUSTER\", src_cluster_name=\"warmup_source\", " + + "dst_cluster_name=\"warmup_target\", side=\"src\", window=\"30m\"} 226492416")); + Assert.assertTrue(metricResult.contains("doris_fe_file_cache_warm_up_sync_job_size_bytes" + + "{job_id=\"1778211593204\", job_type=\"CLUSTER\", src_cluster_name=\"warmup_source\", " + + "dst_cluster_name=\"warmup_target\", side=\"dst\", window=\"1h\"} 301989888")); + + JobWarmUpStats updatedStats = new JobWarmUpStats(); + updatedStats.requestedSegmentSize5m = 12; + updatedStats.finishSegmentSize5m = 10; + updatedStats.computeGap(); + job.setSyncStats(updatedStats); + String updatedMetricResult = getPrometheusMetrics(); + Assert.assertTrue(updatedMetricResult.contains("doris_fe_file_cache_warm_up_sync_job_size_bytes" + + "{job_id=\"1778211593204\", job_type=\"CLUSTER\", src_cluster_name=\"warmup_source\", " + + "dst_cluster_name=\"warmup_target\", side=\"src\", window=\"5m\"} 12")); + Assert.assertTrue(updatedMetricResult.contains("doris_fe_file_cache_warm_up_sync_job_size_bytes" + + "{job_id=\"1778211593204\", job_type=\"CLUSTER\", src_cluster_name=\"warmup_source\", " + + "dst_cluster_name=\"warmup_target\", side=\"dst\", window=\"5m\"} 10")); + + CloudWarmUpJob replayedJob = new CloudWarmUpJob.Builder() + .setJobId(1778211593204L) + .setSrcClusterName("warmup_source") + .setDstClusterName("warmup_target") + .setJobType(CloudWarmUpJob.JobType.CLUSTER) + .setSyncMode(CloudWarmUpJob.SyncMode.EVENT_DRIVEN) + .setSyncEvent(CloudWarmUpJob.SyncEvent.LOAD) + .build(); + replayedJob.setJobState(CloudWarmUpJob.JobState.RUNNING); + JobWarmUpStats replayedStats = new JobWarmUpStats(); + replayedStats.requestedSegmentSize5m = 7; + replayedStats.requestedIndexSize5m = 3; + replayedStats.computeGap(); + replayedJob.setSyncStats(replayedStats); + MetricRepo.syncCloudWarmUpSyncJobMetricDefinitions(Collections.singletonList(replayedJob)); + String replayedMetricResult = getPrometheusMetrics(); + Assert.assertTrue(replayedMetricResult.contains("doris_fe_file_cache_warm_up_sync_job_size_bytes" + + "{job_id=\"1778211593204\", job_type=\"CLUSTER\", src_cluster_name=\"warmup_source\", " + + "dst_cluster_name=\"warmup_target\", side=\"src\", window=\"5m\"} 10")); + + replayedJob.setJobState(CloudWarmUpJob.JobState.CANCELLED); + MetricRepo.syncCloudWarmUpSyncJobMetricDefinitions(Collections.singletonList(replayedJob)); + String cancelledMetricResult = getPrometheusMetrics(); + Assert.assertTrue(cancelledMetricResult.contains("job_state=\"CANCELLED\"")); + Assert.assertFalse(cancelledMetricResult.contains("job_state=\"RUNNING\"")); + Assert.assertFalse(cancelledMetricResult.contains("doris_fe_file_cache_warm_up_sync_job_size_bytes")); + } finally { + MetricRepo.syncCloudWarmUpSyncJobMetricDefinitions(Collections.emptyList()); + Config.cloud_unique_id = oldCloudUniqueId; + } + } + + @Test + public void testEventDrivenCloudWarmUpSyncJobTriggerGapMetric() { + String oldCloudUniqueId = Config.cloud_unique_id; + Config.cloud_unique_id = "test_cloud_unique_id"; + try { + CloudWarmUpJob.PersistedTableFilterRule rule = new CloudWarmUpJob.PersistedTableFilterRule(); + rule.ruleType = "INCLUDE"; + rule.pattern = "db.tbl"; + CloudWarmUpJob job = new CloudWarmUpJob.Builder() + .setJobId(1778211593205L) + .setSrcClusterName("warmup_source") + .setDstClusterName("warmup_target") + .setJobType(CloudWarmUpJob.JobType.CLUSTER) + .setSyncMode(CloudWarmUpJob.SyncMode.EVENT_DRIVEN) + .setSyncEvent(CloudWarmUpJob.SyncEvent.LOAD) + .setTableFilterRules(Collections.singletonList(rule)) + .build(); + job.setJobState(CloudWarmUpJob.JobState.RUNNING); + + JobWarmUpStats stats = new JobWarmUpStats(); + stats.lastTriggerTs = 5000; + stats.progressTriggerTs = 4200; + stats.computeGap(); + job.setSyncStats(stats); + + MetricRepo.syncCloudWarmUpSyncJobMetricDefinitions(Collections.singletonList(job)); + String metricResult = getPrometheusMetrics(); + Assert.assertTrue(metricResult.contains("doris_fe_file_cache_warm_up_sync_job_trigger_gap_ms" + + "{job_id=\"1778211593205\", job_type=\"CLUSTER\", src_cluster_name=\"warmup_source\", " + + "dst_cluster_name=\"warmup_target\"} 800")); + + CloudWarmUpJob clusterLevelJob = new CloudWarmUpJob.Builder() + .setJobId(1778211593206L) + .setSrcClusterName("warmup_source") + .setDstClusterName("warmup_target") + .setJobType(CloudWarmUpJob.JobType.CLUSTER) + .setSyncMode(CloudWarmUpJob.SyncMode.EVENT_DRIVEN) + .setSyncEvent(CloudWarmUpJob.SyncEvent.LOAD) + .build(); + clusterLevelJob.setJobState(CloudWarmUpJob.JobState.RUNNING); + clusterLevelJob.setSyncStats(stats); + + MetricRepo.syncCloudWarmUpSyncJobMetricDefinitions(Collections.singletonList(clusterLevelJob)); + String clusterMetricResult = getPrometheusMetrics(); + Assert.assertTrue(clusterMetricResult.contains("doris_fe_file_cache_warm_up_sync_job_trigger_gap_ms" + + "{job_id=\"1778211593206\", job_type=\"CLUSTER\", src_cluster_name=\"warmup_source\", " + + "dst_cluster_name=\"warmup_target\"} 800")); + } finally { + MetricRepo.syncCloudWarmUpSyncJobMetricDefinitions(Collections.emptyList()); + Config.cloud_unique_id = oldCloudUniqueId; + } + } + + private String getPrometheusMetrics() { + MetricVisitor visitor = new PrometheusMetricVisitor(); + MetricRepo.DORIS_METRIC_REGISTER.accept(visitor); + return visitor.finish(); + } + @Test public void testGc() { PrometheusMetricVisitor visitor = new PrometheusMetricVisitor(); diff --git a/fe/fe-sql-parser/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 b/fe/fe-sql-parser/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 index efd539d5e7e360..446685892295e4 100644 --- a/fe/fe-sql-parser/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 +++ b/fe/fe-sql-parser/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 @@ -299,6 +299,7 @@ IMMEDIATE: 'IMMEDIATE'; IN: 'IN'; INCREMENTAL: 'INCREMENTAL'; INTEGRATION: 'INTEGRATION'; +INCLUDE: 'INCLUDE'; INDEX: 'INDEX'; INDEXES: 'INDEXES'; INFILE: 'INFILE'; diff --git a/fe/fe-sql-parser/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 b/fe/fe-sql-parser/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 index 1bf497f1cf9d8f..e7de219d017609 100644 --- a/fe/fe-sql-parser/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 +++ b/fe/fe-sql-parser/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 @@ -542,6 +542,7 @@ supportedOtherStatement | WARM UP (CLUSTER | COMPUTE GROUP) destination=identifier WITH ((CLUSTER | COMPUTE GROUP) source=identifier | (warmUpItem (AND warmUpItem)*)) FORCE? + onTablesClause? properties=propertyClause? #warmUpCluster | explain? WARM UP SELECT namedExpressionSeq FROM warmUpSingleTableRef whereClause? #warmUpSelect @@ -551,7 +552,15 @@ supportedOtherStatement | START TRANSACTION (WITH CONSISTENT SNAPSHOT)? #unsupportedStartTransaction ; - warmUpItem +onTablesClause + : ON TABLES LEFT_PAREN onTablesFilterRule (COMMA onTablesFilterRule)* RIGHT_PAREN + ; + +onTablesFilterRule + : (INCLUDE | EXCLUDE) STRING_LITERAL + ; + +warmUpItem : TABLE tableName=multipartIdentifier (PARTITION partitionName=identifier)? ; @@ -2150,6 +2159,7 @@ nonReserved | IMMEDIATE | INCREMENTAL | INTEGRATION + | INCLUDE | INDEXES | INSERT | INVERTED diff --git a/gensrc/proto/internal_service.proto b/gensrc/proto/internal_service.proto index 89a5d64976e29c..c5818339f3a66e 100644 --- a/gensrc/proto/internal_service.proto +++ b/gensrc/proto/internal_service.proto @@ -942,6 +942,8 @@ message PWarmUpRowsetRequest { optional int64 unix_ts_us = 2; optional int64 sync_wait_timeout_ms = 3; optional bool skip_existence_check = 4; + optional int64 job_id = 5; + optional int64 upstream_trigger_ts_ms = 6; } message PWarmUpRowsetResponse { diff --git a/gensrc/thrift/BackendService.thrift b/gensrc/thrift/BackendService.thrift index e9276caa42410c..2e5379bb42b256 100644 --- a/gensrc/thrift/BackendService.thrift +++ b/gensrc/thrift/BackendService.thrift @@ -223,6 +223,7 @@ struct TWarmUpTabletsRequest { 3: optional list job_metas 4: required TWarmUpTabletsRequestType type 5: optional TWarmUpEventType event + 6: optional list table_ids } struct TWarmUpTabletsResponse { diff --git a/regression-test/framework/src/main/groovy/org/apache/doris/regression/util/WarmupMetricsUtils.groovy b/regression-test/framework/src/main/groovy/org/apache/doris/regression/util/WarmupMetricsUtils.groovy new file mode 100644 index 00000000000000..aa877adb4132a5 --- /dev/null +++ b/regression-test/framework/src/main/groovy/org/apache/doris/regression/util/WarmupMetricsUtils.groovy @@ -0,0 +1,268 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.apache.doris.regression.util + +import groovy.json.JsonSlurper +import org.slf4j.Logger +import org.slf4j.LoggerFactory + +import java.util.regex.Pattern + +/** + * Utility methods for event-driven warmup regression tests. + * + * Methods that need database access accept a {@code Closure sqlRunner} + * parameter — callers pass {@code { String q -> sql(q) }} from the + * suite context. + */ +class WarmupMetricsUtils { + + static final Logger logger = LoggerFactory.getLogger(WarmupMetricsUtils.class) + + // Bvar metric names + static final String METRIC_REQUESTED = "file_cache_event_driven_warm_up_requested_segment_num" + static final String METRIC_SUBMITTED = "file_cache_event_driven_warm_up_submitted_segment_num" + static final String METRIC_FINISHED = "file_cache_event_driven_warm_up_finished_segment_num" + static final String METRIC_FAILED = "file_cache_event_driven_warm_up_failed_segment_num" + + /** + * Fetch a single bvar metric value from a BE's brpc_metrics endpoint. + */ + static long getBrpcMetric(String ip, String port, String metricName) { + def url = "http://${ip}:${port}/brpc_metrics" + def text = new URL(url).text + def matcher = text =~ ~"${metricName}\\s+(\\d+)" + if (matcher.find()) { + return matcher[0][1] as long + } + throw new RuntimeException("${metricName} not found for ${ip}:${port}") + } + + static String getPrometheusMetrics(String ip, Object port) { + return new URL("http://${ip}:${port}/metrics").text + } + + static BigDecimal findPrometheusMetricValue(String metricsText, String metricName, Map labels) { + def line = metricsText.readLines().find { metricLine -> + metricLine.startsWith("${metricName}{") + && labels.every { entry -> metricLine.contains(prometheusLabel(entry.key.toString(), entry.value)) } + } + if (line == null) { + return null + } + return new BigDecimal(line.substring(line.lastIndexOf(' ') + 1).trim()) + } + + static String prometheusLabel(String key, Object value) { + def text = value == null ? "" : value.toString() + text = text.replace("\\", "\\\\").replace("\"", "\\\"").replace("\n", "\\n") + return "${key}=\"${text}\"".toString() + } + + /** + * Sum a bvar metric across all BEs in the given cluster. + */ + static long getClusterMetricSum(Closure sqlRunner, String clusterName, String metricName) { + def clusterBes = getClusterBackends(sqlRunner, clusterName) + long sum = 0 + for (be in clusterBes) { + sum += getBrpcMetric(be[1].toString(), be[5].toString(), metricName) + } + return sum + } + + static List getClusterBackends(Closure sqlRunner, String clusterName) { + def backends = sqlRunner("SHOW BACKENDS") + return backends.findAll { + it[19].contains("\"compute_group_name\" : \"${clusterName}\"".toString()) + } + } + + static Map getClusterMetricValues(Closure sqlRunner, String clusterName, String metricName) { + Map values = [:] + for (be in getClusterBackends(sqlRunner, clusterName)) { + values[be[0].toString()] = getBrpcMetric(be[1].toString(), be[5].toString(), metricName) + } + return values + } + + static void clearFileCache(String ip, String httpPort) { + def response = new URL("http://${ip}:${httpPort}/api/file_cache?op=clear&sync=true").text + def json = new JsonSlurper().parseText(response) + if (json.status != "OK") { + throw new RuntimeException("Clear cache on ${ip}:${httpPort} failed: ${json.status}") + } + } + + static void clearFileCacheOnAllBackends(Closure sqlRunner, long waitMs = 5000) { + for (be in sqlRunner("SHOW BACKENDS")) { + clearFileCache(be[1].toString(), be[4].toString()) + } + Thread.sleep(waitMs) + } + + static long sumProfileCounter(String profileText, String counterName) { + def matcher = profileText =~ ~"(?m)(?{@code requested} is from the SOURCE cluster; the other three from DESTINATION.

+ * + * @return Map with keys: requested, submitted, finished, failed + */ + static Map getWarmupMetrics(Closure sqlRunner, String srcCluster, String dstCluster) { + return [ + requested: getClusterMetricSum(sqlRunner, srcCluster, METRIC_REQUESTED), + submitted: getClusterMetricSum(sqlRunner, dstCluster, METRIC_SUBMITTED), + finished : getClusterMetricSum(sqlRunner, dstCluster, METRIC_FINISHED), + failed : getClusterMetricSum(sqlRunner, dstCluster, METRIC_FAILED), + ] + } + + /** + * Log and return warmup metrics. + */ + static Map logWarmupMetrics(Closure sqlRunner, String srcCluster, String dstCluster) { + def m = getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + logger.info("warmup metrics [src=${srcCluster}, dst=${dstCluster}]: " + + "requested=${m.requested}, submitted=${m.submitted}, " + + "finished=${m.finished}, failed=${m.failed}") + return m + } + + /** + * Poll until enough segments have finished warming up. + * + * @param expectedFinished absolute finished count to wait for + * @param timeoutMs polling timeout in milliseconds + * @return latest metrics snapshot + */ + static Map waitForWarmupFinish(Closure sqlRunner, String srcCluster, String dstCluster, + long expectedFinished, long timeoutMs = 60000) { + long deadline = System.currentTimeMillis() + timeoutMs + while (System.currentTimeMillis() < deadline) { + def m = getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + if (m.finished >= expectedFinished && m.finished + m.failed >= m.submitted) { + return m + } + Thread.sleep(2000) + } + logger.warn("waitForWarmupFinish timed out after ${timeoutMs}ms, " + + "expected finished >= ${expectedFinished}") + return getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + } + + /** + * Parse the MatchedTables column (index 14) from SHOW WARM UP JOB output. + */ + static Set parseMatchedTables(List jobInfo) { + def raw = jobInfo[0][14]?.toString()?.trim() + if (raw == null || raw.isEmpty()) { + return [] as Set + } + return raw.split(/,\s*/).collect { it.trim() }.findAll { !it.isEmpty() }.toSet() + } + + /** + * Poll until MatchedTables contains (and excludes) the expected table names. + * + * @return last observed MatchedTables set + */ + static Set waitForMatchedTables(Closure sqlRunner, Object jobId, + Set expectedContains, + Set expectedNotContains = [] as Set, + long timeoutMs = 30000) { + long deadline = System.currentTimeMillis() + timeoutMs + Set lastMatched = [] as Set + while (System.currentTimeMillis() < deadline) { + def info = sqlRunner("SHOW WARM UP JOB WHERE ID = ${jobId}") + lastMatched = parseMatchedTables(info) + boolean allContained = expectedContains.every { lastMatched.contains(it) } + boolean noneExcluded = expectedNotContains.every { !lastMatched.contains(it) } + if (allContained && noneExcluded) { + return lastMatched + } + Thread.sleep(2000) + } + return lastMatched + } + + /** + * Parse the SyncStats column (index 15) from SHOW WARM UP JOB output. + */ + static Map parseSyncStats(List jobInfo) { + def raw = jobInfo[0][15]?.toString()?.trim() + if (raw == null || raw.isEmpty()) { + return [:] + } + return new JsonSlurper().parseText(raw) as Map + } + + /** + * Poll SHOW WARM UP JOB WHERE ID until SyncStats exists and satisfies the predicate. + * + * @return last parsed SyncStats map + */ + static Map waitForJobSyncStats(Closure sqlRunner, Object jobId, Closure predicate, + long timeoutMs = 30000) { + long deadline = System.currentTimeMillis() + timeoutMs + Map lastStats = [:] + while (System.currentTimeMillis() < deadline) { + def info = sqlRunner("SHOW WARM UP JOB WHERE ID = ${jobId}") + lastStats = parseSyncStats(info) + if (!lastStats.isEmpty() && predicate(lastStats)) { + return lastStats + } + Thread.sleep(2000) + } + return lastStats + } + + /** + * Wait for warmup metrics to stabilize (no new submissions for a sustained period). + * Uses a double-check pattern: waits 5s initially, then verifies stability over 3s. + * + * @return stabilized metrics snapshot + */ + static Map waitForMetricsStable(Closure sqlRunner, String srcCluster, String dstCluster, + long timeoutMs = 30000) { + long deadline = System.currentTimeMillis() + timeoutMs + def prev = getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + Thread.sleep(5000) + while (System.currentTimeMillis() < deadline) { + def cur = getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + if (cur.submitted == prev.submitted && cur.finished == prev.finished + && cur.finished + cur.failed >= cur.submitted) { + Thread.sleep(3000) + def verify = getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + if (verify.submitted == cur.submitted && verify.finished == cur.finished) { + return verify + } + } + prev = cur + Thread.sleep(2000) + } + logger.warn("waitForMetricsStable timed out after ${timeoutMs}ms") + return getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + } +} diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_abnormal_cancel_empty_recovery.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_abnormal_cancel_empty_recovery.groovy new file mode 100644 index 00000000000000..24bc49914548e5 --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_abnormal_cancel_empty_recovery.groovy @@ -0,0 +1,212 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.NodeType +import org.apache.doris.regression.util.WarmupMetricsUtils + +// Test points covered: EX-03, EX-08. +suite('test_warm_up_event_on_tables_abnormal_cancel_empty_recovery', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_warm_up_table_filter_refresh_interval_ms=1000', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + ] + options.enableDebugPoints() + options.cloudMode = true + + def waitUntil = { String desc, long timeoutMs, Closure predicate -> + long deadline = System.currentTimeMillis() + timeoutMs + while (System.currentTimeMillis() < deadline) { + if (predicate()) { + return + } + sleep(500) + } + assert false : "Timed out waiting for ${desc}" + } + + docker(options) { + Closure sqlRunner = { String q -> sql(q) } + + def srcCluster = "warmup_source" + def dstCluster = "warmup_target" + def dbName = "test_on_tables_abnormal_cancel_empty_db" + def jobIds = [] + def targetDebugEnabled = false + + cluster.addBackend(1, srcCluster) + cluster.addBackend(1, dstCluster) + + try { + sql """use @${srcCluster}""" + sql """CREATE DATABASE IF NOT EXISTS ${dbName}""" + sql """use ${dbName}""" + sql """CREATE TABLE IF NOT EXISTS cancel_tbl ( + id INT, + payload STRING + ) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 4 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + sql """CREATE TABLE IF NOT EXISTS fact_live ( + id INT, + amount INT + ) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + + def cancelJobId = sql(""" + WARM UP CLUSTER ${dstCluster} WITH CLUSTER ${srcCluster} + ON TABLES (INCLUDE '${dbName}.cancel_tbl') + PROPERTIES ("sync_mode" = "event_driven", "sync_event" = "load") + """)[0][0] + jobIds << cancelJobId + assert WarmupMetricsUtils.waitForMatchedTables(sqlRunner, cancelJobId, + ["${dbName}.cancel_tbl".toString()] as Set) == + ["${dbName}.cancel_tbl".toString()] as Set + + WarmupMetricsUtils.clearFileCacheOnAllBackends(sqlRunner) + def baseMetrics = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + sql """use @${srcCluster}""" + sql """use ${dbName}""" + sql """INSERT INTO cancel_tbl VALUES + (1, 'seed_1'), (2, 'seed_2'), (3, 'seed_3'), (4, 'seed_4')""" + def initialMetrics = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, srcCluster, dstCluster, + baseMetrics.finished + 1, 60000) + assert initialMetrics.failed == baseMetrics.failed : + "initial warmup should finish without failures, metrics=${initialMetrics}" + def initialCacheSize = 0L + waitUntil("initial warmup to populate target cache", 30000) { + initialCacheSize = WarmupMetricsUtils.getClusterMetricSum(sqlRunner, dstCluster, + "ttl_cache_size") + return initialCacheSize > 0 + } + assert initialCacheSize > 0 : "initial warmup should populate target cache" + + def targetBe = WarmupMetricsUtils.getClusterBackends(sqlRunner, dstCluster)[0] + GetDebugPoint().enableDebugPoint(targetBe[1].toString(), targetBe[4] as int, NodeType.BE, + "CloudInternalServiceImpl::warm_up_rowset.download_segment", [sleep: 10]) + targetDebugEnabled = true + + def beforeActiveLoad = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + sql """INSERT INTO cancel_tbl VALUES + (100, 'active_100'), (101, 'active_101'), (102, 'active_102'), (103, 'active_103'), + (104, 'active_104'), (105, 'active_105'), (106, 'active_106'), (107, 'active_107')""" + waitUntil("active warmup transfer to be submitted", 20000) { + def m = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + return m.submitted > beforeActiveLoad.submitted && m.finished < m.submitted + } + + sql """CANCEL WARM UP JOB WHERE ID = ${cancelJobId}""" + waitUntil("cancel job state", 20000) { + def info = sql """SHOW WARM UP JOB WHERE ID = ${cancelJobId}""" + return info[0][3] == "CANCELLED" + } + + def afterCancelStable = WarmupMetricsUtils.waitForMetricsStable(sqlRunner, + srcCluster, dstCluster, 50000) + assert afterCancelStable.submitted > beforeActiveLoad.submitted : + "active transfer should have submitted before cancel, before=${beforeActiveLoad}, after=${afterCancelStable}" + assert afterCancelStable.finished + afterCancelStable.failed >= afterCancelStable.submitted : + "active transfer should converge after cancel, metrics=${afterCancelStable}" + + def cacheAfterCancel = WarmupMetricsUtils.getClusterMetricSum(sqlRunner, dstCluster, + "ttl_cache_size") + assert cacheAfterCancel >= initialCacheSize : + "cancel should not clear existing target cache, before=${initialCacheSize}, after=${cacheAfterCancel}" + + GetDebugPoint().disableDebugPoint(targetBe[1].toString(), targetBe[4] as int, NodeType.BE, + "CloudInternalServiceImpl::warm_up_rowset.download_segment") + targetDebugEnabled = false + + def beforePostCancelLoad = WarmupMetricsUtils.waitForMetricsStable(sqlRunner, + srcCluster, dstCluster, 30000) + sql """INSERT INTO cancel_tbl VALUES (200, 'after_cancel_200'), (201, 'after_cancel_201')""" + sleep(5000) + def afterPostCancelLoad = WarmupMetricsUtils.waitForMetricsStable(sqlRunner, + srcCluster, dstCluster, 30000) + assert afterPostCancelLoad.submitted == beforePostCancelLoad.submitted : + "cancelled job should not submit later events, before=${beforePostCancelLoad}, after=${afterPostCancelLoad}" + assert afterPostCancelLoad.finished == beforePostCancelLoad.finished : + "cancelled job should not finish later events, before=${beforePostCancelLoad}, after=${afterPostCancelLoad}" + + def emptyWindowJobId = sql(""" + WARM UP CLUSTER ${dstCluster} WITH CLUSTER ${srcCluster} + ON TABLES (INCLUDE '${dbName}.fact_*') + PROPERTIES ("sync_mode" = "event_driven", "sync_event" = "load") + """)[0][0] + jobIds << emptyWindowJobId + assert WarmupMetricsUtils.waitForMatchedTables(sqlRunner, emptyWindowJobId, + ["${dbName}.fact_live".toString()] as Set) == + ["${dbName}.fact_live".toString()] as Set + + sql """ALTER TABLE ${dbName}.fact_live RENAME archive_live""" + def emptyMatched = WarmupMetricsUtils.waitForMatchedTables(sqlRunner, emptyWindowJobId, + [] as Set, + ["${dbName}.fact_live".toString(), "${dbName}.archive_live".toString()] as Set, + 30000) + assert emptyMatched.isEmpty() : "MatchedTables should be empty during the non-matching window: ${emptyMatched}" + def emptyJobInfo = sql """SHOW WARM UP JOB WHERE ID = ${emptyWindowJobId}""" + assert emptyJobInfo[0][3] in ["RUNNING", "PENDING"] : + "job should stay runnable when MatchedTables is empty, row=${emptyJobInfo[0]}" + + def beforeArchiveLoad = WarmupMetricsUtils.waitForMetricsStable(sqlRunner, + srcCluster, dstCluster, 30000) + sql """INSERT INTO archive_live VALUES (1, 10), (2, 20)""" + sleep(5000) + def afterArchiveLoad = WarmupMetricsUtils.waitForMetricsStable(sqlRunner, + srcCluster, dstCluster, 30000) + assert afterArchiveLoad.submitted == beforeArchiveLoad.submitted : + "non-matching empty-window load should not submit warmup, before=${beforeArchiveLoad}, after=${afterArchiveLoad}" + + sql """ALTER TABLE ${dbName}.archive_live RENAME fact_back""" + assert WarmupMetricsUtils.waitForMatchedTables(sqlRunner, emptyWindowJobId, + ["${dbName}.fact_back".toString()] as Set) == + ["${dbName}.fact_back".toString()] as Set + + def beforeRecoveredLoad = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + sql """INSERT INTO fact_back VALUES (3, 30), (4, 40)""" + def afterRecoveredLoad = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, + srcCluster, dstCluster, beforeRecoveredLoad.finished + 1, 60000) + assert afterRecoveredLoad.submitted > beforeRecoveredLoad.submitted : + "matching table after empty window should submit warmup, before=${beforeRecoveredLoad}, after=${afterRecoveredLoad}" + assert afterRecoveredLoad.finished > beforeRecoveredLoad.finished : + "matching table after empty window should finish warmup, before=${beforeRecoveredLoad}, after=${afterRecoveredLoad}" + } finally { + if (targetDebugEnabled) { + try { GetDebugPoint().clearDebugPointsForAllBEs() } catch (Exception ignored) {} + } + for (jid in jobIds) { + try { sql """CANCEL WARM UP JOB WHERE ID = ${jid}""" } catch (Exception ignored) {} + } + try { + sql """use @${srcCluster}""" + sql """use ${dbName}""" + sql """DROP TABLE IF EXISTS cancel_tbl""" + sql """DROP TABLE IF EXISTS fact_live""" + sql """DROP TABLE IF EXISTS archive_live""" + sql """DROP TABLE IF EXISTS fact_back""" + } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${dbName}""" } catch (Exception ignored) {} + } + } +} diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_abnormal_stats_and_failure.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_abnormal_stats_and_failure.groovy new file mode 100644 index 00000000000000..8888ba88d0ca4a --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_abnormal_stats_and_failure.groovy @@ -0,0 +1,261 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.NodeType +import org.apache.doris.regression.util.WarmupMetricsUtils + +// Test points covered: EX-05 (stats API HTTP 500, read timeout, BE down), EX-07. +suite('test_warm_up_event_on_tables_abnormal_stats_and_failure', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_tablet_rebalancer_interval_second=1', + 'cloud_warm_up_table_filter_refresh_interval_ms=1000', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + ] + options.enableDebugPoints() + options.cloudMode = true + + def waitUntil = { String desc, long timeoutMs, Closure predicate -> + long deadline = System.currentTimeMillis() + timeoutMs + while (System.currentTimeMillis() < deadline) { + if (predicate()) { + return + } + sleep(500) + } + assert false : "Timed out waiting for ${desc}" + } + + docker(options) { + Closure sqlRunner = { String q -> sql(q) } + + def srcCluster = "warmup_source" + def dstCluster = "warmup_target" + def dbName = "test_on_tables_abnormal_stats_fail_db" + def tableName = "abnormal_tbl" + def jobIds = [] + def statsApiDebugBe = null + def statsApiSleepBe = null + def downloadDebugBes = [] + + def rows = { int begin, int end -> + (begin.. 0 } + } + + def statsBeforeApiError = WarmupMetricsUtils.waitForJobSyncStats(sqlRunner, jobId, + { it.seg_num.finish_5m > 0 && it.seg_num.fail_5m == 0 }, 30000) + logger.info("SyncStats before API error injection: ${statsBeforeApiError}") + + statsApiDebugBe = WarmupMetricsUtils.getClusterBackends(sqlRunner, dstCluster)[0] + GetDebugPoint().enableDebugPoint(statsApiDebugBe[1].toString(), statsApiDebugBe[4] as int, + NodeType.BE, "WarmUpStatsAction.handle.return_error") + def degradedInfo = sql """SHOW WARM UP JOB WHERE ID = ${jobId}""" + assert degradedInfo[0][3] in ["RUNNING", "PENDING"] : + "SHOW should keep the job visible while one BE stats API fails, row=${degradedInfo[0]}" + def degradedStats = WarmupMetricsUtils.parseSyncStats(degradedInfo) + logger.info("SyncStats with one BE API failure: ${degradedStats}") + assert !degradedStats.isEmpty() : "SHOW should return degraded SyncStats instead of failing" + assert degradedStats.seg_num.finish_5m > 0 : + "remaining target BE stats should still be aggregated, stats=${degradedStats}" + + GetDebugPoint().disableDebugPoint(statsApiDebugBe[1].toString(), statsApiDebugBe[4] as int, + NodeType.BE, "WarmUpStatsAction.handle.return_error") + statsApiDebugBe = null + def restoredStats = WarmupMetricsUtils.waitForJobSyncStats(sqlRunner, jobId, + { it.seg_num.finish_5m >= statsBeforeApiError.seg_num.finish_5m }, 30000) + logger.info("SyncStats after API error recovery: ${restoredStats}") + + statsApiSleepBe = WarmupMetricsUtils.getClusterBackends(sqlRunner, dstCluster)[0] + GetDebugPoint().enableDebugPoint(statsApiSleepBe[1].toString(), statsApiSleepBe[4] as int, + NodeType.BE, "WarmUpStatsAction.handle.sleep", [sleep_ms: 12000]) + long timeoutStartMs = System.currentTimeMillis() + def timeoutInfo = sql """SHOW WARM UP JOB WHERE ID = ${jobId}""" + long timeoutElapsedMs = System.currentTimeMillis() - timeoutStartMs + assert timeoutInfo[0][3] in ["RUNNING", "PENDING"] : + "SHOW should keep the job visible while one BE stats API times out, row=${timeoutInfo[0]}" + def timeoutStats = WarmupMetricsUtils.parseSyncStats(timeoutInfo) + logger.info("SyncStats with one BE stats API timeout: ${timeoutStats}, elapsedMs=${timeoutElapsedMs}") + assert timeoutElapsedMs < 9000 : + "FE should use a bounded timeout for BE stats API requests, elapsedMs=${timeoutElapsedMs}" + assert !timeoutStats.isEmpty() : + "SHOW should return degraded SyncStats instead of waiting for the slow BE" + assert timeoutStats.seg_num.finish_5m > 0 : + "remaining target BE stats should still be aggregated after timeout, stats=${timeoutStats}" + GetDebugPoint().disableDebugPoint(statsApiSleepBe[1].toString(), statsApiSleepBe[4] as int, + NodeType.BE, "WarmUpStatsAction.handle.sleep") + statsApiSleepBe = null + + def targetBes = WarmupMetricsUtils.getClusterBackends(sqlRunner, dstCluster) + for (be in targetBes) { + GetDebugPoint().enableDebugPoint(be[1].toString(), be[4] as int, NodeType.BE, + "CloudInternalServiceImpl::warm_up_rowset.download_segment.inject_error") + downloadDebugBes << be + } + + def beforeFailureMetrics = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + def beforeFailureStats = WarmupMetricsUtils.waitForJobSyncStats(sqlRunner, jobId, + { !it.isEmpty() }, 30000) + sql """INSERT INTO ${tableName} VALUES ${rows(100, 108)}""" + waitUntil("download failure metric", 60000) { + def m = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + return m.failed > beforeFailureMetrics.failed && m.finished + m.failed >= m.submitted + } + def failedMetrics = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + assert failedMetrics.failed > beforeFailureMetrics.failed : + "injected download failure should increase failed bvar, before=${beforeFailureMetrics}, after=${failedMetrics}" + + def failedStats = WarmupMetricsUtils.waitForJobSyncStats(sqlRunner, jobId, { + it.seg_num.fail_5m > beforeFailureStats.seg_num.fail_5m + && it.seg_num.gap_5m > beforeFailureStats.seg_num.gap_5m + }, 30000) + logger.info("SyncStats after injected download failure: ${failedStats}") + assert failedStats.seg_num.fail_5m > 0 : "5m fail window should expose download failure" + assert failedStats.seg_num.fail_30m > 0 : "30m fail window should expose download failure" + assert failedStats.seg_num.fail_1h > 0 : "1h fail window should expose download failure" + assert failedStats.seg_num.gap_5m > 0 : "5m gap should expose unfinished failed warmup" + assert failedStats.seg_num.gap_30m > 0 : "30m gap should expose unfinished failed warmup" + assert failedStats.seg_num.gap_1h > 0 : "1h gap should expose unfinished failed warmup" + + for (be in downloadDebugBes) { + GetDebugPoint().disableDebugPoint(be[1].toString(), be[4] as int, NodeType.BE, + "CloudInternalServiceImpl::warm_up_rowset.download_segment.inject_error") + } + downloadDebugBes.clear() + + def beforeRecoveryMetrics = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + sql """INSERT INTO ${tableName} VALUES ${rows(200, 208)}""" + def afterRecoveryMetrics = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, srcCluster, dstCluster, + beforeRecoveryMetrics.finished + 1, 90000) + assert afterRecoveryMetrics.finished > beforeRecoveryMetrics.finished : + "warmup should recover and finish new downloads, before=${beforeRecoveryMetrics}, after=${afterRecoveryMetrics}" + assert afterRecoveryMetrics.failed == beforeRecoveryMetrics.failed : + "recovered warmup should not add new failures, before=${beforeRecoveryMetrics}, after=${afterRecoveryMetrics}" + + def recoveredStats = WarmupMetricsUtils.waitForJobSyncStats(sqlRunner, jobId, { + it.seg_num.finish_5m > failedStats.seg_num.finish_5m + && it.seg_num.fail_5m >= failedStats.seg_num.fail_5m + }, 30000) + logger.info("SyncStats after failure recovery: ${recoveredStats}") + + sql """use @${dstCluster}""" + sql """use ${dbName}""" + sql """set query_freshness_tolerance_ms = 5000""" + def res = sql """SELECT count(*) FROM ${tableName}""" + assert res[0][0].toString() == "48" : "target query should see all rows after failure recovery: ${res}" + + def stoppedStatsBeIndex = dstBeIndexes[0] as int + def stoppedStatsBe = cluster.getBeByIndex(stoppedStatsBeIndex) + cluster.stopBackends(stoppedStatsBeIndex) + waitUntil("target BE ${stoppedStatsBe.backendId} to be marked dead", 30000) { + def row = sql("SHOW BACKENDS").find { + it[0].toString() == stoppedStatsBe.backendId.toString() + } + return row != null && row[9].toString() == "false" + } + def beDownInfo = sql """SHOW WARM UP JOB WHERE ID = ${jobId}""" + assert beDownInfo[0][3] in ["RUNNING", "PENDING"] : + "SHOW should keep the job visible while one target BE is down, row=${beDownInfo[0]}" + def beDownStats = WarmupMetricsUtils.parseSyncStats(beDownInfo) + logger.info("SyncStats with one target BE down: ${beDownStats}") + assert !beDownStats.isEmpty() : + "SHOW should return degraded SyncStats when one target BE is down" + assert beDownStats.seg_num.finish_5m > 0 : + "remaining target BE stats should still be aggregated when one target BE is down, stats=${beDownStats}" + } finally { + if (statsApiDebugBe != null) { + try { + GetDebugPoint().disableDebugPoint(statsApiDebugBe[1].toString(), + statsApiDebugBe[4] as int, NodeType.BE, + "WarmUpStatsAction.handle.return_error") + } catch (Exception ignored) {} + } + if (statsApiSleepBe != null) { + try { + GetDebugPoint().disableDebugPoint(statsApiSleepBe[1].toString(), + statsApiSleepBe[4] as int, NodeType.BE, + "WarmUpStatsAction.handle.sleep") + } catch (Exception ignored) {} + } + if (!downloadDebugBes.isEmpty()) { + try { GetDebugPoint().clearDebugPointsForAllBEs() } catch (Exception ignored) {} + } + for (jid in jobIds) { + try { sql """CANCEL WARM UP JOB WHERE ID = ${jid}""" } catch (Exception ignored) {} + } + try { + sql """use @${srcCluster}""" + sql """use ${dbName}""" + sql """DROP TABLE IF EXISTS ${tableName}""" + } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${dbName}""" } catch (Exception ignored) {} + } + } +} diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_canonicalization.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_canonicalization.groovy new file mode 100644 index 00000000000000..5ca684acbd5064 --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_canonicalization.groovy @@ -0,0 +1,117 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import groovy.json.JsonSlurper + +suite('test_warm_up_event_on_tables_canonicalization', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_warm_up_table_filter_refresh_interval_ms=1000', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + ] + options.cloudMode = true + options.beNum = 1 + + docker(options) { + def clusterName1 = "warmup_source" + def clusterName2 = "warmup_target" + + cluster.addBackend(1, clusterName1) + cluster.addBackend(1, clusterName2) + + sql """use @${clusterName1}""" + + def dbName = "test_on_tables_canon_db" + def dbOther = "test_on_tables_canon_other_db" + def jobIds = [] + + try { + sql """CREATE DATABASE IF NOT EXISTS ${dbName}""" + sql """CREATE DATABASE IF NOT EXISTS ${dbOther}""" + + sql """use ${dbName}""" + sql """CREATE TABLE orders (id INT) DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1""" + sql """CREATE TABLE tmp_staging (id INT) DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1""" + sql """use ${dbOther}""" + sql """CREATE TABLE logs (id INT) DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1""" + sql """use @${clusterName1}""" + + // Create a job with specific rule order + def jobId_ = sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + ON TABLES ( + INCLUDE '${dbName}.*', + EXCLUDE '${dbName}.tmp_*', + INCLUDE '${dbOther}.*' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + def jobId = jobId_[0][0] + jobIds << jobId + + def jobInfo = sql """SHOW WARM UP JOB WHERE ID = ${jobId}""" + def tableFilter = jobInfo[0][13] + logger.info("TableFilter: ${tableFilter}") + + // Try creating a "duplicate" with rules in different order — should fail + // because canonicalization normalizes rule order + try { + sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + ON TABLES ( + INCLUDE '${dbOther}.*', + INCLUDE '${dbName}.*', + EXCLUDE '${dbName}.tmp_*' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + assert false : "Expected duplicate job error" + } catch (java.sql.SQLException e) { + logger.info("Expected error for duplicate job: ${e.getMessage()}") + assert e.getMessage().contains("already has a runnable job") + } + + } finally { + for (jid in jobIds) { + try { sql """CANCEL WARM UP JOB WHERE ID = ${jid}""" } catch (Exception ignored) {} + } + try { + sql """use ${dbName}""" + sql """DROP TABLE IF EXISTS orders""" + sql """DROP TABLE IF EXISTS tmp_staging""" + } catch (Exception ignored) {} + try { + sql """use ${dbOther}""" + sql """DROP TABLE IF EXISTS logs""" + } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${dbName}""" } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${dbOther}""" } catch (Exception ignored) {} + } + } +} diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_dynamic.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_dynamic.groovy new file mode 100644 index 00000000000000..c9de7fb56ee821 --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_dynamic.groovy @@ -0,0 +1,217 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.WarmupMetricsUtils + +suite('test_warm_up_event_on_tables_dynamic', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_warm_up_table_filter_refresh_interval_ms=1000', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + ] + options.cloudMode = true + options.beNum = 1 + + docker(options) { + Closure sqlRunner = { String q -> sql(q) } + + def clusterName1 = "warmup_source" + def clusterName2 = "warmup_target" + + cluster.addBackend(1, clusterName1) + cluster.addBackend(1, clusterName2) + + sql """use @${clusterName1}""" + + def dbName = "test_on_tables_dynamic_db" + def jobIds = [] + + try { + sql """CREATE DATABASE IF NOT EXISTS ${dbName}""" + sql """use ${dbName}""" + + // ===== Test 1: New table auto-included after job creation ===== + logger.info("===== Test 1: New table auto-included =====") + + sql """CREATE TABLE IF NOT EXISTS fact_orders (id INT, amount DOUBLE) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + + sql """use @${clusterName1}""" + def jobId_ = sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + ON TABLES ( + INCLUDE '${dbName}.fact_*' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + def jobId = jobId_[0][0] + jobIds << jobId + logger.info("Warm-up job ID: ${jobId}") + + sleep(3000) + + // Verify initial matched tables + def initMatched = WarmupMetricsUtils.waitForMatchedTables(sqlRunner, jobId, + ["${dbName}.fact_orders".toString()] as Set) + logger.info("Initial MatchedTables: ${initMatched}") + assert "${dbName}.fact_orders".toString() in initMatched + + // Create a new table that matches the pattern + sql """use ${dbName}""" + sql """CREATE TABLE IF NOT EXISTS fact_sales (id INT, revenue DOUBLE) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + + // Also create a table that does NOT match the pattern + sql """CREATE TABLE IF NOT EXISTS dim_product (id INT, name STRING) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + + // Poll until new matching table is auto-included + def matchedAfterCreate = WarmupMetricsUtils.waitForMatchedTables(sqlRunner, jobId, + ["${dbName}.fact_orders".toString(), "${dbName}.fact_sales".toString()] as Set, + ["${dbName}.dim_product".toString()] as Set) + logger.info("MatchedTables after create: ${matchedAfterCreate}") + assert "${dbName}.fact_orders".toString() in matchedAfterCreate + assert "${dbName}.fact_sales".toString() in matchedAfterCreate + assert !("${dbName}.dim_product".toString() in matchedAfterCreate) + + // Verify warmup works for the new table — with quantitative metric check + def baseMetrics = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, clusterName1, clusterName2) + def numInserts = 5 + sql """use ${dbName}""" + for (int i = 0; i < numInserts; i++) { + sql """INSERT INTO fact_sales VALUES (${i}, ${i * 100.0})""" + } + + def finalMetrics = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, clusterName1, clusterName2, + baseMetrics.finished + numInserts) + WarmupMetricsUtils.logWarmupMetrics(sqlRunner, clusterName1, clusterName2) + + def reqDelta = finalMetrics.requested - baseMetrics.requested + def subDelta = finalMetrics.submitted - baseMetrics.submitted + def finDelta = finalMetrics.finished - baseMetrics.finished + def failDelta = finalMetrics.failed - baseMetrics.failed + logger.info("Test1 deltas: requested=${reqDelta}, submitted=${subDelta}, finished=${finDelta}, failed=${failDelta}") + assert reqDelta >= numInserts : "Expected requested >= ${numInserts}, got ${reqDelta}" + assert subDelta >= numInserts : "Expected submitted >= ${numInserts}, got ${subDelta}" + assert finDelta >= numInserts : "Expected finished >= ${numInserts}, got ${finDelta}" + assert failDelta == 0 : "Expected 0 failed, got ${failDelta}" + + // Negative proof: insert into dim_product (not matched) + def metricsBeforeDim = WarmupMetricsUtils.waitForMetricsStable(sqlRunner, clusterName1, clusterName2) + for (int i = 0; i < numInserts; i++) { + sql """INSERT INTO dim_product VALUES (${i}, 'product_${i}')""" + } + sleep(5000) + def metricsAfterDim = WarmupMetricsUtils.logWarmupMetrics(sqlRunner, clusterName1, clusterName2) + def dimSubDelta = metricsAfterDim.submitted - metricsBeforeDim.submitted + def dimFinDelta = metricsAfterDim.finished - metricsBeforeDim.finished + assert dimSubDelta == 0 : "dim_product inserts should not trigger warmup, submitted delta=${dimSubDelta}" + assert dimFinDelta == 0 : "dim_product inserts should not trigger warmup, finished delta=${dimFinDelta}" + + // ===== Test 2: Dropped table auto-excluded ===== + logger.info("===== Test 2: Dropped table auto-excluded =====") + + sql """use ${dbName}""" + sql """DROP TABLE IF EXISTS fact_orders""" + + // Poll until dropped table is removed + def matchedAfterDrop = WarmupMetricsUtils.waitForMatchedTables(sqlRunner, jobId, + ["${dbName}.fact_sales".toString()] as Set, + ["${dbName}.fact_orders".toString()] as Set) + logger.info("MatchedTables after drop: ${matchedAfterDrop}") + assert !("${dbName}.fact_orders".toString() in matchedAfterDrop) + assert "${dbName}.fact_sales".toString() in matchedAfterDrop + + // Job should still be running + def jobInfoAfterDrop = sql """SHOW WARM UP JOB WHERE ID = ${jobId}""" + assert jobInfoAfterDrop[0][3] in ["RUNNING", "PENDING"] + + // ===== Test 3: Rename table — pattern re-evaluation ===== + logger.info("===== Test 3: Rename table =====") + + // Rename fact_sales to archive_sales (no longer matches fact_*) + sql """ALTER TABLE ${dbName}.fact_sales RENAME archive_sales""" + + def matchedAfterRename = WarmupMetricsUtils.waitForMatchedTables(sqlRunner, jobId, + [] as Set, + ["${dbName}.fact_sales".toString(), "${dbName}.archive_sales".toString()] as Set) + logger.info("MatchedTables after rename to archive_sales: ${matchedAfterRename}") + assert !("${dbName}.fact_sales".toString() in matchedAfterRename) + assert !("${dbName}.archive_sales".toString() in matchedAfterRename) + + // Job still running even with no matched tables + def jobInfoAfterRename = sql """SHOW WARM UP JOB WHERE ID = ${jobId}""" + assert jobInfoAfterRename[0][3] in ["RUNNING", "PENDING"] + + // Rename back to a matching name + sql """ALTER TABLE ${dbName}.archive_sales RENAME fact_revenue""" + + def matchedAfterRenameBack = WarmupMetricsUtils.waitForMatchedTables(sqlRunner, jobId, + ["${dbName}.fact_revenue".toString()] as Set) + logger.info("MatchedTables after rename to fact_revenue: ${matchedAfterRenameBack}") + assert "${dbName}.fact_revenue".toString() in matchedAfterRenameBack + + // Verify warmup still works after rename-back — with quantitative metric check + def metricsBeforeRenameInsert = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, clusterName1, clusterName2) + def numRenameInserts = 5 + sql """use ${dbName}""" + for (int i = 0; i < numRenameInserts; i++) { + sql """INSERT INTO fact_revenue VALUES (${i + 100}, ${i * 50.0})""" + } + + def metricsAfterRenameInsert = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, clusterName1, clusterName2, + metricsBeforeRenameInsert.finished + numRenameInserts) + WarmupMetricsUtils.logWarmupMetrics(sqlRunner, clusterName1, clusterName2) + + def renameReqDelta = metricsAfterRenameInsert.requested - metricsBeforeRenameInsert.requested + def renameSubDelta = metricsAfterRenameInsert.submitted - metricsBeforeRenameInsert.submitted + def renameFinDelta = metricsAfterRenameInsert.finished - metricsBeforeRenameInsert.finished + def renameFailDelta = metricsAfterRenameInsert.failed - metricsBeforeRenameInsert.failed + logger.info("Rename test deltas: requested=${renameReqDelta}, submitted=${renameSubDelta}, finished=${renameFinDelta}, failed=${renameFailDelta}") + assert renameReqDelta >= numRenameInserts : "Expected requested >= ${numRenameInserts}, got ${renameReqDelta}" + assert renameSubDelta >= numRenameInserts : "Expected submitted >= ${numRenameInserts}, got ${renameSubDelta}" + assert renameFinDelta >= numRenameInserts : "Expected finished >= ${numRenameInserts}, got ${renameFinDelta}" + assert renameFailDelta == 0 : "Expected 0 failed, got ${renameFailDelta}" + + } finally { + for (jid in jobIds) { + try { sql """CANCEL WARM UP JOB WHERE ID = ${jid}""" } catch (Exception ignored) {} + } + try { + sql """use ${dbName}""" + sql """DROP TABLE IF EXISTS fact_orders""" + sql """DROP TABLE IF EXISTS fact_sales""" + sql """DROP TABLE IF EXISTS fact_revenue""" + sql """DROP TABLE IF EXISTS archive_sales""" + sql """DROP TABLE IF EXISTS dim_product""" + } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${dbName}""" } catch (Exception ignored) {} + } + } +} diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_error_and_lifecycle.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_error_and_lifecycle.groovy new file mode 100644 index 00000000000000..34d68357ea31ef --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_error_and_lifecycle.groovy @@ -0,0 +1,387 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.WarmupMetricsUtils +import groovy.json.JsonSlurper + +suite('test_warm_up_event_on_tables_error_and_lifecycle', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_warm_up_table_filter_refresh_interval_ms=1000', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + ] + options.cloudMode = true + options.beNum = 1 + + docker(options) { + Closure sqlRunner = { String q -> sql(q) } + + def clusterName1 = "warmup_source" + def clusterName2 = "warmup_target" + + cluster.addBackend(1, clusterName1) + cluster.addBackend(1, clusterName2) + + sql """use @${clusterName1}""" + + def dbName = "test_on_tables_err_db" + def jobIds = [] + + try { + sql """CREATE DATABASE IF NOT EXISTS ${dbName}""" + sql """use ${dbName}""" + sql """CREATE TABLE IF NOT EXISTS base_table (id INT, val STRING) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + + sql """use @${clusterName1}""" + + // ===== Error Test 1: Exclude-only (no INCLUDE) ===== + logger.info("===== Error Test 1: Exclude-only =====") + try { + sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + ON TABLES ( + EXCLUDE '${dbName}.tmp_*' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + assert false : "Expected error for exclude-only ON TABLES" + } catch (java.sql.SQLException e) { + logger.info("Expected error: ${e.getMessage()}") + assert e.getMessage().contains("at least one INCLUDE") : "Error should mention INCLUDE requirement: ${e.getMessage()}" + } + + // ===== Error Test 2: Invalid pattern format (missing db.table) ===== + logger.info("===== Error Test 2: Invalid pattern format =====") + try { + sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + ON TABLES ( + INCLUDE 'orders' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + assert false : "Expected error for invalid pattern format" + } catch (java.sql.SQLException e) { + logger.info("Expected error: ${e.getMessage()}") + assert e.getMessage().contains("db.table") : "Error should mention db.table format: ${e.getMessage()}" + } + + // ===== Error Test 3: ON TABLES with non-event-driven sync mode ===== + logger.info("===== Error Test 3: ON TABLES with periodic sync mode =====") + try { + sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + ON TABLES ( + INCLUDE '${dbName}.*' + ) + PROPERTIES ( + "sync_mode" = "periodic", + "sync_interval_sec" = "10" + ) + """ + assert false : "Expected error for ON TABLES with periodic sync" + } catch (java.sql.SQLException e) { + logger.info("Expected error: ${e.getMessage()}") + assert e.getMessage().contains("event_driven") : "Error should mention event_driven requirement: ${e.getMessage()}" + } + + // ===== Error Test 4: No tables match the pattern ===== + logger.info("===== Error Test 4: No matching tables =====") + try { + sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + ON TABLES ( + INCLUDE 'nonexistent_db_xyz.*' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + assert false : "Expected error for no matching tables" + } catch (java.sql.SQLException e) { + logger.info("Expected error: ${e.getMessage()}") + def msg = e.getMessage().toLowerCase() + assert msg.contains("no tables matched") || msg.contains("no table") : "Error should indicate no tables matched: ${e.getMessage()}" + } + + // ===== Error Test 5: ON TABLES with ONCE sync mode ===== + logger.info("===== Error Test 5: ON TABLES with once sync mode =====") + try { + sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + ON TABLES ( + INCLUDE '${dbName}.*' + ) + PROPERTIES ( + "sync_mode" = "once" + ) + """ + assert false : "Expected error for ON TABLES with once sync" + } catch (java.sql.SQLException e) { + logger.info("Expected error: ${e.getMessage()}") + assert e.getMessage().contains("event_driven") : "Error should mention event_driven: ${e.getMessage()}" + } + + // ===== Lifecycle Test 1: Cluster-level and table-level jobs are mutually exclusive ===== + logger.info("===== Lifecycle Test 1: Cross-level conflict =====") + + // Create cluster-level event-driven job + def clusterJobId_ = sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + def clusterJobId = clusterJobId_[0][0] + jobIds << clusterJobId + logger.info("Cluster-level job ID: ${clusterJobId}") + + // Creating a table-level load-event job for the same source and destination should fail. + try { + sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + ON TABLES ( + INCLUDE '${dbName}.base_table' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + assert false : "Expected table-level job to conflict with existing cluster-level load-event job" + } catch (java.sql.SQLException e) { + logger.info("Expected cross-level conflict: ${e.getMessage()}") + assert e.getMessage().contains("Cannot create table-level load-event warm up job") : e.getMessage() + assert e.getMessage().contains("conflicting cluster-level load-event warm up job ${clusterJobId}") : + e.getMessage() + assert e.getMessage().contains("Cancel existing load-event warm up job ${clusterJobId}") : + e.getMessage() + } + + def clusterJobInfo = sql """SHOW WARM UP JOB WHERE ID = ${clusterJobId}""" + assert clusterJobInfo[0][13] == "" : "Cluster-level job should have empty TableFilter" + assert clusterJobInfo[0][14] == "" : "Cluster-level job should have empty MatchedTables" + + sql """CANCEL WARM UP JOB WHERE ID = ${clusterJobId}""" + def clusterCancelInfo = sql """SHOW WARM UP JOB WHERE ID = ${clusterJobId}""" + assert clusterCancelInfo[0][3] == "CANCELLED" + + // Create table-level event-driven job after cancelling the conflicting cluster-level job. + def tableJobId_ = sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + ON TABLES ( + INCLUDE '${dbName}.base_table' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + def tableJobId = tableJobId_[0][0] + jobIds << tableJobId + logger.info("Table-level job ID: ${tableJobId}") + + def tableJobInfo = sql """SHOW WARM UP JOB WHERE ID = ${tableJobId}""" + + // Table-level job should have non-empty TableFilter and MatchedTables + assert tableJobInfo[0][13].length() > 0 : "Table-level job should have non-empty TableFilter" + def tableJobMatched = WarmupMetricsUtils.parseMatchedTables(tableJobInfo) + assert "${dbName}.base_table".toString() in tableJobMatched : "Table-level job MatchedTables should contain base_table" + + // Creating a cluster-level load-event job should also fail while the table-level job is running. + try { + sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + assert false : "Expected cluster-level job to conflict with existing table-level load-event job" + } catch (java.sql.SQLException e) { + logger.info("Expected reverse cross-level conflict: ${e.getMessage()}") + assert e.getMessage().contains("Cannot create cluster-level load-event warm up job") : e.getMessage() + assert e.getMessage().contains("conflicting table-level load-event warm up job ${tableJobId}") : + e.getMessage() + assert e.getMessage().contains("Cancel existing load-event warm up job ${tableJobId}") : + e.getMessage() + } + + // ===== Lifecycle Test 2: Duplicate detection with normalized rules ===== + logger.info("===== Lifecycle Test 2: Duplicate detection =====") + + // Try creating same table-level job again + try { + sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + ON TABLES ( + INCLUDE '${dbName}.base_table' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + assert false : "Expected duplicate job error" + } catch (java.sql.SQLException e) { + logger.info("Expected error for duplicate: ${e.getMessage()}") + assert e.getMessage().contains("already has a runnable job") : e.getMessage() + } + + // Different filter should succeed + def tableJobId2_ = sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + ON TABLES ( + INCLUDE '${dbName}.*' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + def tableJobId2 = tableJobId2_[0][0] + jobIds << tableJobId2 + logger.info("Table-level job2 ID (different filter): ${tableJobId2}") + + // ===== Lifecycle Test 3: Cancel and recreate ===== + logger.info("===== Lifecycle Test 3: Cancel and recreate =====") + + sql """CANCEL WARM UP JOB WHERE ID = ${tableJobId}""" + def cancelInfo = sql """SHOW WARM UP JOB WHERE ID = ${tableJobId}""" + assert cancelInfo[0][3] == "CANCELLED" + + // After cancelling, we should be able to create a job with the same filter + def tableJobId3_ = sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + ON TABLES ( + INCLUDE '${dbName}.base_table' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + def tableJobId3 = tableJobId3_[0][0] + jobIds << tableJobId3 + logger.info("Table-level job3 ID (after cancel+recreate): ${tableJobId3}") + + // Verify new job is running + def jobInfo3 = sql """SHOW WARM UP JOB WHERE ID = ${tableJobId3}""" + assert jobInfo3[0][3] in ["RUNNING", "PENDING"] : "Recreated job should be running" + + // ===== Lifecycle Test 4: ? wildcard matching with quantitative metrics ===== + logger.info("===== Lifecycle Test 4: ? wildcard =====") + + sql """use ${dbName}""" + sql """CREATE TABLE IF NOT EXISTS log_a (id INT, msg STRING) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + sql """CREATE TABLE IF NOT EXISTS log_b (id INT, msg STRING) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + sql """CREATE TABLE IF NOT EXISTS log_ab (id INT, msg STRING) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + + sql """use @${clusterName1}""" + def jobIdQ_ = sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + ON TABLES ( + INCLUDE '${dbName}.log_?' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + def jobIdQ = jobIdQ_[0][0] + jobIds << jobIdQ + logger.info("Wildcard ? job ID: ${jobIdQ}") + + sleep(3000) + + def jobInfoQ = sql """SHOW WARM UP JOB WHERE ID = ${jobIdQ}""" + def matchedSetQ = WarmupMetricsUtils.parseMatchedTables(jobInfoQ) + logger.info("MatchedTables for ? wildcard: ${matchedSetQ}") + assert "${dbName}.log_a".toString() in matchedSetQ : "log_a should match log_? pattern" + assert "${dbName}.log_b".toString() in matchedSetQ : "log_b should match log_? pattern" + assert !("${dbName}.log_ab".toString() in matchedSetQ) : "log_ab should NOT match log_? (? matches exactly one char)" + + // Quantitative metric verification for ? wildcard + def baseMetrics = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, clusterName1, clusterName2) + + // Insert into matched tables log_a and log_b + def numInserts = 3 + sql """use ${dbName}""" + for (int i = 0; i < numInserts; i++) { + sql """INSERT INTO log_a VALUES (${i}, 'msg_a_${i}')""" + sql """INSERT INTO log_b VALUES (${i}, 'msg_b_${i}')""" + } + def expectedSegments = numInserts * 2 // 2 matched tables + + // Insert into non-matched table log_ab + for (int i = 0; i < numInserts; i++) { + sql """INSERT INTO log_ab VALUES (${i}, 'msg_ab_${i}')""" + } + + def finalMetrics = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, clusterName1, clusterName2, + baseMetrics.finished + expectedSegments) + WarmupMetricsUtils.logWarmupMetrics(sqlRunner, clusterName1, clusterName2) + + def reqDelta = finalMetrics.requested - baseMetrics.requested + def subDelta = finalMetrics.submitted - baseMetrics.submitted + def finDelta = finalMetrics.finished - baseMetrics.finished + def failDelta = finalMetrics.failed - baseMetrics.failed + logger.info("? wildcard deltas: requested=${reqDelta}, submitted=${subDelta}, finished=${finDelta}, failed=${failDelta}") + + // Only log_a and log_b should have been warmed, not log_ab + assert reqDelta >= expectedSegments : "Expected requested >= ${expectedSegments}, got ${reqDelta}" + assert subDelta >= expectedSegments : "Expected submitted >= ${expectedSegments}, got ${subDelta}" + assert finDelta >= expectedSegments : "Expected finished >= ${expectedSegments}, got ${finDelta}" + assert failDelta == 0 : "Expected 0 failed, got ${failDelta}" + + } finally { + for (jid in jobIds) { + try { sql """CANCEL WARM UP JOB WHERE ID = ${jid}""" } catch (Exception ignored) {} + } + try { + sql """use ${dbName}""" + sql """DROP TABLE IF EXISTS base_table""" + sql """DROP TABLE IF EXISTS log_a""" + sql """DROP TABLE IF EXISTS log_b""" + sql """DROP TABLE IF EXISTS log_ab""" + } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${dbName}""" } catch (Exception ignored) {} + } + } +} diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_include.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_include.groovy new file mode 100644 index 00000000000000..c2b02d8085a400 --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_include.groovy @@ -0,0 +1,167 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.WarmupMetricsUtils +import groovy.json.JsonSlurper + +suite('test_warm_up_event_on_tables_include', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_warm_up_table_filter_refresh_interval_ms=1000', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + ] + options.cloudMode = true + options.beNum = 1 + + docker(options) { + Closure sqlRunner = { String q -> sql(q) } + + def clusterName1 = "warmup_source" + def clusterName2 = "warmup_target" + + cluster.addBackend(1, clusterName1) + cluster.addBackend(1, clusterName2) + + sql """use @${clusterName1}""" + + def dbName = "test_on_tables_inc_db" + def dbExcluded = "test_on_tables_exc_db" + def jobIds = [] + + try { + sql """CREATE DATABASE IF NOT EXISTS ${dbName}""" + sql """CREATE DATABASE IF NOT EXISTS ${dbExcluded}""" + + sql """use ${dbName}""" + sql """CREATE TABLE IF NOT EXISTS orders (id INT, amount DOUBLE) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + sql """CREATE TABLE IF NOT EXISTS customers (id INT, name STRING) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + sql """CREATE VIEW IF NOT EXISTS view_orders AS SELECT id, amount FROM orders""" + + sql """use ${dbExcluded}""" + sql """CREATE TABLE IF NOT EXISTS logs (id INT, msg STRING) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + + // Create INCLUDE wildcard job + sql """use @${clusterName1}""" + def jobId_ = sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + ON TABLES ( + INCLUDE '${dbName}.*' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + def jobId = jobId_[0][0] + jobIds << jobId + logger.info("Warm-up job ID: ${jobId}") + + sleep(3000) + + def baseMetrics = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, clusterName1, clusterName2) + logger.info("Baseline metrics: ${baseMetrics}") + + // Negative proof: insert ONLY into excluded db + def numExcludedInserts = 5 + sql """use ${dbExcluded}""" + for (int i = 0; i < numExcludedInserts; i++) { + sql """INSERT INTO logs VALUES (${i}, 'log_message_${i}')""" + } + sleep(5000) + + def metricsAfterExcluded = WarmupMetricsUtils.logWarmupMetrics(sqlRunner, clusterName1, clusterName2) + def excludedSubmittedDelta = metricsAfterExcluded.submitted - baseMetrics.submitted + def excludedFinishedDelta = metricsAfterExcluded.finished - baseMetrics.finished + assert excludedSubmittedDelta == 0 : "Excluded inserts should not submit segments, delta=${excludedSubmittedDelta}" + assert excludedFinishedDelta == 0 : "Excluded inserts should not finish segments, delta=${excludedFinishedDelta}" + + // Positive proof: insert into included db + def numIncludedInserts = 5 + def expectedSegments = numIncludedInserts * 2 // 2 tables: orders + customers + sql """use ${dbName}""" + for (int i = 0; i < numIncludedInserts; i++) { + sql """INSERT INTO orders VALUES (${i}, ${i * 10.5})""" + sql """INSERT INTO customers VALUES (${i}, 'customer_${i}')""" + } + + def finalMetrics = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, clusterName1, clusterName2, + metricsAfterExcluded.finished + expectedSegments) + WarmupMetricsUtils.logWarmupMetrics(sqlRunner, clusterName1, clusterName2) + + def requestedDelta = finalMetrics.requested - metricsAfterExcluded.requested + def submittedDelta = finalMetrics.submitted - metricsAfterExcluded.submitted + def finishedDelta = finalMetrics.finished - metricsAfterExcluded.finished + def failedDelta = finalMetrics.failed - metricsAfterExcluded.failed + logger.info("Included warmup deltas: requested=${requestedDelta}, submitted=${submittedDelta}, finished=${finishedDelta}, failed=${failedDelta}") + assert requestedDelta >= expectedSegments : "Expected requested >= ${expectedSegments}, got ${requestedDelta}" + assert submittedDelta >= expectedSegments : "Expected submitted >= ${expectedSegments}, got ${submittedDelta}" + assert finishedDelta >= expectedSegments : "Expected finished >= ${expectedSegments}, got ${finishedDelta}" + assert failedDelta == 0 : "Expected 0 failed segments, got ${failedDelta}" + + // Verify SHOW WARM UP JOB output + def jobInfo = sql """SHOW WARM UP JOB WHERE ID = ${jobId}""" + assert jobInfo[0][0] == jobId + assert jobInfo[0][1] == clusterName1 + assert jobInfo[0][2] == clusterName2 + assert jobInfo[0][3] in ["RUNNING", "PENDING"] + assert jobInfo[0][4] == "TABLES" + assert jobInfo[0][5] == "EVENT_DRIVEN (LOAD)" + + def tableFilter = jobInfo[0][13] + logger.info("TableFilter: ${tableFilter}") + assert tableFilter != null && tableFilter.length() > 0 + def filterJson = new JsonSlurper().parseText(tableFilter) + assert filterJson.include.contains("${dbName}.*".toString()) + assert !filterJson.containsKey("exclude") + + def matchedSet = WarmupMetricsUtils.parseMatchedTables(jobInfo) + logger.info("MatchedTables set: ${matchedSet}") + assert "${dbName}.orders".toString() in matchedSet + assert "${dbName}.customers".toString() in matchedSet + assert !matchedSet.contains("${dbName}.view_orders".toString()) + assert !matchedSet.any { it.startsWith("${dbExcluded}.") } + + } finally { + for (jid in jobIds) { + try { sql """CANCEL WARM UP JOB WHERE ID = ${jid}""" } catch (Exception ignored) {} + } + try { + sql """use ${dbName}""" + sql """DROP TABLE IF EXISTS orders""" + sql """DROP TABLE IF EXISTS customers""" + } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${dbName}""" } catch (Exception ignored) {} + try { + sql """use ${dbExcluded}""" + sql """DROP TABLE IF EXISTS logs""" + } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${dbExcluded}""" } catch (Exception ignored) {} + } + } +} diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_include_exclude.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_include_exclude.groovy new file mode 100644 index 00000000000000..9fa2bdafc69d3b --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_include_exclude.groovy @@ -0,0 +1,153 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.WarmupMetricsUtils +import groovy.json.JsonSlurper + +suite('test_warm_up_event_on_tables_include_exclude', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_warm_up_table_filter_refresh_interval_ms=1000', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + ] + options.cloudMode = true + options.beNum = 1 + + docker(options) { + Closure sqlRunner = { String q -> sql(q) } + + def clusterName1 = "warmup_source" + def clusterName2 = "warmup_target" + + cluster.addBackend(1, clusterName1) + cluster.addBackend(1, clusterName2) + + sql """use @${clusterName1}""" + + def dbName = "test_on_tables_ie_db" + def jobIds = [] + + try { + sql """CREATE DATABASE IF NOT EXISTS ${dbName}""" + sql """use ${dbName}""" + sql """CREATE TABLE IF NOT EXISTS orders (id INT, amount DOUBLE) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + sql """CREATE TABLE IF NOT EXISTS customers (id INT, name STRING) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + sql """CREATE TABLE IF NOT EXISTS tmp_staging (id INT, data STRING) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + sql """CREATE TABLE IF NOT EXISTS orders_bak (id INT, amount DOUBLE) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + + sql """use @${clusterName1}""" + def jobId_ = sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + ON TABLES ( + INCLUDE '${dbName}.*', + EXCLUDE '${dbName}.tmp_*', + EXCLUDE '${dbName}.*_bak' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + def jobId = jobId_[0][0] + jobIds << jobId + logger.info("Warm-up job ID: ${jobId}") + + sleep(3000) + def baseMetrics = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, clusterName1, clusterName2) + + // Negative proof: insert only into excluded tables + def numExcInserts = 5 + sql """use ${dbName}""" + for (int i = 0; i < numExcInserts; i++) { + sql """INSERT INTO tmp_staging VALUES (${i}, 'staging_${i}')""" + sql """INSERT INTO orders_bak VALUES (${i}, ${i * 5.0})""" + } + sleep(5000) + + def metricsAfterExc = WarmupMetricsUtils.logWarmupMetrics(sqlRunner, clusterName1, clusterName2) + assert metricsAfterExc.submitted - baseMetrics.submitted == 0 : \ + "Excluded tables should not submit warmup segments" + assert metricsAfterExc.finished - baseMetrics.finished == 0 : \ + "Excluded tables should not finish warmup segments" + + // Positive proof: insert into included tables + def numIncInserts = 5 + def expectedSeg = numIncInserts * 2 // orders + customers + for (int i = 0; i < numIncInserts; i++) { + sql """INSERT INTO orders VALUES (${i + 100}, ${i * 20.5})""" + sql """INSERT INTO customers VALUES (${i + 100}, 'new_customer_${i}')""" + } + + def finalMetrics = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, clusterName1, clusterName2, + metricsAfterExc.finished + expectedSeg) + WarmupMetricsUtils.logWarmupMetrics(sqlRunner, clusterName1, clusterName2) + + def reqDelta = finalMetrics.requested - metricsAfterExc.requested + def subDelta = finalMetrics.submitted - metricsAfterExc.submitted + def finDelta = finalMetrics.finished - metricsAfterExc.finished + def failDelta = finalMetrics.failed - metricsAfterExc.failed + logger.info("Included deltas: requested=${reqDelta}, submitted=${subDelta}, finished=${finDelta}, failed=${failDelta}") + assert reqDelta >= expectedSeg : "Expected requested >= ${expectedSeg}, got ${reqDelta}" + assert subDelta >= expectedSeg : "Expected submitted >= ${expectedSeg}, got ${subDelta}" + assert finDelta >= expectedSeg : "Expected finished >= ${expectedSeg}, got ${finDelta}" + assert failDelta == 0 : "Expected 0 failed, got ${failDelta}" + + // Verify SHOW output + def jobInfo = sql """SHOW WARM UP JOB WHERE ID = ${jobId}""" + def tableFilter = jobInfo[0][13] + logger.info("TableFilter: ${tableFilter}") + def filterJson = new JsonSlurper().parseText(tableFilter) + assert filterJson.include.contains("${dbName}.*".toString()) + assert filterJson.exclude.contains("${dbName}.*_bak".toString()) + assert filterJson.exclude.contains("${dbName}.tmp_*".toString()) + + def matchedSet = WarmupMetricsUtils.parseMatchedTables(jobInfo) + logger.info("MatchedTables set: ${matchedSet}") + assert "${dbName}.orders".toString() in matchedSet + assert "${dbName}.customers".toString() in matchedSet + assert !("${dbName}.tmp_staging".toString() in matchedSet) + assert !("${dbName}.orders_bak".toString() in matchedSet) + + } finally { + for (jid in jobIds) { + try { sql """CANCEL WARM UP JOB WHERE ID = ${jid}""" } catch (Exception ignored) {} + } + try { + sql """use ${dbName}""" + sql """DROP TABLE IF EXISTS orders""" + sql """DROP TABLE IF EXISTS customers""" + sql """DROP TABLE IF EXISTS tmp_staging""" + sql """DROP TABLE IF EXISTS orders_bak""" + } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${dbName}""" } catch (Exception ignored) {} + } + } +} diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_mow_compaction.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_mow_compaction.groovy new file mode 100644 index 00000000000000..50956454d1a10b --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_mow_compaction.groovy @@ -0,0 +1,221 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import groovy.json.JsonSlurper +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.WarmupMetricsUtils + +// Covers table-level event-driven warmup on a MOW table with upsert writes, target reads, and full compaction. +suite('test_warm_up_event_on_tables_mow_compaction', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_warm_up_table_filter_refresh_interval_ms=1000', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + 'enable_compaction_delay_commit_for_warm_up=true', + 'warm_up_rowset_sync_wait_min_timeout_ms=20000', + 'warm_up_rowset_sync_wait_max_timeout_ms=30000', + ] + options.cloudMode = true + options.beNum = 1 + + def httpJson = { String method, String url, int readTimeoutMs = 180000 -> + def conn = new URL(url).openConnection() + conn.setRequestMethod(method) + conn.setConnectTimeout(10000) + conn.setReadTimeout(readTimeoutMs) + def text = conn.responseCode >= 400 ? conn.errorStream?.text : conn.inputStream.text + assert text != null && !text.trim().isEmpty() : "empty HTTP response from ${url}" + return new JsonSlurper().parseText(text.trim()) + } + + def triggerFullCompaction = { ip, port, tabletId -> + def status = httpJson("POST", + "http://${ip}:${port}/api/compaction/run?tablet_id=${tabletId}&compact_type=full") + assert status.status.toString().toLowerCase() in ["success", "already_exist"] : + "trigger compaction failed on ${ip}:${port}, tablet=${tabletId}, status=${status}" + return status + } + + def waitForCompactionFinish = { ip, port, tabletId, timeoutMs -> + long deadline = System.currentTimeMillis() + timeoutMs + def lastStatus = null + while (System.currentTimeMillis() < deadline) { + lastStatus = httpJson("GET", + "http://${ip}:${port}/api/compaction/run_status?tablet_id=${tabletId}", 10000) + assert lastStatus.status.toLowerCase() == "success" : + "compaction failed on ${ip}:${port}, tablet=${tabletId}, status=${lastStatus}" + if (!lastStatus.run_status) { + return lastStatus + } + sleep(1000) + } + assert false : "compaction did not finish on ${ip}:${port}, tablet=${tabletId}, last=${lastStatus}" + } + + docker(options) { + Closure sqlRunner = { String q -> sql(q) } + + def srcCluster = "warmup_source" + def dstCluster = "warmup_target" + def dbName = "test_on_tables_mow_compaction_db" + def tableName = "mow_tbl" + def jobIds = [] + + cluster.addBackend(1, srcCluster) + cluster.addBackend(1, dstCluster) + + try { + sql """use @${srcCluster}""" + sql """CREATE DATABASE IF NOT EXISTS ${dbName}""" + sql """use ${dbName}""" + sql """CREATE TABLE IF NOT EXISTS ${tableName} ( + id INT NOT NULL, + value INT, + tag STRING + ) + UNIQUE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ( + "enable_unique_key_merge_on_write" = "true", + "file_cache_ttl_seconds" = "3600", + "disable_auto_compaction" = "true" + )""" + + def jobId = sql(""" + WARM UP CLUSTER ${dstCluster} WITH CLUSTER ${srcCluster} + ON TABLES (INCLUDE '${dbName}.${tableName}') + PROPERTIES ("sync_mode" = "event_driven", "sync_event" = "load") + """)[0][0] + jobIds << jobId + assert WarmupMetricsUtils.waitForMatchedTables(sqlRunner, jobId, + ["${dbName}.${tableName}".toString()] as Set) == + ["${dbName}.${tableName}".toString()] as Set + + WarmupMetricsUtils.clearFileCacheOnAllBackends(sqlRunner) + def baseMetrics = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + + sql """use @${srcCluster}""" + sql """use ${dbName}""" + sql """INSERT INTO ${tableName} VALUES (1, 10, 'a'), (2, 20, 'b'), (3, 30, 'c')""" + def afterInitialLoad = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, srcCluster, dstCluster, + baseMetrics.finished + 1, 60000) + assert afterInitialLoad.finished >= baseMetrics.finished + 1 : + "initial MOW warmup should finish, metrics=${afterInitialLoad}" + assert afterInitialLoad.failed == baseMetrics.failed : + "initial MOW warmup should not fail, metrics=${afterInitialLoad}" + + sql """use @${dstCluster}""" + sql """use ${dbName}""" + def initialRead = sql """SELECT count(*), sum(value) FROM ${tableName}""" + assert initialRead[0][0].toString() == "3" : "target initial MOW count mismatch: ${initialRead}" + assert initialRead[0][1].toString() == "60" : "target initial MOW sum mismatch: ${initialRead}" + + def beforeUpsert = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + sql """use @${srcCluster}""" + sql """use ${dbName}""" + sql """INSERT INTO ${tableName} VALUES (2, 200, 'b2'), (3, 300, 'c2')""" + def afterUpsert = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, srcCluster, dstCluster, + beforeUpsert.finished + 1, 60000) + assert afterUpsert.finished >= beforeUpsert.finished + 1 : + "first MOW upsert warmup should finish, metrics=${afterUpsert}" + assert afterUpsert.failed == beforeUpsert.failed : + "first MOW upsert warmup should not fail, metrics=${afterUpsert}" + + def beforeSecondUpsert = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + sql """INSERT INTO ${tableName} VALUES (2, 220, 'b3'), (4, 40, 'd')""" + def afterSecondUpsert = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, srcCluster, dstCluster, + beforeSecondUpsert.finished + 1, 60000) + assert afterSecondUpsert.finished >= beforeSecondUpsert.finished + 1 : + "second MOW upsert warmup should finish, metrics=${afterSecondUpsert}" + assert afterSecondUpsert.failed == beforeSecondUpsert.failed : + "second MOW upsert warmup should not fail, metrics=${afterSecondUpsert}" + + def tablets = sql_return_maparray """SHOW TABLETS FROM ${tableName}""" + assert tablets.size() == 1 : "${tableName} should have one tablet, tablets=${tablets}" + def tabletId = tablets[0].TabletId.toString() + def sourceBe = WarmupMetricsUtils.getClusterBackends(sqlRunner, srcCluster)[0] + def beforeCompaction = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + + triggerFullCompaction(sourceBe[1].toString(), sourceBe[4].toString(), tabletId) + waitForCompactionFinish(sourceBe[1].toString(), sourceBe[4].toString(), tabletId, 90000) + + def afterCompaction = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, srcCluster, dstCluster, + beforeCompaction.finished + 1, 90000) + assert afterCompaction.finished >= beforeCompaction.finished + 1 : + "MOW full compaction rowset warmup should finish, metrics=${afterCompaction}" + assert afterCompaction.failed == beforeCompaction.failed : + "MOW full compaction rowset warmup should not fail, metrics=${afterCompaction}" + + def beforePostCompactionUpsert = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + sql """INSERT INTO ${tableName} VALUES (2, 222, 'b4'), (4, 44, 'd2'), (5, 50, 'e')""" + def afterPostCompactionUpsert = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, srcCluster, dstCluster, + beforePostCompactionUpsert.finished + 1, 60000) + assert afterPostCompactionUpsert.finished >= beforePostCompactionUpsert.finished + 1 : + "post-compaction MOW upsert warmup should finish, metrics=${afterPostCompactionUpsert}" + assert afterPostCompactionUpsert.failed == beforePostCompactionUpsert.failed : + "post-compaction MOW upsert warmup should not fail, metrics=${afterPostCompactionUpsert}" + + def stats = WarmupMetricsUtils.waitForJobSyncStats(sqlRunner, jobId, + { it.seg_num.finish_5m >= 5 && it.seg_num.fail_5m == 0 }, 60000) + logger.info("MOW warmup SyncStats: ${stats}") + assert stats.seg_num.fail_5m == 0 : "MOW warmup SyncStats should have no failures: ${stats}" + assert stats.seg_num.gap_5m == 0 : "MOW warmup SyncStats should converge: ${stats}" + + sql """use @${dstCluster}""" + sql """use ${dbName}""" + profile("mow_compaction_target_profile") { + sql """set enable_profile = true""" + sql """set profile_level = 2""" + run { + def res = sql """/* mow_compaction_target_profile */ SELECT count(*), sum(value) FROM ${tableName}""" + assert res[0][0].toString() == "5" : "target final MOW count mismatch: ${res}" + assert res[0][1].toString() == "626" : "target final MOW sum mismatch: ${res}" + sleep(1000) + } + check { profileString, exception -> + if (exception != null) { + throw exception + } + assert profileString.contains("NumRemoteIOTotal") : "profile should contain file cache counters" + def remoteTotal = WarmupMetricsUtils.sumProfileCounter(profileString, "NumRemoteIOTotal") + def localTotal = WarmupMetricsUtils.sumProfileCounter(profileString, "NumLocalIOTotal") + logger.info("MOW profile NumRemoteIOTotal=${remoteTotal}, NumLocalIOTotal=${localTotal}") + assert remoteTotal == 0 : "warmed MOW target query should not read remote data" + assert localTotal > 0 : "warmed MOW target query should hit local file cache" + } + } + + def finalRead = sql """SELECT id, value, tag FROM ${tableName} ORDER BY id""" + assert finalRead.toString() == "[[1, 10, a], [2, 222, b4], [3, 300, c2], [4, 44, d2], [5, 50, e]]" : + "target MOW rows mismatch after upsert and full compaction: ${finalRead}" + } finally { + for (jid in jobIds) { + try { sql """CANCEL WARM UP JOB WHERE ID = ${jid}""" } catch (Exception ignored) {} + } + try { + sql """use @${srcCluster}""" + sql """use ${dbName}""" + sql """DROP TABLE IF EXISTS ${tableName}""" + } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${dbName}""" } catch (Exception ignored) {} + } + } +} diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_multi_dst.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_multi_dst.groovy new file mode 100644 index 00000000000000..7a220876378558 --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_multi_dst.groovy @@ -0,0 +1,213 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.WarmupMetricsUtils +import groovy.json.JsonSlurper + +suite('test_warm_up_event_on_tables_multi_dst', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_warm_up_table_filter_refresh_interval_ms=1000', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + ] + options.cloudMode = true + options.beNum = 1 + + docker(options) { + Closure sqlRunner = { String q -> sql(q) } + + def srcCluster = "warmup_source" + def dstCluster1 = "warmup_target_1" + def dstCluster2 = "warmup_target_2" + + cluster.addBackend(1, srcCluster) + cluster.addBackend(1, dstCluster1) + cluster.addBackend(1, dstCluster2) + + sql """use @${srcCluster}""" + + def dbName = "test_on_tables_multi_dst_db" + def jobIds = [] + + try { + sql """CREATE DATABASE IF NOT EXISTS ${dbName}""" + sql """use ${dbName}""" + + sql """CREATE TABLE IF NOT EXISTS orders (id INT, amount DOUBLE) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + sql """CREATE TABLE IF NOT EXISTS logs (id INT, msg STRING) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + + // ===== Create job1: source -> target1, only 'orders' ===== + sql """use @${srcCluster}""" + def jobId1_ = sql """ + WARM UP CLUSTER ${dstCluster1} WITH CLUSTER ${srcCluster} + ON TABLES ( + INCLUDE '${dbName}.orders' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + def jobId1 = jobId1_[0][0] + jobIds << jobId1 + logger.info("Job1 (source -> target1, orders only): ID=${jobId1}") + + // ===== Create job2: source -> target2, all tables ===== + def jobId2_ = sql """ + WARM UP CLUSTER ${dstCluster2} WITH CLUSTER ${srcCluster} + ON TABLES ( + INCLUDE '${dbName}.*' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + def jobId2 = jobId2_[0][0] + jobIds << jobId2 + logger.info("Job2 (source -> target2, all tables): ID=${jobId2}") + + sleep(3000) + + // Verify matched tables for each job + def matched1 = WarmupMetricsUtils.waitForMatchedTables(sqlRunner, jobId1, + ["${dbName}.orders".toString()] as Set, + ["${dbName}.logs".toString()] as Set) + logger.info("Job1 MatchedTables: ${matched1}") + assert "${dbName}.orders".toString() in matched1 + assert !("${dbName}.logs".toString() in matched1) + + def matched2 = WarmupMetricsUtils.waitForMatchedTables(sqlRunner, jobId2, + ["${dbName}.orders".toString(), "${dbName}.logs".toString()] as Set) + logger.info("Job2 MatchedTables: ${matched2}") + assert "${dbName}.orders".toString() in matched2 + assert "${dbName}.logs".toString() in matched2 + + // ===== Test 1: Insert into 'orders' — both targets should warm up ===== + logger.info("===== Test 1: orders -> both targets =====") + + def baseDst1 = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster1) + def baseDst2 = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster2) + logger.info("Baseline target1: ${baseDst1}, target2: ${baseDst2}") + + def numInserts = 5 + sql """use ${dbName}""" + for (int i = 0; i < numInserts; i++) { + sql """INSERT INTO orders VALUES (${i}, ${i * 10.5})""" + } + + // Wait for both targets to finish + def finalDst1 = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, srcCluster, dstCluster1, + baseDst1.finished + numInserts) + def finalDst2 = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, srcCluster, dstCluster2, + baseDst2.finished + numInserts) + WarmupMetricsUtils.logWarmupMetrics(sqlRunner, srcCluster, dstCluster1) + WarmupMetricsUtils.logWarmupMetrics(sqlRunner, srcCluster, dstCluster2) + + // Verify target1 + def dst1SubDelta = finalDst1.submitted - baseDst1.submitted + def dst1FinDelta = finalDst1.finished - baseDst1.finished + def dst1FailDelta = finalDst1.failed - baseDst1.failed + logger.info("Target1 deltas: submitted=${dst1SubDelta}, finished=${dst1FinDelta}, failed=${dst1FailDelta}") + assert dst1SubDelta >= numInserts : "Target1: expected submitted >= ${numInserts}, got ${dst1SubDelta}" + assert dst1FinDelta >= numInserts : "Target1: expected finished >= ${numInserts}, got ${dst1FinDelta}" + assert dst1FailDelta == 0 : "Target1: expected 0 failed, got ${dst1FailDelta}" + + // Verify target2 + def dst2SubDelta = finalDst2.submitted - baseDst2.submitted + def dst2FinDelta = finalDst2.finished - baseDst2.finished + def dst2FailDelta = finalDst2.failed - baseDst2.failed + logger.info("Target2 deltas: submitted=${dst2SubDelta}, finished=${dst2FinDelta}, failed=${dst2FailDelta}") + assert dst2SubDelta >= numInserts : "Target2: expected submitted >= ${numInserts}, got ${dst2SubDelta}" + assert dst2FinDelta >= numInserts : "Target2: expected finished >= ${numInserts}, got ${dst2FinDelta}" + assert dst2FailDelta == 0 : "Target2: expected 0 failed, got ${dst2FailDelta}" + + // ===== Test 2: Insert into 'logs' — only target2 should warm up ===== + logger.info("===== Test 2: logs -> only target2 =====") + + // Wait for metrics to stabilize before negative proof + WarmupMetricsUtils.waitForMetricsStable(sqlRunner, srcCluster, dstCluster1) + def baseDst1ForLogs = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster1) + def baseDst2ForLogs = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster2) + + sql """use ${dbName}""" + for (int i = 0; i < numInserts; i++) { + sql """INSERT INTO logs VALUES (${i}, 'log_${i}')""" + } + + // Wait for target2 to finish (logs is matched by job2) + def finalDst2ForLogs = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, srcCluster, dstCluster2, + baseDst2ForLogs.finished + numInserts) + WarmupMetricsUtils.logWarmupMetrics(sqlRunner, srcCluster, dstCluster2) + + // Verify target2 warmed logs + def dst2LogsSubDelta = finalDst2ForLogs.submitted - baseDst2ForLogs.submitted + def dst2LogsFinDelta = finalDst2ForLogs.finished - baseDst2ForLogs.finished + def dst2LogsFailDelta = finalDst2ForLogs.failed - baseDst2ForLogs.failed + logger.info("Target2 logs deltas: submitted=${dst2LogsSubDelta}, finished=${dst2LogsFinDelta}, failed=${dst2LogsFailDelta}") + assert dst2LogsSubDelta >= numInserts : "Target2: expected submitted >= ${numInserts}, got ${dst2LogsSubDelta}" + assert dst2LogsFinDelta >= numInserts : "Target2: expected finished >= ${numInserts}, got ${dst2LogsFinDelta}" + assert dst2LogsFailDelta == 0 : "Target2: expected 0 failed, got ${dst2LogsFailDelta}" + + // Verify target1 did NOT warm logs (negative proof) + sleep(5000) + def finalDst1ForLogs = WarmupMetricsUtils.logWarmupMetrics(sqlRunner, srcCluster, dstCluster1) + def dst1LogsSubDelta = finalDst1ForLogs.submitted - baseDst1ForLogs.submitted + def dst1LogsFinDelta = finalDst1ForLogs.finished - baseDst1ForLogs.finished + logger.info("Target1 logs deltas: submitted=${dst1LogsSubDelta}, finished=${dst1LogsFinDelta}") + assert dst1LogsSubDelta == 0 : "Target1 should NOT warm logs, submitted delta=${dst1LogsSubDelta}" + assert dst1LogsFinDelta == 0 : "Target1 should NOT warm logs, finished delta=${dst1LogsFinDelta}" + + // ===== Verify SHOW WARM UP JOB for both jobs ===== + logger.info("===== Verify SHOW WARM UP JOB output =====") + + def jobInfo1 = sql """SHOW WARM UP JOB WHERE ID = ${jobId1}""" + assert jobInfo1[0][2] == dstCluster1 + assert jobInfo1[0][3] in ["RUNNING", "PENDING"] + def filter1 = new JsonSlurper().parseText(jobInfo1[0][13]) + assert filter1.include.contains("${dbName}.orders".toString()) + assert !filter1.containsKey("exclude") + + def jobInfo2 = sql """SHOW WARM UP JOB WHERE ID = ${jobId2}""" + assert jobInfo2[0][2] == dstCluster2 + assert jobInfo2[0][3] in ["RUNNING", "PENDING"] + def filter2 = new JsonSlurper().parseText(jobInfo2[0][13]) + assert filter2.include.contains("${dbName}.*".toString()) + + } finally { + for (jid in jobIds) { + try { sql """CANCEL WARM UP JOB WHERE ID = ${jid}""" } catch (Exception ignored) {} + } + try { + sql """use ${dbName}""" + sql """DROP TABLE IF EXISTS orders""" + sql """DROP TABLE IF EXISTS logs""" + } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${dbName}""" } catch (Exception ignored) {} + } + } +} diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_multi_include.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_multi_include.groovy new file mode 100644 index 00000000000000..1faccc40e10cdf --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_multi_include.groovy @@ -0,0 +1,142 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.WarmupMetricsUtils + +suite('test_warm_up_event_on_tables_multi_include', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_warm_up_table_filter_refresh_interval_ms=1000', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + ] + options.cloudMode = true + options.beNum = 1 + + docker(options) { + Closure sqlRunner = { String q -> sql(q) } + + def clusterName1 = "warmup_source" + def clusterName2 = "warmup_target" + + cluster.addBackend(1, clusterName1) + cluster.addBackend(1, clusterName2) + + sql """use @${clusterName1}""" + + def dbName = "test_on_tables_mi_db" + def dbOther = "test_on_tables_mi_other_db" + def jobIds = [] + + try { + sql """CREATE DATABASE IF NOT EXISTS ${dbName}""" + sql """CREATE DATABASE IF NOT EXISTS ${dbOther}""" + + sql """use ${dbName}""" + sql """CREATE TABLE IF NOT EXISTS orders (id INT, amount DOUBLE) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + sql """CREATE TABLE IF NOT EXISTS customers (id INT, name STRING) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + + sql """use ${dbOther}""" + sql """CREATE TABLE IF NOT EXISTS logs (id INT, msg STRING) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + + // Multiple INCLUDE: orders from dbName + logs from dbOther (but NOT customers) + sql """use @${clusterName1}""" + def jobId_ = sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + ON TABLES ( + INCLUDE '${dbName}.orders', + INCLUDE '${dbOther}.logs' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + def jobId = jobId_[0][0] + jobIds << jobId + logger.info("Warm-up job ID: ${jobId}") + + sleep(3000) + def baseMetrics = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, clusterName1, clusterName2) + + // Insert into matched (orders, logs) and unmatched (customers) + def numInserts = 5 + sql """use ${dbName}""" + for (int i = 0; i < numInserts; i++) { + sql """INSERT INTO orders VALUES (${i}, ${i * 30.0})""" + sql """INSERT INTO customers VALUES (${i}, 'extra_${i}')""" + } + sql """use ${dbOther}""" + for (int i = 0; i < numInserts; i++) { + sql """INSERT INTO logs VALUES (${i}, 'important_${i}')""" + } + + // Expected: orders(5) + logs(5) = 10 segments; customers(5) NOT included + def expectedSeg = numInserts * 2 // orders + logs + def finalMetrics = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, clusterName1, clusterName2, + baseMetrics.finished + expectedSeg) + WarmupMetricsUtils.logWarmupMetrics(sqlRunner, clusterName1, clusterName2) + + def reqDelta = finalMetrics.requested - baseMetrics.requested + def subDelta = finalMetrics.submitted - baseMetrics.submitted + def finDelta = finalMetrics.finished - baseMetrics.finished + def failDelta = finalMetrics.failed - baseMetrics.failed + logger.info("Multi-include deltas: requested=${reqDelta}, submitted=${subDelta}, finished=${finDelta}, failed=${failDelta}") + assert reqDelta >= expectedSeg : "Expected requested >= ${expectedSeg}, got ${reqDelta}" + assert subDelta >= expectedSeg : "Expected submitted >= ${expectedSeg}, got ${subDelta}" + assert finDelta >= expectedSeg : "Expected finished >= ${expectedSeg}, got ${finDelta}" + // customers(5 inserts) should NOT contribute; if they did, submitted would be >= 15 + assert subDelta < expectedSeg + numInserts : \ + "customers should NOT be warmed (submitted=${subDelta} should be < ${expectedSeg + numInserts})" + assert failDelta == 0 : "Expected 0 failed, got ${failDelta}" + + def jobInfo = sql """SHOW WARM UP JOB WHERE ID = ${jobId}""" + def matchedSet = WarmupMetricsUtils.parseMatchedTables(jobInfo) + logger.info("MatchedTables set: ${matchedSet}") + assert "${dbName}.orders".toString() in matchedSet + assert "${dbOther}.logs".toString() in matchedSet + assert !("${dbName}.customers".toString() in matchedSet) + + } finally { + for (jid in jobIds) { + try { sql """CANCEL WARM UP JOB WHERE ID = ${jid}""" } catch (Exception ignored) {} + } + try { + sql """use ${dbName}""" + sql """DROP TABLE IF EXISTS orders""" + sql """DROP TABLE IF EXISTS customers""" + } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${dbName}""" } catch (Exception ignored) {} + try { + sql """use ${dbOther}""" + sql """DROP TABLE IF EXISTS logs""" + } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${dbOther}""" } catch (Exception ignored) {} + } + } +} diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_overlap_and_mv.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_overlap_and_mv.groovy new file mode 100644 index 00000000000000..fd55d170e483a9 --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_overlap_and_mv.groovy @@ -0,0 +1,332 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.WarmupMetricsUtils + +suite('test_warm_up_event_on_tables_overlap_and_mv', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_warm_up_table_filter_refresh_interval_ms=1000', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + ] + options.cloudMode = true + options.beNum = 1 + + docker(options) { + Closure sqlRunner = { String q -> sql(q) } + + def dstCluster = "warmup_target" + + def clusters = sql """SHOW CLUSTERS""" + assert !clusters.isEmpty() : "SHOW CLUSTERS should return the default source cluster" + def defaultCluster = clusters.find { + it[1].toString().equalsIgnoreCase("true") + } + def srcCluster = (defaultCluster ?: clusters[0])[0].toString() + logger.info("use default source cluster for overlap and mv warmup case: ${srcCluster}") + cluster.addBackend(1, dstCluster) + + def overlapDb = "test_on_tables_overlap_extra_db" + def mvDb = "test_on_tables_mv_extra_db" + def jobIds = [] + + try { + // FT-10: overlapping table-level jobs can coexist without duplicate target downloads. + sql """use @${srcCluster}""" + sql """CREATE DATABASE IF NOT EXISTS ${overlapDb}""" + sql """use ${overlapDb}""" + sql """CREATE TABLE IF NOT EXISTS orders (id INT, amount INT) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + sql """CREATE TABLE IF NOT EXISTS customers (id INT, name STRING) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + sql """CREATE TABLE IF NOT EXISTS audit_log (id INT, msg STRING) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + + def ordersJobId = sql(""" + WARM UP CLUSTER ${dstCluster} WITH CLUSTER ${srcCluster} + ON TABLES ( + INCLUDE '${overlapDb}.orders' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """)[0][0] + jobIds << ordersJobId + + def customersJobId = sql(""" + WARM UP CLUSTER ${dstCluster} WITH CLUSTER ${srcCluster} + ON TABLES ( + INCLUDE '${overlapDb}.customers' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """)[0][0] + jobIds << customersJobId + + def overlapJobId = sql(""" + WARM UP CLUSTER ${dstCluster} WITH CLUSTER ${srcCluster} + ON TABLES ( + INCLUDE '${overlapDb}.*', + EXCLUDE '${overlapDb}.audit_*' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """)[0][0] + jobIds << overlapJobId + + def ordersMatched = WarmupMetricsUtils.waitForMatchedTables(sqlRunner, ordersJobId, + ["${overlapDb}.orders".toString()] as Set, + ["${overlapDb}.customers".toString(), "${overlapDb}.audit_log".toString()] as Set) + def customersMatched = WarmupMetricsUtils.waitForMatchedTables(sqlRunner, customersJobId, + ["${overlapDb}.customers".toString()] as Set, + ["${overlapDb}.orders".toString(), "${overlapDb}.audit_log".toString()] as Set) + def overlapMatched = WarmupMetricsUtils.waitForMatchedTables(sqlRunner, overlapJobId, + ["${overlapDb}.orders".toString(), "${overlapDb}.customers".toString()] as Set, + ["${overlapDb}.audit_log".toString()] as Set) + assert ordersMatched == ["${overlapDb}.orders".toString()] as Set + assert customersMatched == ["${overlapDb}.customers".toString()] as Set + assert overlapMatched == ["${overlapDb}.customers".toString(), "${overlapDb}.orders".toString()] as Set + sleep(3000) + + def overlapBaseMetrics = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + int rowsPerTable = 4 + for (int i = 0; i < rowsPerTable; i++) { + sql """INSERT INTO orders VALUES (${i}, ${i * 10})""" + sql """INSERT INTO customers VALUES (${i}, 'customer_${i}')""" + sql """INSERT INTO audit_log VALUES (${i}, 'audit_${i}')""" + } + + int uniqueMatchedSegments = rowsPerTable * 2 + int jobMatchedSegments = rowsPerTable * 4 + def overlapFinalMetrics = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, srcCluster, dstCluster, + overlapBaseMetrics.finished + uniqueMatchedSegments) + def requestedDelta = overlapFinalMetrics.requested - overlapBaseMetrics.requested + def submittedDelta = overlapFinalMetrics.submitted - overlapBaseMetrics.submitted + def finishedDelta = overlapFinalMetrics.finished - overlapBaseMetrics.finished + def failedDelta = overlapFinalMetrics.failed - overlapBaseMetrics.failed + logger.info("overlap deltas requested=${requestedDelta}, submitted=${submittedDelta}, " + + "finished=${finishedDelta}, failed=${failedDelta}") + assert requestedDelta >= jobMatchedSegments : + "source requested should count each matching job, expected >= ${jobMatchedSegments}, got ${requestedDelta}" + assert submittedDelta >= uniqueMatchedSegments : + "target should warm all unique matched rowsets, expected >= ${uniqueMatchedSegments}, got ${submittedDelta}" + assert submittedDelta <= uniqueMatchedSegments : + "overlap jobs should not amplify target downloads, expected <= ${uniqueMatchedSegments}, got ${submittedDelta}" + assert finishedDelta >= uniqueMatchedSegments : + "target should finish all unique matched rowsets, expected >= ${uniqueMatchedSegments}, got ${finishedDelta}" + assert failedDelta == 0 : "overlap jobs should not fail, got failed delta ${failedDelta}" + + def ordersStats = WarmupMetricsUtils.waitForJobSyncStats(sqlRunner, ordersJobId, { stats -> + stats.seg_num.requested_5m >= rowsPerTable + }) + def customersStats = WarmupMetricsUtils.waitForJobSyncStats(sqlRunner, customersJobId, { stats -> + stats.seg_num.requested_5m >= rowsPerTable + }) + def overlapStats = WarmupMetricsUtils.waitForJobSyncStats(sqlRunner, overlapJobId, { stats -> + stats.seg_num.requested_5m >= rowsPerTable * 2 + }) + assert ordersStats.seg_num.requested_5m < rowsPerTable * 2 : + "orders-only job should not include customers/audit, stats=${ordersStats}" + assert customersStats.seg_num.requested_5m < rowsPerTable * 2 : + "customers-only job should not include orders/audit, stats=${customersStats}" + assert overlapStats.seg_num.requested_5m >= rowsPerTable * 2 : + "overlap job should include orders and customers, stats=${overlapStats}" + + for (jid in [ordersJobId, customersJobId, overlapJobId]) { + try { sql """CANCEL WARM UP JOB WHERE ID = ${jid}""" } catch (Exception ignored) {} + } + WarmupMetricsUtils.waitForMetricsStable(sqlRunner, srcCluster, dstCluster) + + // FT-12: async MV is independently matchable, while sync MV/rollup warms with the base table. + sql """use @${srcCluster}""" + sql """CREATE DATABASE IF NOT EXISTS ${mvDb}""" + sql """use ${mvDb}""" + def baseTable = "fact_rollup" + def rollupName = "rollup_sum" + def asyncMv = "mv_async_summary" + + sql """CREATE TABLE IF NOT EXISTS ${baseTable} (k INT, v INT) + DUPLICATE KEY(k) DISTRIBUTED BY HASH(k) BUCKETS 1 + PROPERTIES ( + "file_cache_ttl_seconds" = "3600", + "disable_auto_compaction" = "true" + )""" + sql """INSERT INTO ${baseTable} VALUES (1, 10), (2, 20)""" + sql """DROP MATERIALIZED VIEW IF EXISTS ${rollupName} ON ${baseTable}""" + sql """CREATE MATERIALIZED VIEW ${rollupName} AS + SELECT k AS rollup_k, sum(v) AS rollup_total_v FROM ${baseTable} GROUP BY k""" + waitingMVTaskFinishedByMvName(mvDb, baseTable, rollupName) + + sql """DROP MATERIALIZED VIEW IF EXISTS ${asyncMv}""" + sql """ + CREATE MATERIALIZED VIEW ${asyncMv} + BUILD DEFERRED REFRESH COMPLETE ON MANUAL + DISTRIBUTED BY HASH(k) BUCKETS 1 + PROPERTIES ('replication_num' = '1') + AS SELECT k, sum(v) AS total_v FROM ${baseTable} GROUP BY k + """ + + def baseJobId = sql(""" + WARM UP CLUSTER ${dstCluster} WITH CLUSTER ${srcCluster} + ON TABLES ( + INCLUDE '${mvDb}.${baseTable}' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """)[0][0] + jobIds << baseJobId + + def mvJobId = sql(""" + WARM UP CLUSTER ${dstCluster} WITH CLUSTER ${srcCluster} + ON TABLES ( + INCLUDE '${mvDb}.mv_*' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """)[0][0] + jobIds << mvJobId + + def baseMatched = WarmupMetricsUtils.waitForMatchedTables(sqlRunner, baseJobId, + ["${mvDb}.${baseTable}".toString()] as Set, + ["${mvDb}.${asyncMv}".toString(), "${mvDb}.${rollupName}".toString()] as Set) + def mvMatched = WarmupMetricsUtils.waitForMatchedTables(sqlRunner, mvJobId, + ["${mvDb}.${asyncMv}".toString()] as Set, + ["${mvDb}.${baseTable}".toString(), "${mvDb}.${rollupName}".toString()] as Set) + assert baseMatched == ["${mvDb}.${baseTable}".toString()] as Set : + "base filter should match only base table, got ${baseMatched}" + assert mvMatched == ["${mvDb}.${asyncMv}".toString()] as Set : + "mv_* filter should match only async MV, got ${mvMatched}" + sleep(3000) + + def mvBaseMetrics = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + int baseInsertRows = 3 + for (int i = 0; i < baseInsertRows; i++) { + sql """INSERT INTO ${baseTable} VALUES (${i + 1}, ${i + 1})""" + } + + int expectedBaseWarmupSegments = baseInsertRows + def afterBaseLoad = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, srcCluster, dstCluster, + mvBaseMetrics.finished + expectedBaseWarmupSegments) + def baseLoadFinishedDelta = afterBaseLoad.finished - mvBaseMetrics.finished + logger.info("base table load with rollup warmup finished delta: ${baseLoadFinishedDelta}") + assert baseLoadFinishedDelta >= expectedBaseWarmupSegments : + "base load should warm while rollup exists, expected >= ${expectedBaseWarmupSegments}, got ${baseLoadFinishedDelta}" + + def baseJobStats = WarmupMetricsUtils.waitForJobSyncStats(sqlRunner, baseJobId, { stats -> + stats.seg_num.requested_5m >= expectedBaseWarmupSegments + && stats.seg_num.finish_5m >= expectedBaseWarmupSegments + }) + assert baseJobStats.seg_num.requested_5m >= expectedBaseWarmupSegments : + "base job should warm base table with rollup present without matching rollup as a table, stats=${baseJobStats}" + + def beforeMvRefresh = WarmupMetricsUtils.waitForMetricsStable(sqlRunner, srcCluster, dstCluster) + sql """REFRESH MATERIALIZED VIEW ${asyncMv} COMPLETE""" + waitingMTMVTaskFinishedByMvName(asyncMv, mvDb) + def afterMvRefresh = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, srcCluster, dstCluster, + beforeMvRefresh.finished + 1) + assert afterMvRefresh.finished > beforeMvRefresh.finished : + "async MV refresh should trigger event-driven warmup after mv_* job is created" + + def mvJobStats = WarmupMetricsUtils.waitForJobSyncStats(sqlRunner, mvJobId, { stats -> + stats.seg_num.requested_5m >= 1 && stats.seg_num.finish_5m >= 1 + }) + assert mvJobStats.seg_num.requested_5m >= 1 : + "mv_* job should independently warm async MV rowsets, stats=${mvJobStats}" + + sql """use @${dstCluster}""" + sql """use ${mvDb}""" + def asyncMvRewriteQuerySql = + "SELECT k, sum(v) AS total_v FROM ${baseTable} GROUP BY k ORDER BY k" + mv_rewrite_success(asyncMvRewriteQuerySql, asyncMv, true) + profile("ft12_async_mv_target_profile") { + sql """set enable_profile = true""" + sql """set profile_level = 2""" + run { + def res = sql """/* ft12_async_mv_target_profile */ ${asyncMvRewriteQuerySql}""" + assert res.collect { [it[0].toString(), it[1].toString()] } == + [["1", "11"], ["2", "22"], ["3", "3"]] : + "target aggregate query should be rewritten to async MV and return MV data, got ${res}" + sleep(1000) + } + check { profileString, exception -> + if (exception != null) { + throw exception + } + assert profileString.contains("NumRemoteIOTotal") : + "profile should contain file cache counters" + def remoteTotal = WarmupMetricsUtils.sumProfileCounter(profileString, + "NumRemoteIOTotal") + def localTotal = WarmupMetricsUtils.sumProfileCounter(profileString, + "NumLocalIOTotal") + logger.info("async MV target profile NumRemoteIOTotal=${remoteTotal}, " + + "NumLocalIOTotal=${localTotal}") + assert remoteTotal == 0 : + "rewritten async MV query should not read remote data after warmup" + assert localTotal > 0 : + "rewritten async MV query should hit local file cache after warmup" + } + } + + def rollupQuery = sql """SELECT k, sum(v) FROM ${baseTable} GROUP BY k ORDER BY k""" + assert rollupQuery.collect { [it[0].toString(), it[1].toString()] } == + [["1", "11"], ["2", "22"], ["3", "3"]] : + "target cluster should read base table with rollup data correctly, got ${rollupQuery}" + def asyncMvQuery = sql """SELECT k, total_v FROM ${asyncMv} ORDER BY k""" + assert asyncMvQuery.collect { [it[0].toString(), it[1].toString()] } == + [["1", "11"], ["2", "22"], ["3", "3"]] : + "target cluster should read async MV correctly, got ${asyncMvQuery}" + + } finally { + for (jid in jobIds) { + try { sql """CANCEL WARM UP JOB WHERE ID = ${jid}""" } catch (Exception ignored) {} + } + try { + sql """use ${overlapDb}""" + sql """DROP TABLE IF EXISTS orders""" + sql """DROP TABLE IF EXISTS customers""" + sql """DROP TABLE IF EXISTS audit_log""" + } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${overlapDb}""" } catch (Exception ignored) {} + try { + sql """use @${srcCluster}""" + sql """use ${mvDb}""" + sql """DROP MATERIALIZED VIEW IF EXISTS mv_async_summary""" + sql """DROP MATERIALIZED VIEW IF EXISTS rollup_sum ON fact_rollup""" + sql """DROP TABLE IF EXISTS fact_rollup""" + } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${mvDb}""" } catch (Exception ignored) {} + } + } +} diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_show_and_cancel.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_show_and_cancel.groovy new file mode 100644 index 00000000000000..1287e57999f8af --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_show_and_cancel.groovy @@ -0,0 +1,384 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import groovy.json.JsonSlurper +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.WarmupMetricsUtils + +suite('test_warm_up_event_on_tables_show_and_cancel', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_warm_up_table_filter_refresh_interval_ms=1000', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + ] + options.cloudMode = true + options.beNum = 1 + + docker(options) { + Closure sqlRunner = { String q -> sql(q) } + + def srcCluster = "warmup_source" + def dstCluster = "warmup_target" + + cluster.addBackend(1, srcCluster) + cluster.addBackend(1, dstCluster) + + def showDb = "test_on_tables_show_extra_db" + def cancelDb = "test_on_tables_cancel_extra_db" + def jobIds = [] + def slurper = new JsonSlurper() + + def getJobRow = { jobId -> + def rows = sql """SHOW WARM UP JOB WHERE ID = ${jobId}""" + assert rows.size() == 1 : "expected one row for job ${jobId}, got ${rows}" + assert rows[0].size() == 16 : "SHOW WARM UP JOB should expose 16 columns, got ${rows[0].size()}" + return rows[0] + } + + def waitForJobState = { jobId, Set expectedStates, long timeoutMs = 60000 -> + long deadline = System.currentTimeMillis() + timeoutMs + def row = null + while (System.currentTimeMillis() < deadline) { + row = getJobRow(jobId) + if (expectedStates.contains(row[3].toString())) { + return row + } + sleep(1000) + } + return row + } + + def assertEmptyNewColumns = { row, String jobDesc -> + assert row[13]?.toString() == "" : "${jobDesc} should have empty TableFilter, row=${row}" + assert row[14]?.toString() == "" : "${jobDesc} should have empty MatchedTables, row=${row}" + assert row[15]?.toString() == "" : "${jobDesc} should have empty SyncStats, row=${row}" + } + + def assertDetailedSyncStats = { row, String jobDesc -> + def stats = WarmupMetricsUtils.parseSyncStats([row]) + assert !stats.isEmpty() : "${jobDesc} should have detailed SyncStats, row=${row}" + assert stats.containsKey("seg_num") : "${jobDesc} detailed SyncStats should contain seg_num: ${stats}" + assert stats.containsKey("seg_size") : "${jobDesc} detailed SyncStats should contain seg_size: ${stats}" + assert stats.containsKey("idx_num") : "${jobDesc} detailed SyncStats should contain idx_num: ${stats}" + assert stats.containsKey("idx_size") : "${jobDesc} detailed SyncStats should contain idx_size: ${stats}" + assert stats.containsKey("last_trigger_ts") : + "${jobDesc} detailed SyncStats should contain last_trigger_ts: ${stats}" + assert stats.containsKey("last_finish_ts") : + "${jobDesc} detailed SyncStats should contain last_finish_ts: ${stats}" + assert !stats.containsKey("window") : "${jobDesc} WHERE ID output should not be compact summary: ${stats}" + assert !stats.containsKey("src_size") : "${jobDesc} WHERE ID output should not be compact summary: ${stats}" + assert !stats.containsKey("dst_size") : "${jobDesc} WHERE ID output should not be compact summary: ${stats}" + assert !stats.containsKey("gap_size") : "${jobDesc} WHERE ID output should not be compact summary: ${stats}" + return stats + } + + def assertSummarySyncStats = { row, String jobDesc -> + def raw = row[15]?.toString()?.trim() + assert raw != null && raw.length() > 0 : "${jobDesc} should have compact SyncStats summary, row=${row}" + def stats = slurper.parseText(raw) + assert stats.window == "30m" : "${jobDesc} list output should use 30m summary, row=${row}" + assert stats.src_size instanceof String : "${jobDesc} summary src_size should be a string: ${stats}" + assert stats.dst_size instanceof String : "${jobDesc} summary dst_size should be a string: ${stats}" + assert stats.gap_size instanceof String : "${jobDesc} summary gap_size should be a string: ${stats}" + assert !stats.containsKey("seg_num") : "${jobDesc} list output should not include detailed seg_num" + assert !stats.containsKey("seg_size") : "${jobDesc} list output should not include detailed seg_size" + assert !stats.containsKey("idx_num") : "${jobDesc} list output should not include detailed idx_num" + assert !stats.containsKey("idx_size") : "${jobDesc} list output should not include detailed idx_size" + assert !stats.containsKey("last_trigger_ts") : + "${jobDesc} list output should not include detailed last_trigger_ts" + assert !stats.containsKey("last_finish_ts") : + "${jobDesc} list output should not include detailed last_finish_ts" + return stats + } + + try { + // FT-05: SHOW WARM UP JOB mixes new ON TABLES jobs with old once/periodic/table jobs. + sql """use @${srcCluster}""" + sql """CREATE DATABASE IF NOT EXISTS ${showDb}""" + sql """use ${showDb}""" + sql """CREATE TABLE IF NOT EXISTS show_base (id INT, val STRING) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + sql """CREATE TABLE IF NOT EXISTS show_extra (id INT, val STRING) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + sql """INSERT INTO show_base VALUES (0, 'seed')""" + + def oldTableJobId = sql("""WARM UP CLUSTER ${dstCluster} WITH TABLE show_base""")[0][0] + jobIds << oldTableJobId + def oldTableRow = waitForJobState(oldTableJobId, ["FINISHED", "RUNNING", "PENDING"] as Set) + assert oldTableRow[4] == "TABLE" : "old WITH TABLE job should be TABLE, row=${oldTableRow}" + assert oldTableRow[5].toString().startsWith("ONCE") : "old WITH TABLE job should be ONCE, row=${oldTableRow}" + assert oldTableRow[12].toString().contains("${showDb}.show_base".toString()) : + "old WITH TABLE job should show warmed table, row=${oldTableRow}" + assertEmptyNewColumns(oldTableRow, "old WITH TABLE job") + + def periodicJobId = sql(""" + WARM UP CLUSTER ${dstCluster} WITH CLUSTER ${srcCluster} + PROPERTIES ( + "sync_mode" = "periodic", + "sync_interval_sec" = "10" + ) + """)[0][0] + jobIds << periodicJobId + + def clusterJobId = sql(""" + WARM UP CLUSTER ${dstCluster} WITH CLUSTER ${srcCluster} + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """)[0][0] + jobIds << clusterJobId + + sleep(2000) + sql """use ${showDb}""" + sql """INSERT INTO show_base VALUES (1, 'cluster_base')""" + sql """INSERT INTO show_extra VALUES (1, 'cluster_extra')""" + + def clusterStats = WarmupMetricsUtils.waitForJobSyncStats(sqlRunner, clusterJobId, { stats -> + stats.seg_num.requested_5m > 0 && stats.seg_num.finish_5m > 0 + }, 8000) + assert !clusterStats.isEmpty() : "cluster-level event job should expose SyncStats" + assert clusterStats.seg_num.requested_5m > 0 : + "cluster event job SyncStats should observe load requests: ${clusterStats}" + assert clusterStats.seg_num.finish_5m > 0 : + "cluster event job SyncStats should observe target finishes: ${clusterStats}" + def runningClusterRow = getJobRow(clusterJobId) + def runningClusterStats = assertDetailedSyncStats(runningClusterRow, "cluster event job") + assert runningClusterStats.seg_num.requested_5m > 0 : + "cluster event job detailed SyncStats should observe load requests: ${runningClusterStats}" + assert runningClusterStats.seg_num.finish_5m > 0 : + "cluster event job detailed SyncStats should observe target finishes: ${runningClusterStats}" + def runningListRows = sql """SHOW WARM UP JOB""" + def runningClusterSummaryRow = runningListRows.find { it[0].toString() == clusterJobId.toString() } + def runningClusterSummary = assertSummarySyncStats(runningClusterSummaryRow, "cluster event job") + assert !runningClusterSummary.containsKey("data_size") : + "cluster event job summary should merge data and index sizes: ${runningClusterSummary}" + assert !runningClusterSummary.containsKey("index_size") : + "cluster event job summary should merge data and index sizes: ${runningClusterSummary}" + + try { + sql(""" + WARM UP CLUSTER ${dstCluster} WITH CLUSTER ${srcCluster} + ON TABLES ( + INCLUDE '${showDb}.show_base' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """) + assert false : "Expected ON TABLES load-event job to conflict with existing cluster-level job" + } catch (java.sql.SQLException e) { + logger.info("Expected cross-level conflict: ${e.getMessage()}") + assert e.getMessage().contains("Cannot create table-level load-event warm up job") : e.getMessage() + assert e.getMessage().contains("conflicting cluster-level load-event warm up job ${clusterJobId}") : + e.getMessage() + assert e.getMessage().contains("Cancel existing load-event warm up job ${clusterJobId}") : + e.getMessage() + } + + sql """CANCEL WARM UP JOB WHERE ID = ${clusterJobId}""" + def cancelledClusterRow = getJobRow(clusterJobId) + assert cancelledClusterRow[3] == "CANCELLED" : "cluster event job should be cancelled, row=${cancelledClusterRow}" + + def tableJobId = sql(""" + WARM UP CLUSTER ${dstCluster} WITH CLUSTER ${srcCluster} + ON TABLES ( + INCLUDE '${showDb}.show_base' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """)[0][0] + jobIds << tableJobId + + WarmupMetricsUtils.waitForMatchedTables(sqlRunner, tableJobId, + ["${showDb}.show_base".toString()] as Set, + ["${showDb}.show_extra".toString()] as Set) + + sleep(2000) + sql """use ${showDb}""" + sql """INSERT INTO show_base VALUES (2, 'table_base')""" + sql """INSERT INTO show_extra VALUES (2, 'table_extra')""" + + def tableStats = WarmupMetricsUtils.waitForJobSyncStats(sqlRunner, tableJobId, { stats -> + stats.seg_num.requested_5m > 0 && stats.seg_num.finish_5m > 0 + }, 8000) + assert !tableStats.isEmpty() : "table-level event job should expose SyncStats" + assert tableStats.seg_num.requested_5m > 0 : + "table-level event job SyncStats should observe load requests: ${tableStats}" + assert tableStats.seg_num.finish_5m > 0 : + "table-level event job SyncStats should observe target finishes: ${tableStats}" + + def periodicRow = getJobRow(periodicJobId) + assert periodicRow[4] == "CLUSTER" : "periodic job type should be CLUSTER, row=${periodicRow}" + assert periodicRow[5] == "PERIODIC (10s)" : "periodic job sync mode mismatch, row=${periodicRow}" + assertEmptyNewColumns(periodicRow, "periodic cluster job") + + def clusterRow = getJobRow(clusterJobId) + assert clusterRow[3] == "CANCELLED" : "cluster event job should remain visible after cancel, row=${clusterRow}" + assert clusterRow[4] == "CLUSTER" : "cluster event job type should be CLUSTER, row=${clusterRow}" + assert clusterRow[5] == "EVENT_DRIVEN (LOAD)" : "cluster event sync mode mismatch, row=${clusterRow}" + assert clusterRow[13] == "" : "cluster event job should not have TableFilter, row=${clusterRow}" + assert clusterRow[14] == "" : "cluster event job should not have MatchedTables, row=${clusterRow}" + + def tableRow = getJobRow(tableJobId) + assert tableRow[4] == "TABLES" : "ON TABLES job type should be TABLES, row=${tableRow}" + assert tableRow[5] == "EVENT_DRIVEN (LOAD)" : "ON TABLES sync mode mismatch, row=${tableRow}" + def tableFilter = slurper.parseText(tableRow[13].toString()) + assert tableFilter.include == ["${showDb}.show_base".toString()] : + "table filter should show the canonical include rule, row=${tableRow}" + def matched = WarmupMetricsUtils.parseMatchedTables([tableRow]) + assert matched == ["${showDb}.show_base".toString()] as Set : + "MatchedTables should contain only show_base, got ${matched}" + def detailedTableStats = assertDetailedSyncStats(tableRow, "ON TABLES job") + assert detailedTableStats.seg_num.requested_5m > 0 : + "ON TABLES detailed SyncStats should observe load requests: ${detailedTableStats}" + assert detailedTableStats.seg_num.finish_5m > 0 : + "ON TABLES detailed SyncStats should observe target finishes: ${detailedTableStats}" + + def listRows = sql """SHOW WARM UP JOB""" + for (jobId in [oldTableJobId, periodicJobId, clusterJobId, tableJobId]) { + def row = listRows.find { it[0].toString() == jobId.toString() } + assert row != null : "SHOW WARM UP JOB should include job ${jobId}, rows=${listRows}" + assert row.size() == 16 : "SHOW WARM UP JOB list row should expose 16 columns, row=${row}" + } + def tableSummaryRow = listRows.find { it[0].toString() == tableJobId.toString() } + def tableSummary = assertSummarySyncStats(tableSummaryRow, "ON TABLES job") + assert !tableSummary.containsKey("data_size") : + "ON TABLES job summary should merge data and index sizes: ${tableSummary}" + assert !tableSummary.containsKey("index_size") : + "ON TABLES job summary should merge data and index sizes: ${tableSummary}" + + for (jid in [periodicJobId, clusterJobId, tableJobId]) { + try { sql """CANCEL WARM UP JOB WHERE ID = ${jid}""" } catch (Exception ignored) {} + } + + // FT-11: cancel keeps existing cache but removes the job from subsequent load triggers. + sql """use @${srcCluster}""" + sql """CREATE DATABASE IF NOT EXISTS ${cancelDb}""" + sql """use ${cancelDb}""" + sql """CREATE TABLE IF NOT EXISTS cancel_base (id INT, val STRING) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + + def cancelJobId = sql(""" + WARM UP CLUSTER ${dstCluster} WITH CLUSTER ${srcCluster} + ON TABLES ( + INCLUDE '${cancelDb}.cancel_base' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """)[0][0] + jobIds << cancelJobId + WarmupMetricsUtils.waitForMatchedTables(sqlRunner, cancelJobId, + ["${cancelDb}.cancel_base".toString()] as Set) + sleep(3000) + + def firstBaseMetrics = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + int firstLoadRows = 4 + for (int i = 0; i < firstLoadRows; i++) { + sql """INSERT INTO cancel_base VALUES (${i}, 'before_cancel_${i}')""" + } + WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, srcCluster, dstCluster, + firstBaseMetrics.finished + firstLoadRows) + def stableBeforeCancel = WarmupMetricsUtils.waitForMetricsStable(sqlRunner, srcCluster, dstCluster) + def targetCacheBeforeCancel = WarmupMetricsUtils.getClusterMetricSum(sqlRunner, dstCluster, "ttl_cache_size") + assert targetCacheBeforeCancel > 0 : + "target cache should be populated before cancel, size=${targetCacheBeforeCancel}" + + sql """CANCEL WARM UP JOB WHERE ID = ${cancelJobId}""" + def cancelledRow = getJobRow(cancelJobId) + assert cancelledRow[3] == "CANCELLED" : "job should be CANCELLED, row=${cancelledRow}" + def targetCacheAfterCancel = WarmupMetricsUtils.getClusterMetricSum(sqlRunner, dstCluster, "ttl_cache_size") + assert targetCacheAfterCancel >= targetCacheBeforeCancel : + "cancel should not evict existing cache, before=${targetCacheBeforeCancel}, after=${targetCacheAfterCancel}" + + int afterCancelRows = 3 + for (int i = 0; i < afterCancelRows; i++) { + sql """INSERT INTO cancel_base VALUES (${i + 100}, 'after_cancel_${i}')""" + } + sleep(8000) + def afterCancelledLoad = WarmupMetricsUtils.logWarmupMetrics(sqlRunner, srcCluster, dstCluster) + assert afterCancelledLoad.requested == stableBeforeCancel.requested : + "cancelled job should not request new segments after load" + assert afterCancelledLoad.submitted == stableBeforeCancel.submitted : + "cancelled job should not submit new segments after load" + assert afterCancelledLoad.finished == stableBeforeCancel.finished : + "cancelled job should not finish new segments after load" + def targetCacheAfterCancelledLoad = + WarmupMetricsUtils.getClusterMetricSum(sqlRunner, dstCluster, "ttl_cache_size") + assert targetCacheAfterCancelledLoad >= targetCacheBeforeCancel : + "existing cache should remain after post-cancel load" + + sql """use @${dstCluster}""" + sql """use ${cancelDb}""" + profile("ft11_cancel_target_profile") { + sql """set enable_profile = true""" + sql """set profile_level = 2""" + run { + def expectedRows = firstLoadRows + afterCancelRows + def expectedSum = (0.. + if (exception != null) { + throw exception + } + assert profileString.contains("NumRemoteIOTotal") : "profile should contain file cache counters" + def remoteTotal = WarmupMetricsUtils.sumProfileCounter(profileString, "NumRemoteIOTotal") + def localTotal = WarmupMetricsUtils.sumProfileCounter(profileString, "NumLocalIOTotal") + logger.info("cancel target profile NumRemoteIOTotal=${remoteTotal}, NumLocalIOTotal=${localTotal}") + assert remoteTotal > 0 : + "post-cancel target query should read remote data for segments loaded after cancel" + assert localTotal > 0 : "post-cancel target query should still hit existing warmed cache" + } + } + + } finally { + for (jid in jobIds) { + try { sql """CANCEL WARM UP JOB WHERE ID = ${jid}""" } catch (Exception ignored) {} + } + try { + sql """use ${showDb}""" + sql """DROP TABLE IF EXISTS show_base""" + sql """DROP TABLE IF EXISTS show_extra""" + } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${showDb}""" } catch (Exception ignored) {} + try { + sql """use ${cancelDb}""" + sql """DROP TABLE IF EXISTS cancel_base""" + } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${cancelDb}""" } catch (Exception ignored) {} + } + } +} diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_sync_stats.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_sync_stats.groovy new file mode 100644 index 00000000000000..0def53a822b126 --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_sync_stats.groovy @@ -0,0 +1,298 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.WarmupMetricsUtils +import groovy.json.JsonSlurper + +suite('test_warm_up_event_on_tables_sync_stats', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_warm_up_table_filter_refresh_interval_ms=1000', + 'cloud_warm_up_sync_stats_refresh_interval_ms=1000', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + ] + options.cloudMode = true + options.beNum = 1 + + docker(options) { + Closure sqlRunner = { String q -> sql(q) } + Closure fetchFeMetrics = { -> + def masterFe = cluster.getMasterFe() + WarmupMetricsUtils.getPrometheusMetrics(masterFe.host, masterFe.httpPort) + } + Closure waitForWarmUpSyncJobMetrics = { + Object jobId, String jobType, String srcClusterName, String dstClusterName -> + def commonLabels = [ + job_id: jobId.toString(), + job_type: jobType, + src_cluster_name: srcClusterName, + dst_cluster_name: dstClusterName + ] + String lastDebug = "" + long deadline = System.currentTimeMillis() + 30000 + while (System.currentTimeMillis() < deadline) { + def metricsText = fetchFeMetrics() + def infoLabels = [ + job_id: jobId.toString(), + job_type: jobType, + sync_mode: "EVENT_DRIVEN", + sync_event: "LOAD", + job_state: "RUNNING", + src_cluster_name: srcClusterName, + dst_cluster_name: dstClusterName + ] + def info = WarmupMetricsUtils.findPrometheusMetricValue(metricsText, + "doris_fe_file_cache_warm_up_sync_job_info", infoLabels) + def sizeMetrics = [:] + boolean allSizeMetricsPositive = true + for (window in ["5m", "30m", "1h"]) { + for (side in ["src", "dst"]) { + def key = "${side}_${window}".toString() + sizeMetrics[key] = WarmupMetricsUtils.findPrometheusMetricValue(metricsText, + "doris_fe_file_cache_warm_up_sync_job_size_bytes", + commonLabels + [side: side, window: window]) + if (sizeMetrics[key] == null || sizeMetrics[key] <= 0) { + allSizeMetricsPositive = false + } + } + } + + if (info == 1G && allSizeMetricsPositive) { + logger.info("FE warm-up sync metrics for job ${jobId}: ${sizeMetrics}") + return + } + lastDebug = metricsText.readLines() + .findAll { it.contains("file_cache_warm_up_sync_job") && it.contains("job_id=\"${jobId}\"") } + .join("\n") + sleep(1000) + } + assert false : "Timed out waiting FE warm-up sync metrics for ${jobType} job ${jobId}. " + + "Last matching metrics:\n${lastDebug}" + } + + def clusterName1 = "warmup_source" + def clusterName2 = "warmup_target" + + cluster.addBackend(1, clusterName1) + cluster.addBackend(1, clusterName2) + + sql """use @${clusterName1}""" + + def dbName = "test_sync_stats_db" + def jobIds = [] + + try { + sql """CREATE DATABASE IF NOT EXISTS ${dbName}""" + sql """use ${dbName}""" + sql """CREATE TABLE IF NOT EXISTS t1 (id INT, val STRING) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + + // Create event-driven warmup job + sql """use @${clusterName1}""" + def jobId_ = sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + ON TABLES ( + INCLUDE '${dbName}.*' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + def jobId = jobId_[0][0] + jobIds << jobId + logger.info("Warm-up job ID: ${jobId}") + + sleep(3000) + + // Capture baseline BEFORE inserts so we know the target + def baseMetrics = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, clusterName1, clusterName2) + logger.info("Baseline metrics: ${baseMetrics}") + + // Insert data to trigger warmup + def numInserts = 5 + sql """use ${dbName}""" + for (int i = 0; i < numInserts; i++) { + sql """INSERT INTO t1 VALUES (${i}, 'value_${i}')""" + } + + // Wait for warmup to finish using bvar metrics + def metrics = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, clusterName1, clusterName2, + baseMetrics.finished + numInserts) + logger.info("Warmup metrics after finish: ${metrics}") + + // Compute bvar deltas (source submitted, target finished) + def submittedDelta = metrics.submitted - baseMetrics.submitted + def finishedDelta = metrics.finished - baseMetrics.finished + logger.info("Bvar deltas: submitted=${submittedDelta}, finished=${finishedDelta}") + + // Poll SHOW WARM UP JOB until windowed metrics catch up with bvar values + // (bvar::Window samples every ~1s; values need time to accumulate) + def syncStats = null + def syncStatsStr = "" + long deadline = System.currentTimeMillis() + 30000 + while (System.currentTimeMillis() < deadline) { + def jobInfo = sql """SHOW WARM UP JOB WHERE ID = ${jobId}""" + assert jobInfo.size() > 0 : "SHOW WARM UP JOB returned no rows" + syncStatsStr = jobInfo[0][15]?.toString()?.trim() + if (syncStatsStr != null && syncStatsStr.length() > 0) { + syncStats = new JsonSlurper().parseText(syncStatsStr) + if (syncStats.seg_num.requested_5m >= submittedDelta + && syncStats.seg_num.finish_5m >= finishedDelta + && syncStats.trigger_gap_ms == 0) { + break + } + } + sleep(2000) + } + logger.info("SyncStats column: ${syncStatsStr}") + assert syncStats != null : "SyncStats should not be empty for event-driven job" + + // Verify top-level keys + assert syncStats.containsKey("seg_num") : "Missing seg_num" + assert syncStats.containsKey("seg_size") : "Missing seg_size" + assert syncStats.containsKey("idx_num") : "Missing idx_num" + assert syncStats.containsKey("idx_size") : "Missing idx_size" + assert syncStats.containsKey("last_trigger_ts") : "Missing last_trigger_ts" + assert syncStats.containsKey("last_finish_ts") : "Missing last_finish_ts" + assert syncStats.containsKey("trigger_gap_ms") : "Missing trigger_gap_ms" + assert !syncStats.containsKey("window") : "Detailed SyncStats should not be compact summary" + assert !syncStats.containsKey("src_size") : "Detailed SyncStats should not be compact summary" + assert !syncStats.containsKey("dst_size") : "Detailed SyncStats should not be compact summary" + assert !syncStats.containsKey("gap_size") : "Detailed SyncStats should not be compact summary" + + // Verify detailed stats have the expected window keys. + def assertWindowFields = { groupName, group -> + for (window in ["5m", "30m", "1h"]) { + for (field in ["requested", "finish", "gap", "fail"]) { + def key = "${field}_${window}".toString() + assert group.containsKey(key) : "Missing ${groupName}.${key}" + } + } + } + def segNum = syncStats.seg_num + assertWindowFields("seg_num", segNum) + assertWindowFields("seg_size", syncStats.seg_size) + assertWindowFields("idx_num", syncStats.idx_num) + assertWindowFields("idx_size", syncStats.idx_size) + + // Verify absolute segment counts match bvar deltas + logger.info("seg_num.requested_5m=${segNum.requested_5m}, bvar submitted delta=${submittedDelta}") + logger.info("seg_num.finish_5m=${segNum.finish_5m}, bvar finished delta=${finishedDelta}") + assert segNum.requested_5m == submittedDelta : + "seg_num.requested_5m(${segNum.requested_5m}) should equal source submitted delta(${submittedDelta})" + assert segNum.finish_5m == finishedDelta : + "seg_num.finish_5m(${segNum.finish_5m}) should equal target finished delta(${finishedDelta})" + + // Verify gap is 0 after warmup completes (all requested segments finished) + assert segNum.gap_5m == 0 : "Expected gap_5m == 0 after warmup completes, got ${segNum.gap_5m}" + assert syncStats.trigger_gap_ms == 0 : + "Expected trigger_gap_ms == 0 after warmup completes, got ${syncStats.trigger_gap_ms}, stats=${syncStats}" + + // Verify fail count is 0 + assert segNum.fail_5m == 0 : "Expected no failures, got fail_5m=${segNum.fail_5m}" + + // Verify seg_size values are human-readable strings + def segSize = syncStats.seg_size + logger.info("seg_size.requested_5m = ${segSize.requested_5m}") + assert segSize.requested_5m instanceof String : "seg_size values should be strings" + assert syncStats.idx_size.requested_5m instanceof String : "idx_size values should be strings" + assert syncStats.idx_num.requested_5m instanceof Number : "idx_num values should be numbers" + + // Verify timestamps are non-empty (warmup has occurred) + logger.info("last_trigger_ts = ${syncStats.last_trigger_ts}, last_finish_ts = ${syncStats.last_finish_ts}") + assert syncStats.last_trigger_ts != null && syncStats.last_trigger_ts.toString().length() > 0 : + "last_trigger_ts should be non-empty after warmup" + assert syncStats.last_finish_ts != null && syncStats.last_finish_ts.toString().length() > 0 : + "last_finish_ts should be non-empty after warmup" + + // SHOW WARM UP JOB list output should show a compact 30m summary, not the detailed SyncStats. + def allJobInfo = sql """SHOW WARM UP JOB""" + def summaryRow = allJobInfo.find { row -> row[0]?.toString() == jobId.toString() } + assert summaryRow != null : "SHOW WARM UP JOB should include job ${jobId}" + def summaryStatsStr = summaryRow[15]?.toString()?.trim() + logger.info("SyncStats summary column: ${summaryStatsStr}") + assert summaryStatsStr != null && summaryStatsStr.length() > 0 : + "SyncStats summary should not be empty for event-driven job" + def summaryStats = new JsonSlurper().parseText(summaryStatsStr) + assert summaryStats.window == "30m" : "Summary should use 30m window" + assert summaryStats.src_size instanceof String : "Summary src_size should be a string" + assert summaryStats.dst_size instanceof String : "Summary dst_size should be a string" + assert summaryStats.gap_size instanceof String : "Summary gap_size should be a string" + assert summaryStats.trigger_gap_ms == 0 : + "Summary trigger_gap_ms should be 0 after warmup completes, got ${summaryStats.trigger_gap_ms}" + assert !summaryStats.containsKey("seg_num") : "List summary should not include detailed seg_num" + assert !summaryStats.containsKey("seg_size") : "List summary should not include detailed seg_size" + assert !summaryStats.containsKey("idx_num") : "List summary should not include detailed idx_num" + assert !summaryStats.containsKey("idx_size") : "List summary should not include detailed idx_size" + assert !summaryStats.containsKey("last_trigger_ts") : "List summary should not include detailed timestamp" + assert !summaryStats.containsKey("last_finish_ts") : "List summary should not include detailed timestamp" + assert !summaryStats.containsKey("data_size") : "List summary should merge data and index sizes" + assert !summaryStats.containsKey("index_size") : "List summary should merge data and index sizes" + + waitForWarmUpSyncJobMetrics(jobId, "TABLES", clusterName1, clusterName2) + + sql """CANCEL WARM UP JOB WHERE ID = ${jobId}""" + sleep(1000) + + def clusterJobIdRows = sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + def clusterJobId = clusterJobIdRows[0][0] + jobIds << clusterJobId + logger.info("Cluster-level warm-up job ID: ${clusterJobId}") + sleep(3000) + + def clusterBaseMetrics = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, clusterName1, clusterName2) + logger.info("Cluster job baseline metrics: ${clusterBaseMetrics}") + + for (int i = numInserts; i < numInserts * 2; i++) { + sql """INSERT INTO t1 VALUES (${i}, 'value_${i}')""" + } + + def clusterMetrics = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, clusterName1, clusterName2, + clusterBaseMetrics.finished + numInserts) + logger.info("Cluster job warmup metrics after finish: ${clusterMetrics}") + def clusterSubmittedDelta = clusterMetrics.submitted - clusterBaseMetrics.submitted + def clusterFinishedDelta = clusterMetrics.finished - clusterBaseMetrics.finished + assert clusterSubmittedDelta > 0 : "Cluster-level job should submit source warm-up requests" + assert clusterFinishedDelta > 0 : "Cluster-level job should finish target warm-up requests" + waitForWarmUpSyncJobMetrics(clusterJobId, "CLUSTER", clusterName1, clusterName2) + + } finally { + for (jid in jobIds) { + try { sql """CANCEL WARM UP JOB WHERE ID = ${jid}""" } catch (Exception ignored) {} + } + try { + sql """use ${dbName}""" + sql """DROP TABLE IF EXISTS t1""" + } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${dbName}""" } catch (Exception ignored) {} + } + } +} diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_cluster_change.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_cluster_change.groovy new file mode 100644 index 00000000000000..9cbeea1a99cb89 --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_cluster_change.groovy @@ -0,0 +1,168 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import groovy.json.JsonSlurper +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.WarmupMetricsUtils + +// Test point covered: ST-12. +suite('test_warm_up_event_on_tables_system_cluster_change', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_warm_up_table_filter_refresh_interval_ms=1000', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + ] + options.cloudMode = true + options.beNum = 1 + + docker(options) { + Closure sqlRunner = { String q -> sql(q) } + + def srcCluster = "warmup_st12_source" + def dstCluster = "warmup_st12_target" + def dstCluster2 = "warmup_st12_target2" + def dstClusterRenamed = "warmup_st12_target_renamed" + def srcClusterRenamed = "warmup_st12_source_renamed" + def dbName = "test_on_tables_system_cluster_change_db" + def tableName = "base_tbl" + def jobIds = [] + def jsonSlurper = new JsonSlurper() + def metaService = cluster.getAllMetaservices().get(0) + + def waitForCluster = { String clusterName, boolean expectedPresent -> + List clusters = [] + for (int i = 0; i < 60; i++) { + clusters = sql """SHOW CLUSTERS""" + boolean present = clusters.any { it[0].toString() == clusterName } + if (present == expectedPresent) { + return + } + sleep(1000) + } + assert false : "cluster ${clusterName} present=${!expectedPresent} did not become ${expectedPresent}, clusters=${clusters}" + } + + def getClusterId = { String clusterName -> + def tag = getCloudBeTagByName(clusterName) + return jsonSlurper.parseText(tag).compute_group_id.toString() + } + + def prepareSourceTable = { String clusterName -> + sql """use @${clusterName}""" + sql """CREATE DATABASE IF NOT EXISTS ${dbName}""" + sql """use ${dbName}""" + sql """CREATE TABLE IF NOT EXISTS ${tableName} ( + id INT, + val STRING + ) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + } + + def createTableWarmupJob = { String source, String target -> + prepareSourceTable(source) + def jobId = sql(""" + WARM UP CLUSTER ${target} WITH CLUSTER ${source} + ON TABLES (INCLUDE '${dbName}.${tableName}') + PROPERTIES ("sync_mode" = "event_driven", "sync_event" = "load") + """)[0][0] + jobIds << jobId + assert WarmupMetricsUtils.waitForMatchedTables(sqlRunner, jobId, + ["${dbName}.${tableName}".toString()] as Set) == + ["${dbName}.${tableName}".toString()] as Set + return jobId + } + + def showJob = { jobId -> + def rows = sql """SHOW WARM UP JOB WHERE ID = ${jobId}""" + assert !rows.isEmpty() : "warmup job ${jobId} should exist" + return rows[0] + } + + def waitForSystemCancelled = { jobId, String phase -> + def row = null + for (int i = 0; i < 60; i++) { + row = showJob(jobId) + if (row[3].toString() == "CANCELLED" + && row[11].toString().toLowerCase().contains("system cancel")) { + return row + } + sleep(1000) + } + assert false : "${phase}: expected system-cancelled warmup job ${jobId}, row=${row}" + } + + try { + cluster.addBackend(1, srcCluster) + cluster.addBackend(1, dstCluster) + waitForCluster(srcCluster, true) + waitForCluster(dstCluster, true) + + def alterJobId = createTableWarmupJob(srcCluster, dstCluster) + sql """ALTER COMPUTE GROUP ${srcCluster} PROPERTIES ('balance_type'='without_warmup')""" + sql """ALTER COMPUTE GROUP ${dstCluster} PROPERTIES ('balance_type'='without_warmup')""" + sleep(5000) + def alterRow = showJob(alterJobId) + assert alterRow[3].toString() in ["RUNNING", "PENDING"] : + "altering compute group properties should not cancel table warmup job, row=${alterRow}" + + sql """ALTER SYSTEM RENAME COMPUTE GROUP ${dstCluster} ${dstClusterRenamed}""" + waitForCluster(dstClusterRenamed, true) + waitForSystemCancelled(alterJobId, "target rename") + + sql """ALTER SYSTEM RENAME COMPUTE GROUP ${dstClusterRenamed} ${dstCluster}""" + waitForCluster(dstCluster, true) + + def sourceRenameJobId = createTableWarmupJob(srcCluster, dstCluster) + sql """ALTER SYSTEM RENAME COMPUTE GROUP ${srcCluster} ${srcClusterRenamed}""" + waitForCluster(srcClusterRenamed, true) + waitForSystemCancelled(sourceRenameJobId, "source rename") + + sql """ALTER SYSTEM RENAME COMPUTE GROUP ${srcClusterRenamed} ${srcCluster}""" + waitForCluster(srcCluster, true) + + def targetDropJobId = createTableWarmupJob(srcCluster, dstCluster) + def dstClusterId = getClusterId(dstCluster) + drop_cluster(dstCluster, dstClusterId, metaService) + waitForCluster(dstCluster, false) + waitForSystemCancelled(targetDropJobId, "target drop") + + cluster.addBackend(1, dstCluster2) + waitForCluster(dstCluster2, true) + def sourceDropJobId = createTableWarmupJob(srcCluster, dstCluster2) + def srcClusterId = getClusterId(srcCluster) + drop_cluster(srcCluster, srcClusterId, metaService) + waitForCluster(srcCluster, false) + waitForSystemCancelled(sourceDropJobId, "source drop") + } finally { + for (jid in jobIds) { + try { sql """CANCEL WARM UP JOB WHERE ID = ${jid}""" } catch (Exception ignored) {} + } + try { + sql """use @${srcCluster}""" + sql """use ${dbName}""" + sql """DROP TABLE IF EXISTS ${tableName}""" + sql """DROP DATABASE IF EXISTS ${dbName}""" + } catch (Exception ignored) {} + } + } +} diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_compaction_sync_wait.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_compaction_sync_wait.groovy new file mode 100644 index 00000000000000..97873d514b97ff --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_compaction_sync_wait.groovy @@ -0,0 +1,213 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.NodeType +import org.apache.doris.regression.util.WarmupMetricsUtils + +// Test point covered: ST-04. +suite('test_warm_up_event_on_tables_system_compaction_sync_wait', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_warm_up_table_filter_refresh_interval_ms=1000', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + 'warm_up_rowset_slow_log_ms=1', + 'enable_compaction_delay_commit_for_warm_up=true', + 'warm_up_rowset_sync_wait_min_timeout_ms=20000', + 'warm_up_rowset_sync_wait_max_timeout_ms=30000', + ] + options.enableDebugPoints() + options.cloudMode = true + + def waitForMetricAtLeast = { ip, port, metricName, target, timeoutMs -> + long deadline = System.currentTimeMillis() + timeoutMs + long last = 0 + while (System.currentTimeMillis() < deadline) { + last = WarmupMetricsUtils.getBrpcMetric(ip.toString(), port.toString(), metricName) + if (last >= target) { + return last + } + sleep(500) + } + assert false : "metric ${metricName} on ${ip}:${port} did not reach ${target}, last=${last}" + } + + def httpJson = { String method, String url, int readTimeoutMs = 180000 -> + def conn = new URL(url).openConnection() + conn.setRequestMethod(method) + conn.setConnectTimeout(10000) + conn.setReadTimeout(readTimeoutMs) + def text = conn.responseCode >= 400 ? conn.errorStream?.text : conn.inputStream.text + assert text != null && !text.trim().isEmpty() : "empty HTTP response from ${url}" + return parseJson(text.trim()) + } + + def triggerCumulativeCompaction = { ip, port, tabletId -> + def status = httpJson("POST", + "http://${ip}:${port}/api/compaction/run?tablet_id=${tabletId}&compact_type=cumulative") + assert status.status.toLowerCase() in ["success", "already_exist"] : + "trigger compaction failed on ${ip}:${port}, tablet=${tabletId}, status=${status}" + return status + } + + def waitForCompactionFinish = { ip, port, tabletId, timeoutMs -> + long deadline = System.currentTimeMillis() + timeoutMs + def lastStatus = null + while (System.currentTimeMillis() < deadline) { + lastStatus = httpJson("GET", + "http://${ip}:${port}/api/compaction/run_status?tablet_id=${tabletId}", 10000) + assert lastStatus.status.toLowerCase() == "success" : + "compaction failed on ${ip}:${port}, tablet=${tabletId}, status=${lastStatus}" + if (!lastStatus.run_status) { + return lastStatus + } + sleep(1000) + } + assert false : "compaction did not finish on ${ip}:${port}, tablet=${tabletId}, last=${lastStatus}" + } + + docker(options) { + Closure sqlRunner = { String q -> sql(q) } + + def srcCluster = "warmup_source" + def dstCluster = "warmup_target" + def dbName = "test_on_tables_system_compaction_db" + def jobIds = [] + def debugEnabled = false + def targetBe = null + def sourceBe = null + def compactionFuture = null + def loadCount = 8 + + cluster.addBackend(1, srcCluster) + cluster.addBackend(1, dstCluster) + + try { + sql """use @${srcCluster}""" + sql """CREATE DATABASE IF NOT EXISTS ${dbName}""" + sql """use ${dbName}""" + sql """CREATE TABLE IF NOT EXISTS compact_tbl ( + id INT NOT NULL, + payload STRING + ) + UNIQUE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ( + "file_cache_ttl_seconds" = "3600", + "disable_auto_compaction" = "true" + )""" + + def jobId = sql(""" + WARM UP CLUSTER ${dstCluster} WITH CLUSTER ${srcCluster} + ON TABLES (INCLUDE '${dbName}.compact_tbl') + PROPERTIES ("sync_mode" = "event_driven", "sync_event" = "load") + """)[0][0] + jobIds << jobId + assert WarmupMetricsUtils.waitForMatchedTables(sqlRunner, jobId, + ["${dbName}.compact_tbl".toString()] as Set) == + ["${dbName}.compact_tbl".toString()] as Set + + WarmupMetricsUtils.clearFileCacheOnAllBackends(sqlRunner) + def baseMetrics = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + + sql """use @${srcCluster}""" + sql """use ${dbName}""" + for (int i = 0; i < loadCount; i++) { + sql """INSERT INTO compact_tbl VALUES (${i}, 'row_${i}')""" + } + def metrics = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, srcCluster, dstCluster, + baseMetrics.finished + loadCount, 90000) + assert metrics.failed == baseMetrics.failed : "initial rowset warmup should not fail, metrics=${metrics}" + sleep(15000) + + def tablets = sql_return_maparray """SHOW TABLETS FROM compact_tbl""" + assert tablets.size() == 1 : "compact_tbl should have one tablet, tablets=${tablets}" + def tabletId = tablets[0].TabletId.toString() + sourceBe = WarmupMetricsUtils.getClusterBackends(sqlRunner, srcCluster)[0] + targetBe = WarmupMetricsUtils.getClusterBackends(sqlRunner, dstCluster)[0] + def beforeSubmitted = WarmupMetricsUtils.getBrpcMetric(targetBe[1].toString(), targetBe[5].toString(), + WarmupMetricsUtils.METRIC_SUBMITTED) + def beforeFinished = WarmupMetricsUtils.getBrpcMetric(targetBe[1].toString(), targetBe[5].toString(), + WarmupMetricsUtils.METRIC_FINISHED) + def beforeWaitCompaction = WarmupMetricsUtils.getBrpcMetric(targetBe[1].toString(), targetBe[5].toString(), + "file_cache_warm_up_rowset_wait_for_compaction_num") + def beforeWaitTimeout = WarmupMetricsUtils.getBrpcMetric(targetBe[1].toString(), targetBe[5].toString(), + "file_cache_warm_up_rowset_wait_for_compaction_timeout_num") + + GetDebugPoint().enableDebugPoint(targetBe[1].toString(), targetBe[4] as int, NodeType.BE, + "S3FileReader::read_at_impl.io_slow", [sleep: 10]) + debugEnabled = true + + compactionFuture = thread { + sql """use @${srcCluster}""" + sql """use ${dbName}""" + triggerCumulativeCompaction(sourceBe[1].toString(), sourceBe[4].toString(), tabletId) + waitForCompactionFinish(sourceBe[1].toString(), sourceBe[4].toString(), tabletId, 90000) + } + + waitForMetricAtLeast(targetBe[1], targetBe[5], + "file_cache_warm_up_rowset_wait_for_compaction_num", beforeWaitCompaction + 1, 60000) + assert WarmupMetricsUtils.getBrpcMetric(targetBe[1].toString(), targetBe[5].toString(), + WarmupMetricsUtils.METRIC_SUBMITTED) >= beforeSubmitted + 1 : + "compaction rowset should submit one more target warmup" + assert WarmupMetricsUtils.getBrpcMetric(targetBe[1].toString(), targetBe[5].toString(), + WarmupMetricsUtils.METRIC_FINISHED) >= beforeFinished : + "finished warmup metric should not regress" + + sql """use @${dstCluster}""" + sql """use ${dbName}""" + assert sql("""SELECT count(*) FROM compact_tbl""")[0][0].toString() == loadCount.toString() + + compactionFuture.get() + assert WarmupMetricsUtils.getBrpcMetric(targetBe[1].toString(), targetBe[5].toString(), + WarmupMetricsUtils.METRIC_FINISHED) >= beforeFinished + 1 : + "compaction rowset warmup should finish after sync wait" + assert WarmupMetricsUtils.getBrpcMetric(targetBe[1].toString(), targetBe[5].toString(), + "file_cache_warm_up_rowset_wait_for_compaction_timeout_num") == beforeWaitTimeout : + "compaction sync wait should not time out" + + sql """use @${srcCluster}""" + sql """use ${dbName}""" + sql """INSERT INTO compact_tbl VALUES (${loadCount + 1}, 'after_compaction')""" + WarmupMetricsUtils.waitForMetricsStable(sqlRunner, srcCluster, dstCluster, 30000) + sql """use @${dstCluster}""" + sql """use ${dbName}""" + assert sql("""SELECT count(*) FROM compact_tbl""")[0][0].toString() == (loadCount + 1).toString() + + def stats = WarmupMetricsUtils.waitForJobSyncStats(sqlRunner, jobId, + { it.seg_num.finish_5m >= loadCount + 2 && it.seg_num.fail_5m == 0 }, 60000) + assert stats.seg_num.gap_5m == 0 : "compaction warmup should converge, stats=${stats}" + } finally { + if (debugEnabled) { + try { GetDebugPoint().clearDebugPointsForAllBEs() } catch (Exception ignored) {} + } + for (jid in jobIds) { + try { sql """CANCEL WARM UP JOB WHERE ID = ${jid}""" } catch (Exception ignored) {} + } + try { + sql """use @${srcCluster}""" + sql """use ${dbName}""" + sql """DROP TABLE IF EXISTS compact_tbl""" + } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${dbName}""" } catch (Exception ignored) {} + } + } +} diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_e2e_multi_be.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_e2e_multi_be.groovy new file mode 100644 index 00000000000000..1429498764f834 --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_e2e_multi_be.groovy @@ -0,0 +1,269 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.WarmupMetricsUtils + +// Test points covered: ST-01, ST-02, ST-10. +suite('test_warm_up_event_on_tables_system_e2e_multi_be', 'docker') { + def options = new ClusterOptions() + options.beNum = 1 + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_tablet_rebalancer_interval_second=1', + 'cloud_warm_up_table_filter_refresh_interval_ms=1000', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + ] + options.cloudMode = true + + docker(options) { + Closure sqlRunner = { String q -> sql(q) } + + def srcCluster = "warmup_source" + def dstCluster = "warmup_target" + def dbName = "test_on_tables_system_e2e_db" + def jobIds = [] + + cluster.addBackend(3, srcCluster) + cluster.addBackend(3, dstCluster) + + try { + sql """use @${srcCluster}""" + sql """CREATE DATABASE IF NOT EXISTS ${dbName}""" + sql """use ${dbName}""" + sql """CREATE TABLE IF NOT EXISTS orders (id INT, amount INT) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 9 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + sql """CREATE TABLE IF NOT EXISTS customers (id INT, name STRING) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 9 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + sql """CREATE TABLE IF NOT EXISTS audit_log (id INT, msg STRING) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 3 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + + def ordersJobId = sql(""" + WARM UP CLUSTER ${dstCluster} WITH CLUSTER ${srcCluster} + ON TABLES (INCLUDE '${dbName}.orders') + PROPERTIES ("sync_mode" = "event_driven", "sync_event" = "load") + """)[0][0] + jobIds << ordersJobId + def customersJobId = sql(""" + WARM UP CLUSTER ${dstCluster} WITH CLUSTER ${srcCluster} + ON TABLES (INCLUDE '${dbName}.customers') + PROPERTIES ("sync_mode" = "event_driven", "sync_event" = "load") + """)[0][0] + jobIds << customersJobId + def wildcardJobId = sql(""" + WARM UP CLUSTER ${dstCluster} WITH CLUSTER ${srcCluster} + ON TABLES ( + INCLUDE '${dbName}.*', + EXCLUDE '${dbName}.audit_*' + ) + PROPERTIES ("sync_mode" = "event_driven", "sync_event" = "load") + """)[0][0] + jobIds << wildcardJobId + + assert WarmupMetricsUtils.waitForMatchedTables(sqlRunner, ordersJobId, + ["${dbName}.orders".toString()] as Set, + ["${dbName}.customers".toString(), "${dbName}.audit_log".toString()] as Set) == + ["${dbName}.orders".toString()] as Set + assert WarmupMetricsUtils.waitForMatchedTables(sqlRunner, customersJobId, + ["${dbName}.customers".toString()] as Set, + ["${dbName}.orders".toString(), "${dbName}.audit_log".toString()] as Set) == + ["${dbName}.customers".toString()] as Set + assert WarmupMetricsUtils.waitForMatchedTables(sqlRunner, wildcardJobId, + ["${dbName}.orders".toString(), "${dbName}.customers".toString()] as Set, + ["${dbName}.audit_log".toString()] as Set) == + ["${dbName}.orders".toString(), "${dbName}.customers".toString()] as Set + + assert WarmupMetricsUtils.getClusterBackends(sqlRunner, srcCluster).size() == 3 + assert WarmupMetricsUtils.getClusterBackends(sqlRunner, dstCluster).size() == 3 + + WarmupMetricsUtils.clearFileCacheOnAllBackends(sqlRunner) + def baseMetrics = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + def targetFinishedBefore = WarmupMetricsUtils.getClusterMetricValues(sqlRunner, + dstCluster, WarmupMetricsUtils.METRIC_FINISHED) + + sql """use @${srcCluster}""" + sql """use ${dbName}""" + int rowCount = 90 + long expectedOrderSum = ((long) rowCount - 1) * rowCount / 2 * 10 + def orderValues = (0..= 18 : "target should warm every bucket for orders/customers" + assert requestedDelta >= submittedDelta * 2 : + "overlapping jobs should request the same matched rowsets independently, got requested=${requestedDelta}, submitted=${submittedDelta}" + assert finishedDelta == submittedDelta : "all target downloads should finish" + assert failedDelta == 0 : "warmup should not fail" + + def targetFinishedAfter = WarmupMetricsUtils.getClusterMetricValues(sqlRunner, + dstCluster, WarmupMetricsUtils.METRIC_FINISHED) + def targetFinishedDeltas = targetFinishedAfter.collectEntries { + [(it.key): it.value - (targetFinishedBefore[it.key] ?: 0)] + } + logger.info("target finished deltas by BE: ${targetFinishedDeltas}") + assert targetFinishedDeltas.size() == 3 : "target cluster should have 3 BEs" + assert targetFinishedDeltas.every { it.value > 0 } : + "each target BE should finish warmup tasks, got ${targetFinishedDeltas}" + + sql """use @${dstCluster}""" + sql """use ${dbName}""" + profile("st01_target_profile") { + sql """set enable_profile = true""" + sql """set profile_level = 2""" + run { + def res = sql """/* st01_target_profile */ SELECT count(*), sum(amount) FROM orders""" + assert res[0][0].toString() == rowCount.toString() : "target query row count mismatch: ${res}" + assert res[0][1].toString() == expectedOrderSum.toString() : "target query sum mismatch: ${res}" + sleep(1000) + } + check { profileString, exception -> + if (exception != null) { + throw exception + } + assert profileString.contains("NumRemoteIOTotal") : "profile should contain file cache counters" + def remoteTotal = WarmupMetricsUtils.sumProfileCounter(profileString, "NumRemoteIOTotal") + def localTotal = WarmupMetricsUtils.sumProfileCounter(profileString, "NumLocalIOTotal") + logger.info("target profile NumRemoteIOTotal=${remoteTotal}, NumLocalIOTotal=${localTotal}") + assert remoteTotal == 0 : "warmed target query should not read remote data" + assert localTotal > 0 : "warmed target query should hit local file cache" + } + } + + def targetTtl = [:] + long targetTtlDeadline = System.currentTimeMillis() + 30000 + while (System.currentTimeMillis() < targetTtlDeadline) { + targetTtl = WarmupMetricsUtils.getClusterMetricValues(sqlRunner, dstCluster, "ttl_cache_size") + if (targetTtl.size() == 3 && targetTtl.values().sum() > 0) { + break + } + sleep(2000) + } + logger.info("target ttl cache by BE: ${targetTtl}") + assert targetTtl.size() == 3 : "target cluster should have 3 BEs" + assert targetTtl.values().sum() > 0 : "target cluster should own warmed cache, got ${targetTtl}" + + def ordersStats = WarmupMetricsUtils.waitForJobSyncStats(sqlRunner, ordersJobId, { stats -> + stats.seg_num.requested_5m > 0 + && stats.seg_num.finish_5m == stats.seg_num.requested_5m + && stats.seg_num.gap_5m == 0 + && stats.seg_num.fail_5m == 0 + && stats.seg_size.finish_5m == stats.seg_size.requested_5m + && stats.seg_size.gap_5m == "0b" + && stats.seg_size.fail_5m == "0b" + }, 30000) + logger.info("system e2e SyncStats for orders job ${ordersJobId}: ${ordersStats}") + assert ordersStats.seg_num.requested_5m > 0 : + "orders job should have requested segments in SyncStats: ${ordersStats}" + assert ordersStats.seg_num.finish_5m == ordersStats.seg_num.requested_5m : + "orders job should count already-warmed overlapping rowsets as finished: ${ordersStats}" + assert ordersStats.seg_num.gap_5m == 0 : + "orders job should have no SyncStats segment gap after warmup: ${ordersStats}" + assert ordersStats.seg_num.fail_5m == 0 : + "orders job should have no SyncStats segment failures: ${ordersStats}" + assert ordersStats.seg_size.finish_5m == ordersStats.seg_size.requested_5m : + "orders job should count already-warmed overlapping rowset bytes as finished: ${ordersStats}" + assert ordersStats.seg_size.gap_5m == "0b" : + "orders job should have no SyncStats size gap after warmup: ${ordersStats}" + assert ordersStats.seg_size.fail_5m == "0b" : + "orders job should have no SyncStats size failures: ${ordersStats}" + + def customersStats = WarmupMetricsUtils.waitForJobSyncStats(sqlRunner, customersJobId, { stats -> + stats.seg_num.requested_5m > 0 + && stats.seg_num.finish_5m == stats.seg_num.requested_5m + && stats.seg_num.gap_5m == 0 + && stats.seg_num.fail_5m == 0 + && stats.seg_size.finish_5m == stats.seg_size.requested_5m + && stats.seg_size.gap_5m == "0b" + && stats.seg_size.fail_5m == "0b" + }, 30000) + logger.info("system e2e SyncStats for customers job ${customersJobId}: ${customersStats}") + assert customersStats.seg_num.requested_5m > 0 : + "customers job should have requested segments in SyncStats: ${customersStats}" + assert customersStats.seg_num.finish_5m == customersStats.seg_num.requested_5m : + "customers job should count already-warmed overlapping rowsets as finished: ${customersStats}" + assert customersStats.seg_num.gap_5m == 0 : + "customers job should have no SyncStats segment gap after warmup: ${customersStats}" + assert customersStats.seg_num.fail_5m == 0 : + "customers job should have no SyncStats segment failures: ${customersStats}" + assert customersStats.seg_size.finish_5m == customersStats.seg_size.requested_5m : + "customers job should count already-warmed overlapping rowset bytes as finished: ${customersStats}" + assert customersStats.seg_size.gap_5m == "0b" : + "customers job should have no SyncStats size gap after warmup: ${customersStats}" + assert customersStats.seg_size.fail_5m == "0b" : + "customers job should have no SyncStats size failures: ${customersStats}" + + def wildcardStats = WarmupMetricsUtils.waitForJobSyncStats(sqlRunner, wildcardJobId, { stats -> + stats.seg_num.requested_5m > 0 + && stats.seg_num.finish_5m == stats.seg_num.requested_5m + && stats.seg_num.gap_5m == 0 + && stats.seg_num.fail_5m == 0 + && stats.seg_size.finish_5m == stats.seg_size.requested_5m + && stats.seg_size.gap_5m == "0b" + && stats.seg_size.fail_5m == "0b" + }, 30000) + logger.info("system e2e SyncStats for wildcard job ${wildcardJobId}: ${wildcardStats}") + assert wildcardStats.seg_num.requested_5m > 0 : + "wildcard job should have requested segments in SyncStats: ${wildcardStats}" + assert wildcardStats.seg_num.finish_5m == wildcardStats.seg_num.requested_5m : + "wildcard job should count already-warmed overlapping rowsets as finished: ${wildcardStats}" + assert wildcardStats.seg_num.gap_5m == 0 : + "wildcard job should have no SyncStats segment gap after warmup: ${wildcardStats}" + assert wildcardStats.seg_num.fail_5m == 0 : + "wildcard job should have no SyncStats segment failures: ${wildcardStats}" + assert wildcardStats.seg_size.finish_5m == wildcardStats.seg_size.requested_5m : + "wildcard job should count already-warmed overlapping rowset bytes as finished: ${wildcardStats}" + assert wildcardStats.seg_size.gap_5m == "0b" : + "wildcard job should have no SyncStats size gap after warmup: ${wildcardStats}" + assert wildcardStats.seg_size.fail_5m == "0b" : + "wildcard job should have no SyncStats size failures: ${wildcardStats}" + def wildcardOverlapMessage = "wildcard job should cover both overlapping tables, orders=${ordersStats}, customers=${customersStats}, wildcard=${wildcardStats}" + assert wildcardStats.seg_num.requested_5m >= + ordersStats.seg_num.requested_5m + customersStats.seg_num.requested_5m : + wildcardOverlapMessage + } finally { + for (jid in jobIds) { + try { sql """CANCEL WARM UP JOB WHERE ID = ${jid}""" } catch (Exception ignored) {} + } + try { + sql """use @${srcCluster}""" + sql """use ${dbName}""" + sql """DROP TABLE IF EXISTS orders""" + sql """DROP TABLE IF EXISTS customers""" + sql """DROP TABLE IF EXISTS audit_log""" + } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${dbName}""" } catch (Exception ignored) {} + } + } +} diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_packed_file.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_packed_file.groovy new file mode 100644 index 00000000000000..3dad6a269425c4 --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_packed_file.groovy @@ -0,0 +1,154 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.WarmupMetricsUtils + +// Test point covered: ST-06. +suite('test_warm_up_event_on_tables_system_packed_file', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_warm_up_table_filter_refresh_interval_ms=1000', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + 'enable_packed_file=true', + 'small_file_threshold_bytes=102400', + 'disable_auto_compaction=true', + ] + options.cloudMode = true + + docker(options) { + Closure sqlRunner = { String q -> sql(q) } + + def srcCluster = "warmup_source" + def dstCluster = "warmup_target" + def dbName = "test_on_tables_system_packed_file_db" + def tableName = "packed_tbl" + def jobIds = [] + def loadCount = 30 + + cluster.addBackend(1, srcCluster) + cluster.addBackend(1, dstCluster) + + try { + sql """use @${srcCluster}""" + sql """CREATE DATABASE IF NOT EXISTS ${dbName}""" + sql """use ${dbName}""" + sql """CREATE TABLE IF NOT EXISTS ${tableName} ( + id INT, + name STRING, + payload STRING + ) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ( + "file_cache_ttl_seconds" = "3600", + "disable_auto_compaction" = "true" + )""" + + def jobId = sql(""" + WARM UP CLUSTER ${dstCluster} WITH CLUSTER ${srcCluster} + ON TABLES (INCLUDE '${dbName}.${tableName}') + PROPERTIES ("sync_mode" = "event_driven", "sync_event" = "load") + """)[0][0] + jobIds << jobId + assert WarmupMetricsUtils.waitForMatchedTables(sqlRunner, jobId, + ["${dbName}.${tableName}".toString()] as Set) == + ["${dbName}.${tableName}".toString()] as Set + + WarmupMetricsUtils.clearFileCacheOnAllBackends(sqlRunner) + def baseMetrics = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + def basePackedFiles = WarmupMetricsUtils.getClusterMetricSum(sqlRunner, srcCluster, + "packed_file_total_small_file_num") + def baseTargetCacheSize = WarmupMetricsUtils.getClusterMetricSum(sqlRunner, dstCluster, + "ttl_cache_size") + + sql """use @${srcCluster}""" + sql """use ${dbName}""" + for (int i = 0; i < loadCount; i++) { + sql """INSERT INTO ${tableName} VALUES (${i}, 'packed_${i}', repeat('x', 128))""" + } + + def metrics = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, srcCluster, dstCluster, + baseMetrics.finished + loadCount, 90000) + def requestedDelta = metrics.requested - baseMetrics.requested + def submittedDelta = metrics.submitted - baseMetrics.submitted + def finishedDelta = metrics.finished - baseMetrics.finished + def failedDelta = metrics.failed - baseMetrics.failed + logger.info("packed file bvar deltas requested=${requestedDelta}, submitted=${submittedDelta}, " + + "finished=${finishedDelta}, failed=${failedDelta}") + assert requestedDelta >= loadCount : "source bvar should request packed small-file rowsets" + assert submittedDelta >= loadCount : "target bvar should submit packed small-file rowsets" + assert finishedDelta == submittedDelta : "target bvar should finish all submitted packed rowsets" + assert failedDelta == 0 : "packed-file warmup should not fail" + + def packedFilesDelta = WarmupMetricsUtils.getClusterMetricSum(sqlRunner, srcCluster, + "packed_file_total_small_file_num") - basePackedFiles + logger.info("packed_file_total_small_file_num delta=${packedFilesDelta}") + assert packedFilesDelta > 0 : "source cluster should write small files into packed file" + + def targetCacheSizeDelta = WarmupMetricsUtils.getClusterMetricSum(sqlRunner, dstCluster, + "ttl_cache_size") - baseTargetCacheSize + logger.info("target ttl_cache_size delta=${targetCacheSizeDelta}") + assert targetCacheSizeDelta > 0 : "target packed-file warmup should populate TTL file cache" + + def stats = WarmupMetricsUtils.waitForJobSyncStats(sqlRunner, jobId, + { it.seg_num.finish_5m >= loadCount && it.seg_num.fail_5m == 0 }, 60000) + logger.info("packed file SyncStats: ${stats}") + assert stats.seg_num.requested_5m > 0 : "SyncStats should observe packed small-file rowset requests" + assert stats.seg_num.finish_5m >= loadCount : "SyncStats should finish packed small-file rowsets" + assert stats.seg_num.fail_5m == 0 : "SyncStats should have no packed-file failures" + + sql """use @${dstCluster}""" + sql """use ${dbName}""" + profile("st06_packed_file_profile") { + sql """set enable_profile = true""" + sql """set profile_level = 2""" + run { + def res = sql """/* st06_packed_file_profile */ SELECT count(*), sum(id) FROM ${tableName}""" + assert res[0][0].toString() == loadCount.toString() : "packed table count mismatch: ${res}" + assert res[0][1].toString() == "435" : "packed table sum mismatch: ${res}" + sleep(1000) + } + check { profileString, exception -> + if (exception != null) { + throw exception + } + assert profileString.contains("NumRemoteIOTotal") : "profile should contain file cache counters" + def remoteTotal = WarmupMetricsUtils.sumProfileCounter(profileString, "NumRemoteIOTotal") + def localTotal = WarmupMetricsUtils.sumProfileCounter(profileString, "NumLocalIOTotal") + logger.info("packed profile NumRemoteIOTotal=${remoteTotal}, NumLocalIOTotal=${localTotal}") + assert remoteTotal == 0 : "warmed packed-file query should not read remote data" + assert localTotal > 0 : "warmed packed-file query should hit local file cache" + } + } + } finally { + for (jid in jobIds) { + try { sql """CANCEL WARM UP JOB WHERE ID = ${jid}""" } catch (Exception ignored) {} + } + try { + sql """use @${srcCluster}""" + sql """use ${dbName}""" + sql """DROP TABLE IF EXISTS ${tableName}""" + } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${dbName}""" } catch (Exception ignored) {} + } + } +} diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_restart_and_resize.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_restart_and_resize.groovy new file mode 100644 index 00000000000000..a0d09791868c19 --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_restart_and_resize.groovy @@ -0,0 +1,223 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.WarmupMetricsUtils + +// Test points covered: ST-07, ST-08. +suite('test_warm_up_event_on_tables_system_restart_and_resize', 'docker') { + def options = new ClusterOptions() + options.feNum = 3 + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_tablet_rebalancer_interval_second=1', + 'cloud_warm_up_table_filter_refresh_interval_ms=1000', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + ] + options.cloudMode = true + + def restartMasterFe = { + def oldMasterFe = cluster.getMasterFe() + cluster.restartFrontends(oldMasterFe.index) + boolean hasRestart = false + for (int i = 0; i < 30; i++) { + if (cluster.getFeByIndex(oldMasterFe.index).alive) { + hasRestart = true + break + } + sleep(1000) + } + assert hasRestart : "master FE did not restart" + context.reconnectFe() + } + + docker(options) { + Closure sqlRunner = { String q -> sql(q) } + + def srcCluster = "warmup_source" + def dstCluster = "warmup_target" + def dbName = "test_on_tables_system_restart_resize_db" + def jobIds = [] + + def srcBeIndexes = cluster.addBackend(1, srcCluster) + def dstBeIndexes = cluster.addBackend(1, dstCluster) + + def assertWarmupReached = { Map metrics, long expectedFinished, String phase -> + assert metrics.finished >= expectedFinished : + "${phase}: expected finished >= ${expectedFinished}, metrics=${metrics}" + assert metrics.finished + metrics.failed >= metrics.submitted : + "${phase}: submitted warmup tasks should be terminal, metrics=${metrics}" + } + + def aliveFrontends = { String phase -> + def fes = [] + for (int i = 0; i < 30; i++) { + fes = cluster.getAllFrontends(true) + if (fes.size() == options.feNum) { + return fes + } + sleep(1000) + } + assert false : "${phase}: expected ${options.feNum} alive FEs, got ${fes}" + } + + def assertShowWarmupOnAllFes = { Object jobId, Set expectedTables, String phase -> + for (fe in aliveFrontends(phase)) { + def feLabel = "fe-${fe.index}" + def jdbcUrl = String.format( + "jdbc:mysql://%s:%s/?useLocalSessionState=true&allowLoadLocalInfile=false", + fe.host, fe.queryPort) + connect(context.config.jdbcUser, context.config.jdbcPassword, jdbcUrl) { + def rows = sql """SHOW WARM UP JOB WHERE ID = ${jobId}""" + assert rows.size() == 1 : "${phase}: ${feLabel} should show one warmup job, rows=${rows}" + def row = rows[0] + assert row[0].toString() == jobId.toString() : + "${phase}: ${feLabel} job id mismatch, row=${row}" + assert row[1].toString() == srcCluster : + "${phase}: ${feLabel} source cluster mismatch, row=${row}" + assert row[2].toString() == dstCluster : + "${phase}: ${feLabel} target cluster mismatch, row=${row}" + assert row[3] in ["RUNNING", "PENDING"] : + "${phase}: ${feLabel} job should be running or pending, row=${row}" + assert row[4].toString() == "TABLES" : + "${phase}: ${feLabel} job type mismatch, row=${row}" + assert row[5].toString().startsWith("EVENT_DRIVEN") : + "${phase}: ${feLabel} sync mode mismatch, row=${row}" + def matched = WarmupMetricsUtils.parseMatchedTables(rows) + assert matched.containsAll(expectedTables) : + "${phase}: ${feLabel} matched tables mismatch, expected=${expectedTables}, matched=${matched}" + logger.info("${phase}: SHOW WARM UP JOB on ${feLabel}(${fe.host}:${fe.queryPort}) row=${row}") + } + } + } + + try { + sql """use @${srcCluster}""" + sql """CREATE DATABASE IF NOT EXISTS ${dbName}""" + sql """use ${dbName}""" + sql """CREATE TABLE IF NOT EXISTS ha_tbl ( + id INT, + val STRING + ) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 2 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + + def jobId = sql(""" + WARM UP CLUSTER ${dstCluster} WITH CLUSTER ${srcCluster} + ON TABLES (INCLUDE '${dbName}.*') + PROPERTIES ("sync_mode" = "event_driven", "sync_event" = "load") + """)[0][0] + jobIds << jobId + assert WarmupMetricsUtils.waitForMatchedTables(sqlRunner, jobId, + ["${dbName}.ha_tbl".toString()] as Set).contains("${dbName}.ha_tbl".toString()) + assertShowWarmupOnAllFes(jobId, ["${dbName}.ha_tbl".toString()] as Set, + "after creating table-level warmup job") + + restartMasterFe() + sql """use @${srcCluster}""" + sql """use ${dbName}""" + + WarmupMetricsUtils.clearFileCacheOnAllBackends(sqlRunner) + def baseMetrics = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + for (int i = 0; i < 4; i++) { + sql """INSERT INTO ha_tbl VALUES (${i}, 'before_restart_${i}')""" + } + def afterFeRestart = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, srcCluster, dstCluster, + baseMetrics.finished + 4, 90000) + assertWarmupReached(afterFeRestart, baseMetrics.finished + 4, "after master FE restart") + assert afterFeRestart.failed == baseMetrics.failed : + "warmup should continue after master FE restart, metrics=${afterFeRestart}" + assertShowWarmupOnAllFes(jobId, ["${dbName}.ha_tbl".toString()] as Set, + "after master FE restart") + + cluster.restartBackends(dstBeIndexes[0] as int) + sleep(5000) + sql """use @${srcCluster}""" + sql """use ${dbName}""" + def beforeTargetRestartLoad = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + for (int i = 4; i < 8; i++) { + sql """INSERT INTO ha_tbl VALUES (${i}, 'after_target_restart_${i}')""" + } + def afterTargetRestart = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, srcCluster, dstCluster, + beforeTargetRestartLoad.finished + 4, 90000) + assertWarmupReached(afterTargetRestart, beforeTargetRestartLoad.finished + 4, + "after target BE restart") + assert afterTargetRestart.failed == beforeTargetRestartLoad.failed : + "warmup should continue after target BE restart, metrics=${afterTargetRestart}" + + def targetBeforeScale = WarmupMetricsUtils.getClusterBackends(sqlRunner, dstCluster) + .collect { it[0].toString() } as Set + cluster.addBackend(1, dstCluster) + sleep(5000) + def targetAfterScale = WarmupMetricsUtils.getClusterBackends(sqlRunner, dstCluster) + def newTargetBes = targetAfterScale.findAll { !targetBeforeScale.contains(it[0].toString()) } + assert newTargetBes.size() == 1 : "expected one new target BE, before=${targetBeforeScale}, after=${targetAfterScale}" + def newTargetBe = newTargetBes[0] + + sql """use @${srcCluster}""" + sql """use ${dbName}""" + sql """CREATE TABLE IF NOT EXISTS scale_tbl ( + id INT, + val STRING + ) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 4 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + assert WarmupMetricsUtils.waitForMatchedTables(sqlRunner, jobId, + ["${dbName}.ha_tbl".toString(), "${dbName}.scale_tbl".toString()] as Set) + .contains("${dbName}.scale_tbl".toString()) + assertShowWarmupOnAllFes(jobId, + ["${dbName}.ha_tbl".toString(), "${dbName}.scale_tbl".toString()] as Set, + "after target scale-out table match") + + def newBeFinishedBefore = WarmupMetricsUtils.getBrpcMetric(newTargetBe[1].toString(), + newTargetBe[5].toString(), WarmupMetricsUtils.METRIC_FINISHED) + def beforeScaleLoad = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + for (int i = 0; i < 8; i++) { + sql """INSERT INTO scale_tbl VALUES (${i}, 'scale_${i}')""" + } + def afterScaleLoad = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, srcCluster, dstCluster, + beforeScaleLoad.finished + 8, 90000) + assertWarmupReached(afterScaleLoad, beforeScaleLoad.finished + 8, "after target scale-out") + assert afterScaleLoad.failed == beforeScaleLoad.failed : + "warmup should continue after target scale-out, metrics=${afterScaleLoad}" + def newBeFinishedAfter = WarmupMetricsUtils.getBrpcMetric(newTargetBe[1].toString(), + newTargetBe[5].toString(), WarmupMetricsUtils.METRIC_FINISHED) + assert newBeFinishedAfter > newBeFinishedBefore : + "new target BE should participate in later table-level warmup" + + sql """use @${dstCluster}""" + sql """use ${dbName}""" + assert sql("""SELECT count(*) FROM ha_tbl""")[0][0].toString() == "8" + assert sql("""SELECT count(*) FROM scale_tbl""")[0][0].toString() == "8" + } finally { + for (jid in jobIds) { + try { sql """CANCEL WARM UP JOB WHERE ID = ${jid}""" } catch (Exception ignored) {} + } + try { + sql """use @${srcCluster}""" + sql """use ${dbName}""" + sql """DROP TABLE IF EXISTS ha_tbl""" + sql """DROP TABLE IF EXISTS scale_tbl""" + } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${dbName}""" } catch (Exception ignored) {} + } + } +} diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_schema_index.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_schema_index.groovy new file mode 100644 index 00000000000000..25fa92d25808d9 --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_schema_index.groovy @@ -0,0 +1,181 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.WarmupMetricsUtils + +// Test points covered: ST-03, ST-05. +suite('test_warm_up_event_on_tables_system_schema_index', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_warm_up_table_filter_refresh_interval_ms=1000', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + 'disable_auto_compaction=true', + ] + options.cloudMode = true + + def waitLatestColumnAlter = { tableName -> + long deadline = System.currentTimeMillis() + 60000 + def last = [] + while (System.currentTimeMillis() < deadline) { + last = sql """SHOW ALTER TABLE COLUMN WHERE TableName = '${tableName}' + ORDER BY CreateTime DESC LIMIT 1""" + if (last.isEmpty() || last[0].toString().contains("FINISHED")) { + sleep(1000) + return + } + sleep(1000) + } + assert false : "schema change on ${tableName} did not finish, last=${last}" + } + + docker(options) { + Closure sqlRunner = { String q -> sql(q) } + + def srcCluster = "warmup_source" + def dstCluster = "warmup_target" + def dbName = "test_on_tables_system_schema_index_db" + def jobIds = [] + + cluster.addBackend(1, srcCluster) + cluster.addBackend(1, dstCluster) + + try { + sql """use @${srcCluster}""" + sql """CREATE DATABASE IF NOT EXISTS ${dbName}""" + sql """use ${dbName}""" + sql """CREATE TABLE IF NOT EXISTS schema_tbl ( + id INT, + v INT, + tag STRING + ) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 2 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + sql """CREATE TABLE IF NOT EXISTS idx_tbl ( + id INT, + body STRING, + city STRING, + INDEX idx_body(body) USING INVERTED + PROPERTIES("parser" = "english", "support_phrase" = "true") + ) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ( + "file_cache_ttl_seconds" = "3600", + "storage_format" = "V2" + )""" + + def jobId = sql(""" + WARM UP CLUSTER ${dstCluster} WITH CLUSTER ${srcCluster} + ON TABLES ( + INCLUDE '${dbName}.schema_*', + INCLUDE '${dbName}.idx_*' + ) + PROPERTIES ("sync_mode" = "event_driven", "sync_event" = "load") + """)[0][0] + jobIds << jobId + + def matched = WarmupMetricsUtils.waitForMatchedTables(sqlRunner, jobId, + ["${dbName}.schema_tbl".toString(), + "${dbName}.idx_tbl".toString()] as Set) + assert matched == ["${dbName}.schema_tbl".toString(), + "${dbName}.idx_tbl".toString()] as Set : + "unexpected matched tables: ${matched}" + + WarmupMetricsUtils.clearFileCacheOnAllBackends(sqlRunner) + def baseMetrics = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + + sql """use @${srcCluster}""" + sql """use ${dbName}""" + + sql """INSERT INTO schema_tbl VALUES (1, 10, 'a'), (2, 20, 'b')""" + sql """ALTER TABLE schema_tbl ADD COLUMN extra STRING DEFAULT 'x'""" + waitLatestColumnAlter("schema_tbl") + sql """INSERT INTO schema_tbl(id, v, tag, extra) VALUES (3, 30, 'c', 'c_extra')""" + sql """ALTER TABLE schema_tbl RENAME COLUMN tag label""" + waitLatestColumnAlter("schema_tbl") + sql """INSERT INTO schema_tbl(id, v, label, extra) VALUES (4, 40, 'd', 'd_extra')""" + sql """ALTER TABLE schema_tbl MODIFY COLUMN v BIGINT NULL""" + waitLatestColumnAlter("schema_tbl") + sql """ALTER TABLE schema_tbl DROP COLUMN extra""" + waitLatestColumnAlter("schema_tbl") + sql """INSERT INTO schema_tbl(id, v, label) VALUES (5, 50, 'e')""" + + sql """INSERT INTO idx_tbl VALUES + (1, 'quick brown fox', 'beijing'), + (2, 'slow yellow fox', 'shanghai'), + (3, 'quick blue whale', 'beijing')""" + + def metrics = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, srcCluster, dstCluster, + baseMetrics.finished + 5, 90000) + assert metrics.failed == baseMetrics.failed : "warmup should not fail, metrics=${metrics}, base=${baseMetrics}" + + def stats = WarmupMetricsUtils.waitForJobSyncStats(sqlRunner, jobId, + { it.seg_num.finish_5m >= 5 && it.idx_num.finish_5m > 0 && it.seg_num.fail_5m == 0 }, + 60000) + logger.info("schema/index SyncStats: ${stats}") + assert stats.idx_num.finish_5m > 0 : "inverted index files should be warmed, stats=${stats}" + assert stats.idx_num.fail_5m == 0 : "inverted index warmup should not fail, stats=${stats}" + + sql """use @${dstCluster}""" + sql """use ${dbName}""" + def schemaRes = sql """SELECT count(*), sum(v) FROM schema_tbl""" + assert schemaRes[0][0].toString() == "5" : "schema table count mismatch: ${schemaRes}" + assert schemaRes[0][1].toString() == "150" : "schema table sum mismatch: ${schemaRes}" + + profile("st05_inverted_index_profile") { + sql """set enable_profile = true""" + sql """set profile_level = 2""" + sql """set enable_common_expr_pushdown = true""" + run { + def res = sql """/* st05_inverted_index_profile */ SELECT id FROM idx_tbl + WHERE body MATCH_ALL 'quick' ORDER BY id""" + assert res.collect { it[0].toString() } == ["1", "3"] : "index query mismatch: ${res}" + sleep(1000) + } + check { profileString, exception -> + if (exception != null) { + throw exception + } + assert profileString.contains("InvertedIndexNumRemoteIOTotal") : + "profile should contain inverted index file cache counters" + def idxRemoteTotal = WarmupMetricsUtils.sumProfileCounter(profileString, + "InvertedIndexNumRemoteIOTotal") + def remoteTotal = WarmupMetricsUtils.sumProfileCounter(profileString, "NumRemoteIOTotal") + logger.info("index profile remote counters: data=${remoteTotal}, inverted_index=${idxRemoteTotal}") + assert idxRemoteTotal == 0 : "warmed inverted index query should not read index files remotely" + assert remoteTotal == 0 : "warmed inverted index query should not read data files remotely" + } + } + } finally { + for (jid in jobIds) { + try { sql """CANCEL WARM UP JOB WHERE ID = ${jid}""" } catch (Exception ignored) {} + } + try { + sql """use @${srcCluster}""" + sql """use ${dbName}""" + sql """DROP TABLE IF EXISTS schema_tbl""" + sql """DROP TABLE IF EXISTS idx_tbl""" + } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${dbName}""" } catch (Exception ignored) {} + } + } +} From d3233b7047905cb359256db5d9f768881f3e0b7f Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Thu, 14 May 2026 14:28:05 +0800 Subject: [PATCH 2/7] branch-selectdb-doris-3.1: [Fix](regression) Fix some unstable cases (#9031) --- .../test_warm_up_event_on_tables_mow_compaction.groovy | 2 +- .../test_warm_up_event_on_tables_sync_stats.groovy | 6 ++++-- ...rm_up_event_on_tables_system_compaction_sync_wait.groovy | 3 ++- .../test_warm_up_event_on_tables_system_packed_file.groovy | 3 ++- .../test_warm_up_event_on_tables_system_schema_index.groovy | 3 ++- 5 files changed, 11 insertions(+), 6 deletions(-) diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_mow_compaction.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_mow_compaction.groovy index 50956454d1a10b..129451ddd862b4 100644 --- a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_mow_compaction.groovy +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_mow_compaction.groovy @@ -174,7 +174,7 @@ suite('test_warm_up_event_on_tables_mow_compaction', 'docker') { "post-compaction MOW upsert warmup should not fail, metrics=${afterPostCompactionUpsert}" def stats = WarmupMetricsUtils.waitForJobSyncStats(sqlRunner, jobId, - { it.seg_num.finish_5m >= 5 && it.seg_num.fail_5m == 0 }, 60000) + { it.seg_num.finish_5m >= 5 && it.seg_num.fail_5m == 0 && it.seg_num.gap_5m == 0 }, 60000) logger.info("MOW warmup SyncStats: ${stats}") assert stats.seg_num.fail_5m == 0 : "MOW warmup SyncStats should have no failures: ${stats}" assert stats.seg_num.gap_5m == 0 : "MOW warmup SyncStats should converge: ${stats}" diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_sync_stats.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_sync_stats.groovy index 0def53a822b126..73b4577a45472e 100644 --- a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_sync_stats.groovy +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_sync_stats.groovy @@ -158,8 +158,10 @@ suite('test_warm_up_event_on_tables_sync_stats', 'docker') { syncStatsStr = jobInfo[0][15]?.toString()?.trim() if (syncStatsStr != null && syncStatsStr.length() > 0) { syncStats = new JsonSlurper().parseText(syncStatsStr) - if (syncStats.seg_num.requested_5m >= submittedDelta - && syncStats.seg_num.finish_5m >= finishedDelta + if (syncStats.seg_num.requested_5m == submittedDelta + && syncStats.seg_num.finish_5m == finishedDelta + && syncStats.seg_num.gap_5m == 0 + && syncStats.seg_num.fail_5m == 0 && syncStats.trigger_gap_ms == 0) { break } diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_compaction_sync_wait.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_compaction_sync_wait.groovy index 97873d514b97ff..fdf04d3a56f4f3 100644 --- a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_compaction_sync_wait.groovy +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_compaction_sync_wait.groovy @@ -193,7 +193,8 @@ suite('test_warm_up_event_on_tables_system_compaction_sync_wait', 'docker') { assert sql("""SELECT count(*) FROM compact_tbl""")[0][0].toString() == (loadCount + 1).toString() def stats = WarmupMetricsUtils.waitForJobSyncStats(sqlRunner, jobId, - { it.seg_num.finish_5m >= loadCount + 2 && it.seg_num.fail_5m == 0 }, 60000) + { it.seg_num.finish_5m >= loadCount + 2 && it.seg_num.fail_5m == 0 && it.seg_num.gap_5m == 0 }, + 60000) assert stats.seg_num.gap_5m == 0 : "compaction warmup should converge, stats=${stats}" } finally { if (debugEnabled) { diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_packed_file.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_packed_file.groovy index 3dad6a269425c4..a85cc48d99b1ff 100644 --- a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_packed_file.groovy +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_packed_file.groovy @@ -110,7 +110,8 @@ suite('test_warm_up_event_on_tables_system_packed_file', 'docker') { assert targetCacheSizeDelta > 0 : "target packed-file warmup should populate TTL file cache" def stats = WarmupMetricsUtils.waitForJobSyncStats(sqlRunner, jobId, - { it.seg_num.finish_5m >= loadCount && it.seg_num.fail_5m == 0 }, 60000) + { it.seg_num.requested_5m > 0 && it.seg_num.finish_5m >= loadCount && it.seg_num.fail_5m == 0 }, + 60000) logger.info("packed file SyncStats: ${stats}") assert stats.seg_num.requested_5m > 0 : "SyncStats should observe packed small-file rowset requests" assert stats.seg_num.finish_5m >= loadCount : "SyncStats should finish packed small-file rowsets" diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_schema_index.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_schema_index.groovy index 25fa92d25808d9..75e7b5c9af8ae1 100644 --- a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_schema_index.groovy +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_schema_index.groovy @@ -129,7 +129,8 @@ suite('test_warm_up_event_on_tables_system_schema_index', 'docker') { assert metrics.failed == baseMetrics.failed : "warmup should not fail, metrics=${metrics}, base=${baseMetrics}" def stats = WarmupMetricsUtils.waitForJobSyncStats(sqlRunner, jobId, - { it.seg_num.finish_5m >= 5 && it.idx_num.finish_5m > 0 && it.seg_num.fail_5m == 0 }, + { it.seg_num.finish_5m >= 5 && it.idx_num.finish_5m > 0 + && it.seg_num.fail_5m == 0 && it.idx_num.fail_5m == 0 }, 60000) logger.info("schema/index SyncStats: ${stats}") assert stats.idx_num.finish_5m > 0 : "inverted index files should be warmed, stats=${stats}" From da5a8b8484758effe521f4776403ce38d0bc97e3 Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Fri, 29 May 2026 10:23:56 +0800 Subject: [PATCH 3/7] [fix](cloud) Fix warm up manager test invocation ### What problem does this PR solve? Issue Number: None Related PR: #63832 Problem Summary: The table-level warm-up change adds a table_id argument before sync_wait_timeout_ms in CloudWarmUpManager::warm_up_rowset. After rebasing onto the latest master, the existing CloudWarmUpManagerTest calls still used the old two-argument form, so the positive-timeout test passed 1000 as table_id and left sync_wait_timeout_ms at its default -1. That made the test take the async non-positive-timeout branch, so the before-wait sync point was never reached and the spurious notify assertion failed. Update the test calls to pass table_id and sync_wait_timeout_ms explicitly. ### Release note None ### Check List (For Author) - Test: - Unit Test: ./run-be-ut.sh --run --filter=CloudWarmUpManagerTest.* -j100 - Behavior changed: No. - Does this need documentation: No. --- be/test/cloud/cloud_warm_up_manager_test.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/be/test/cloud/cloud_warm_up_manager_test.cpp b/be/test/cloud/cloud_warm_up_manager_test.cpp index 90ea834e143d82..4ac284b990bf01 100644 --- a/be/test/cloud/cloud_warm_up_manager_test.cpp +++ b/be/test/cloud/cloud_warm_up_manager_test.cpp @@ -138,7 +138,7 @@ TEST_F(CloudWarmUpManagerTest, NonPositiveTimeoutQueuesBackgroundCopyAndReturns) std::atomic returned = false; std::thread caller([&] { - manager.warm_up_rowset(*rs_meta, -1); + manager.warm_up_rowset(*rs_meta, /*table_id=*/0, /*sync_wait_timeout_ms=*/-1); returned = true; }); @@ -206,7 +206,7 @@ TEST_F(CloudWarmUpManagerTest, NonPositiveTimeoutSkipsWarmupWhenAsyncRowsetMetaI }, &warmup_enter_guard); - manager.warm_up_rowset(*rs_meta, -1); + manager.warm_up_rowset(*rs_meta, /*table_id=*/0, /*sync_wait_timeout_ms=*/-1); { std::unique_lock lock(observed_mtx); @@ -261,7 +261,7 @@ TEST_F(CloudWarmUpManagerTest, PositiveTimeoutIgnoresSpuriousWakeupUntilWorkerFi std::atomic returned = false; std::thread caller([&] { - manager.warm_up_rowset(*rs_meta, 1000); + manager.warm_up_rowset(*rs_meta, /*table_id=*/0, /*sync_wait_timeout_ms=*/1000); returned = true; }); From 4864a5c79bde047b197687bb6fbe8bc2553de1c2 Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Fri, 29 May 2026 11:54:34 +0800 Subject: [PATCH 4/7] [fix](cloud) Relax warm-up table filter perf thresholds ### What problem does this PR solve? Issue Number: None Related PR: #63832 Problem Summary: The table-level warm-up table filter performance tests used tight wall-clock thresholds for the 200K and 500K wildcard match-all cases. CI machines can run these scale tests slightly slower than local runs even though the matching implementation remains efficient. Relax the 200K threshold from 1s to 1.5s and the 500K threshold from 2s to 3s while keeping the existing functional assertions and smaller or more selective performance checks. ### Release note None ### Check List (For Author) - Test: - Unit Test: ./run-fe-ut.sh --run org.apache.doris.cloud.CacheHotspotManagerTableFilterTest - Behavior changed: No. - Does this need documentation: No. --- .../doris/cloud/CacheHotspotManagerTableFilterTest.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fe/fe-core/src/test/java/org/apache/doris/cloud/CacheHotspotManagerTableFilterTest.java b/fe/fe-core/src/test/java/org/apache/doris/cloud/CacheHotspotManagerTableFilterTest.java index 12b9b8ef127f1c..5f6300913e9774 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/cloud/CacheHotspotManagerTableFilterTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/cloud/CacheHotspotManagerTableFilterTest.java @@ -844,8 +844,8 @@ public void testShouldWarmUpPerformance200kTables() { Assertions.assertEquals(200000, matched); System.out.println("[Perf] 200K tables, wildcard match-all: " + elapsedMs + " ms"); - Assertions.assertTrue(elapsedMs < 1000, - "200K regex matches should complete within 1s, took " + elapsedMs + " ms"); + Assertions.assertTrue(elapsedMs < 1500, + "200K regex matches should complete within 1.5s, took " + elapsedMs + " ms"); } @Test @@ -865,8 +865,8 @@ public void testShouldWarmUpPerformance500kTables() { Assertions.assertEquals(500000, matched); System.out.println("[Perf] 500K tables, wildcard match-all: " + elapsedMs + " ms"); - Assertions.assertTrue(elapsedMs < 2000, - "500K regex matches should complete within 2s, took " + elapsedMs + " ms"); + Assertions.assertTrue(elapsedMs < 3000, + "500K regex matches should complete within 3s, took " + elapsedMs + " ms"); } @Test From 44f6b857b97dff575a3f7de796c924bde6e80b05 Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Fri, 29 May 2026 14:13:32 +0800 Subject: [PATCH 5/7] [fix](cloud) Relax many-rule warm-up filter perf threshold ### What problem does this PR solve? Issue Number: None Related PR: #63832 Problem Summary: The table-level warm-up table filter performance test for 200K tables with 15 include/exclude rules still used a tight 2s wall-clock threshold. CI can exceed that threshold under load while the matcher remains functionally correct. Relax the threshold to 3s and keep the matched-table assertion unchanged. ### Release note None ### Check List (For Author) - Test: - Unit Test: ./run-fe-ut.sh --run org.apache.doris.cloud.CacheHotspotManagerTableFilterTest - Behavior changed: No. - Does this need documentation: No. --- .../doris/cloud/CacheHotspotManagerTableFilterTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fe/fe-core/src/test/java/org/apache/doris/cloud/CacheHotspotManagerTableFilterTest.java b/fe/fe-core/src/test/java/org/apache/doris/cloud/CacheHotspotManagerTableFilterTest.java index 5f6300913e9774..ad749929779ce3 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/cloud/CacheHotspotManagerTableFilterTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/cloud/CacheHotspotManagerTableFilterTest.java @@ -950,8 +950,8 @@ public void testShouldWarmUpPerformanceManyRules200k() { // 10 dbs × 2000 tables = 20000 included, minus 10 × 5 excluded = 19950 Assertions.assertEquals(19950, matched); System.out.println("[Perf] 200K tables, 15 rules (10 incl + 5 excl): " + elapsedMs + " ms"); - Assertions.assertTrue(elapsedMs < 2000, - "200K regex matches with 15 rules should complete within 2s, took " + elapsedMs + " ms"); + Assertions.assertTrue(elapsedMs < 3000, + "200K regex matches with 15 rules should complete within 3s, took " + elapsedMs + " ms"); } @Test From 40f2522b7ee691077b8986e09e63852b436a6d57 Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Mon, 1 Jun 2026 10:44:35 +0800 Subject: [PATCH 6/7] [fix](cloud) Add table id to warm-up rowset failure ### What problem does this PR solve? Issue Number: None Related PR: #63832 Problem Summary: The aggregated warm-up rowset failure message included the tablet id and rowset id but omitted the table id, making table-level event-driven warm-up failures harder to diagnose. Pass table_id into the aggregated failure builder and include it in the error text. Extend the helper unit tests to assert the table id is reported. ### Release note None ### Check List (For Author) - Test: - Unit Test: ./run-be-ut.sh --run --filter=CloudWarmUpManagerFilterTest.* -j100 - Behavior changed: No. - Does this need documentation: No. --- be/src/cloud/cloud_warm_up_manager.cpp | 11 ++++++----- be/src/cloud/cloud_warm_up_manager.h | 2 +- be/test/cloud/cloud_warm_up_manager_filter_test.cpp | 8 +++++--- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/be/src/cloud/cloud_warm_up_manager.cpp b/be/src/cloud/cloud_warm_up_manager.cpp index ba8234539b93a4..31384608d9f3d9 100644 --- a/be/src/cloud/cloud_warm_up_manager.cpp +++ b/be/src/cloud/cloud_warm_up_manager.cpp @@ -708,7 +708,7 @@ void CloudWarmUpManager::_warm_up_rowset(RowsetMeta& rs_meta, int64_t table_id, Status CloudWarmUpManager::_build_warm_up_rowset_result( const std::vector& failures, size_t replica_count, int64_t tablet_id, - const std::string& rowset_id) { + int64_t table_id, const std::string& rowset_id) { if (failures.empty()) { return Status::OK(); } @@ -726,9 +726,10 @@ Status CloudWarmUpManager::_build_warm_up_rowset_result( } return Status::Error(code, - "warm up rowset failed on {}/{} replicas, tablet_id={}, rowset_id={}, " - "failures=[{}]", - failures.size(), replica_count, tablet_id, rowset_id, failure_msg); + "warm up rowset failed on {}/{} replicas, tablet_id={}, table_id={}, " + "rowset_id={}, failures=[{}]", + failures.size(), replica_count, tablet_id, table_id, rowset_id, + failure_msg); } Status CloudWarmUpManager::_do_warm_up_rowset(RowsetMeta& rs_meta, int64_t table_id, @@ -882,7 +883,7 @@ Status CloudWarmUpManager::_do_warm_up_rowset(RowsetMeta& rs_meta, int64_t table add_failure(info, target, status); } } - return _build_warm_up_rowset_result(failures, replicas.size(), tablet_id, + return _build_warm_up_rowset_result(failures, replicas.size(), tablet_id, table_id, rs_meta.rowset_id().to_string()); } diff --git a/be/src/cloud/cloud_warm_up_manager.h b/be/src/cloud/cloud_warm_up_manager.h index eb656790599a95..f4102915705457 100644 --- a/be/src/cloud/cloud_warm_up_manager.h +++ b/be/src/cloud/cloud_warm_up_manager.h @@ -118,7 +118,7 @@ class CloudWarmUpManager { static Status _build_warm_up_rowset_result(const std::vector& failures, size_t replica_count, int64_t tablet_id, - const std::string& rowset_id); + int64_t table_id, const std::string& rowset_id); void schedule_remove_balanced_tablet(int64_t tablet_id); static void clean_up_expired_mappings(void* arg); diff --git a/be/test/cloud/cloud_warm_up_manager_filter_test.cpp b/be/test/cloud/cloud_warm_up_manager_filter_test.cpp index 1ebae403422601..c55d6f49d77711 100644 --- a/be/test/cloud/cloud_warm_up_manager_filter_test.cpp +++ b/be/test/cloud/cloud_warm_up_manager_filter_test.cpp @@ -194,7 +194,7 @@ TEST_F(CloudWarmUpManagerFilterTest, GetReplicaInfoBypassesFilterWhenTableIdUnkn } TEST_F(CloudWarmUpManagerFilterTest, BuildWarmUpRowsetResultReturnsOkWithoutFailures) { - auto st = CloudWarmUpManager::_build_warm_up_rowset_result({}, 2, 4001, "rowset-1"); + auto st = CloudWarmUpManager::_build_warm_up_rowset_result({}, 2, 4001, 5001, "rowset-1"); EXPECT_TRUE(st.ok()); } @@ -205,12 +205,13 @@ TEST_F(CloudWarmUpManagerFilterTest, BuildWarmUpRowsetResultAggregatesAllFailure {ErrorCode::INTERNAL_ERROR, "job_id=2, backend_id=22, target=127.0.0.1:8022, status=[INTERNAL_ERROR]rpc two"}}; - auto st = CloudWarmUpManager::_build_warm_up_rowset_result(failures, 3, 4002, "rowset-2"); + auto st = CloudWarmUpManager::_build_warm_up_rowset_result(failures, 3, 4002, 5002, "rowset-2"); EXPECT_FALSE(st.ok()); EXPECT_EQ(ErrorCode::THRIFT_RPC_ERROR, st.code()); std::string msg = st.to_string_no_stack(); EXPECT_NE(std::string::npos, msg.find("failed on 2/3 replicas")); + EXPECT_NE(std::string::npos, msg.find("table_id=5002")); EXPECT_NE(std::string::npos, msg.find("rpc one")); EXPECT_NE(std::string::npos, msg.find("rpc two")); } @@ -222,11 +223,12 @@ TEST_F(CloudWarmUpManagerFilterTest, BuildWarmUpRowsetResultKeepsTableNotFoundRe {ErrorCode::TABLE_NOT_FOUND, "job_id=2, backend_id=22, target=127.0.0.1:8022, status=[TABLET_MISSING]missing"}}; - auto st = CloudWarmUpManager::_build_warm_up_rowset_result(failures, 2, 4003, "rowset-3"); + auto st = CloudWarmUpManager::_build_warm_up_rowset_result(failures, 2, 4003, 5003, "rowset-3"); EXPECT_FALSE(st.ok()); EXPECT_TRUE(st.is()); std::string msg = st.to_string_no_stack(); + EXPECT_NE(std::string::npos, msg.find("table_id=5003")); EXPECT_NE(std::string::npos, msg.find("rpc one")); EXPECT_NE(std::string::npos, msg.find("missing")); } From e9ef9da91a90b5477b1c626ea0a4e5a22347ad5e Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Mon, 1 Jun 2026 12:00:30 +0800 Subject: [PATCH 7/7] [test](fe) Relax 500K table filter perf threshold ### What problem does this PR solve? Issue Number: None Related PR: #63832 Problem Summary: The 500K table-filter performance unit test can exceed the previous 3s threshold under CI load even though the matcher behavior remains correct. Relax the assertion to 4s to avoid treating small runtime variance as a test failure. ### Release note None ### Check List (For Author) - Test: - Unit Test: ./run-fe-ut.sh --run org.apache.doris.cloud.CacheHotspotManagerTableFilterTest - Behavior changed: No. - Does this need documentation: No. --- .../doris/cloud/CacheHotspotManagerTableFilterTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fe/fe-core/src/test/java/org/apache/doris/cloud/CacheHotspotManagerTableFilterTest.java b/fe/fe-core/src/test/java/org/apache/doris/cloud/CacheHotspotManagerTableFilterTest.java index ad749929779ce3..255268e66c5a74 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/cloud/CacheHotspotManagerTableFilterTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/cloud/CacheHotspotManagerTableFilterTest.java @@ -865,8 +865,8 @@ public void testShouldWarmUpPerformance500kTables() { Assertions.assertEquals(500000, matched); System.out.println("[Perf] 500K tables, wildcard match-all: " + elapsedMs + " ms"); - Assertions.assertTrue(elapsedMs < 3000, - "500K regex matches should complete within 3s, took " + elapsedMs + " ms"); + Assertions.assertTrue(elapsedMs < 4000, + "500K regex matches should complete within 4s, took " + elapsedMs + " ms"); } @Test