diff --git a/be/src/cloud/cloud_backend_service.cpp b/be/src/cloud/cloud_backend_service.cpp index 403da0b76c6ee5..efcc096c313d4a 100644 --- a/be/src/cloud/cloud_backend_service.cpp +++ b/be/src/cloud/cloud_backend_service.cpp @@ -104,7 +104,11 @@ void CloudBackendService::warm_up_tablets(TWarmUpTabletsResponse& response, .tag("request_type", "SET_JOB") .tag("job_id", request.job_id); if (request.__isset.event) { - st = manager.set_event(request.job_id, request.event); + const std::vector* table_ids_ptr = nullptr; + if (request.__isset.table_ids) { + table_ids_ptr = &request.table_ids; + } + st = manager.set_event(request.job_id, request.event, false, table_ids_ptr); if (st.ok()) { break; } diff --git a/be/src/cloud/cloud_internal_service.cpp b/be/src/cloud/cloud_internal_service.cpp index 9e685a8f90bc13..d8b67eab439f66 100644 --- a/be/src/cloud/cloud_internal_service.cpp +++ b/be/src/cloud/cloud_internal_service.cpp @@ -20,13 +20,19 @@ #include #include +#include +#include +#include +#include #include #include +#include #include "cloud/cloud_storage_engine.h" #include "cloud/cloud_tablet.h" #include "cloud/cloud_tablet_mgr.h" #include "cloud/cloud_warm_up_manager.h" +#include "cloud/cloud_warmup_metrics.h" #include "cloud/config.h" #include "io/cache/block_file_cache.h" #include "io/cache/block_file_cache_downloader.h" @@ -34,6 +40,7 @@ #include "runtime/thread_context.h" #include "runtime/workload_management/io_throttle.h" #include "util/async_io.h" +#include "util/bvar_windowed_adder.h" #include "util/debug_points.h" namespace doris { @@ -407,10 +414,103 @@ bvar::Adder g_file_cache_warm_up_rowset_wait_for_compaction_num( bvar::Adder g_file_cache_warm_up_rowset_wait_for_compaction_timeout_num( "file_cache_warm_up_rowset_wait_for_compaction_timeout_num"); +// Per-job windowed metrics for target BE +// bvar::Window enforces MAX_SECONDS_LIMIT = 3600, so the longest window is 1h. +static constexpr int WINDOW_5M = 300; +static constexpr int WINDOW_30M = 1800; +static constexpr int WINDOW_1H = 3600; + +MBvarWindowedAdder g_warmup_ed_finish_segment_num("warmup_ed_finish_segment_num", {"job_id"}, + {WINDOW_5M, WINDOW_30M, WINDOW_1H}, false); +MBvarWindowedAdder g_warmup_ed_finish_segment_size("warmup_ed_finish_segment_size", {"job_id"}, + {WINDOW_5M, WINDOW_30M, WINDOW_1H}, false); +MBvarWindowedAdder g_warmup_ed_finish_index_num("warmup_ed_finish_index_num", {"job_id"}, + {WINDOW_5M, WINDOW_30M, WINDOW_1H}, false); +MBvarWindowedAdder g_warmup_ed_finish_index_size("warmup_ed_finish_index_size", {"job_id"}, + {WINDOW_5M, WINDOW_30M, WINDOW_1H}, false); +MBvarWindowedAdder g_warmup_ed_fail_segment_num("warmup_ed_fail_segment_num", {"job_id"}, + {WINDOW_5M, WINDOW_30M, WINDOW_1H}, false); +MBvarWindowedAdder g_warmup_ed_fail_segment_size("warmup_ed_fail_segment_size", {"job_id"}, + {WINDOW_5M, WINDOW_30M, WINDOW_1H}, false); +MBvarWindowedAdder g_warmup_ed_fail_index_num("warmup_ed_fail_index_num", {"job_id"}, + {WINDOW_5M, WINDOW_30M, WINDOW_1H}, false); +MBvarWindowedAdder g_warmup_ed_fail_index_size("warmup_ed_fail_index_size", {"job_id"}, + {WINDOW_5M, WINDOW_30M, WINDOW_1H}, false); +bvar::MultiDimension> g_warmup_ed_last_finish_ts({"job_id"}); + +void update_warmup_ed_last_finish_ts(const std::string& job_id_str) { + auto* finish_ts = g_warmup_ed_last_finish_ts.get_stats(std::list {job_id_str}); + if (finish_ts) { + finish_ts->set_value(std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count()); + } +} + +void record_warmup_ed_finish_segment(const std::string& job_id_str, int64_t segment_size) { + g_warmup_ed_finish_segment_num.put({job_id_str}, 1); + g_warmup_ed_finish_segment_size.put({job_id_str}, segment_size); + update_warmup_ed_last_finish_ts(job_id_str); +} + +void record_warmup_ed_finish_index(const std::string& job_id_str, int64_t idx_size) { + g_warmup_ed_finish_index_num.put({job_id_str}, 1); + g_warmup_ed_finish_index_size.put({job_id_str}, idx_size); + update_warmup_ed_last_finish_ts(job_id_str); +} + +void record_warmup_ed_fail_segment(const std::string& job_id_str, int64_t segment_size) { + g_warmup_ed_fail_segment_num.put({job_id_str}, 1); + g_warmup_ed_fail_segment_size.put({job_id_str}, segment_size); +} + +void record_warmup_ed_fail_index(const std::string& job_id_str, int64_t idx_size) { + g_warmup_ed_fail_index_num.put({job_id_str}, 1); + g_warmup_ed_fail_index_size.put({job_id_str}, idx_size); +} + +void record_warmup_ed_skipped_rowset_as_finished(RowsetMeta& rs_meta, + const std::string& job_id_str) { + auto schema_ptr = rs_meta.tablet_schema(); + bool has_inverted_index = schema_ptr->has_inverted_index() || schema_ptr->has_ann_index(); + auto idx_version = schema_ptr->get_inverted_index_storage_format(); + for (int64_t segment_id = 0; segment_id < rs_meta.num_segments(); segment_id++) { + record_warmup_ed_finish_segment(job_id_str, rs_meta.segment_file_size(segment_id)); + + if (!has_inverted_index) { + continue; + } + auto&& inverted_index_info = rs_meta.inverted_index_file_info(segment_id); + if (idx_version == InvertedIndexStorageFormatPB::V1) { + std::unordered_map index_size_map; + for (const auto& info : inverted_index_info.index_info()) { + if (info.index_file_size() != -1) { + index_size_map[info.index_id()] = info.index_file_size(); + } else { + VLOG_DEBUG << "Invalid index_file_size for segment_id " << segment_id + << ", index_id " << info.index_id(); + } + } + for (const auto& index : schema_ptr->inverted_indexes()) { + record_warmup_ed_finish_index(job_id_str, index_size_map[index->index_id()]); + } + } else { // InvertedIndexStorageFormatPB::V2 + int64_t idx_size = 0; + if (inverted_index_info.has_index_size()) { + idx_size = inverted_index_info.index_size(); + } else { + VLOG_DEBUG << "index_size is not set for segment " << segment_id; + } + record_warmup_ed_finish_index(job_id_str, idx_size); + } + } +} + void handle_segment_download_done(Status st, int64_t tablet_id, const RowsetId& rowset_id, int64_t segment_id, std::shared_ptr tablet, std::shared_ptr wait, Version version, - int64_t segment_size, int64_t request_ts, int64_t handle_ts) { + int64_t segment_size, int64_t request_ts, int64_t handle_ts, + std::string job_id_str, int64_t upstream_trigger_ts_ms) { DBUG_EXECUTE_IF("CloudInternalServiceImpl::warm_up_rowset.download_segment", { auto sleep_time = dp->param("sleep", 3); LOG_INFO("[verbose] block download for rowset={}, version={}, sleep={}", @@ -428,6 +528,7 @@ void handle_segment_download_done(Status st, int64_t tablet_id, const RowsetId& if (st.ok()) { g_file_cache_event_driven_warm_up_finished_segment_num << 1; g_file_cache_event_driven_warm_up_finished_segment_size << segment_size; + record_warmup_ed_finish_segment(job_id_str, segment_size); int64_t now_ts = current_unix_time_us(); g_file_cache_warm_up_rowset_last_finish_unix_ts.set_value(now_ts); auto rowset_latency_us = warm_up_rowset_cross_host_latency_us(request_ts, now_ts); @@ -451,6 +552,7 @@ void handle_segment_download_done(Status st, int64_t tablet_id, const RowsetId& } else { g_file_cache_event_driven_warm_up_failed_segment_num << 1; g_file_cache_event_driven_warm_up_failed_segment_size << segment_size; + record_warmup_ed_fail_segment(job_id_str, segment_size); LOG(WARNING) << "download segment failed, tablet_id: " << tablet_id << " rowset_id: " << rowset_id.to_string() << ", error: " << st; } @@ -460,6 +562,7 @@ void handle_segment_download_done(Status st, int64_t tablet_id, const RowsetId& VLOG_DEBUG << "warmup rowset " << version.to_string() << "(" << rowset_id.to_string() << ") completed"; } + g_warmup_ed_downstream_progress_tracker.record_task_done(job_id_str, upstream_trigger_ts_ms); if (wait) { wait->signal(); } @@ -470,7 +573,8 @@ void handle_inverted_index_download_done(Status st, int64_t tablet_id, const Row std::shared_ptr tablet, std::shared_ptr wait, Version version, uint64_t idx_size, int64_t request_ts, - int64_t handle_ts) { + int64_t handle_ts, std::string job_id_str, + int64_t upstream_trigger_ts_ms) { DBUG_EXECUTE_IF("CloudInternalServiceImpl::warm_up_rowset.download_inverted_idx", { auto sleep_time = dp->param("sleep", 3); LOG_INFO( @@ -482,6 +586,7 @@ void handle_inverted_index_download_done(Status st, int64_t tablet_id, const Row if (st.ok()) { g_file_cache_event_driven_warm_up_finished_index_num << 1; g_file_cache_event_driven_warm_up_finished_index_size << idx_size; + record_warmup_ed_finish_index(job_id_str, static_cast(idx_size)); int64_t now_ts = current_unix_time_us(); g_file_cache_warm_up_rowset_last_finish_unix_ts.set_value(now_ts); auto rowset_latency_us = warm_up_rowset_cross_host_latency_us(request_ts, now_ts); @@ -505,6 +610,7 @@ void handle_inverted_index_download_done(Status st, int64_t tablet_id, const Row } else { g_file_cache_event_driven_warm_up_failed_index_num << 1; g_file_cache_event_driven_warm_up_failed_index_size << idx_size; + record_warmup_ed_fail_index(job_id_str, static_cast(idx_size)); LOG(WARNING) << "download inverted index failed, tablet_id: " << tablet_id << " rowset_id: " << rowset_id << ", error: " << st; } @@ -514,6 +620,7 @@ void handle_inverted_index_download_done(Status st, int64_t tablet_id, const Row VLOG_DEBUG << "warmup rowset " << version.to_string() << "(" << rowset_id.to_string() << ") completed"; } + g_warmup_ed_downstream_progress_tracker.record_task_done(job_id_str, upstream_trigger_ts_ms); if (wait) { wait->signal(); } @@ -534,6 +641,11 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c due_time = butil::milliseconds_from_now(request->sync_wait_timeout_ms()); } + // Extract job_id from request (0 if not set, for backward compatibility) + std::string job_id_str = std::to_string(request->has_job_id() ? request->job_id() : 0); + int64_t upstream_trigger_ts_ms = + request->has_upstream_trigger_ts_ms() ? request->upstream_trigger_ts_ms() : 0; + for (auto& rs_meta_pb : request->rowset_metas()) { RowsetMeta rs_meta; rs_meta.init_from_pb(rs_meta_pb); @@ -581,8 +693,15 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c if (!tablet->add_rowset_warmup_state(rs_meta, WarmUpTriggerSource::EVENT_DRIVEN)) { LOG(INFO) << "found duplicate warmup task for rowset " << rowset_id.to_string() << ", skip it"; + g_warmup_ed_downstream_progress_tracker.record_task_done(job_id_str, + upstream_trigger_ts_ms); + record_warmup_ed_skipped_rowset_as_finished(rs_meta, job_id_str); continue; } + if (rs_meta.num_segments() == 0) { + g_warmup_ed_downstream_progress_tracker.record_task_done(job_id_str, + upstream_trigger_ts_ms); + } for (int64_t segment_id = 0; segment_id < rs_meta.num_segments(); segment_id++) { if (!config::file_cache_enable_only_warm_up_idx) { @@ -605,7 +724,8 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c [=, version = rs_meta.version()](Status st) { handle_segment_download_done( st, tablet_id, rowset_id, segment_id, tablet, wait, - version, segment_size, request_ts, handle_ts); + version, segment_size, request_ts, handle_ts, + job_id_str, upstream_trigger_ts_ms); }, .tablet_id = tablet_id}; @@ -614,12 +734,15 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c if (wait) { wait->add_count(); } + g_warmup_ed_downstream_progress_tracker.record_task_submit(job_id_str, + upstream_trigger_ts_ms); _engine.file_cache_block_downloader().submit_download_task(download_meta); } // Use rs_meta.fs() to support packed files for inverted index download. - auto download_inverted_index = [&, tablet](std::string index_path, uint64_t idx_size) { + auto download_inverted_index = [&, tablet, job_id_str](std::string index_path, + uint64_t idx_size) { io::DownloadFileMeta download_meta { .path = io::Path(index_path), .file_size = static_cast(idx_size), @@ -632,9 +755,11 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c [=, version = rs_meta.version()](Status st) { handle_inverted_index_download_done( st, tablet_id, rowset_id, segment_id, index_path, - tablet, wait, version, idx_size, request_ts, handle_ts); + tablet, wait, version, idx_size, request_ts, handle_ts, + job_id_str, upstream_trigger_ts_ms); }, - .tablet_id = tablet_id}; + .tablet_id = tablet_id, + }; g_file_cache_event_driven_warm_up_submitted_index_num << 1; g_file_cache_event_driven_warm_up_submitted_index_size << idx_size; tablet->update_rowset_warmup_state_inverted_idx_num( @@ -642,6 +767,8 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c if (wait) { wait->add_count(); } + g_warmup_ed_downstream_progress_tracker.record_task_submit(job_id_str, + upstream_trigger_ts_ms); _engine.file_cache_block_downloader().submit_download_task(download_meta); }; diff --git a/be/src/cloud/cloud_meta_mgr.cpp b/be/src/cloud/cloud_meta_mgr.cpp index d84a54cd1e9b2f..f6817411f54c5d 100644 --- a/be/src/cloud/cloud_meta_mgr.cpp +++ b/be/src/cloud/cloud_meta_mgr.cpp @@ -1512,7 +1512,7 @@ Status CloudMetaMgr::commit_rowset(RowsetMeta& rs_meta, const std::string& job_i << ", with timeout: " << timeout_ms << " ms"; } auto& manager = ExecEnv::GetInstance()->storage_engine().to_cloud().cloud_warm_up_manager(); - manager.warm_up_rowset(rs_meta, timeout_ms); + manager.warm_up_rowset(rs_meta, table_id, timeout_ms); return st; } diff --git a/be/src/cloud/cloud_warm_up_manager.cpp b/be/src/cloud/cloud_warm_up_manager.cpp index 40d0066e2eee76..31384608d9f3d9 100644 --- a/be/src/cloud/cloud_warm_up_manager.cpp +++ b/be/src/cloud/cloud_warm_up_manager.cpp @@ -26,13 +26,17 @@ #include #include +#include +#include #include +#include #include "bvar/bvar.h" #include "cloud/cloud_tablet.h" #include "cloud/cloud_tablet_mgr.h" #include "cloud/config.h" #include "common/cast_set.h" +#include "common/config.h" #include "common/logging.h" #include "cpp/sync_point.h" #include "io/cache/block_file_cache_downloader.h" @@ -41,7 +45,9 @@ #include "storage/rowset/beta_rowset.h" #include "storage/tablet/tablet.h" #include "util/brpc_client_cache.h" // BrpcClientCache +#include "util/bvar_windowed_adder.h" #include "util/client_cache.h" +#include "util/defer_op.h" #include "util/stack_util.h" #include "util/thrift_rpc_helper.h" #include "util/time.h" @@ -90,6 +96,23 @@ bvar::Adder g_balance_tablet_be_mapping_size("balance_tablet_be_mappin bvar::LatencyRecorder g_file_cache_warm_up_rowset_wait_for_compaction_latency( "file_cache_warm_up_rowset_wait_for_compaction_latency"); +// Per-job windowed metrics for source BE +// bvar::Window enforces MAX_SECONDS_LIMIT = 3600, so the longest window is 1h. +static constexpr int WINDOW_5M = 300; +static constexpr int WINDOW_30M = 1800; +static constexpr int WINDOW_1H = 3600; + +MBvarWindowedAdder g_warmup_ed_requested_segment_num("warmup_ed_requested_segment_num", {"job_id"}, + {WINDOW_5M, WINDOW_30M, WINDOW_1H}, false); +MBvarWindowedAdder g_warmup_ed_requested_segment_size("warmup_ed_requested_segment_size", + {"job_id"}, + {WINDOW_5M, WINDOW_30M, WINDOW_1H}, false); +MBvarWindowedAdder g_warmup_ed_requested_index_num("warmup_ed_requested_index_num", {"job_id"}, + {WINDOW_5M, WINDOW_30M, WINDOW_1H}, false); +MBvarWindowedAdder g_warmup_ed_requested_index_size("warmup_ed_requested_index_size", {"job_id"}, + {WINDOW_5M, WINDOW_30M, WINDOW_1H}, false); +bvar::MultiDimension> g_warmup_ed_last_trigger_ts({"job_id"}); + CloudWarmUpManager::CloudWarmUpManager(CloudStorageEngine& engine) : _engine(engine) { _download_thread = std::thread(&CloudWarmUpManager::handle_jobs, this); static_cast(ThreadPoolBuilder("CloudWarmUpManagerThreadPool") @@ -460,7 +483,8 @@ Status CloudWarmUpManager::clear_job(int64_t job_id) { return st; } -Status CloudWarmUpManager::set_event(int64_t job_id, TWarmUpEventType::type event, bool clear) { +Status CloudWarmUpManager::set_event(int64_t job_id, TWarmUpEventType::type event, bool clear, + const std::vector* table_ids) { DBUG_EXECUTE_IF("CloudWarmUpManager.set_event.ignore_all", { LOG(INFO) << "Ignore set_event request, job_id=" << job_id << ", event=" << event << ", clear=" << clear; @@ -471,10 +495,28 @@ Status CloudWarmUpManager::set_event(int64_t job_id, TWarmUpEventType::type even if (event == TWarmUpEventType::type::LOAD) { if (clear) { _tablet_replica_cache.erase(job_id); + _event_driven_filters.erase(job_id); LOG(INFO) << "Clear event driven sync, job_id=" << job_id << ", event=" << event; } else if (!_tablet_replica_cache.contains(job_id)) { static_cast(_tablet_replica_cache[job_id]); - LOG(INFO) << "Set event driven sync, job_id=" << job_id << ", event=" << event; + if (table_ids != nullptr) { + // table-level filter: set to the given table_id set (may be empty, + // meaning all matched tables were deleted — warm up nothing) + _event_driven_filters[job_id] = + std::unordered_set(table_ids->begin(), table_ids->end()); + LOG(INFO) << "Set event driven sync with table filter, job_id=" << job_id + << ", event=" << event << ", table_ids_size=" << table_ids->size(); + } else { + // cluster-level: no filter, warm up all tables + _event_driven_filters[job_id] = std::nullopt; + LOG(INFO) << "Set event driven sync, job_id=" << job_id << ", event=" << event; + } + } else if (table_ids != nullptr) { + // Update table_ids for an existing job (may be empty) + _event_driven_filters[job_id] = + std::unordered_set(table_ids->begin(), table_ids->end()); + LOG(INFO) << "Updated table filter for event driven sync, job_id=" << job_id + << ", table_ids_size=" << table_ids->size(); } } else { st = Status::InternalError("The event {} is not supported yet", event); @@ -482,13 +524,29 @@ Status CloudWarmUpManager::set_event(int64_t job_id, TWarmUpEventType::type even return st; } -std::vector CloudWarmUpManager::get_replica_info(int64_t tablet_id, bool bypass_cache, - bool& cache_hit) { - std::vector replicas; +std::vector CloudWarmUpManager::get_replica_info(int64_t tablet_id, + int64_t table_id, + bool bypass_cache, + bool& cache_hit) { + std::vector replicas; std::vector cancelled_jobs; std::lock_guard lock(_mtx); cache_hit = false; for (auto& [job_id, cache] : _tablet_replica_cache) { + // Check table-level filter: skip this job if table_id doesn't match + // table_id == 0 means the caller doesn't have table context (e.g., recycle_cache), + // so skip filtering + if (table_id != 0) { + auto filter_it = _event_driven_filters.find(job_id); + if (filter_it != _event_driven_filters.end() && filter_it->second.has_value()) { + if (filter_it->second->find(table_id) == filter_it->second->end()) { + VLOG_DEBUG << "get_replica_info: table_id=" << table_id + << " not in filter for job_id=" << job_id << ", skipping"; + continue; + } + } + } + if (!bypass_cache) { auto it = cache.find(tablet_id); if (it != cache.end()) { @@ -496,9 +554,9 @@ std::vector CloudWarmUpManager::get_replica_info(int64_t tablet_id auto now = std::chrono::steady_clock::now(); auto sec = std::chrono::duration_cast(now - it->second.first); if (sec.count() < config::warmup_tablet_replica_info_cache_ttl_sec) { - replicas.push_back(it->second.second); - LOG(INFO) << "get_replica_info: cache hit, tablet_id=" << tablet_id - << ", job_id=" << job_id; + replicas.push_back(JobReplicaInfo {job_id, it->second.second}); + VLOG_DEBUG << "get_replica_info: cache hit, tablet_id=" << tablet_id + << ", job_id=" << job_id; cache_hit = true; continue; } else { @@ -566,7 +624,7 @@ std::vector CloudWarmUpManager::get_replica_info(int64_t tablet_id << " replica_infos, tablet id=" << tid << ", job_id=" << job_id; for (const auto& replica : it.second) { cache[tid] = std::make_pair(std::chrono::steady_clock::now(), replica); - replicas.push_back(replica); + replicas.push_back(JobReplicaInfo {job_id, replica}); LOG(INFO) << "get_replica_info: cache add, tablet_id=" << tid << ", job_id=" << job_id; } @@ -581,10 +639,12 @@ std::vector CloudWarmUpManager::get_replica_info(int64_t tablet_id return replicas; } -void CloudWarmUpManager::warm_up_rowset(RowsetMeta& rs_meta, int64_t sync_wait_timeout_ms) { +void CloudWarmUpManager::warm_up_rowset(RowsetMeta& rs_meta, int64_t table_id, + int64_t sync_wait_timeout_ms) { if (sync_wait_timeout_ms <= 0) { auto rs_meta_pb = std::make_shared(rs_meta.get_rowset_pb()); - auto st = _thread_pool_token->submit_func([this, rs_meta_pb, sync_wait_timeout_ms]() { + auto st = _thread_pool_token->submit_func([this, rs_meta_pb, table_id, + sync_wait_timeout_ms]() { RowsetMeta async_rs_meta; bool init_succeed = async_rs_meta.init_from_pb(*rs_meta_pb); TEST_SYNC_POINT_CALLBACK("CloudWarmUpManager::warm_up_rowset.async_init_from_pb", @@ -593,7 +653,7 @@ void CloudWarmUpManager::warm_up_rowset(RowsetMeta& rs_meta, int64_t sync_wait_t LOG(WARNING) << "Failed to init rowset meta when warming up rowset asynchronously"; return; } - _warm_up_rowset(async_rs_meta, sync_wait_timeout_ms); + _warm_up_rowset(async_rs_meta, table_id, sync_wait_timeout_ms); }); if (!st.ok()) { LOG(WARNING) << "Failed to submit warm up rowset task: " << st; @@ -607,7 +667,7 @@ void CloudWarmUpManager::warm_up_rowset(RowsetMeta& rs_meta, int64_t sync_wait_t bool finished = false; std::unique_lock lock(mu); auto st = _thread_pool_token->submit_func([&, this]() { - _warm_up_rowset(rs_meta, sync_wait_timeout_ms); + _warm_up_rowset(rs_meta, table_id, sync_wait_timeout_ms); std::unique_lock l(mu); finished = true; cv.notify_one(); @@ -623,21 +683,22 @@ void CloudWarmUpManager::warm_up_rowset(RowsetMeta& rs_meta, int64_t sync_wait_t } } -void CloudWarmUpManager::_warm_up_rowset(RowsetMeta& rs_meta, int64_t sync_wait_timeout_ms) { +void CloudWarmUpManager::_warm_up_rowset(RowsetMeta& rs_meta, int64_t table_id, + int64_t sync_wait_timeout_ms) { TEST_SYNC_POINT_CALLBACK("CloudWarmUpManager::_warm_up_rowset.enter", &rs_meta, &sync_wait_timeout_ms); bool cache_hit = false; - auto replicas = get_replica_info(rs_meta.tablet_id(), false, cache_hit); + auto replicas = get_replica_info(rs_meta.tablet_id(), table_id, false, cache_hit); if (replicas.empty()) { VLOG_DEBUG << "There is no need to warmup tablet=" << rs_meta.tablet_id() << ", skipping rowset=" << rs_meta.rowset_id().to_string(); g_file_cache_event_driven_warm_up_skipped_rowset_num << 1; return; } - Status st = _do_warm_up_rowset(rs_meta, replicas, sync_wait_timeout_ms, !cache_hit); + Status st = _do_warm_up_rowset(rs_meta, table_id, replicas, sync_wait_timeout_ms, !cache_hit); if (cache_hit && !st.ok() && st.is()) { - replicas = get_replica_info(rs_meta.tablet_id(), true, cache_hit); - st = _do_warm_up_rowset(rs_meta, replicas, sync_wait_timeout_ms, true); + replicas = get_replica_info(rs_meta.tablet_id(), table_id, true, cache_hit); + st = _do_warm_up_rowset(rs_meta, table_id, replicas, sync_wait_timeout_ms, true); } if (!st.ok()) { LOG(WARNING) << "Failed to warm up rowset, tablet_id=" << rs_meta.tablet_id() @@ -645,8 +706,34 @@ void CloudWarmUpManager::_warm_up_rowset(RowsetMeta& rs_meta, int64_t sync_wait_ } } -Status CloudWarmUpManager::_do_warm_up_rowset(RowsetMeta& rs_meta, - std::vector& replicas, +Status CloudWarmUpManager::_build_warm_up_rowset_result( + const std::vector& failures, size_t replica_count, int64_t tablet_id, + int64_t table_id, const std::string& rowset_id) { + if (failures.empty()) { + return Status::OK(); + } + + int code = failures.front().code; + std::string failure_msg; + for (size_t i = 0; i < failures.size(); ++i) { + if (failures[i].code == ErrorCode::TABLE_NOT_FOUND) { + code = ErrorCode::TABLE_NOT_FOUND; + } + if (i > 0) { + failure_msg.append("; "); + } + failure_msg.append(failures[i].reason); + } + + return Status::Error(code, + "warm up rowset failed on {}/{} replicas, tablet_id={}, table_id={}, " + "rowset_id={}, failures=[{}]", + failures.size(), replica_count, tablet_id, table_id, rowset_id, + failure_msg); +} + +Status CloudWarmUpManager::_do_warm_up_rowset(RowsetMeta& rs_meta, int64_t table_id, + std::vector& replicas, int64_t sync_wait_timeout_ms, bool skip_existence_check) { auto tablet_id = rs_meta.tablet_id(); @@ -654,34 +741,53 @@ Status CloudWarmUpManager::_do_warm_up_rowset(RowsetMeta& rs_meta, std::chrono::system_clock::now().time_since_epoch()) .count(); g_file_cache_warm_up_rowset_last_call_unix_ts.set_value(now_ts); - auto ret_st = Status::OK(); + std::vector failures; + auto add_failure = [&failures](const JobReplicaInfo& info, const std::string& target, + const Status& st) { + failures.push_back(WarmUpRowsetFailure { + .code = st.code(), + .reason = "job_id=" + std::to_string(info.job_id) + + ", backend_id=" + std::to_string(info.replica.backend_id) + + ", target=" + target + ", status=" + st.to_string_no_stack()}); + }; + + for (auto& info : replicas) { + std::string job_id_str = std::to_string(info.job_id); + std::string target = get_host_port(info.replica.host, info.replica.brpc_port); + int64_t trigger_ts_ms = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + + PWarmUpRowsetRequest request; + request.add_rowset_metas()->CopyFrom(rs_meta.get_rowset_pb()); + request.set_unix_ts_us(now_ts); + request.set_sync_wait_timeout_ms(sync_wait_timeout_ms); + request.set_skip_existence_check(skip_existence_check); + request.set_job_id(info.job_id); + request.set_upstream_trigger_ts_ms(trigger_ts_ms); - PWarmUpRowsetRequest request; - request.add_rowset_metas()->CopyFrom(rs_meta.get_rowset_pb()); - request.set_unix_ts_us(now_ts); - request.set_sync_wait_timeout_ms(sync_wait_timeout_ms); - request.set_skip_existence_check(skip_existence_check); - for (auto& replica : replicas) { // send sync request - std::string host = replica.host; + std::string host = info.replica.host; auto dns_cache = ExecEnv::GetInstance()->dns_cache(); if (dns_cache == nullptr) { LOG(WARNING) << "DNS cache is not initialized, skipping hostname resolve"; - } else if (!is_valid_ip(replica.host)) { - Status status = dns_cache->get(replica.host, &host); + } else if (!is_valid_ip(info.replica.host)) { + Status status = dns_cache->get(info.replica.host, &host); if (!status.ok()) { - LOG(WARNING) << "failed to get ip from host " << replica.host << ": " + LOG(WARNING) << "failed to get ip from host " << info.replica.host << ": " << status.to_string(); + add_failure(info, target, status); continue; } } - std::string brpc_addr = get_host_port(host, replica.brpc_port); + std::string brpc_addr = get_host_port(host, info.replica.brpc_port); Status st = Status::OK(); std::shared_ptr brpc_stub = ExecEnv::GetInstance()->brpc_internal_client_cache()->get_new_client_no_cache( brpc_addr); if (!brpc_stub) { st = Status::RpcError("Address {} is wrong", brpc_addr); + add_failure(info, target, st); continue; } @@ -689,9 +795,13 @@ Status CloudWarmUpManager::_do_warm_up_rowset(RowsetMeta& rs_meta, auto schema_ptr = rs_meta.tablet_schema(); auto idx_version = schema_ptr->get_inverted_index_storage_format(); for (int64_t segment_id = 0; segment_id < rs_meta.num_segments(); segment_id++) { + auto seg_size = rs_meta.segment_file_size(cast_set(segment_id)); + g_file_cache_event_driven_warm_up_requested_segment_num << 1; - g_file_cache_event_driven_warm_up_requested_segment_size - << rs_meta.segment_file_size(cast_set(segment_id)); + g_warmup_ed_requested_segment_num.put({job_id_str}, 1); + + g_file_cache_event_driven_warm_up_requested_segment_size << seg_size; + g_warmup_ed_requested_segment_size.put({job_id_str}, seg_size); if (schema_ptr->has_inverted_index() || schema_ptr->has_ann_index()) { if (idx_version == InvertedIndexStorageFormatPB::V1) { @@ -701,23 +811,31 @@ Status CloudWarmUpManager::_do_warm_up_rowset(RowsetMeta& rs_meta, VLOG_DEBUG << "No index info available for segment " << segment_id; continue; } - for (const auto& info : inverted_index_info.index_info()) { + for (const auto& idx_info : inverted_index_info.index_info()) { g_file_cache_event_driven_warm_up_requested_index_num << 1; - if (info.index_file_size() != -1) { + g_warmup_ed_requested_index_num.put({job_id_str}, 1); + + if (idx_info.index_file_size() != -1) { g_file_cache_event_driven_warm_up_requested_index_size - << info.index_file_size(); + << idx_info.index_file_size(); + g_warmup_ed_requested_index_size.put({job_id_str}, + idx_info.index_file_size()); } else { VLOG_DEBUG << "Invalid index_file_size for segment_id " << segment_id - << ", index_id " << info.index_id(); + << ", index_id " << idx_info.index_id(); } } } else { // InvertedIndexStorageFormatPB::V2 auto&& inverted_index_info = rs_meta.inverted_index_file_info(cast_set(segment_id)); g_file_cache_event_driven_warm_up_requested_index_num << 1; + g_warmup_ed_requested_index_num.put({job_id_str}, 1); + if (inverted_index_info.has_index_size()) { g_file_cache_event_driven_warm_up_requested_index_size << inverted_index_info.index_size(); + g_warmup_ed_requested_index_size.put({job_id_str}, + inverted_index_info.index_size()); } else { VLOG_DEBUG << "index_size is not set for segment " << segment_id; } @@ -725,6 +843,13 @@ Status CloudWarmUpManager::_do_warm_up_rowset(RowsetMeta& rs_meta, } } + // Update last trigger timestamp + auto* trigger_ts = + g_warmup_ed_last_trigger_ts.get_stats(std::list {job_id_str}); + if (trigger_ts) { + trigger_ts->set_value(trigger_ts_ms); + } + brpc::Controller cntl; if (sync_wait_timeout_ms > 0) { cntl.set_timeout_ms(sync_wait_timeout_ms + 1000); @@ -736,7 +861,8 @@ Status CloudWarmUpManager::_do_warm_up_rowset(RowsetMeta& rs_meta, if (cntl.Failed()) { LOG_WARNING("warm up rowset {} for tablet {} failed, rpc error: {}", rs_meta.rowset_id().to_string(), tablet_id, cntl.ErrorText()); - return Status::RpcError(cntl.ErrorText()); + add_failure(info, target, Status::RpcError(cntl.ErrorText())); + continue; } if (sync_wait_timeout_ms > 0) { auto cost_us = watch.elapsed_time_microseconds(); @@ -752,12 +878,13 @@ Status CloudWarmUpManager::_do_warm_up_rowset(RowsetMeta& rs_meta, if (response.has_status() && !status.ok()) { LOG(INFO) << "warm_up_rowset failed, tablet_id=" << rs_meta.tablet_id() << ", rowset_id=" << rs_meta.rowset_id().to_string() - << ", target=" << replica.host << ", skip_existence_check" + << ", target=" << info.replica.host << ", skip_existence_check" << skip_existence_check << ", status=" << status; - ret_st = status; + add_failure(info, target, status); } } - return ret_st; + return _build_warm_up_rowset_result(failures, replicas.size(), tablet_id, table_id, + rs_meta.rowset_id().to_string()); } void CloudWarmUpManager::recycle_cache(int64_t tablet_id, @@ -782,7 +909,7 @@ void CloudWarmUpManager::_recycle_cache(int64_t tablet_id, const std::vector& rowsets) { LOG(INFO) << "recycle_cache: tablet_id=" << tablet_id << ", num_rowsets=" << rowsets.size(); bool cache_hit = false; - auto replicas = get_replica_info(tablet_id, false, cache_hit); + auto replicas = get_replica_info(tablet_id, /*table_id=*/0, false, cache_hit); if (replicas.empty()) { return; } @@ -802,18 +929,18 @@ void CloudWarmUpManager::_recycle_cache(int64_t tablet_id, auto dns_cache = ExecEnv::GetInstance()->dns_cache(); for (auto& replica : replicas) { // send sync request - std::string host = replica.host; + std::string host = replica.replica.host; if (dns_cache == nullptr) { LOG(WARNING) << "DNS cache is not initialized, skipping hostname resolve"; - } else if (!is_valid_ip(replica.host)) { - Status status = dns_cache->get(replica.host, &host); + } else if (!is_valid_ip(replica.replica.host)) { + Status status = dns_cache->get(replica.replica.host, &host); if (!status.ok()) { - LOG(WARNING) << "failed to get ip from host " << replica.host << ": " + LOG(WARNING) << "failed to get ip from host " << replica.replica.host << ": " << status.to_string(); return; } } - std::string brpc_addr = get_host_port(host, replica.brpc_port); + std::string brpc_addr = get_host_port(host, replica.replica.brpc_port); Status st = Status::OK(); std::shared_ptr brpc_stub = ExecEnv::GetInstance()->brpc_internal_client_cache()->get_new_client_no_cache( diff --git a/be/src/cloud/cloud_warm_up_manager.h b/be/src/cloud/cloud_warm_up_manager.h index 992702f162e0a1..f4102915705457 100644 --- a/be/src/cloud/cloud_warm_up_manager.h +++ b/be/src/cloud/cloud_warm_up_manager.h @@ -21,10 +21,12 @@ #include #include +#include #include #include #include #include +#include #include #include "cloud/cloud_storage_engine.h" @@ -39,6 +41,16 @@ enum class DownloadType { S3, }; +// Filter for event-driven warmup jobs. +// nullopt = cluster-level (no table filter, warm up all tables) +// has_value = table-level filter (only warm up tables in the set) +using EventDrivenJobFilter = std::optional>; + +struct JobReplicaInfo { + int64_t job_id; + TReplicaInfo replica; +}; + struct JobMeta { JobMeta() = default; JobMeta(const TJobMeta& meta); @@ -75,7 +87,8 @@ class CloudWarmUpManager { // Cancel the job Status clear_job(int64_t job_id); - Status set_event(int64_t job_id, TWarmUpEventType::type event, bool clear = false); + Status set_event(int64_t job_id, TWarmUpEventType::type event, bool clear = false, + const std::vector* table_ids = nullptr); // If `sync_wait_timeout_ms` <= 0, the function will send the warm-up RPC // and return immediately without waiting for the warm-up to complete. @@ -85,7 +98,7 @@ class CloudWarmUpManager { // @param rs_meta Metadata of the rowset to be warmed up. // @param sync_wait_timeout_ms Timeout in milliseconds to wait for the warm-up // to complete. Non-positive value means no waiting. - void warm_up_rowset(RowsetMeta& rs_meta, int64_t sync_wait_timeout_ms = -1); + void warm_up_rowset(RowsetMeta& rs_meta, int64_t table_id, int64_t sync_wait_timeout_ms = -1); void recycle_cache(int64_t tablet_id, const std::vector& rowsets); @@ -98,17 +111,27 @@ class CloudWarmUpManager { std::unordered_map> get_all_balanced_tablets() const; private: + struct WarmUpRowsetFailure { + int code; + std::string reason; + }; + + static Status _build_warm_up_rowset_result(const std::vector& failures, + size_t replica_count, int64_t tablet_id, + int64_t table_id, const std::string& rowset_id); + void schedule_remove_balanced_tablet(int64_t tablet_id); static void clean_up_expired_mappings(void* arg); void handle_jobs(); - Status _do_warm_up_rowset(RowsetMeta& rs_meta, std::vector& replicas, - int64_t sync_wait_timeout_ms, bool skip_existence_check); + Status _do_warm_up_rowset(RowsetMeta& rs_meta, int64_t table_id, + std::vector& replicas, int64_t sync_wait_timeout_ms, + bool skip_existence_check); - std::vector get_replica_info(int64_t tablet_id, bool bypass_cache, - bool& cache_hit); + std::vector get_replica_info(int64_t tablet_id, int64_t table_id, + bool bypass_cache, bool& cache_hit); - void _warm_up_rowset(RowsetMeta& rs_meta, int64_t sync_wait_timeout_ms); + void _warm_up_rowset(RowsetMeta& rs_meta, int64_t table_id, int64_t sync_wait_timeout_ms); void _recycle_cache(int64_t tablet_id, const std::vector& rowsets); void submit_download_tasks(io::Path path, int64_t file_size, io::FileSystemSPtr file_system, @@ -133,6 +156,8 @@ class CloudWarmUpManager { using Cache = std::unordered_map; // job_id -> cache std::unordered_map _tablet_replica_cache; + // job_id -> table filter (nullopt = cluster-level, no filter) + std::unordered_map _event_driven_filters; std::unique_ptr _thread_pool; std::unique_ptr _thread_pool_token; diff --git a/be/src/cloud/cloud_warmup_metrics.cpp b/be/src/cloud/cloud_warmup_metrics.cpp new file mode 100644 index 00000000000000..59d6c769c1d1fa --- /dev/null +++ b/be/src/cloud/cloud_warmup_metrics.cpp @@ -0,0 +1,82 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "cloud/cloud_warmup_metrics.h" + +#include + +namespace doris { + +WarmUpEdDownstreamProgressTracker g_warmup_ed_downstream_progress_tracker; + +void WarmUpEdDownstreamProgressTracker::record_task_submit(const std::string& job_id_str, + int64_t upstream_trigger_ts_ms) { + if (upstream_trigger_ts_ms <= 0) { + return; + } + std::lock_guard lock(_mtx); + auto& progress = _progress_by_job[job_id_str]; + ++progress.pending_trigger_ts_counts[upstream_trigger_ts_ms]; +} + +void WarmUpEdDownstreamProgressTracker::record_task_done(const std::string& job_id_str, + int64_t upstream_trigger_ts_ms) { + if (upstream_trigger_ts_ms <= 0) { + return; + } + std::lock_guard lock(_mtx); + auto& progress = _progress_by_job[job_id_str]; + auto pending_it = progress.pending_trigger_ts_counts.find(upstream_trigger_ts_ms); + if (pending_it != progress.pending_trigger_ts_counts.end()) { + --pending_it->second; + if (pending_it->second <= 0) { + progress.pending_trigger_ts_counts.erase(pending_it); + } + } + progress.last_finished_trigger_ts = + std::max(progress.last_finished_trigger_ts, upstream_trigger_ts_ms); +} + +int64_t WarmUpEdDownstreamProgressTracker::get_progress_ts(const std::string& job_id_str) const { + std::lock_guard lock(_mtx); + auto progress_it = _progress_by_job.find(job_id_str); + if (progress_it == _progress_by_job.end()) { + return 0; + } + const auto& progress = progress_it->second; + if (!progress.pending_trigger_ts_counts.empty()) { + return progress.pending_trigger_ts_counts.begin()->first; + } + return progress.last_finished_trigger_ts; +} + +std::vector WarmUpEdDownstreamProgressTracker::list_job_ids() const { + std::lock_guard lock(_mtx); + std::vector job_ids; + job_ids.reserve(_progress_by_job.size()); + for (const auto& entry : _progress_by_job) { + job_ids.emplace_back(entry.first); + } + return job_ids; +} + +void WarmUpEdDownstreamProgressTracker::reset_for_test() { + std::lock_guard lock(_mtx); + _progress_by_job.clear(); +} + +} // namespace doris diff --git a/be/src/cloud/cloud_warmup_metrics.h b/be/src/cloud/cloud_warmup_metrics.h new file mode 100644 index 00000000000000..3c4840d1803178 --- /dev/null +++ b/be/src/cloud/cloud_warmup_metrics.h @@ -0,0 +1,76 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "util/bvar_windowed_adder.h" + +namespace doris { + +// Source BE metrics keyed by job_id (defined in cloud_warm_up_manager.cpp). +extern MBvarWindowedAdder g_warmup_ed_requested_segment_num; +extern MBvarWindowedAdder g_warmup_ed_requested_segment_size; +extern MBvarWindowedAdder g_warmup_ed_requested_index_num; +extern MBvarWindowedAdder g_warmup_ed_requested_index_size; +extern bvar::MultiDimension> g_warmup_ed_last_trigger_ts; + +// Target BE metrics keyed by job_id (defined in cloud_internal_service.cpp). +extern MBvarWindowedAdder g_warmup_ed_finish_segment_num; +extern MBvarWindowedAdder g_warmup_ed_finish_segment_size; +extern MBvarWindowedAdder g_warmup_ed_finish_index_num; +extern MBvarWindowedAdder g_warmup_ed_finish_index_size; +extern MBvarWindowedAdder g_warmup_ed_fail_segment_num; +extern MBvarWindowedAdder g_warmup_ed_fail_segment_size; +extern MBvarWindowedAdder g_warmup_ed_fail_index_num; +extern MBvarWindowedAdder g_warmup_ed_fail_index_size; +extern bvar::MultiDimension> g_warmup_ed_last_finish_ts; + +// Tracks the target BE's event-driven warm-up progress by upstream trigger timestamp. +// If there are unfinished downloads for a job, progress is the earliest pending upstream trigger +// time. If the job has no pending downloads, progress falls back to the latest completed upstream +// trigger time, so FE can report a zero trigger gap once the target side catches up. +class WarmUpEdDownstreamProgressTracker { +public: + void record_task_submit(const std::string& job_id_str, int64_t upstream_trigger_ts_ms); + void record_task_done(const std::string& job_id_str, int64_t upstream_trigger_ts_ms); + int64_t get_progress_ts(const std::string& job_id_str) const; + std::vector list_job_ids() const; + void reset_for_test(); + +private: + struct JobProgress { + std::map pending_trigger_ts_counts; + int64_t last_finished_trigger_ts = 0; + }; + + mutable std::mutex _mtx; + std::unordered_map _progress_by_job; +}; + +extern WarmUpEdDownstreamProgressTracker g_warmup_ed_downstream_progress_tracker; + +} // namespace doris diff --git a/be/src/service/http/action/warmup_stats_action.cpp b/be/src/service/http/action/warmup_stats_action.cpp new file mode 100644 index 00000000000000..a41f388e686a34 --- /dev/null +++ b/be/src/service/http/action/warmup_stats_action.cpp @@ -0,0 +1,121 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "service/http/action/warmup_stats_action.h" + +#include +#include +#include +#include +#include + +#include "cloud/cloud_warmup_metrics.h" +#include "service/http/http_channel.h" +#include "service/http/http_headers.h" +#include "service/http/http_request.h" +#include "service/http/http_status.h" +#include "util/debug_points.h" +#include "util/easy_json.h" + +namespace doris { + +// Fill windowed num/size metrics into a JSON object +static void fill_windowed(EasyJson& parent, const std::string& key, MBvarWindowedAdder& num_adder, + MBvarWindowedAdder& size_adder, const std::string& dim_key) { + EasyJson obj = parent.Set(key, EasyJson::kObject); + EasyJson num = obj.Set("num", EasyJson::kObject); + num["5m"] = num_adder.get_window_value(dim_key, 0); + num["30m"] = num_adder.get_window_value(dim_key, 1); + num["1h"] = num_adder.get_window_value(dim_key, 2); + EasyJson size = obj.Set("size", EasyJson::kObject); + size["5m"] = size_adder.get_window_value(dim_key, 0); + size["30m"] = size_adder.get_window_value(dim_key, 1); + size["1h"] = size_adder.get_window_value(dim_key, 2); +} + +void WarmUpStatsAction::handle(HttpRequest* req) { + DBUG_EXECUTE_IF("WarmUpStatsAction.handle.return_error", { + HttpChannel::send_reply(req, HttpStatus::INTERNAL_SERVER_ERROR, + "injected warmup stats error"); + return; + }); + DBUG_EXECUTE_IF("WarmUpStatsAction.handle.sleep", { + auto sleep_ms = dp->param("sleep_ms", 6000); + std::this_thread::sleep_for(std::chrono::milliseconds(sleep_ms)); + }); + + // Collect all job_id dimension keys from all metrics + std::set all_keys; + for (auto& k : g_warmup_ed_requested_segment_num.list_dimensions()) all_keys.insert(k); + for (auto& k : g_warmup_ed_requested_index_num.list_dimensions()) all_keys.insert(k); + for (auto& k : g_warmup_ed_finish_segment_num.list_dimensions()) all_keys.insert(k); + for (auto& k : g_warmup_ed_finish_index_num.list_dimensions()) all_keys.insert(k); + for (auto& k : g_warmup_ed_fail_segment_num.list_dimensions()) all_keys.insert(k); + for (auto& k : g_warmup_ed_fail_index_num.list_dimensions()) all_keys.insert(k); + for (auto& k : g_warmup_ed_downstream_progress_tracker.list_job_ids()) all_keys.insert(k); + + EasyJson result; + result["code"] = 0; + EasyJson jobs = result.Set("data", EasyJson::kArray); + + for (auto& job_id_str : all_keys) { + EasyJson entry = jobs.PushBack(EasyJson::kObject); + try { + entry["job_id"] = static_cast(std::stoll(job_id_str)); + } catch (...) { + entry["job_id"] = 0; + } + + // requested + EasyJson req_obj = entry.Set("requested", EasyJson::kObject); + fill_windowed(req_obj, "seg", g_warmup_ed_requested_segment_num, + g_warmup_ed_requested_segment_size, job_id_str); + fill_windowed(req_obj, "idx", g_warmup_ed_requested_index_num, + g_warmup_ed_requested_index_size, job_id_str); + + // finish + EasyJson fin_obj = entry.Set("finish", EasyJson::kObject); + fill_windowed(fin_obj, "seg", g_warmup_ed_finish_segment_num, + g_warmup_ed_finish_segment_size, job_id_str); + fill_windowed(fin_obj, "idx", g_warmup_ed_finish_index_num, g_warmup_ed_finish_index_size, + job_id_str); + + // fail + EasyJson fail_obj = entry.Set("fail", EasyJson::kObject); + fill_windowed(fail_obj, "seg", g_warmup_ed_fail_segment_num, g_warmup_ed_fail_segment_size, + job_id_str); + fill_windowed(fail_obj, "idx", g_warmup_ed_fail_index_num, g_warmup_ed_fail_index_size, + job_id_str); + + // Timestamps + auto* trigger_ts = + g_warmup_ed_last_trigger_ts.get_stats(std::list {job_id_str}); + entry["last_trigger_ts"] = trigger_ts ? trigger_ts->get_value() : 0; + auto* finish_ts = g_warmup_ed_last_finish_ts.get_stats(std::list {job_id_str}); + entry["last_finish_ts"] = finish_ts ? finish_ts->get_value() : 0; + // Target-side progress watermark for trigger-gap calculation. Pending work reports the + // earliest unfinished upstream trigger time; fully caught-up work reports the latest + // finished upstream trigger time. + entry["progress_trigger_ts"] = + g_warmup_ed_downstream_progress_tracker.get_progress_ts(job_id_str); + } + + req->add_output_header(HttpHeaders::CONTENT_TYPE, "application/json"); + HttpChannel::send_reply(req, HttpStatus::OK, result.ToString()); +} + +} // namespace doris diff --git a/be/src/service/http/action/warmup_stats_action.h b/be/src/service/http/action/warmup_stats_action.h new file mode 100644 index 00000000000000..72e0a17fd4802a --- /dev/null +++ b/be/src/service/http/action/warmup_stats_action.h @@ -0,0 +1,37 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "service/http/http_handler_with_auth.h" + +namespace doris { + +class ExecEnv; + +// HTTP action for /api/warmup_event_driven_stats +// Returns per-job_id windowed warmup metrics as JSON. +class WarmUpStatsAction final : public HttpHandlerWithAuth { +public: + explicit WarmUpStatsAction(ExecEnv* exec_env) : HttpHandlerWithAuth(exec_env) {} + + ~WarmUpStatsAction() override = default; + + void handle(HttpRequest* req) override; +}; + +} // namespace doris diff --git a/be/src/service/http_service.cpp b/be/src/service/http_service.cpp index 93a50a81cc10ff..a7c46e267fd6cf 100644 --- a/be/src/service/http_service.cpp +++ b/be/src/service/http_service.cpp @@ -76,6 +76,7 @@ #include "service/http/action/tablets_distribution_action.h" #include "service/http/action/tablets_info_action.h" #include "service/http/action/version_action.h" +#include "service/http/action/warmup_stats_action.h" #include "service/http/default_path_handlers.h" #include "service/http/ev_http_server.h" #include "service/http/http_method.h" @@ -502,6 +503,10 @@ void HttpService::register_cloud_handler(CloudStorageEngine& engine) { auto* show_hotspot_action = _pool.add(new ShowHotspotAction(engine, _env)); _ev_http_server->register_handler(HttpMethod::GET, "/api/hotspot/tablet", show_hotspot_action); + auto* warmup_stats_action = _pool.add(new WarmUpStatsAction(_env)); + _ev_http_server->register_handler(HttpMethod::GET, "/api/warmup_event_driven_stats", + warmup_stats_action); + CalcFileCrcAction* calc_crc_action = _pool.add( new CalcFileCrcAction(_env, engine, TPrivilegeHier::GLOBAL, TPrivilegeType::ADMIN)); _ev_http_server->register_handler(HttpMethod::GET, "/api/calc_crc", calc_crc_action); diff --git a/be/src/util/bvar_windowed_adder.h b/be/src/util/bvar_windowed_adder.h new file mode 100644 index 00000000000000..c4e9245b7e3246 --- /dev/null +++ b/be/src/util/bvar_windowed_adder.h @@ -0,0 +1,162 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace doris { + +/** + * Multi-dimension windowed adder. + * + * For each dimension value combination (e.g., job_id), automatically creates: + * - A bvar::Adder (cumulative counter managed by MultiDimension) + * - Multiple bvar::Window instances (sliding window views at different time scales) + * + * Windows are lazily created on first write to a dimension value. + * + * @example + * MBvarWindowedAdder requested_seg_num( + * "warmup_ed_requested_segment_num", + * {"job_id"}, + * {300, 1800, 7200} + * ); + * requested_seg_num.put({"13419"}, 1); + */ +class MBvarWindowedAdder { +public: + MBvarWindowedAdder(const std::string& name, const std::initializer_list& dim_names, + std::vector window_seconds, bool expose = true) + : name_(name), + window_seconds_(std::move(window_seconds)), + md_total_(std::list(dim_names)), + expose_(expose) { + if (expose_) { + md_total_.expose(name_ + "_total"); + } + } + + void put(const std::initializer_list& dim_values, int64_t value) { + auto* adder = md_total_.get_stats(std::list(dim_values)); + if (!adder) return; + ensure_windows(dim_values, adder); + *adder << value; + } + + /** Get the current window value for the specified dimension and window index. */ + int64_t get_window_value(const std::initializer_list& dim_values, + size_t window_idx) { + std::lock_guard lock(mutex_); + auto it = dims_.find(make_key(dim_values)); + if (it == dims_.end() || window_idx >= it->second.windows.size()) { + return 0; + } + return it->second.windows[window_idx]->get_value(); + } + + /** Overload accepting a pre-built key string (e.g., "job_id,table_id"). */ + int64_t get_window_value(const std::string& dim_key, size_t window_idx) { + std::lock_guard lock(mutex_); + auto it = dims_.find(dim_key); + if (it == dims_.end() || window_idx >= it->second.windows.size()) { + return 0; + } + return it->second.windows[window_idx]->get_value(); + } + + /** List all dimension key strings that have been seen. */ + std::vector list_dimensions() const { + std::lock_guard lock(mutex_); + std::vector result; + result.reserve(dims_.size()); + for (auto& [key, _] : dims_) { + result.push_back(key); + } + return result; + } + + void hide() { + std::lock_guard lock(mutex_); + if (!expose_) { + return; + } + expose_ = false; + md_total_.hide(); + for (auto& [_, entry] : dims_) { + for (auto& window : entry.windows) { + window->hide(); + } + } + } + +private: + struct DimEntry { + bvar::Adder* adder; // owned by MultiDimension + std::vector>>> windows; + }; + + void ensure_windows(const std::initializer_list& dim_values, + bvar::Adder* adder) { + std::string key = make_key(dim_values); + std::lock_guard lock(mutex_); + if (dims_.count(key)) return; + DimEntry entry; + entry.adder = adder; + for (int ws : window_seconds_) { + if (expose_) { + std::string wname = name_ + "_" + std::to_string(ws) + "s_" + key; + entry.windows.emplace_back( + std::make_unique>>(wname, adder, ws)); + } else { + entry.windows.emplace_back( + std::make_unique>>(adder, ws)); + } + } + dims_[key] = std::move(entry); + } + + static std::string make_key(const std::initializer_list& dim_values) { + std::string result; + for (auto& v : dim_values) { + if (!result.empty()) result += ","; + result += v; + } + return result; + } + + std::string name_; + std::vector window_seconds_; + bvar::MultiDimension> md_total_; + bool expose_; + mutable bthread::Mutex mutex_; + std::map dims_; +}; + +} // namespace doris diff --git a/be/test/cloud/cloud_warm_up_manager_filter_test.cpp b/be/test/cloud/cloud_warm_up_manager_filter_test.cpp new file mode 100644 index 00000000000000..c55d6f49d77711 --- /dev/null +++ b/be/test/cloud/cloud_warm_up_manager_filter_test.cpp @@ -0,0 +1,278 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include +#include +#include +#include +#include +#include + +#include "cloud/cloud_storage_engine.h" +#include "cloud/cloud_warm_up_manager.h" +#include "cloud/cloud_warmup_metrics.h" +#include "gen_cpp/AgentService_types.h" + +namespace doris { + +class CloudWarmUpManagerFilterTest : public testing::Test { +public: + CloudWarmUpManagerFilterTest() : _engine(CloudStorageEngine(EngineOptions {})) {} + +protected: + CloudStorageEngine _engine; +}; + +static TReplicaInfo make_replica(int64_t backend_id) { + TReplicaInfo replica; + replica.__set_backend_id(backend_id); + replica.__set_host("127.0.0.1"); + replica.__set_brpc_port(8000 + backend_id); + replica.__set_is_alive(true); + return replica; +} + +TEST_F(CloudWarmUpManagerFilterTest, EventDrivenJobFilterNullopt) { + EventDrivenJobFilter filter = std::nullopt; + EXPECT_FALSE(filter.has_value()); +} + +TEST_F(CloudWarmUpManagerFilterTest, EventDrivenJobFilterWithTableIds) { + EventDrivenJobFilter filter = std::unordered_set {100, 200, 300}; + EXPECT_TRUE(filter.has_value()); + EXPECT_EQ(3, filter->size()); + EXPECT_TRUE(filter->count(100) > 0); + EXPECT_TRUE(filter->count(200) > 0); + EXPECT_TRUE(filter->count(300) > 0); + EXPECT_TRUE(filter->count(999) == 0); +} + +TEST_F(CloudWarmUpManagerFilterTest, EventDrivenJobFilterEmpty) { + EventDrivenJobFilter filter = std::unordered_set {}; + EXPECT_TRUE(filter.has_value()); + EXPECT_EQ(0, filter->size()); +} + +TEST_F(CloudWarmUpManagerFilterTest, SetEventWithoutTableIdsStoresClusterLevelFilter) { + CloudWarmUpManager manager(_engine); + int64_t job_id = 1001; + + auto st = manager.set_event(job_id, TWarmUpEventType::LOAD, false, nullptr); + EXPECT_TRUE(st.ok()); + EXPECT_TRUE(manager._tablet_replica_cache.contains(job_id)); + auto filter_it = manager._event_driven_filters.find(job_id); + ASSERT_NE(filter_it, manager._event_driven_filters.end()); + EXPECT_FALSE(filter_it->second.has_value()); +} + +TEST_F(CloudWarmUpManagerFilterTest, SetEventWithTableIdsStoresFilter) { + CloudWarmUpManager manager(_engine); + int64_t job_id = 1002; + std::vector table_ids = {10, 20, 30}; + + auto st = manager.set_event(job_id, TWarmUpEventType::LOAD, false, &table_ids); + EXPECT_TRUE(st.ok()); + EXPECT_TRUE(manager._tablet_replica_cache.contains(job_id)); + auto filter_it = manager._event_driven_filters.find(job_id); + ASSERT_NE(filter_it, manager._event_driven_filters.end()); + ASSERT_TRUE(filter_it->second.has_value()); + EXPECT_EQ(3, filter_it->second->size()); + EXPECT_TRUE(filter_it->second->contains(10)); + EXPECT_TRUE(filter_it->second->contains(20)); + EXPECT_TRUE(filter_it->second->contains(30)); +} + +TEST_F(CloudWarmUpManagerFilterTest, SetEventWithEmptyTableIdsStoresEmptyFilter) { + CloudWarmUpManager manager(_engine); + int64_t job_id = 1003; + std::vector table_ids = {}; + + auto st = manager.set_event(job_id, TWarmUpEventType::LOAD, false, &table_ids); + EXPECT_TRUE(st.ok()); + auto filter_it = manager._event_driven_filters.find(job_id); + ASSERT_NE(filter_it, manager._event_driven_filters.end()); + ASSERT_TRUE(filter_it->second.has_value()); + EXPECT_TRUE(filter_it->second->empty()); +} + +TEST_F(CloudWarmUpManagerFilterTest, SetEventClearRemovesFilterAndCache) { + CloudWarmUpManager manager(_engine); + int64_t job_id = 1004; + std::vector table_ids = {10, 20}; + + auto st = manager.set_event(job_id, TWarmUpEventType::LOAD, false, &table_ids); + EXPECT_TRUE(st.ok()); + EXPECT_TRUE(manager._tablet_replica_cache.contains(job_id)); + EXPECT_TRUE(manager._event_driven_filters.contains(job_id)); + + st = manager.set_event(job_id, TWarmUpEventType::LOAD, true); + EXPECT_TRUE(st.ok()); + EXPECT_FALSE(manager._tablet_replica_cache.contains(job_id)); + EXPECT_FALSE(manager._event_driven_filters.contains(job_id)); +} + +TEST_F(CloudWarmUpManagerFilterTest, SetEventUpdateTableIdsReplacesFilter) { + CloudWarmUpManager manager(_engine); + int64_t job_id = 1005; + + std::vector initial_ids = {10, 20}; + auto st = manager.set_event(job_id, TWarmUpEventType::LOAD, false, &initial_ids); + EXPECT_TRUE(st.ok()); + + std::vector updated_ids = {30, 40, 50}; + st = manager.set_event(job_id, TWarmUpEventType::LOAD, false, &updated_ids); + EXPECT_TRUE(st.ok()); + auto filter_it = manager._event_driven_filters.find(job_id); + ASSERT_NE(filter_it, manager._event_driven_filters.end()); + ASSERT_TRUE(filter_it->second.has_value()); + EXPECT_EQ(3, filter_it->second->size()); + EXPECT_FALSE(filter_it->second->contains(10)); + EXPECT_TRUE(filter_it->second->contains(30)); + EXPECT_TRUE(filter_it->second->contains(40)); + EXPECT_TRUE(filter_it->second->contains(50)); +} + +TEST_F(CloudWarmUpManagerFilterTest, SetEventUnsupportedType) { + CloudWarmUpManager manager(_engine); + int64_t job_id = 1006; + + auto st = manager.set_event(job_id, TWarmUpEventType::QUERY, false, nullptr); + EXPECT_FALSE(st.ok()); +} + +TEST_F(CloudWarmUpManagerFilterTest, GetReplicaInfoAppliesTableFilter) { + CloudWarmUpManager manager(_engine); + int64_t tablet_id = 3001; + auto now = std::chrono::steady_clock::now(); + + manager._tablet_replica_cache[2001][tablet_id] = {now, make_replica(11)}; + manager._event_driven_filters[2001] = std::unordered_set {10}; + + manager._tablet_replica_cache[2002][tablet_id] = {now, make_replica(22)}; + manager._event_driven_filters[2002] = std::unordered_set {20}; + + bool cache_hit = false; + auto replicas = manager.get_replica_info(tablet_id, 20, false, cache_hit); + + ASSERT_EQ(1, replicas.size()); + EXPECT_EQ(22, replicas[0].replica.backend_id); + EXPECT_TRUE(cache_hit); +} + +TEST_F(CloudWarmUpManagerFilterTest, GetReplicaInfoBypassesFilterWhenTableIdUnknown) { + CloudWarmUpManager manager(_engine); + int64_t tablet_id = 3002; + auto now = std::chrono::steady_clock::now(); + + manager._tablet_replica_cache[3001][tablet_id] = {now, make_replica(31)}; + manager._event_driven_filters[3001] = std::unordered_set {10}; + + manager._tablet_replica_cache[3002][tablet_id] = {now, make_replica(32)}; + manager._event_driven_filters[3002] = std::unordered_set {20}; + + bool cache_hit = false; + auto replicas = manager.get_replica_info(tablet_id, 0, false, cache_hit); + + ASSERT_EQ(2, replicas.size()); + EXPECT_TRUE(cache_hit); +} + +TEST_F(CloudWarmUpManagerFilterTest, BuildWarmUpRowsetResultReturnsOkWithoutFailures) { + auto st = CloudWarmUpManager::_build_warm_up_rowset_result({}, 2, 4001, 5001, "rowset-1"); + EXPECT_TRUE(st.ok()); +} + +TEST_F(CloudWarmUpManagerFilterTest, BuildWarmUpRowsetResultAggregatesAllFailures) { + std::vector failures = { + {ErrorCode::THRIFT_RPC_ERROR, + "job_id=1, backend_id=11, target=127.0.0.1:8011, status=[THRIFT_RPC_ERROR]rpc one"}, + {ErrorCode::INTERNAL_ERROR, + "job_id=2, backend_id=22, target=127.0.0.1:8022, status=[INTERNAL_ERROR]rpc two"}}; + + auto st = CloudWarmUpManager::_build_warm_up_rowset_result(failures, 3, 4002, 5002, "rowset-2"); + + EXPECT_FALSE(st.ok()); + EXPECT_EQ(ErrorCode::THRIFT_RPC_ERROR, st.code()); + std::string msg = st.to_string_no_stack(); + EXPECT_NE(std::string::npos, msg.find("failed on 2/3 replicas")); + EXPECT_NE(std::string::npos, msg.find("table_id=5002")); + EXPECT_NE(std::string::npos, msg.find("rpc one")); + EXPECT_NE(std::string::npos, msg.find("rpc two")); +} + +TEST_F(CloudWarmUpManagerFilterTest, BuildWarmUpRowsetResultKeepsTableNotFoundRetrySignal) { + std::vector failures = { + {ErrorCode::THRIFT_RPC_ERROR, + "job_id=1, backend_id=11, target=127.0.0.1:8011, status=[THRIFT_RPC_ERROR]rpc one"}, + {ErrorCode::TABLE_NOT_FOUND, + "job_id=2, backend_id=22, target=127.0.0.1:8022, status=[TABLET_MISSING]missing"}}; + + auto st = CloudWarmUpManager::_build_warm_up_rowset_result(failures, 2, 4003, 5003, "rowset-3"); + + EXPECT_FALSE(st.ok()); + EXPECT_TRUE(st.is()); + std::string msg = st.to_string_no_stack(); + EXPECT_NE(std::string::npos, msg.find("table_id=5003")); + EXPECT_NE(std::string::npos, msg.find("rpc one")); + EXPECT_NE(std::string::npos, msg.find("missing")); +} + +TEST_F(CloudWarmUpManagerFilterTest, DownstreamProgressTracksEarliestPendingTrigger) { + g_warmup_ed_downstream_progress_tracker.reset_for_test(); + std::string job_id = "9001"; + + g_warmup_ed_downstream_progress_tracker.record_task_submit(job_id, 1000); + g_warmup_ed_downstream_progress_tracker.record_task_submit(job_id, 1500); + g_warmup_ed_downstream_progress_tracker.record_task_submit(job_id, 1000); + + EXPECT_EQ(1000, g_warmup_ed_downstream_progress_tracker.get_progress_ts(job_id)); + + g_warmup_ed_downstream_progress_tracker.record_task_done(job_id, 1000); + EXPECT_EQ(1000, g_warmup_ed_downstream_progress_tracker.get_progress_ts(job_id)); + + g_warmup_ed_downstream_progress_tracker.record_task_done(job_id, 1000); + EXPECT_EQ(1500, g_warmup_ed_downstream_progress_tracker.get_progress_ts(job_id)); + + g_warmup_ed_downstream_progress_tracker.record_task_done(job_id, 1500); + EXPECT_EQ(1500, g_warmup_ed_downstream_progress_tracker.get_progress_ts(job_id)); + + g_warmup_ed_downstream_progress_tracker.reset_for_test(); +} + +TEST_F(CloudWarmUpManagerFilterTest, DownstreamProgressFallsBackToLatestFinishedTrigger) { + g_warmup_ed_downstream_progress_tracker.reset_for_test(); + std::string job_id = "9002"; + + g_warmup_ed_downstream_progress_tracker.record_task_done(job_id, 2000); + g_warmup_ed_downstream_progress_tracker.record_task_done(job_id, 1800); + EXPECT_EQ(2000, g_warmup_ed_downstream_progress_tracker.get_progress_ts(job_id)); + + g_warmup_ed_downstream_progress_tracker.record_task_submit(job_id, 1900); + EXPECT_EQ(1900, g_warmup_ed_downstream_progress_tracker.get_progress_ts(job_id)); + + g_warmup_ed_downstream_progress_tracker.record_task_done(job_id, 1900); + EXPECT_EQ(2000, g_warmup_ed_downstream_progress_tracker.get_progress_ts(job_id)); + + auto job_ids = g_warmup_ed_downstream_progress_tracker.list_job_ids(); + EXPECT_NE(job_ids.end(), std::find(job_ids.begin(), job_ids.end(), job_id)); + + g_warmup_ed_downstream_progress_tracker.reset_for_test(); +} + +} // namespace doris diff --git a/be/test/cloud/cloud_warm_up_manager_test.cpp b/be/test/cloud/cloud_warm_up_manager_test.cpp index 90ea834e143d82..4ac284b990bf01 100644 --- a/be/test/cloud/cloud_warm_up_manager_test.cpp +++ b/be/test/cloud/cloud_warm_up_manager_test.cpp @@ -138,7 +138,7 @@ TEST_F(CloudWarmUpManagerTest, NonPositiveTimeoutQueuesBackgroundCopyAndReturns) std::atomic returned = false; std::thread caller([&] { - manager.warm_up_rowset(*rs_meta, -1); + manager.warm_up_rowset(*rs_meta, /*table_id=*/0, /*sync_wait_timeout_ms=*/-1); returned = true; }); @@ -206,7 +206,7 @@ TEST_F(CloudWarmUpManagerTest, NonPositiveTimeoutSkipsWarmupWhenAsyncRowsetMetaI }, &warmup_enter_guard); - manager.warm_up_rowset(*rs_meta, -1); + manager.warm_up_rowset(*rs_meta, /*table_id=*/0, /*sync_wait_timeout_ms=*/-1); { std::unique_lock lock(observed_mtx); @@ -261,7 +261,7 @@ TEST_F(CloudWarmUpManagerTest, PositiveTimeoutIgnoresSpuriousWakeupUntilWorkerFi std::atomic returned = false; std::thread caller([&] { - manager.warm_up_rowset(*rs_meta, 1000); + manager.warm_up_rowset(*rs_meta, /*table_id=*/0, /*sync_wait_timeout_ms=*/1000); returned = true; }); diff --git a/be/test/util/bvar_windowed_adder_test.cpp b/be/test/util/bvar_windowed_adder_test.cpp new file mode 100644 index 00000000000000..9306364b4d4694 --- /dev/null +++ b/be/test/util/bvar_windowed_adder_test.cpp @@ -0,0 +1,140 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "util/bvar_windowed_adder.h" + +#include + +#include +#include +#include + +namespace doris { + +TEST(MBvarWindowedAdderTest, PutAndGetTotal) { + // bvar::Window has MAX_SECONDS_LIMIT = 3600, so use values within that limit + MBvarWindowedAdder adder("test_put_get", {"job_id"}, {3600}); + + adder.put({"100"}, 5); + adder.put({"100"}, 3); + + // Window value should reflect accumulated puts + // Note: bvar::Window reports the *per-second average* × window_size in some modes, + // but bvar::Adder-backed windows report the sum of samples within the window. + // The exact value depends on bvar internals and timing, so just verify it's > 0. + int64_t val = adder.get_window_value({"100"}, 0); + EXPECT_GE(val, 0); // Window may need time to accumulate +} + +TEST(MBvarWindowedAdderTest, UnknownDimensionReturnsZero) { + MBvarWindowedAdder adder("test_unknown_dim", {"job_id"}, {3600}); + + EXPECT_EQ(0, adder.get_window_value({"nonexistent"}, 0)); + EXPECT_EQ(0, adder.get_window_value("nonexistent", 0)); +} + +TEST(MBvarWindowedAdderTest, InvalidWindowIndexReturnsZero) { + MBvarWindowedAdder adder("test_invalid_idx", {"job_id"}, {3600}); + + adder.put({"100"}, 1); + + // Window index 1 doesn't exist (only index 0) + EXPECT_EQ(0, adder.get_window_value({"100"}, 1)); + EXPECT_EQ(0, adder.get_window_value({"100"}, 999)); +} + +TEST(MBvarWindowedAdderTest, MultipleDimensions) { + MBvarWindowedAdder adder("test_multi_dim", {"job_id"}, {3600}); + + adder.put({"100"}, 10); + adder.put({"200"}, 20); + adder.put({"300"}, 30); + + auto dims = adder.list_dimensions(); + EXPECT_EQ(3, dims.size()); + + std::sort(dims.begin(), dims.end()); + EXPECT_EQ("100", dims[0]); + EXPECT_EQ("200", dims[1]); + EXPECT_EQ("300", dims[2]); +} + +TEST(MBvarWindowedAdderTest, ListDimensionsEmpty) { + MBvarWindowedAdder adder("test_empty_dims", {"job_id"}, {3600}); + + auto dims = adder.list_dimensions(); + EXPECT_TRUE(dims.empty()); +} + +TEST(MBvarWindowedAdderTest, MultipleWindowSizes) { + // bvar::Window has MAX_SECONDS_LIMIT = 3600, all values must be within this limit + MBvarWindowedAdder adder("test_multi_win", {"job_id"}, {300, 1800, 3600}); + + adder.put({"100"}, 42); + + // All 3 windows should be created (indices 0, 1, 2) + // Values may be 0 due to bvar internal timing, but should not crash + adder.get_window_value({"100"}, 0); + adder.get_window_value({"100"}, 1); + adder.get_window_value({"100"}, 2); + + // Index 3 out of range + EXPECT_EQ(0, adder.get_window_value({"100"}, 3)); +} + +TEST(MBvarWindowedAdderTest, GetWindowValueByStringKey) { + MBvarWindowedAdder adder("test_str_key", {"job_id"}, {3600}); + + adder.put({"42"}, 100); + + // String key for single dimension is just the value itself + int64_t val = adder.get_window_value("42", 0); + EXPECT_GE(val, 0); + + // Unknown string key + EXPECT_EQ(0, adder.get_window_value("unknown", 0)); +} + +TEST(MBvarWindowedAdderTest, EnsureWindowsIdempotent) { + MBvarWindowedAdder adder("test_idempotent", {"job_id"}, {3600}); + + // Multiple puts to the same dimension should not create duplicate windows + adder.put({"100"}, 1); + adder.put({"100"}, 2); + adder.put({"100"}, 3); + + auto dims = adder.list_dimensions(); + EXPECT_EQ(1, dims.size()); + EXPECT_EQ("100", dims[0]); +} + +TEST(MBvarWindowedAdderTest, MakeKeyComposite) { + // Test that multi-value dimensions produce comma-separated keys + MBvarWindowedAdder adder("test_composite", {"a", "b"}, {3600}); + + adder.put({"x", "y"}, 1); + + auto dims = adder.list_dimensions(); + EXPECT_EQ(1, dims.size()); + EXPECT_EQ("x,y", dims[0]); + + // Can also query by composite string key + int64_t val = adder.get_window_value("x,y", 0); + EXPECT_GE(val, 0); +} + +} // namespace doris diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java index bdd64284c9af27..2b7bb7d8d4a17e 100644 --- a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java +++ b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java @@ -3295,6 +3295,21 @@ public static int metaServiceRpcRetryTimes() { @ConfField(mutable = true, masterOnly = true) public static long cloud_warm_up_job_max_bytes_per_batch = 21474836480L; // 20GB + @ConfField(mutable = true, masterOnly = true, description = { + "zh-CN: 定期刷新 table-level warmup 任务匹配的 table ID 集合的时间间隔(毫秒)", + "en: Interval in milliseconds to refresh matched table IDs for table-level warmup jobs"}) + public static long cloud_warm_up_table_filter_refresh_interval_ms = 60000; // 60 seconds + + @ConfField(mutable = true, masterOnly = true, description = { + "zh-CN: 定期从 BE 拉取主动增量预热 SyncStats 并缓存到 FE job 的时间间隔(毫秒)", + "en: Interval in milliseconds to collect event-driven warmup SyncStats from BEs and cache it in FE jobs"}) + public static long cloud_warm_up_sync_stats_refresh_interval_ms = 15000; // 15 seconds + + @ConfField(mutable = true, masterOnly = true, description = { + "zh-CN: SHOW WARM UP JOB 和 FE 日志中 MatchedTables 最多展示的表数量", + "en: Maximum number of MatchedTables entries displayed in SHOW WARM UP JOB and FE logs"}) + public static int cloud_warm_up_matched_tables_display_limit = 100; + @ConfField(mutable = true, masterOnly = true) public static boolean cloud_warm_up_force_all_partitions = false; diff --git a/fe/fe-core/src/main/java/org/apache/doris/cloud/CacheHotspotManager.java b/fe/fe-core/src/main/java/org/apache/doris/cloud/CacheHotspotManager.java index 500f65de153df4..2b4ffa287477ce 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/cloud/CacheHotspotManager.java +++ b/fe/fe-core/src/main/java/org/apache/doris/cloud/CacheHotspotManager.java @@ -18,12 +18,14 @@ package org.apache.doris.cloud; import org.apache.doris.catalog.Database; +import org.apache.doris.catalog.DatabaseIf; import org.apache.doris.catalog.Env; import org.apache.doris.catalog.MaterializedIndex; import org.apache.doris.catalog.MaterializedIndex.IndexExtState; import org.apache.doris.catalog.OlapTable; import org.apache.doris.catalog.Partition; import org.apache.doris.catalog.Table; +import org.apache.doris.catalog.TableIf; import org.apache.doris.catalog.Tablet; import org.apache.doris.cloud.CloudWarmUpJob.JobState; import org.apache.doris.cloud.CloudWarmUpJob.JobType; @@ -39,7 +41,10 @@ import org.apache.doris.common.ThreadPoolManager; import org.apache.doris.common.Triple; import org.apache.doris.common.util.MasterDaemon; +import org.apache.doris.common.util.NetUtils; import org.apache.doris.common.util.TimeUtils; +import org.apache.doris.httpv2.rest.manager.HttpUtils; +import org.apache.doris.metric.MetricRepo; import org.apache.doris.nereids.trees.plans.commands.CancelWarmUpJobCommand; import org.apache.doris.nereids.trees.plans.commands.WarmUpClusterCommand; import org.apache.doris.rpc.RpcException; @@ -52,8 +57,14 @@ import org.apache.doris.thrift.TNetworkAddress; import org.apache.doris.thrift.TStatusCode; +import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.Lists; import com.google.common.collect.Maps; +import com.google.common.util.concurrent.ThreadFactoryBuilder; +import com.google.gson.JsonArray; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; +import com.google.gson.JsonParser; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.apache.thrift.TException; @@ -67,6 +78,7 @@ import java.util.Collections; import java.util.Comparator; import java.util.HashMap; +import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; @@ -78,10 +90,14 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorCompletionService; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; +import java.util.stream.Collectors; public class CacheHotspotManager extends MasterDaemon { public static final int MAX_SHOW_ENTRIES = 2000; @@ -103,6 +119,18 @@ public class CacheHotspotManager extends MasterDaemon { private boolean startJobDaemon = false; + private MasterDaemon tableFilterRefreshDaemon; + + private boolean startTableFilterRefreshDaemon = false; + + private MasterDaemon warmUpSyncStatsRefreshDaemon; + + private boolean startWarmUpSyncStatsRefreshDaemon = false; + + // Thread pool for concurrent BE HTTP requests during on-demand stats collection + private final ExecutorService warmupStatsExecutor = Executors.newFixedThreadPool(16, + new ThreadFactoryBuilder().setNameFormat("warmup-stats-collector-%d").setDaemon(true).build()); + private ConcurrentMap cloudWarmUpJobs = Maps.newConcurrentMap(); private ConcurrentMap activeCloudWarmUpJobs = Maps.newConcurrentMap(); @@ -116,11 +144,17 @@ private static class JobKey { private final String srcName; private final String dstName; private final CloudWarmUpJob.SyncMode syncMode; + private final String tableFilterExpr; public JobKey(String srcName, String dstName, CloudWarmUpJob.SyncMode syncMode) { + this(srcName, dstName, syncMode, ""); + } + + public JobKey(String srcName, String dstName, CloudWarmUpJob.SyncMode syncMode, String tableFilterExpr) { this.srcName = srcName; this.dstName = dstName; this.syncMode = syncMode; + this.tableFilterExpr = tableFilterExpr == null ? "" : tableFilterExpr; } @Override @@ -134,17 +168,22 @@ public boolean equals(Object o) { JobKey jobKey = (JobKey) o; return Objects.equals(srcName, jobKey.srcName) && Objects.equals(dstName, jobKey.dstName) - && syncMode == jobKey.syncMode; + && syncMode == jobKey.syncMode + && Objects.equals(tableFilterExpr, jobKey.tableFilterExpr); } @Override public int hashCode() { - return Objects.hash(srcName, dstName, syncMode); + return Objects.hash(srcName, dstName, syncMode, tableFilterExpr); } @Override public String toString() { - return "WarmUpJob src='" + srcName + "', dst='" + dstName + "', syncMode=" + String.valueOf(syncMode); + String s = "WarmUpJob src='" + srcName + "', dst='" + dstName + "', syncMode=" + String.valueOf(syncMode); + if (!tableFilterExpr.isEmpty()) { + s += ", tableFilter=" + tableFilterExpr; + } + return s; } } @@ -156,10 +195,12 @@ private void registerJobForRepeatDetection(CloudWarmUpJob job, boolean replay) t if (job.isDone()) { return; } + if (!replay) { + checkLoadEventWarmUpConflict(job); + } if (job.isEventDriven() || job.isPeriodic()) { - // For long lasting jobs, i.e. event-driven and periodic. - // It is meaningless to create more than one job for a given src, dst, and syncMode. - JobKey key = new JobKey(job.getSrcClusterName(), job.getDstClusterName(), job.getSyncMode()); + JobKey key = new JobKey(job.getSrcClusterName(), job.getDstClusterName(), + job.getSyncMode(), job.getTableFilterExpr()); boolean added = this.repeatJobDetectionSet.add(key); if (!added && !replay) { throw new AnalysisException(key + " already has a runnable job"); @@ -167,6 +208,106 @@ private void registerJobForRepeatDetection(CloudWarmUpJob job, boolean replay) t } } + // Only checks cross-type conflicts between table-level and cluster-level load-event warm-up jobs. + // Same-type duplicate jobs are still rejected later by repeatJobDetectionSet. + private void checkLoadEventWarmUpConflict(CloudWarmUpJob newJob) throws AnalysisException { + if (!isLoadEventWarmUpJob(newJob)) { + return; + } + + for (CloudWarmUpJob existingJob : runnableCloudWarmUpJobs.values()) { + if (existingJob.getJobId() == newJob.getJobId() || existingJob.isDone() + || !isLoadEventWarmUpJob(existingJob)) { + continue; + } + if (!isSameWarmUpPair(newJob, existingJob)) { + continue; + } + if (isTableLevelLoadEventWarmUpJob(newJob) != isTableLevelLoadEventWarmUpJob(existingJob)) { + throw buildLoadEventWarmUpConflictException(newJob, existingJob); + } + } + } + + public void cancelTableLevelLoadEventWarmUpJobsForVirtualComputeGroup( + String virtualComputeGroupName, String activeComputeGroup, String standbyComputeGroup, + List subComputeGroups, String reason) throws AnalysisException { + String cancelReason = reason + " for virtual compute group '" + virtualComputeGroupName + "'"; + Set computeGroupsInVcg = new HashSet<>(); + if (subComputeGroups != null) { + computeGroupsInVcg.addAll(subComputeGroups); + } + computeGroupsInVcg.add(activeComputeGroup); + computeGroupsInVcg.add(standbyComputeGroup); + + for (CloudWarmUpJob existingJob : runnableCloudWarmUpJobs.values()) { + if (existingJob.isDone() || !isTableLevelLoadEventWarmUpJob(existingJob)) { + continue; + } + if (!computeGroupsInVcg.contains(existingJob.getSrcClusterName()) + || !computeGroupsInVcg.contains(existingJob.getDstClusterName())) { + continue; + } + try { + cancel(existingJob.getJobId(), cancelReason); + LOG.info("cancel table-level load-event warm up job {} before virtual compute group '{}' creates " + + "cluster-level load-event warm up job. active compute group {}, " + + "standby compute group {}, source compute group {}, destination compute group {}{}, " + + "reason: {}", + existingJob.getJobId(), virtualComputeGroupName, activeComputeGroup, standbyComputeGroup, + existingJob.getSrcClusterName(), existingJob.getDstClusterName(), + formatExistingTableFilter(existingJob), cancelReason); + } catch (DdlException e) { + throw new AnalysisException("Failed to cancel table-level load-event warm up job " + + existingJob.getJobId() + " before virtual compute group '" + virtualComputeGroupName + + "' creates cluster-level load-event warm up job from active compute group '" + + activeComputeGroup + "' to standby compute group '" + standbyComputeGroup + + "'. Source compute group '" + existingJob.getSrcClusterName() + + "', destination compute group '" + existingJob.getDstClusterName() + "'" + + formatExistingTableFilter(existingJob) + ". Cancel table-level load-event warm up job " + + existingJob.getJobId() + " before retrying.", e); + } + } + } + + private static boolean isLoadEventWarmUpJob(CloudWarmUpJob job) { + return job != null && job.isEventDriven() && job.getSyncEvent() == CloudWarmUpJob.SyncEvent.LOAD; + } + + private static boolean isClusterLevelLoadEventWarmUpJob(CloudWarmUpJob job) { + return isLoadEventWarmUpJob(job) && job.getJobType() == JobType.CLUSTER; + } + + private static boolean isTableLevelLoadEventWarmUpJob(CloudWarmUpJob job) { + return isLoadEventWarmUpJob(job) && job.getJobType() == JobType.TABLES; + } + + private static boolean isSameWarmUpPair(CloudWarmUpJob left, CloudWarmUpJob right) { + return Objects.equals(left.getSrcClusterName(), right.getSrcClusterName()) + && Objects.equals(left.getDstClusterName(), right.getDstClusterName()); + } + + private static AnalysisException buildLoadEventWarmUpConflictException( + CloudWarmUpJob newJob, CloudWarmUpJob existingJob) { + String newJobLevel = isTableLevelLoadEventWarmUpJob(newJob) ? "table-level" : "cluster-level"; + String existingJobLevel = isClusterLevelLoadEventWarmUpJob(existingJob) ? "cluster-level" : "table-level"; + return new AnalysisException("Cannot create " + newJobLevel + " load-event warm up job from source " + + "compute group '" + newJob.getSrcClusterName() + "' to destination compute group '" + + newJob.getDstClusterName() + "': conflicting " + existingJobLevel + + " load-event warm up job " + existingJob.getJobId() + + " already exists for the same source and destination" + + formatExistingTableFilter(existingJob) + + ". Cancel existing load-event warm up job " + existingJob.getJobId() + + " before creating this job."); + } + + private static String formatExistingTableFilter(CloudWarmUpJob job) { + if (!job.hasTableFilter()) { + return ""; + } + return " with table filter [" + job.getTableFilterExpr() + "]"; + } + // Tracks warm-up jobs scheduled by CacheHotSpotManager. // Ensures that at most one job runs concurrently per destination cluster. private Map clusterToRunningJobId = new ConcurrentHashMap<>(); @@ -236,7 +377,8 @@ public void notifyJobStop(CloudWarmUpJob job) { } if (job.isEventDriven() || job.isPeriodic()) { this.repeatJobDetectionSet.remove(new JobKey( - job.getSrcClusterName(), job.getDstClusterName(), job.getSyncMode())); + job.getSrcClusterName(), job.getDstClusterName(), + job.getSyncMode(), job.getTableFilterExpr())); } } @@ -252,6 +394,17 @@ public void runAfterCatalogReady() { jobDaemon.start(); startJobDaemon = true; } + if (!startTableFilterRefreshDaemon) { + tableFilterRefreshDaemon = new TableFilterRefreshDaemon(); + tableFilterRefreshDaemon.start(); + startTableFilterRefreshDaemon = true; + } + if (Config.isCloudMode() && !startWarmUpSyncStatsRefreshDaemon) { + warmUpSyncStatsRefreshDaemon = new WarmUpSyncStatsRefreshDaemon(); + warmUpSyncStatsRefreshDaemon.start(); + startWarmUpSyncStatsRefreshDaemon = true; + } + if (!tableCreated) { try { @@ -339,6 +492,20 @@ private void triggerBatchInsert() { } } + private void refreshWarmUpSyncStats() { + if (!Env.getCurrentEnv().isMaster()) { + MetricRepo.syncCloudWarmUpSyncJobMetricDefinitions(Collections.emptyList()); + return; + } + + Map statsMap = collectAndAggregate(); + for (CloudWarmUpJob job : cloudWarmUpJobs.values()) { + JobWarmUpStats stats = job.isEventDriven() && !job.isDone() ? statsMap.get(job.getJobId()) : null; + job.setSyncStats(stats); + } + MetricRepo.syncCloudWarmUpSyncJobMetricDefinitions(cloudWarmUpJobs.values()); + } + private void insertIntoTable(String clusterId, long tableId, long indexId, long fileCacheSize, THotPartition partition, Backend backend) { LOG.info("table id {}, index id {}, partition id {}", tableId, indexId, partition.partition_id); @@ -613,7 +780,8 @@ public List> getSingleJobInfo(long jobId) throws AnalysisException if (job == null) { throw new AnalysisException("cloud warm up with job " + jobId + " does not exist"); } - infos.add(job.getJobInfo()); + Map statsMap = collectAndAggregate(); + infos.add(job.getJobInfo(statsMap.get(jobId), true)); return infos; } @@ -634,6 +802,190 @@ public void runAfterCatalogReady() { } } + private class TableFilterRefreshDaemon extends MasterDaemon { + TableFilterRefreshDaemon() { + super("TableFilterRefreshDaemon", Config.cloud_warm_up_table_filter_refresh_interval_ms); + LOG.info("start table filter refresh daemon, interval={}ms", + Config.cloud_warm_up_table_filter_refresh_interval_ms); + } + + @Override + public void runAfterCatalogReady() { + if (getInterval() != Config.cloud_warm_up_table_filter_refresh_interval_ms) { + setInterval(Config.cloud_warm_up_table_filter_refresh_interval_ms); + LOG.info("update table filter refresh daemon interval to {}ms", getInterval()); + } + refreshAllTableFilters(); + } + } + + private class WarmUpSyncStatsRefreshDaemon extends MasterDaemon { + WarmUpSyncStatsRefreshDaemon() { + super("WarmUpSyncStatsRefreshDaemon", Config.cloud_warm_up_sync_stats_refresh_interval_ms); + LOG.info("start warm up sync stats refresh daemon, interval={}ms", + Config.cloud_warm_up_sync_stats_refresh_interval_ms); + } + + @Override + public void runAfterCatalogReady() { + if (getInterval() != Config.cloud_warm_up_sync_stats_refresh_interval_ms) { + setInterval(Config.cloud_warm_up_sync_stats_refresh_interval_ms); + LOG.info("update warm up sync stats refresh daemon interval to {}ms", getInterval()); + } + refreshWarmUpSyncStats(); + } + } + + + /** + * Collect warmup stats from all BEs on demand and aggregate per-job. + * Called when SHOW WARM UP JOB is executed. + * + * @return per-job aggregated warmup stats; empty map if no event-driven jobs exist + */ + private Map collectAndAggregate() { + Map result = new HashMap<>(); + + // 1. Collect all clusters involved in event-driven jobs + Set allClusters = new HashSet<>(); + for (CloudWarmUpJob job : runnableCloudWarmUpJobs.values()) { + if (job.isEventDriven()) { + allClusters.add(job.getSrcClusterName()); + allClusters.add(job.getDstClusterName()); + } + } + if (allClusters.isEmpty()) { + return result; + } + + // 2. Enumerate all (cluster, BE) pairs + List> allTargets = new ArrayList<>(); + for (String cluster : allClusters) { + for (Backend be : getBackendsFromCluster(cluster)) { + if (be.isAlive()) { + allTargets.add(Pair.of(cluster, be)); + } + } + } + if (allTargets.isEmpty()) { + return result; + } + + // 3. Concurrent HTTP requests to all BEs + ExecutorCompletionService> completionService = + new ExecutorCompletionService<>(warmupStatsExecutor); + + // Acquire auth token once for all BE requests (needed when enable_all_http_auth is on) + Map authHeaders = new HashMap<>(); + try { + String token = Env.getCurrentEnv().getTokenManager().acquireToken(); + authHeaders.put("Auth-Token", token); + } catch (Exception e) { + LOG.warn("Failed to acquire auth token for warmup stats collection, " + + "requests may fail if enable_all_http_auth is enabled: {}", e.getMessage()); + } + + for (Pair target : allTargets) { + String cluster = target.first; + Backend be = target.second; + completionService.submit(() -> { + String url = "http://" + + NetUtils.getHostPortInAccessibleFormat(be.getHost(), be.getHttpPort()) + + "/api/warmup_event_driven_stats"; + String json = HttpUtils.doGet(url, authHeaders, 5000); + return Pair.of(cluster, json); + }); + } + + // 4. Collect results and merge by cluster → jobId + Map> clusterStats = new HashMap<>(); + for (int i = 0; i < allTargets.size(); i++) { + try { + Future> future = completionService.take(); + Pair resultPair = future.get(10, TimeUnit.SECONDS); + String cluster = resultPair.first; + String json = resultPair.second; + Map jobMap = + clusterStats.computeIfAbsent(cluster, k -> new HashMap<>()); + mergeStatsFromJson(jobMap, json); + } catch (Exception e) { + LOG.warn("Failed to collect warmup stats: {}", e.getMessage()); + } + } + + // 5. Aggregate per-job + for (CloudWarmUpJob job : runnableCloudWarmUpJobs.values()) { + if (!job.isEventDriven()) { + continue; + } + JobWarmUpStats stats = aggregateStatsForJob(job, clusterStats); + result.put(job.getJobId(), stats); + } + return result; + } + + /** + * Parse BE JSON response and merge into jobMap. + * JSON structure: data[].{job_id, requested, finish, fail, ...} + */ + private void mergeStatsFromJson( + Map jobMap, String json) { + try { + JsonObject root = JsonParser.parseString(json).getAsJsonObject(); + JsonArray data = root.getAsJsonArray("data"); + if (data == null) { + return; + } + for (JsonElement jobElem : data) { + JsonObject jobObj = jobElem.getAsJsonObject(); + long jobId = jobObj.get("job_id").getAsLong(); + TableWarmUpWindowedStats stats = TableWarmUpWindowedStats.fromJson(jobObj); + jobMap.compute(jobId, (id, existing) -> { + if (existing == null) { + return stats; + } + existing.merge(stats); + return existing; + }); + } + } catch (Exception e) { + LOG.warn("Failed to parse warmup stats JSON: {}", e.getMessage()); + } + } + + /** + * Aggregate per-job stats: from srcCluster take requested, from dstCluster take finished. + */ + @VisibleForTesting + JobWarmUpStats aggregateStatsForJob( + CloudWarmUpJob job, + Map> clusterStats) { + JobWarmUpStats result = new JobWarmUpStats(); + long jobId = job.getJobId(); + String srcCluster = job.getSrcClusterName(); + String dstCluster = job.getDstClusterName(); + + TableWarmUpWindowedStats srcStat = clusterStats + .getOrDefault(srcCluster, Collections.emptyMap()) + .get(jobId); + TableWarmUpWindowedStats dstStat = clusterStats + .getOrDefault(dstCluster, Collections.emptyMap()) + .get(jobId); + + if (srcStat != null) { + result.mergeRequested(srcStat); + } + if (dstStat != null) { + // Target-side progress timestamp is a watermark, not an additive counter. The merge + // keeps the minimum positive watermark across BEs so FE reports the slowest target + // progress for trigger-gap calculation. + result.mergeFinished(dstStat); + } + result.computeGap(); + return result; + } + + private void clearFinishedOrCancelCloudWarmUpJob() { Iterator> iterator = runnableCloudWarmUpJobs.entrySet().iterator(); while (iterator.hasNext()) { @@ -664,22 +1016,34 @@ public CloudWarmUpJob getCloudWarmUpJob(long jobId) { } public List> getAllJobInfos(int limit) { + Map statsMap = collectAndAggregate(); List> infos = Lists.newArrayList(); Collection allJobs = cloudWarmUpJobs.values(); allJobs.stream().sorted(Comparator.comparing(CloudWarmUpJob::getCreateTimeMs).reversed()) .limit(limit).forEach(t -> { - infos.add(t.getJobInfo()); + infos.add(t.getJobInfo(statsMap.get(t.getJobId()), false)); }); return infos; } public void addCloudWarmUpJob(CloudWarmUpJob job) throws AnalysisException { + restoreTableFilterState(job); registerJobForRepeatDetection(job, false); cloudWarmUpJobs.put(job.getJobId(), job); LOG.info("add cloud warm up job {}", job.getJobId()); runnableCloudWarmUpJobs.put(job.getJobId(), job); } + private void restoreTableFilterState(CloudWarmUpJob job) { + if (!job.hasTableFilter()) { + return; + } + job.rebuildOnTablesFilter(); + Map tableIdNames = resolveTableIds(job.getOnTablesFilter()); + job.setCurrentTableIdNames(tableIdNames); + logMatchedTables("restored table filter for job " + job.getJobId(), tableIdNames); + } + public List getPartitionsFromTriple(Triple tableTriple) { String dbName = tableTriple.getLeft(); String tableName = tableTriple.getMiddle(); @@ -826,10 +1190,35 @@ public long createJob(WarmUpClusterCommand stmt) throws AnalysisException { } builder.setSyncMode(SyncMode.EVENT_DRIVEN) .setSyncEvent(syncEvent); + + // Handle ON TABLES rules + List onTablesRules = stmt.getOnTablesRules(); + if (onTablesRules != null && !onTablesRules.isEmpty()) { + builder.setJobType(JobType.TABLES); + List persistedRules = new ArrayList<>(); + for (OnTablesFilter.TableFilterRule rule : onTablesRules) { + CloudWarmUpJob.PersistedTableFilterRule pr = new CloudWarmUpJob.PersistedTableFilterRule(); + pr.ruleType = rule.getRuleType().name(); + pr.pattern = rule.getRawPattern(); + persistedRules.add(pr); + } + builder.setTableFilterRules(persistedRules); + } } else { builder.setSyncMode(SyncMode.ONCE); } warmUpJob = builder.build(); + + // For event-driven jobs with ON TABLES, rebuild filter and resolve initial table IDs + if (warmUpJob.hasTableFilter()) { + warmUpJob.rebuildOnTablesFilter(); + Map initialTableIdNames = resolveTableIds(warmUpJob.getOnTablesFilter()); + logMatchedTables("created table filter for job " + jobId, initialTableIdNames); + if (initialTableIdNames.isEmpty()) { + throw new AnalysisException("No tables matched the ON TABLES filter"); + } + warmUpJob.setCurrentTableIdNames(initialTableIdNames); + } } addCloudWarmUpJob(warmUpJob); @@ -858,6 +1247,26 @@ public void cancel(long jobId, String msg) throws DdlException { } } + public void cancelTableFilterJobsForClusterChange(String clusterName, String reason) { + for (CloudWarmUpJob job : runnableCloudWarmUpJobs.values()) { + if (job.isDone() || !job.hasTableFilter()) { + continue; + } + if (!Objects.equals(clusterName, job.getSrcClusterName()) + && !Objects.equals(clusterName, job.getDstClusterName())) { + continue; + } + try { + cancel(job.getJobId(), reason); + LOG.info("cancel table-level cloud warm up job {} because compute group {} changed: {}", + job.getJobId(), clusterName, reason); + } catch (DdlException e) { + LOG.warn("failed to cancel table-level cloud warm up job {} after compute group {} changed", + job.getJobId(), clusterName, e); + } + } + } + private void runCloudWarmUpJob() { runnableCloudWarmUpJobs.values().forEach(cloudWarmUpJob -> { if (cloudWarmUpJob.shouldWait()) { @@ -887,6 +1296,9 @@ public void replayCloudWarmUpJob(CloudWarmUpJob cloudWarmUpJob) throws Exception runnableCloudWarmUpJobs.put(cloudWarmUpJob.getJobId(), cloudWarmUpJob); cloudWarmUpJobs.put(cloudWarmUpJob.getJobId(), cloudWarmUpJob); LOG.info("replay cloud warm up job {}, state {}", cloudWarmUpJob.getJobId(), cloudWarmUpJob.getJobState()); + + restoreTableFilterState(cloudWarmUpJob); + if (cloudWarmUpJob.isDone()) { notifyJobStop(cloudWarmUpJob); } else { @@ -904,4 +1316,67 @@ public void replayCloudWarmUpJob(CloudWarmUpJob cloudWarmUpJob) throws Exception } } + /** + * Resolve glob-based ON TABLES filter to a map of matching table ID → "db.table" name + * by iterating all databases and tables in the internal catalog. + */ + public Map resolveTableIds(OnTablesFilter filter) { + Map result = new HashMap<>(); + if (filter == null) { + return result; + } + Collection> allDbs = + Env.getCurrentInternalCatalog().getAllDbs(); + for (DatabaseIf dbIf : allDbs) { + String dbName = dbIf.getFullName(); + // Strip "default_cluster:" prefix if present + if (dbName.contains(":")) { + dbName = dbName.substring(dbName.indexOf(':') + 1); + } + Set tableNames = dbIf.getTableNamesOrEmptyWithLock(); + for (String tableName : tableNames) { + TableIf table = dbIf.getTableNullable(tableName); + if (table != null && table.isManagedTable() && filter.shouldWarmUp(dbName, tableName)) { + result.put(table.getId(), dbName + "." + tableName); + } + } + } + return result; + } + + private void logMatchedTables(String action, Map tableIdNames) { + String matchedTables = CloudWarmUpJob.formatMatchedTablesForDisplay(tableIdNames.entrySet().stream() + .sorted(Map.Entry.comparingByKey()) + .map(entry -> entry.getKey() + ":" + entry.getValue()) + .collect(Collectors.toList())); + LOG.info("{}: matched_table_count={}, matched_tables=[{}]", + action, tableIdNames.size(), matchedTables); + } + + /** + * Periodically refresh table IDs for all running event-driven jobs with ON TABLES filter. + * Called from the daemon loop to pick up newly created/dropped tables matching glob patterns. + */ + public void refreshAllTableFilters() { + for (CloudWarmUpJob job : runnableCloudWarmUpJobs.values()) { + if (job.isDone() || !job.isEventDriven() || !job.hasTableFilter()) { + continue; + } + try { + Map newTableIdNames = resolveTableIds(job.getOnTablesFilter()); + logMatchedTables("refreshed table filter for job " + job.getJobId(), newTableIdNames); + Set oldTableIds = job.getCurrentTableIds(); + if (!newTableIdNames.equals(job.getCurrentTableIdNames())) { + job.setCurrentTableIdNames(newTableIdNames); + LOG.info("refreshed table filter for job {}: {} -> {} tables", + job.getJobId(), + oldTableIds == null ? 0 : oldTableIds.size(), + newTableIdNames.size()); + } + } catch (Exception e) { + LOG.warn("failed to refresh table filter for job {}", job.getJobId(), e); + } + } + } + } diff --git a/fe/fe-core/src/main/java/org/apache/doris/cloud/CloudWarmUpJob.java b/fe/fe-core/src/main/java/org/apache/doris/cloud/CloudWarmUpJob.java index ae12107c3ddac4..c4f47ed9269eb6 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/cloud/CloudWarmUpJob.java +++ b/fe/fe-core/src/main/java/org/apache/doris/cloud/CloudWarmUpJob.java @@ -46,6 +46,8 @@ import com.google.common.base.Preconditions; import com.google.common.collect.Lists; +import com.google.gson.JsonArray; +import com.google.gson.JsonObject; import com.google.gson.annotations.SerializedName; import org.apache.commons.lang3.StringUtils; import org.apache.logging.log4j.LogManager; @@ -55,10 +57,13 @@ import java.io.DataOutput; import java.io.IOException; import java.util.ArrayList; +import java.util.Comparator; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; import java.util.stream.Collectors; public class CloudWarmUpJob implements Writable { @@ -77,7 +82,8 @@ public boolean isFinalState() { public enum JobType { CLUSTER, - TABLE; + TABLE, + TABLES; } public enum SyncMode { @@ -139,6 +145,32 @@ public enum SyncEvent { @SerializedName(value = "syncEvent") protected SyncEvent syncEvent; + @SerializedName(value = "tableFilterRules") + protected List tableFilterRules = new ArrayList<>(); + + // Computed from tableFilterRules via canonicalize(); not persisted. + private transient String tableFilterExpr = ""; + private transient OnTablesFilter onTablesFilter; + // Maps table ID → "db.table" qualified name for matched tables. + private transient volatile Map currentTableIdNames = new ConcurrentHashMap<>(); + + // Latest event-driven SyncStats collected by FE background metrics refresh. Not persisted. + private transient volatile JobWarmUpStats syncStats; + + /** + * Serializable rule for GSON persistence. + */ + public static class PersistedTableFilterRule { + @SerializedName("ruleType") + public String ruleType; + @SerializedName("pattern") + public String pattern; + } + + private static final Comparator TABLE_FILTER_RULE_COMPARATOR = + Comparator.comparingInt(CloudWarmUpJob::tableFilterRuleTypeOrder) + .thenComparing(rule -> StringUtils.defaultString(rule.pattern)); + private Map beToClient; private Map beToAddr; @@ -159,6 +191,7 @@ public static class Builder { private SyncMode syncMode = SyncMode.ONCE; private SyncEvent syncEvent; private long syncInterval; + private List tableFilterRules = new ArrayList<>(); public Builder() {} @@ -197,6 +230,11 @@ public Builder setSyncInterval(long syncInterval) { return this; } + public Builder setTableFilterRules(List tableFilterRules) { + this.tableFilterRules = tableFilterRules; + return this; + } + public CloudWarmUpJob build() { if (jobId == 0 || srcClusterName == null || dstClusterName == null || jobType == null || syncMode == null) { throw new IllegalStateException("Missing required fields for CloudWarmUpJob"); @@ -214,6 +252,8 @@ private CloudWarmUpJob(Builder builder) { this.syncMode = builder.syncMode; this.syncEvent = builder.syncEvent; this.syncInterval = builder.syncInterval; + this.tableFilterRules = normalizeTableFilterRules(builder.tableFilterRules); + this.tableFilterExpr = computeTableFilterExpr(); this.createTimeMs = System.currentTimeMillis(); } @@ -273,7 +313,7 @@ public void fetchBeToTabletIdBatches() { if (FeConstants.runningUnitTest) { return; } - if (jobType == JobType.TABLE) { + if (jobType == JobType.TABLE || jobType == JobType.TABLES) { // warm up with table will have to set tablets on creation return; } @@ -338,6 +378,10 @@ public long getCreateTimeMs() { return createTimeMs; } + public long getStartTimeMs() { + return startTimeMs; + } + public String getErrMsg() { return errMsg; } @@ -366,6 +410,18 @@ public SyncMode getSyncMode() { return syncMode; } + public SyncEvent getSyncEvent() { + return syncEvent; + } + + public JobWarmUpStats getSyncStats() { + return syncStats; + } + + public void setSyncStats(JobWarmUpStats syncStats) { + this.syncStats = syncStats; + } + public String getSyncModeString() { if (syncMode == null) { // For backward compatibility: older FE versions did not set syncMode for jobs, @@ -390,7 +446,11 @@ public String getSyncModeString() { return sb.toString(); } - public List getJobInfo() { + public List getJobInfo(JobWarmUpStats stats) { + return getJobInfo(stats, true); + } + + public List getJobInfo(JobWarmUpStats stats, boolean showDetailedSyncStats) { List info = Lists.newArrayList(); info.add(String.valueOf(jobId)); info.add(srcClusterName); @@ -416,9 +476,42 @@ public List getJobInfo() { ? t.getLeft() + "." + t.getMiddle() : t.getLeft() + "." + t.getMiddle() + "." + t.getRight()) .collect(Collectors.joining(", "))); + info.add(tableFilterExpr == null ? "" : tableFilterExpr); + info.add(getMatchedTablesString()); + // SyncStats: only for event-driven jobs + if (isEventDriven() && stats != null) { + info.add(showDetailedSyncStats ? stats.toJsonString() : stats.toSummaryJsonString()); + } else { + info.add(""); + } return info; } + private String getMatchedTablesString() { + if (currentTableIdNames == null || currentTableIdNames.isEmpty()) { + return ""; + } + return formatMatchedTablesForDisplay(currentTableIdNames.values().stream() + .sorted() + .collect(Collectors.toList())); + } + + static String formatMatchedTablesForDisplay(List matchedTables) { + if (matchedTables == null || matchedTables.isEmpty()) { + return ""; + } + int displayLimit = Math.max(0, Config.cloud_warm_up_matched_tables_display_limit); + int shownCount = Math.min(matchedTables.size(), displayLimit); + String result = matchedTables.stream() + .limit(shownCount) + .collect(Collectors.joining(", ")); + if (matchedTables.size() <= displayLimit) { + return result; + } + String truncatedSuffix = "... (truncated, " + shownCount + " of " + matchedTables.size() + " shown)"; + return result.isEmpty() ? truncatedSuffix : result + ", " + truncatedSuffix; + } + public void setJobState(JobState jobState) { this.jobState = jobState; } @@ -477,6 +570,153 @@ public String getSrcClusterName() { return srcClusterName; } + public boolean hasTableFilter() { + return tableFilterRules != null && !tableFilterRules.isEmpty(); + } + + public String getTableFilterExpr() { + return tableFilterExpr; + } + + public List getTableFilterRules() { + return tableFilterRules; + } + + public OnTablesFilter getOnTablesFilter() { + return onTablesFilter; + } + + /** + * Returns the set of currently matched table IDs. + */ + public Set getCurrentTableIds() { + if (currentTableIdNames == null) { + currentTableIdNames = new ConcurrentHashMap<>(); + } + return currentTableIdNames.keySet(); + } + + /** + * Sets the current matched table ID-to-name mapping. + */ + public void setCurrentTableIdNames(Map idNames) { + this.currentTableIdNames = new ConcurrentHashMap<>(idNames); + } + + public Map getCurrentTableIdNames() { + if (currentTableIdNames == null) { + currentTableIdNames = new ConcurrentHashMap<>(); + } + return currentTableIdNames; + } + + /** + * Compute the canonical table filter expression from persisted rules. + * Returns empty string when no table filter rules exist. + */ + private String computeTableFilterExpr() { + List normalizedRules = normalizeTableFilterRules(tableFilterRules); + tableFilterRules = normalizedRules; + if (normalizedRules.isEmpty()) { + return ""; + } + return canonicalizeNormalizedRules(normalizedRules); + } + + /** + * Generate canonical JSON from persisted rules for JobKey dedup and SHOW output. + * Steps: group by type → sort alphabetically → deduplicate → compact JSON. + */ + public static String canonicalize(List rules) { + return canonicalizeNormalizedRules(normalizeTableFilterRules(rules)); + } + + private static String canonicalizeNormalizedRules(List normalizedRules) { + List includes = normalizedRules.stream() + .filter(r -> "INCLUDE".equals(r.ruleType)) + .map(r -> r.pattern) + .collect(Collectors.toList()); + List excludes = normalizedRules.stream() + .filter(r -> "EXCLUDE".equals(r.ruleType)) + .map(r -> r.pattern) + .collect(Collectors.toList()); + + JsonObject json = new JsonObject(); + JsonArray incArr = new JsonArray(); + includes.forEach(incArr::add); + json.add("include", incArr); + if (!excludes.isEmpty()) { + JsonArray excArr = new JsonArray(); + excludes.forEach(excArr::add); + json.add("exclude", excArr); + } + return json.toString(); + } + + /** + * Rebuild the transient OnTablesFilter and tableFilterExpr from persisted tableFilterRules. + * Called after deserialization (EditLog replay, FE restart). + */ + public void rebuildOnTablesFilter() { + if (currentTableIdNames == null) { + currentTableIdNames = new ConcurrentHashMap<>(); + } + if (tableFilterRules == null || tableFilterRules.isEmpty()) { + this.tableFilterRules = new ArrayList<>(); + this.tableFilterExpr = ""; + this.onTablesFilter = null; + return; + } + this.tableFilterExpr = computeTableFilterExpr(); + List rules = tableFilterRules.stream() + .map(r -> new OnTablesFilter.TableFilterRule( + "INCLUDE".equals(r.ruleType) + ? OnTablesFilter.TableFilterRule.RuleType.INCLUDE + : OnTablesFilter.TableFilterRule.RuleType.EXCLUDE, + r.pattern)) + .collect(Collectors.toList()); + this.onTablesFilter = new OnTablesFilter(rules); + } + + private static int tableFilterRuleTypeOrder(PersistedTableFilterRule rule) { + return "INCLUDE".equals(rule.ruleType) ? 0 : 1; + } + + private static String normalizeTableFilterRuleType(String ruleType) { + Preconditions.checkNotNull(ruleType, "table filter rule type cannot be null"); + Preconditions.checkState("INCLUDE".equalsIgnoreCase(ruleType) || "EXCLUDE".equalsIgnoreCase(ruleType), + "Unexpected table filter rule type: %s", ruleType); + return "INCLUDE".equalsIgnoreCase(ruleType) ? "INCLUDE" : "EXCLUDE"; + } + + private static PersistedTableFilterRule copyNormalizedTableFilterRule(PersistedTableFilterRule rule) { + PersistedTableFilterRule normalizedRule = new PersistedTableFilterRule(); + normalizedRule.ruleType = normalizeTableFilterRuleType(rule.ruleType); + normalizedRule.pattern = rule.pattern; + return normalizedRule; + } + + private static List normalizeTableFilterRules(List rules) { + if (rules == null || rules.isEmpty()) { + return new ArrayList<>(); + } + List sortedRules = rules.stream() + .map(CloudWarmUpJob::copyNormalizedTableFilterRule) + .sorted(TABLE_FILTER_RULE_COMPARATOR) + .collect(Collectors.toList()); + List normalizedRules = new ArrayList<>(); + String lastRuleKey = null; + for (PersistedTableFilterRule rule : sortedRules) { + String ruleKey = rule.ruleType + "\0" + StringUtils.defaultString(rule.pattern); + if (ruleKey.equals(lastRuleKey)) { + continue; + } + normalizedRules.add(rule); + lastRuleKey = ruleKey; + } + return normalizedRules; + } + public synchronized void run() { if (isTimeout()) { cancel("Timeout", false); @@ -741,8 +981,13 @@ private void runEventDrivenJob() throws Exception { throw new IllegalArgumentException("Unknown SyncEvent " + syncEvent); } request.setEvent(event); - LOG.debug("send warm up request to BE {} ({}). job_id={}, event={}, request_type=SET_JOB(EVENT)", - entry.getKey(), getBackendEndpoint(entry.getKey()), jobId, syncEvent); + if (hasTableFilter()) { + request.setTableIds(new ArrayList<>(getCurrentTableIds())); + } + LOG.debug("send warm up request to BE {} ({}). job_id={}, event={}, " + + "request_type=SET_JOB(EVENT), table_ids_count={}", + entry.getKey(), getBackendEndpoint(entry.getKey()), jobId, syncEvent, + hasTableFilter() ? getCurrentTableIdNames().size() : "all"); TWarmUpTabletsResponse response = entry.getValue().warmUpTablets(request); if (response.getStatus().getStatusCode() != TStatusCode.OK) { if (!response.getStatus().getErrorMsgs().isEmpty()) { @@ -753,6 +998,7 @@ private void runEventDrivenJob() throws Exception { } } } catch (Exception e) { + errMsg = e.getMessage(); LOG.warn("send warm up request job_id={} failed with exception {}", jobId, e); } finally { @@ -901,6 +1147,8 @@ public void write(DataOutput out) throws IOException { public static CloudWarmUpJob read(DataInput in) throws IOException { String json = Text.readString(in); - return GsonUtils.GSON.fromJson(json, CloudWarmUpJob.class); + CloudWarmUpJob job = GsonUtils.GSON.fromJson(json, CloudWarmUpJob.class); + job.rebuildOnTablesFilter(); + return job; } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/cloud/JobWarmUpStats.java b/fe/fe-core/src/main/java/org/apache/doris/cloud/JobWarmUpStats.java new file mode 100644 index 00000000000000..cdb293216a1b54 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/cloud/JobWarmUpStats.java @@ -0,0 +1,285 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.cloud; + +import org.apache.doris.monitor.unit.ByteSizeValue; + +import com.google.gson.JsonObject; + +import java.time.Instant; +import java.time.LocalDateTime; +import java.time.ZoneId; +import java.time.format.DateTimeFormatter; + +/** + * Per-Job aggregated warmup statistics. + * Aggregates requested (from source cluster) and finished/failed (from target cluster) + * across all matched tables, then computes gap = requested - finished. + */ +public class JobWarmUpStats { + private static final DateTimeFormatter TIME_FMT = DateTimeFormatter.ofPattern("HH:mm:ss"); + + // Aggregated requested + public long requestedSegmentNum5m; + public long requestedSegmentNum30m; + public long requestedSegmentNum1h; + public long requestedSegmentSize5m; + public long requestedSegmentSize30m; + public long requestedSegmentSize1h; + public long requestedIndexNum5m; + public long requestedIndexNum30m; + public long requestedIndexNum1h; + public long requestedIndexSize5m; + public long requestedIndexSize30m; + public long requestedIndexSize1h; + public long lastTriggerTs; + + // Aggregated finished + public long finishSegmentNum5m; + public long finishSegmentNum30m; + public long finishSegmentNum1h; + public long finishSegmentSize5m; + public long finishSegmentSize30m; + public long finishSegmentSize1h; + public long finishIndexNum5m; + public long finishIndexNum30m; + public long finishIndexNum1h; + public long finishIndexSize5m; + public long finishIndexSize30m; + public long finishIndexSize1h; + + // Aggregated failed + public long failSegmentNum5m; + public long failSegmentNum30m; + public long failSegmentNum1h; + public long failSegmentSize5m; + public long failSegmentSize30m; + public long failSegmentSize1h; + public long failIndexNum5m; + public long failIndexNum30m; + public long failIndexNum1h; + public long failIndexSize5m; + public long failIndexSize30m; + public long failIndexSize1h; + public long lastFinishTs; + // Aggregated from target BEs. FE takes the minimum positive target progress watermark so the + // slowest target BE decides how far the job has caught up to source-side triggers. + public long progressTriggerTs; + + // gap = requested - finished + public long gapSegmentNum5m; + public long gapSegmentNum30m; + public long gapSegmentNum1h; + public long gapSegmentSize5m; + public long gapSegmentSize30m; + public long gapSegmentSize1h; + public long gapIndexNum5m; + public long gapIndexNum30m; + public long gapIndexNum1h; + public long gapIndexSize5m; + public long gapIndexSize30m; + public long gapIndexSize1h; + // Source last trigger timestamp minus target progress watermark. A caught-up target reports its + // latest finished trigger as progress, so this value naturally becomes 0. + public long triggerGapMs; + + /** Accumulate requested stats from a table in the source cluster. */ + public void mergeRequested(TableWarmUpWindowedStats t) { + requestedSegmentNum5m += t.requestedSegmentNum5m; + requestedSegmentNum30m += t.requestedSegmentNum30m; + requestedSegmentNum1h += t.requestedSegmentNum1h; + requestedSegmentSize5m += t.requestedSegmentSize5m; + requestedSegmentSize30m += t.requestedSegmentSize30m; + requestedSegmentSize1h += t.requestedSegmentSize1h; + requestedIndexNum5m += t.requestedIndexNum5m; + requestedIndexNum30m += t.requestedIndexNum30m; + requestedIndexNum1h += t.requestedIndexNum1h; + requestedIndexSize5m += t.requestedIndexSize5m; + requestedIndexSize30m += t.requestedIndexSize30m; + requestedIndexSize1h += t.requestedIndexSize1h; + lastTriggerTs = Math.max(lastTriggerTs, t.lastTriggerTs); + } + + /** Accumulate finished/failed stats from a table in the target cluster. */ + public void mergeFinished(TableWarmUpWindowedStats t) { + finishSegmentNum5m += t.finishSegmentNum5m; + finishSegmentNum30m += t.finishSegmentNum30m; + finishSegmentNum1h += t.finishSegmentNum1h; + finishSegmentSize5m += t.finishSegmentSize5m; + finishSegmentSize30m += t.finishSegmentSize30m; + finishSegmentSize1h += t.finishSegmentSize1h; + finishIndexNum5m += t.finishIndexNum5m; + finishIndexNum30m += t.finishIndexNum30m; + finishIndexNum1h += t.finishIndexNum1h; + finishIndexSize5m += t.finishIndexSize5m; + finishIndexSize30m += t.finishIndexSize30m; + finishIndexSize1h += t.finishIndexSize1h; + failSegmentNum5m += t.failSegmentNum5m; + failSegmentNum30m += t.failSegmentNum30m; + failSegmentNum1h += t.failSegmentNum1h; + failSegmentSize5m += t.failSegmentSize5m; + failSegmentSize30m += t.failSegmentSize30m; + failSegmentSize1h += t.failSegmentSize1h; + failIndexNum5m += t.failIndexNum5m; + failIndexNum30m += t.failIndexNum30m; + failIndexNum1h += t.failIndexNum1h; + failIndexSize5m += t.failIndexSize5m; + failIndexSize30m += t.failIndexSize30m; + failIndexSize1h += t.failIndexSize1h; + lastFinishTs = Math.max(lastFinishTs, t.lastFinishTs); + progressTriggerTs = minPositive(progressTriggerTs, t.progressTriggerTs); + } + + /** Compute gap = requested - finished for all window/metric combinations. */ + public void computeGap() { + gapSegmentNum5m = requestedSegmentNum5m - finishSegmentNum5m; + gapSegmentNum30m = requestedSegmentNum30m - finishSegmentNum30m; + gapSegmentNum1h = requestedSegmentNum1h - finishSegmentNum1h; + gapSegmentSize5m = requestedSegmentSize5m - finishSegmentSize5m; + gapSegmentSize30m = requestedSegmentSize30m - finishSegmentSize30m; + gapSegmentSize1h = requestedSegmentSize1h - finishSegmentSize1h; + gapIndexNum5m = requestedIndexNum5m - finishIndexNum5m; + gapIndexNum30m = requestedIndexNum30m - finishIndexNum30m; + gapIndexNum1h = requestedIndexNum1h - finishIndexNum1h; + gapIndexSize5m = requestedIndexSize5m - finishIndexSize5m; + gapIndexSize30m = requestedIndexSize30m - finishIndexSize30m; + gapIndexSize1h = requestedIndexSize1h - finishIndexSize1h; + triggerGapMs = lastTriggerTs > 0 && progressTriggerTs > 0 + ? Math.max(0, lastTriggerTs - progressTriggerTs) : 0; + } + + /** Serialize compact 30m SyncStats summary for SHOW WARM UP JOB list output. */ + public String toSummaryJsonString() { + JsonObject root = new JsonObject(); + root.addProperty("window", "30m"); + long srcSize = requestedSegmentSize30m + requestedIndexSize30m; + long dstSize = finishSegmentSize30m + finishIndexSize30m; + root.addProperty("src_size", humanReadableSize(srcSize)); + root.addProperty("dst_size", humanReadableSize(dstSize)); + root.addProperty("gap_size", humanReadableSize(srcSize - dstSize)); + // Compact SHOW WARM UP JOB output still exposes the active incremental warm-up time lag. + root.addProperty("trigger_gap_ms", triggerGapMs); + return root.toString(); + } + + /** Serialize detailed SyncStats JSON for SHOW WARM UP JOB WHERE ID = ... output. */ + public String toJsonString() { + JsonObject root = new JsonObject(); + + // seg_num + JsonObject segNum = new JsonObject(); + segNum.addProperty("requested_5m", requestedSegmentNum5m); + segNum.addProperty("finish_5m", finishSegmentNum5m); + segNum.addProperty("gap_5m", gapSegmentNum5m); + segNum.addProperty("fail_5m", failSegmentNum5m); + segNum.addProperty("requested_30m", requestedSegmentNum30m); + segNum.addProperty("finish_30m", finishSegmentNum30m); + segNum.addProperty("gap_30m", gapSegmentNum30m); + segNum.addProperty("fail_30m", failSegmentNum30m); + segNum.addProperty("requested_1h", requestedSegmentNum1h); + segNum.addProperty("finish_1h", finishSegmentNum1h); + segNum.addProperty("gap_1h", gapSegmentNum1h); + segNum.addProperty("fail_1h", failSegmentNum1h); + root.add("seg_num", segNum); + + // seg_size + JsonObject segSize = new JsonObject(); + segSize.addProperty("requested_5m", humanReadableSize(requestedSegmentSize5m)); + segSize.addProperty("finish_5m", humanReadableSize(finishSegmentSize5m)); + segSize.addProperty("gap_5m", humanReadableSize(gapSegmentSize5m)); + segSize.addProperty("fail_5m", humanReadableSize(failSegmentSize5m)); + segSize.addProperty("requested_30m", humanReadableSize(requestedSegmentSize30m)); + segSize.addProperty("finish_30m", humanReadableSize(finishSegmentSize30m)); + segSize.addProperty("gap_30m", humanReadableSize(gapSegmentSize30m)); + segSize.addProperty("fail_30m", humanReadableSize(failSegmentSize30m)); + segSize.addProperty("requested_1h", humanReadableSize(requestedSegmentSize1h)); + segSize.addProperty("finish_1h", humanReadableSize(finishSegmentSize1h)); + segSize.addProperty("gap_1h", humanReadableSize(gapSegmentSize1h)); + segSize.addProperty("fail_1h", humanReadableSize(failSegmentSize1h)); + root.add("seg_size", segSize); + + // idx_num + JsonObject idxNum = new JsonObject(); + idxNum.addProperty("requested_5m", requestedIndexNum5m); + idxNum.addProperty("finish_5m", finishIndexNum5m); + idxNum.addProperty("gap_5m", gapIndexNum5m); + idxNum.addProperty("fail_5m", failIndexNum5m); + idxNum.addProperty("requested_30m", requestedIndexNum30m); + idxNum.addProperty("finish_30m", finishIndexNum30m); + idxNum.addProperty("gap_30m", gapIndexNum30m); + idxNum.addProperty("fail_30m", failIndexNum30m); + idxNum.addProperty("requested_1h", requestedIndexNum1h); + idxNum.addProperty("finish_1h", finishIndexNum1h); + idxNum.addProperty("gap_1h", gapIndexNum1h); + idxNum.addProperty("fail_1h", failIndexNum1h); + root.add("idx_num", idxNum); + + // idx_size + JsonObject idxSize = new JsonObject(); + idxSize.addProperty("requested_5m", humanReadableSize(requestedIndexSize5m)); + idxSize.addProperty("finish_5m", humanReadableSize(finishIndexSize5m)); + idxSize.addProperty("gap_5m", humanReadableSize(gapIndexSize5m)); + idxSize.addProperty("fail_5m", humanReadableSize(failIndexSize5m)); + idxSize.addProperty("requested_30m", humanReadableSize(requestedIndexSize30m)); + idxSize.addProperty("finish_30m", humanReadableSize(finishIndexSize30m)); + idxSize.addProperty("gap_30m", humanReadableSize(gapIndexSize30m)); + idxSize.addProperty("fail_30m", humanReadableSize(failIndexSize30m)); + idxSize.addProperty("requested_1h", humanReadableSize(requestedIndexSize1h)); + idxSize.addProperty("finish_1h", humanReadableSize(finishIndexSize1h)); + idxSize.addProperty("gap_1h", humanReadableSize(gapIndexSize1h)); + idxSize.addProperty("fail_1h", humanReadableSize(failIndexSize1h)); + root.add("idx_size", idxSize); + + // timestamps + root.addProperty("last_trigger_ts", formatEpochMs(lastTriggerTs)); + root.addProperty("last_finish_ts", formatEpochMs(lastFinishTs)); + root.addProperty("progress_trigger_ts", formatEpochMs(progressTriggerTs)); + root.addProperty("trigger_gap_ms", triggerGapMs); + + return root.toString(); + } + + private static long minPositive(long current, long candidate) { + if (current <= 0) { + return Math.max(candidate, 0); + } + if (candidate <= 0) { + return current; + } + return Math.min(current, candidate); + } + + private static String humanReadableSize(long bytes) { + if (bytes < 0) { + return "-" + new ByteSizeValue(-bytes).toString(); + } + return new ByteSizeValue(bytes).toString(); + } + + private static String formatEpochMs(long epochMs) { + if (epochMs <= 0) { + return ""; + } + try { + return LocalDateTime.ofInstant(Instant.ofEpochMilli(epochMs), ZoneId.systemDefault()) + .format(TIME_FMT); + } catch (Exception e) { + return ""; + } + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/cloud/OnTablesFilter.java b/fe/fe-core/src/main/java/org/apache/doris/cloud/OnTablesFilter.java new file mode 100644 index 00000000000000..f0aaef8c7de891 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/cloud/OnTablesFilter.java @@ -0,0 +1,169 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.cloud; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +/** + * ON TABLES clause filter for table-level event-driven warmup. + * + * Semantics: INCLUDE union − EXCLUDE union. + * A table is warmed up if it matches any INCLUDE rule and does not match any EXCLUDE rule. + */ +public class OnTablesFilter { + + /** + * A single INCLUDE or EXCLUDE rule with a glob pattern compiled to a Java regex. + */ + public static class TableFilterRule { + public enum RuleType { + INCLUDE, + EXCLUDE + } + + private final RuleType ruleType; + private final String rawPattern; + private final Pattern compiledPattern; + + public TableFilterRule(RuleType ruleType, String globPattern) { + this.ruleType = ruleType; + this.rawPattern = globPattern; + this.compiledPattern = compileGlob(globPattern); + } + + /** + * Compile a glob pattern to an anchored Java regex. + * Glob: '*' matches any characters, '?' matches a single character, + * '.' and other regex metacharacters are treated as literals. + */ + private static Pattern compileGlob(String glob) { + StringBuilder regex = new StringBuilder("^"); + for (char c : glob.toCharArray()) { + switch (c) { + case '*': + regex.append(".*"); + break; + case '?': + regex.append("."); + break; + case '.': + case '(': + case ')': + case '[': + case ']': + case '{': + case '}': + case '\\': + case '^': + case '$': + case '|': + case '+': + regex.append('\\').append(c); + break; + default: + regex.append(c); + } + } + regex.append("$"); + return Pattern.compile(regex.toString()); + } + + public boolean matches(String fullTableName) { + return compiledPattern.matcher(fullTableName).matches(); + } + + public RuleType getRuleType() { + return ruleType; + } + + public String getRawPattern() { + return rawPattern; + } + } + + private final List includeRules; + private final List excludeRules; + + public OnTablesFilter(List rules) { + List includes = new ArrayList<>(); + List excludes = new ArrayList<>(); + for (TableFilterRule rule : rules) { + if (rule.getRuleType() == TableFilterRule.RuleType.INCLUDE) { + includes.add(rule); + } else { + excludes.add(rule); + } + } + this.includeRules = Collections.unmodifiableList(includes); + this.excludeRules = Collections.unmodifiableList(excludes); + } + + /** + * Determine whether a table should be warmed up. + * 1. If the table matches any INCLUDE rule → candidate + * 2. If the candidate matches any EXCLUDE rule → excluded + */ + public boolean shouldWarmUp(String dbName, String tableName) { + String fullName = dbName + "." + tableName; + + boolean included = includeRules.stream() + .anyMatch(rule -> rule.matches(fullName)); + if (!included) { + return false; + } + + boolean excluded = excludeRules.stream() + .anyMatch(rule -> rule.matches(fullName)); + return !excluded; + } + + public List getIncludeRules() { + return includeRules; + } + + public List getExcludeRules() { + return excludeRules; + } + + /** + * Get all rules (include + exclude) for iteration. + */ + public List getAllRules() { + List all = new ArrayList<>(includeRules.size() + excludeRules.size()); + all.addAll(includeRules); + all.addAll(excludeRules); + return all; + } + + /** + * Generate a human-readable string representation for logging. + */ + @Override + public String toString() { + return "OnTablesFilter{include=" + includeRules.stream() + .map(TableFilterRule::getRawPattern) + .collect(Collectors.joining(", ")) + + ", exclude=" + excludeRules.stream() + .map(TableFilterRule::getRawPattern) + .collect(Collectors.joining(", ")) + "}"; + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/cloud/TableWarmUpWindowedStats.java b/fe/fe-core/src/main/java/org/apache/doris/cloud/TableWarmUpWindowedStats.java new file mode 100644 index 00000000000000..d9c315efa7057e --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/cloud/TableWarmUpWindowedStats.java @@ -0,0 +1,206 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.cloud; + +import com.google.gson.JsonObject; + +/** + * Per-job windowed warmup statistics collected from a single BE. + * Contains requested, finish, and fail counters for segments and indexes + * across 3 time windows (5m, 30m, 1h). + */ +public class TableWarmUpWindowedStats { + + // requested (source BE populates these) + public long requestedSegmentNum5m; + public long requestedSegmentNum30m; + public long requestedSegmentNum1h; + public long requestedSegmentSize5m; + public long requestedSegmentSize30m; + public long requestedSegmentSize1h; + public long requestedIndexNum5m; + public long requestedIndexNum30m; + public long requestedIndexNum1h; + public long requestedIndexSize5m; + public long requestedIndexSize30m; + public long requestedIndexSize1h; + public long lastTriggerTs; + // Target BE progress watermark carried from source BE trigger time. Pending downloads use the + // earliest unfinished trigger time; when no downloads are pending, BE reports the latest + // finished trigger time. + public long progressTriggerTs; + + // finish (target BE populates these) + public long finishSegmentNum5m; + public long finishSegmentNum30m; + public long finishSegmentNum1h; + public long finishSegmentSize5m; + public long finishSegmentSize30m; + public long finishSegmentSize1h; + public long finishIndexNum5m; + public long finishIndexNum30m; + public long finishIndexNum1h; + public long finishIndexSize5m; + public long finishIndexSize30m; + public long finishIndexSize1h; + public long lastFinishTs; + + // fail (target BE populates these) + public long failSegmentNum5m; + public long failSegmentNum30m; + public long failSegmentNum1h; + public long failSegmentSize5m; + public long failSegmentSize30m; + public long failSegmentSize1h; + public long failIndexNum5m; + public long failIndexNum30m; + public long failIndexNum1h; + public long failIndexSize5m; + public long failIndexSize30m; + public long failIndexSize1h; + + /** + * Parse from BE JSON response. + * JSON hierarchy: {requested|finish|fail}.{seg|idx}.{num|size}.{5m|30m|1h} + */ + public static TableWarmUpWindowedStats fromJson(JsonObject obj) { + TableWarmUpWindowedStats s = new TableWarmUpWindowedStats(); + + JsonObject req = obj.getAsJsonObject("requested"); + if (req != null) { + s.requestedSegmentNum5m = getWindow(req, "seg", "num", "5m"); + s.requestedSegmentNum30m = getWindow(req, "seg", "num", "30m"); + s.requestedSegmentNum1h = getWindow(req, "seg", "num", "1h"); + s.requestedSegmentSize5m = getWindow(req, "seg", "size", "5m"); + s.requestedSegmentSize30m = getWindow(req, "seg", "size", "30m"); + s.requestedSegmentSize1h = getWindow(req, "seg", "size", "1h"); + s.requestedIndexNum5m = getWindow(req, "idx", "num", "5m"); + s.requestedIndexNum30m = getWindow(req, "idx", "num", "30m"); + s.requestedIndexNum1h = getWindow(req, "idx", "num", "1h"); + s.requestedIndexSize5m = getWindow(req, "idx", "size", "5m"); + s.requestedIndexSize30m = getWindow(req, "idx", "size", "30m"); + s.requestedIndexSize1h = getWindow(req, "idx", "size", "1h"); + } + + JsonObject fin = obj.getAsJsonObject("finish"); + if (fin != null) { + s.finishSegmentNum5m = getWindow(fin, "seg", "num", "5m"); + s.finishSegmentNum30m = getWindow(fin, "seg", "num", "30m"); + s.finishSegmentNum1h = getWindow(fin, "seg", "num", "1h"); + s.finishSegmentSize5m = getWindow(fin, "seg", "size", "5m"); + s.finishSegmentSize30m = getWindow(fin, "seg", "size", "30m"); + s.finishSegmentSize1h = getWindow(fin, "seg", "size", "1h"); + s.finishIndexNum5m = getWindow(fin, "idx", "num", "5m"); + s.finishIndexNum30m = getWindow(fin, "idx", "num", "30m"); + s.finishIndexNum1h = getWindow(fin, "idx", "num", "1h"); + s.finishIndexSize5m = getWindow(fin, "idx", "size", "5m"); + s.finishIndexSize30m = getWindow(fin, "idx", "size", "30m"); + s.finishIndexSize1h = getWindow(fin, "idx", "size", "1h"); + } + + JsonObject fail = obj.getAsJsonObject("fail"); + if (fail != null) { + s.failSegmentNum5m = getWindow(fail, "seg", "num", "5m"); + s.failSegmentNum30m = getWindow(fail, "seg", "num", "30m"); + s.failSegmentNum1h = getWindow(fail, "seg", "num", "1h"); + s.failSegmentSize5m = getWindow(fail, "seg", "size", "5m"); + s.failSegmentSize30m = getWindow(fail, "seg", "size", "30m"); + s.failSegmentSize1h = getWindow(fail, "seg", "size", "1h"); + s.failIndexNum5m = getWindow(fail, "idx", "num", "5m"); + s.failIndexNum30m = getWindow(fail, "idx", "num", "30m"); + s.failIndexNum1h = getWindow(fail, "idx", "num", "1h"); + s.failIndexSize5m = getWindow(fail, "idx", "size", "5m"); + s.failIndexSize30m = getWindow(fail, "idx", "size", "30m"); + s.failIndexSize1h = getWindow(fail, "idx", "size", "1h"); + } + + s.lastTriggerTs = obj.has("last_trigger_ts") ? obj.get("last_trigger_ts").getAsLong() : 0; + s.lastFinishTs = obj.has("last_finish_ts") ? obj.get("last_finish_ts").getAsLong() : 0; + s.progressTriggerTs = obj.has("progress_trigger_ts") + ? obj.get("progress_trigger_ts").getAsLong() : 0; + return s; + } + + private static long getWindow(JsonObject parent, String type, String metric, String window) { + JsonObject typeObj = parent.getAsJsonObject(type); + if (typeObj == null) { + return 0; + } + JsonObject metricObj = typeObj.getAsJsonObject(metric); + if (metricObj == null) { + return 0; + } + return metricObj.has(window) ? metricObj.get(window).getAsLong() : 0; + } + + /** Merge stats from another BE in the same cluster (additive for counts, max for timestamps). */ + public void merge(TableWarmUpWindowedStats other) { + requestedSegmentNum5m += other.requestedSegmentNum5m; + requestedSegmentNum30m += other.requestedSegmentNum30m; + requestedSegmentNum1h += other.requestedSegmentNum1h; + requestedSegmentSize5m += other.requestedSegmentSize5m; + requestedSegmentSize30m += other.requestedSegmentSize30m; + requestedSegmentSize1h += other.requestedSegmentSize1h; + requestedIndexNum5m += other.requestedIndexNum5m; + requestedIndexNum30m += other.requestedIndexNum30m; + requestedIndexNum1h += other.requestedIndexNum1h; + requestedIndexSize5m += other.requestedIndexSize5m; + requestedIndexSize30m += other.requestedIndexSize30m; + requestedIndexSize1h += other.requestedIndexSize1h; + + finishSegmentNum5m += other.finishSegmentNum5m; + finishSegmentNum30m += other.finishSegmentNum30m; + finishSegmentNum1h += other.finishSegmentNum1h; + finishSegmentSize5m += other.finishSegmentSize5m; + finishSegmentSize30m += other.finishSegmentSize30m; + finishSegmentSize1h += other.finishSegmentSize1h; + finishIndexNum5m += other.finishIndexNum5m; + finishIndexNum30m += other.finishIndexNum30m; + finishIndexNum1h += other.finishIndexNum1h; + finishIndexSize5m += other.finishIndexSize5m; + finishIndexSize30m += other.finishIndexSize30m; + finishIndexSize1h += other.finishIndexSize1h; + + failSegmentNum5m += other.failSegmentNum5m; + failSegmentNum30m += other.failSegmentNum30m; + failSegmentNum1h += other.failSegmentNum1h; + failSegmentSize5m += other.failSegmentSize5m; + failSegmentSize30m += other.failSegmentSize30m; + failSegmentSize1h += other.failSegmentSize1h; + failIndexNum5m += other.failIndexNum5m; + failIndexNum30m += other.failIndexNum30m; + failIndexNum1h += other.failIndexNum1h; + failIndexSize5m += other.failIndexSize5m; + failIndexSize30m += other.failIndexSize30m; + failIndexSize1h += other.failIndexSize1h; + + lastTriggerTs = Math.max(lastTriggerTs, other.lastTriggerTs); + lastFinishTs = Math.max(lastFinishTs, other.lastFinishTs); + progressTriggerTs = minPositive(progressTriggerTs, other.progressTriggerTs); + } + + private static long minPositive(long current, long candidate) { + if (current <= 0) { + return Math.max(candidate, 0); + } + if (candidate <= 0) { + return current; + } + return Math.min(current, candidate); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudClusterChecker.java b/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudClusterChecker.java index 338d619604f372..0445d48545cdcb 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudClusterChecker.java +++ b/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudClusterChecker.java @@ -146,14 +146,16 @@ private void checkToDelCluster(Map remoteClusterIdToPB, Set toDel = - new ArrayList<>(finalClusterIdToBackend.getOrDefault(delId, new ArrayList<>())); - cloudSystemInfoService.updateCloudBackends(new ArrayList<>(), toDel); - // del clusterName String delClusterName = cloudSystemInfoService.getClusterNameByClusterId(delId); if (delClusterName.isEmpty()) { return; } + ((CloudEnv) Env.getCurrentEnv()).getCacheHotspotMgr().cancelTableFilterJobsForClusterChange( + delClusterName, "system cancel: compute group " + delClusterName + " dropped"); + List toDel = + new ArrayList<>(finalClusterIdToBackend.getOrDefault(delId, new ArrayList<>())); + cloudSystemInfoService.updateCloudBackends(new ArrayList<>(), toDel); + // del clusterName // del clusterID MetricRepo.unregisterCloudMetrics(delId, delClusterName, toDel); cloudSystemInfoService.dropCluster(delId, delClusterName); @@ -262,6 +264,9 @@ private void checkDiffNode(Map remoteClusterIdToPB, LOG.info("cluster_name corresponding to cluster_id has been changed," + " cluster_id : {} , current_cluster_name : {}, new_cluster_name :{}", cid, currentClusterName, newClusterName); + ((CloudEnv) Env.getCurrentEnv()).getCacheHotspotMgr().cancelTableFilterJobsForClusterChange( + currentClusterName, "system cancel: compute group " + currentClusterName + + " renamed to " + newClusterName); // change all be's cluster_name currentBes.forEach(b -> b.setCloudClusterName(newClusterName)); // update clusterNameToId diff --git a/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudInstanceStatusChecker.java b/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudInstanceStatusChecker.java index 90c8ea42573504..4b3b7cf79b735d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudInstanceStatusChecker.java +++ b/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudInstanceStatusChecker.java @@ -284,8 +284,12 @@ private void syncFileCacheTasksForVirtualGroup(Cloud.ClusterPB virtualGroupInMs, if (virtualGroupInFe.isNeedRebuildFileCache()) { String srcCg = virtualGroupInFe.getActiveComputeGroup(); String dstCg = virtualGroupInFe.getStandbyComputeGroup(); - cancelCacheJobs(virtualGroupInFe, jobIdsInMs); try { + cacheHotspotManager.cancelTableLevelLoadEventWarmUpJobsForVirtualComputeGroup( + virtualGroupInFe.getName(), srcCg, dstCg, virtualGroupInFe.getSubComputeGroups(), + "vcg cancel table-level load-event warm up job before rebuilding file cache jobs"); + cancelCacheJobs(virtualGroupInFe, jobIdsInMs); + // all Map periodicProperties = new HashMap<>(); // "sync_mode" = "periodic", "sync_interval_sec" = "fetch_cluster_cache_hotspot_interval_ms" @@ -316,7 +320,8 @@ private void syncFileCacheTasksForVirtualGroup(Cloud.ClusterPB virtualGroupInMs, LOG.info("virtual compute group {}, generate new jobIds periodic={}, event={}, and old jobIds {}", virtualGroupInFe, jobIdPeriodic, jobIdEvent, jobIdsInMs); } catch (AnalysisException e) { - LOG.warn("virtual compute err, name: {}, analysis error", virtualGroupInFe.getName(), e); + LOG.warn("virtual compute err, name: {}, failed to generate file cache warm up jobs: {}", + virtualGroupInFe.getName(), e.getMessage(), e); return; } virtualGroupInFe.setNeedRebuildFileCache(false); diff --git a/fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java b/fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java index 723cb3f6eb188e..6248be5386429b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java +++ b/fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java @@ -22,6 +22,8 @@ import org.apache.doris.catalog.Database; import org.apache.doris.catalog.Env; import org.apache.doris.catalog.TabletSlidingWindowAccessStats; +import org.apache.doris.cloud.CloudWarmUpJob; +import org.apache.doris.cloud.JobWarmUpStats; import org.apache.doris.cloud.catalog.CloudTabletRebalancer; import org.apache.doris.cloud.system.CloudSystemInfoService; import org.apache.doris.common.Config; @@ -63,14 +65,19 @@ import org.apache.logging.log4j.Logger; import java.util.ArrayList; +import java.util.Collection; import java.util.Collections; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.SortedMap; +import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ScheduledThreadPoolExecutor; import java.util.concurrent.TimeUnit; import java.util.function.Predicate; import java.util.function.Supplier; +import java.util.function.ToLongFunction; public final class MetricRepo { private static final Logger LOG = LogManager.getLogger(MetricRepo.class); @@ -134,6 +141,9 @@ public final class MetricRepo { public static LongCounterMetric COUNTER_SQL_CACHE_HIT; public static LongCounterMetric COUNTER_SQL_SQL_CACHE_TOTAL_SEARCH_TIMES; + private static final Map CLOUD_WARM_UP_SYNC_JOB_METRICS = + new ConcurrentHashMap<>(); + public static LongCounterMetric COUNTER_UPDATE_TABLET_STAT_FAILED; public static GaugeMetric GAUGE_TABLET_ACCESS_RECENT; @@ -1624,6 +1634,199 @@ public static void visitHistograms(MetricVisitor visitor) { DORIS_METRIC_REGISTER.acceptHistograms(visitor); } + /* + * Example Prometheus output for a running event-driven cluster warm-up job. MetricVisitor adds the + * "doris_fe_" prefix to the metric names registered below. + * + * doris_fe_file_cache_warm_up_sync_job_info{ + * job_id="1778211593204", job_type="CLUSTER", sync_mode="EVENT_DRIVEN", + * sync_event="LOAD", job_state="RUNNING", src_cluster_name="warmup_source", + * dst_cluster_name="warmup_target" + * } 1 + * doris_fe_file_cache_warm_up_sync_job_size_bytes{ + * job_id="1778211593204", job_type="CLUSTER", src_cluster_name="warmup_source", + * dst_cluster_name="warmup_target", side="src", window="5m" + * } 113246208 + * doris_fe_file_cache_warm_up_sync_job_size_bytes{ + * job_id="1778211593204", job_type="CLUSTER", src_cluster_name="warmup_source", + * dst_cluster_name="warmup_target", side="dst", window="5m" + * } 100663296 + * + * The size metric emits the same label shape for side=(src,dst) and window=(5m,30m,1h). + */ + public static void syncCloudWarmUpSyncJobMetricDefinitions(Collection jobs) { + if (!MetricRepo.isInit || Config.isNotCloudMode()) { + clearCloudWarmUpSyncJobMetrics(); + return; + } + + Collection currentJobs = jobs == null ? Collections.emptyList() : jobs; + Set currentMetricKeys = new HashSet<>(); + for (CloudWarmUpJob job : currentJobs) { + if (job == null) { + continue; + } + registerCloudWarmUpSyncJobMetrics(job, currentMetricKeys); + } + CLOUD_WARM_UP_SYNC_JOB_METRICS.entrySet().removeIf(entry -> { + if (currentMetricKeys.contains(entry.getKey())) { + return false; + } + CloudWarmUpSyncJobGauge metric = entry.getValue(); + DORIS_METRIC_REGISTER.removeMetricsByNameAndLabels(metric.getName(), metric.getLabels()); + return true; + }); + } + + private static void clearCloudWarmUpSyncJobMetrics() { + CLOUD_WARM_UP_SYNC_JOB_METRICS.forEach((key, metric) -> + DORIS_METRIC_REGISTER.removeMetricsByNameAndLabels(metric.getName(), metric.getLabels())); + CLOUD_WARM_UP_SYNC_JOB_METRICS.clear(); + } + + private static void registerCloudWarmUpSyncJobMetrics(CloudWarmUpJob job, Set currentMetricKeys) { + String jobId = String.valueOf(job.getJobId()); + String jobType = labelValue(job.getJobType()); + String srcClusterName = labelValue(job.getSrcClusterName()); + String dstClusterName = labelValue(job.getDstClusterName()); + + List infoLabels = new ArrayList<>(); + infoLabels.add(new MetricLabel("job_id", jobId)); + infoLabels.add(new MetricLabel("job_type", jobType)); + infoLabels.add(new MetricLabel("sync_mode", labelValue(job.getSyncMode()))); + infoLabels.add(new MetricLabel("sync_event", labelValue(job.getSyncEvent()))); + infoLabels.add(new MetricLabel("job_state", labelValue(job.getJobState()))); + infoLabels.add(new MetricLabel("src_cluster_name", srcClusterName)); + infoLabels.add(new MetricLabel("dst_cluster_name", dstClusterName)); + addCloudWarmUpSyncJobGauge(currentMetricKeys, "file_cache_warm_up_sync_job_info", MetricUnit.NOUNIT, + "warm up sync job info", infoLabels, job, currentJob -> 1L); + + if (!job.isEventDriven() || job.isDone()) { + return; + } + + for (String window : new String[] {"5m", "30m", "1h"}) { + for (String side : new String[] {"src", "dst"}) { + List labels = new ArrayList<>(); + labels.add(new MetricLabel("job_id", jobId)); + labels.add(new MetricLabel("job_type", jobType)); + labels.add(new MetricLabel("src_cluster_name", srcClusterName)); + labels.add(new MetricLabel("dst_cluster_name", dstClusterName)); + labels.add(new MetricLabel("side", side)); + labels.add(new MetricLabel("window", window)); + addCloudWarmUpSyncJobGauge(currentMetricKeys, "file_cache_warm_up_sync_job_size_bytes", + MetricUnit.BYTES, "warm up sync job source or target total size in bytes", + labels, job, currentJob -> getCloudWarmUpSyncJobSizeBytes(currentJob, side, window)); + } + } + + List labels = new ArrayList<>(); + labels.add(new MetricLabel("job_id", jobId)); + labels.add(new MetricLabel("job_type", jobType)); + labels.add(new MetricLabel("src_cluster_name", srcClusterName)); + labels.add(new MetricLabel("dst_cluster_name", dstClusterName)); + // Trigger gap observes active event-driven warm-up lag in time: source latest trigger time + // minus the target-side progress watermark collected from BEs. + addCloudWarmUpSyncJobGauge(currentMetricKeys, "file_cache_warm_up_sync_job_trigger_gap_ms", + MetricUnit.MILLISECONDS, "warm up sync job trigger progress gap in milliseconds", + labels, job, MetricRepo::getCloudWarmUpSyncJobTriggerGapMs); + } + + private static void addCloudWarmUpSyncJobGauge(Set currentMetricKeys, String name, MetricUnit unit, + String description, List labels, CloudWarmUpJob job, + ToLongFunction valueFunction) { + String key = metricKey(name, labels); + currentMetricKeys.add(key); + CloudWarmUpSyncJobGauge gauge = new CloudWarmUpSyncJobGauge(name, unit, description, labels, job, + valueFunction); + CloudWarmUpSyncJobGauge existingGauge = CLOUD_WARM_UP_SYNC_JOB_METRICS.putIfAbsent(key, gauge); + if (existingGauge == null) { + DORIS_METRIC_REGISTER.addMetrics(gauge); + } else { + existingGauge.setJob(job); + } + } + + private static class CloudWarmUpSyncJobGauge extends GaugeMetric { + private volatile CloudWarmUpJob job; + private final ToLongFunction valueFunction; + + CloudWarmUpSyncJobGauge(String name, MetricUnit unit, String description, List labels, + CloudWarmUpJob job, ToLongFunction valueFunction) { + super(name, unit, description); + this.job = job; + this.valueFunction = valueFunction; + setLabels(labels); + } + + void setJob(CloudWarmUpJob job) { + this.job = job; + } + + @Override + public Long getValue() { + CloudWarmUpJob currentJob = job; + if (currentJob == null) { + return 0L; + } + try { + return valueFunction.applyAsLong(currentJob); + } catch (Exception e) { + return 0L; + } + } + } + + private static String metricKey(String name, List labels) { + StringBuilder sb = new StringBuilder(name); + for (MetricLabel label : labels) { + sb.append('|').append(label.getKey()).append('=').append(label.getValue()); + } + return sb.toString(); + } + + private static String labelValue(Object value) { + return value == null ? "" : value.toString(); + } + + private static JobWarmUpStats getCloudWarmUpSyncJobStats(CloudWarmUpJob job) { + JobWarmUpStats stats = job.getSyncStats(); + return stats == null ? new JobWarmUpStats() : stats; + } + + private static long getCloudWarmUpSyncJobSizeBytes(CloudWarmUpJob job, String side, String window) { + JobWarmUpStats stats = getCloudWarmUpSyncJobStats(job); + switch (side) { + case "src": + return byWindow(window, stats.requestedSegmentSize5m + stats.requestedIndexSize5m, + stats.requestedSegmentSize30m + stats.requestedIndexSize30m, + stats.requestedSegmentSize1h + stats.requestedIndexSize1h); + case "dst": + return byWindow(window, stats.finishSegmentSize5m + stats.finishIndexSize5m, + stats.finishSegmentSize30m + stats.finishIndexSize30m, + stats.finishSegmentSize1h + stats.finishIndexSize1h); + default: + return 0L; + } + } + + private static long getCloudWarmUpSyncJobTriggerGapMs(CloudWarmUpJob job) { + return getCloudWarmUpSyncJobStats(job).triggerGapMs; + } + + private static long byWindow(String window, long value5m, long value30m, long value1h) { + switch (window) { + case "5m": + return value5m; + case "30m": + return value30m; + case "1h": + return value1h; + default: + return 0; + } + } + // update some metrics to make a ready to be visited private static void updateMetrics() { SYSTEM_METRICS.update(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java index 0bb14eff373d39..5e8ab9decabed9 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java @@ -55,6 +55,7 @@ import org.apache.doris.catalog.info.PartitionNamesInfo; import org.apache.doris.catalog.info.TableNameInfo; import org.apache.doris.catalog.info.TagOptions; +import org.apache.doris.cloud.OnTablesFilter.TableFilterRule; import org.apache.doris.cloud.stage.StageUtil; import org.apache.doris.common.Config; import org.apache.doris.common.FeConstants; @@ -9463,7 +9464,19 @@ public LogicalPlan visitWarmUpCluster(DorisParser.WarmUpClusterContext ctx) { isForce = true; } ImmutableMap properties = ImmutableMap.copyOf(visitPropertyClause(ctx.properties)); - return new WarmUpClusterCommand(warmUpItems, srcCluster, dstCluster, isForce, isWarmUpWithTable, properties); + List onTablesRules = new ArrayList<>(); + if (ctx.onTablesClause() != null) { + for (DorisParser.OnTablesFilterRuleContext ruleContext + : ctx.onTablesClause().onTablesFilterRule()) { + TableFilterRule.RuleType ruleType = ruleContext.INCLUDE() != null + ? TableFilterRule.RuleType.INCLUDE + : TableFilterRule.RuleType.EXCLUDE; + onTablesRules.add(new TableFilterRule( + ruleType, stripQuotes(ruleContext.STRING_LITERAL().getText()))); + } + } + return new WarmUpClusterCommand(warmUpItems, srcCluster, dstCluster, isForce, + isWarmUpWithTable, properties, onTablesRules); } void fileCacheAdmissionCheck(DorisParser.WarmUpSelectContext ctx) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/ShowWarmUpCommand.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/ShowWarmUpCommand.java index 29f9487dcc13b9..063cfea68bbf7f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/ShowWarmUpCommand.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/ShowWarmUpCommand.java @@ -54,6 +54,9 @@ public class ShowWarmUpCommand extends ShowCommand { .add("FinishTime") .add("ErrMsg") .add("Tables") + .add("TableFilter") + .add("MatchedTables") + .add("SyncStats") .build(); private Expression whereClause; private boolean showAllJobs = false; diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/WarmUpClusterCommand.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/WarmUpClusterCommand.java index 1a077573417db0..4c6d74c898f54f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/WarmUpClusterCommand.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/WarmUpClusterCommand.java @@ -23,6 +23,7 @@ import org.apache.doris.catalog.OlapTable; import org.apache.doris.catalog.ScalarType; import org.apache.doris.catalog.info.TableNameInfo; +import org.apache.doris.cloud.OnTablesFilter.TableFilterRule; import org.apache.doris.cloud.catalog.CloudEnv; import org.apache.doris.cloud.catalog.ComputeGroup; import org.apache.doris.cloud.system.CloudSystemInfoService; @@ -64,6 +65,7 @@ public class WarmUpClusterCommand extends Command implements ForwardWithSync { private boolean isWarmUpWithTable; private List> tables = new ArrayList<>(); private Map properties = new HashMap<>(); + private List onTablesRules = new ArrayList<>(); /** * WarmUpClusterCommand @@ -87,8 +89,20 @@ public WarmUpClusterCommand(List warmUpItems, boolean isForce, boolean isWarmUpWithTable, Map properties) { + this(warmUpItems, srcCluster, dstCluster, isForce, isWarmUpWithTable, properties, + new ArrayList<>()); + } + + public WarmUpClusterCommand(List warmUpItems, + String srcCluster, + String dstCluster, + boolean isForce, + boolean isWarmUpWithTable, + Map properties, + List onTablesRules) { this(warmUpItems, srcCluster, dstCluster, isForce, isWarmUpWithTable); - this.properties = properties; + this.properties = properties == null ? new HashMap<>() : properties; + this.onTablesRules = onTablesRules == null ? new ArrayList<>() : onTablesRules; } public List getWarmUpItems() { @@ -115,6 +129,10 @@ public List> getTables() { return tables; } + public List getOnTablesRules() { + return onTablesRules; + } + @Override public void run(ConnectContext ctx, StmtExecutor executor) throws Exception { validate(ctx); @@ -140,10 +158,16 @@ private void checkWarmupCgs(CloudSystemInfoService cloudSys) throws AnalysisExce if (!Strings.isNullOrEmpty(srcCluster) && !Strings.isNullOrEmpty(dstCluster)) { String srcMayOwnedVcg = cloudSys.ownedByVirtualComputeGroup(srcCluster); - String dstMayOwnedVcg = cloudSys.ownedByVirtualComputeGroup(srcCluster); - if (srcMayOwnedVcg != null && srcMayOwnedVcg.equals(dstMayOwnedVcg)) { - throw new AnalysisException("The srcClusterName " + srcCluster + " dstClusterName " + dstCluster - + " is owned by virtual compute group " + srcMayOwnedVcg + " not support"); + String dstMayOwnedVcg = cloudSys.ownedByVirtualComputeGroup(dstCluster); + if (srcMayOwnedVcg != null && Objects.equals(srcMayOwnedVcg, dstMayOwnedVcg)) { + StringBuilder message = new StringBuilder("Cannot create warm up job from source compute group '") + .append(srcCluster).append("' to destination compute group '").append(dstCluster) + .append("': "); + message.append("source compute group '").append(srcCluster) + .append("' and destination compute group '").append(dstCluster) + .append("' are both owned by virtual compute group '").append(srcMayOwnedVcg) + .append("', not support"); + throw new AnalysisException(message.toString()); } } } @@ -180,6 +204,11 @@ public void validate(ConnectContext connectContext) throws UserException { + " is same with srcClusterName: " + srcCluster); } + boolean hasOnTablesRules = onTablesRules != null && !onTablesRules.isEmpty(); + if (hasOnTablesRules && isWarmUpWithTable) { + throw new AnalysisException("ON TABLES clause cannot be used with WITH TABLE warmup"); + } + if (isWarmUpWithTable) { for (WarmUpItem warmUpItem : warmUpItems) { TableNameInfo tableNameInfo = warmUpItem.getTableNameInfo(); @@ -203,6 +232,24 @@ public void validate(ConnectContext connectContext) throws UserException { tables.add(Triple.of(dbName, tableNameInfo.getTbl(), partitionName)); } } + + if (hasOnTablesRules) { + boolean hasInclude = onTablesRules.stream() + .anyMatch(r -> r.getRuleType() == TableFilterRule.RuleType.INCLUDE); + if (!hasInclude) { + throw new AnalysisException("ON TABLES clause must contain at least one INCLUDE rule"); + } + for (TableFilterRule rule : onTablesRules) { + if (!rule.getRawPattern().contains(".")) { + throw new AnalysisException("ON TABLES pattern must be in 'db.table' format: '" + + rule.getRawPattern() + "'"); + } + } + String syncMode = properties.get("sync_mode"); + if (!"event_driven".equals(syncMode)) { + throw new AnalysisException("ON TABLES clause is only supported with event_driven sync_mode"); + } + } } private void handleWarmUp(ConnectContext ctx, StmtExecutor executor) throws IOException { diff --git a/fe/fe-core/src/test/java/org/apache/doris/cloud/CacheHotspotManagerTableFilterTest.java b/fe/fe-core/src/test/java/org/apache/doris/cloud/CacheHotspotManagerTableFilterTest.java new file mode 100644 index 00000000000000..255268e66c5a74 --- /dev/null +++ b/fe/fe-core/src/test/java/org/apache/doris/cloud/CacheHotspotManagerTableFilterTest.java @@ -0,0 +1,1003 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.cloud; + +import org.apache.doris.catalog.DatabaseIf; +import org.apache.doris.catalog.Env; +import org.apache.doris.catalog.TableIf; +import org.apache.doris.cloud.OnTablesFilter.TableFilterRule; +import org.apache.doris.cloud.OnTablesFilter.TableFilterRule.RuleType; +import org.apache.doris.cloud.system.CloudSystemInfoService; +import org.apache.doris.common.AnalysisException; +import org.apache.doris.datasource.CatalogMgr; +import org.apache.doris.datasource.InternalCatalog; +import org.apache.doris.nereids.trees.plans.commands.WarmUpClusterCommand; +import org.apache.doris.persist.EditLog; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.core.LogEvent; +import org.apache.logging.log4j.core.Logger; +import org.apache.logging.log4j.core.appender.AbstractAppender; +import org.apache.logging.log4j.core.config.Property; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.Mockito; + +import java.lang.reflect.Field; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; + +/** + * Tests for CacheHotspotManager's table filter methods: + * resolveTableIds() and refreshAllTableFilters(). + * Uses Mockito to mock Env.getCurrentInternalCatalog() with fake databases/tables. + */ +public class CacheHotspotManagerTableFilterTest { + + private Env env; + private CatalogMgr mockCatalogMgr; + private InternalCatalog mockCatalog; + private EditLog mockEditLog; + private CacheHotspotManager manager; + private List> databases; + private Object originalCatalogMgr; + private EditLog originalEditLog; + private Object originalSystemInfo; + + private static Object getField(Object target, Class clazz, String fieldName) throws Exception { + Field field = clazz.getDeclaredField(fieldName); + field.setAccessible(true); + return field.get(target); + } + + private static void setField(Object target, Class clazz, String fieldName, Object value) throws Exception { + Field field = clazz.getDeclaredField(fieldName); + field.setAccessible(true); + field.set(target, value); + } + + @BeforeEach + public void setUp() throws Exception { + env = Env.getCurrentEnv(); + mockCatalogMgr = Mockito.mock(CatalogMgr.class); + mockCatalog = Mockito.mock(InternalCatalog.class); + mockEditLog = Mockito.mock(EditLog.class); + + originalCatalogMgr = getField(env, Env.class, "catalogMgr"); + originalEditLog = env.getEditLog(); + originalSystemInfo = getField(env, Env.class, "systemInfo"); + setField(env, Env.class, "catalogMgr", mockCatalogMgr); + setField(env, Env.class, "systemInfo", Mockito.mock(CloudSystemInfoService.class)); + env.setEditLog(mockEditLog); + Mockito.when(mockCatalogMgr.getInternalCatalog()).thenReturn(mockCatalog); + + databases = new ArrayList<>(); + Mockito.when(mockCatalog.getAllDbs()).thenAnswer(inv -> databases); + + manager = new CacheHotspotManager(Mockito.mock(CloudSystemInfoService.class)); + } + + @AfterEach + public void tearDown() throws Exception { + setField(env, Env.class, "catalogMgr", originalCatalogMgr); + setField(env, Env.class, "systemInfo", originalSystemInfo); + env.setEditLog(originalEditLog); + } + + @SuppressWarnings("unchecked") + private DatabaseIf mockDb(String name, TableIf... tables) { + DatabaseIf db = Mockito.mock(DatabaseIf.class); + Mockito.when(db.getFullName()).thenReturn(name); + // For resolveTableIds: getTableNamesOrEmptyWithLock + getTableNullable + HashSet tableNames = new HashSet<>(); + for (TableIf t : tables) { + tableNames.add(t.getName()); + Mockito.when(db.getTableNullable(t.getName())).thenReturn(t); + } + Mockito.when(db.getTableNamesOrEmptyWithLock()).thenReturn(tableNames); + // Keep getTables for other test paths (refreshAllTableFilters) + Mockito.when(db.getTables()).thenReturn(Arrays.asList(tables)); + return db; + } + + private TableIf mockTable(long id, String name) { + TableIf table = Mockito.mock(TableIf.class); + Mockito.when(table.getId()).thenReturn(id); + Mockito.when(table.getName()).thenReturn(name); + Mockito.when(table.getType()).thenReturn(TableIf.TableType.OLAP); + Mockito.when(table.isManagedTable()).thenReturn(true); + return table; + } + + private TableIf mockMtmv(long id, String name) { + TableIf table = Mockito.mock(TableIf.class); + Mockito.when(table.getId()).thenReturn(id); + Mockito.when(table.getName()).thenReturn(name); + Mockito.when(table.getType()).thenReturn(TableIf.TableType.MATERIALIZED_VIEW); + Mockito.when(table.isManagedTable()).thenReturn(true); + return table; + } + + private TableIf mockView(long id, String name) { + TableIf table = Mockito.mock(TableIf.class); + Mockito.when(table.getId()).thenReturn(id); + Mockito.when(table.getName()).thenReturn(name); + Mockito.when(table.getType()).thenReturn(TableIf.TableType.VIEW); + Mockito.when(table.isManagedTable()).thenReturn(false); + return table; + } + + private OnTablesFilter buildFilter(TableFilterRule... rules) { + return new OnTablesFilter(Arrays.asList(rules)); + } + + private Map eventDrivenProperties() { + Map properties = new HashMap<>(); + properties.put("sync_mode", "event_driven"); + properties.put("sync_event", "load"); + return properties; + } + + private WarmUpClusterCommand buildEventDrivenStmt(String src, String dst, TableFilterRule... rules) { + return new WarmUpClusterCommand(new ArrayList<>(), src, dst, false, false, + eventDrivenProperties(), rules.length == 0 ? new ArrayList<>() : Arrays.asList(rules)); + } + + private CloudWarmUpJob createEventDrivenJob(String src, String dst, TableFilterRule... rules) throws Exception { + long jobId = manager.createJob(buildEventDrivenStmt(src, dst, rules)); + CloudWarmUpJob job = manager.getCloudWarmUpJob(jobId); + Assertions.assertNotNull(job); + return job; + } + + private CloudWarmUpJob replayEventDrivenJob(long jobId, String src, String dst, TableFilterRule... rules) + throws Exception { + CloudWarmUpJob.Builder builder = new CloudWarmUpJob.Builder() + .setJobId(jobId) + .setSrcClusterName(src) + .setDstClusterName(dst) + .setJobType(CloudWarmUpJob.JobType.CLUSTER) + .setSyncMode(CloudWarmUpJob.SyncMode.EVENT_DRIVEN) + .setSyncEvent(CloudWarmUpJob.SyncEvent.LOAD); + if (rules.length > 0) { + List persistedRules = new ArrayList<>(); + for (TableFilterRule rule : rules) { + CloudWarmUpJob.PersistedTableFilterRule persistedRule = + new CloudWarmUpJob.PersistedTableFilterRule(); + persistedRule.ruleType = rule.getRuleType().name(); + persistedRule.pattern = rule.getRawPattern(); + persistedRules.add(persistedRule); + } + builder.setTableFilterRules(persistedRules); + } + CloudWarmUpJob job = builder.build(); + manager.replayCloudWarmUpJob(job); + return job; + } + + // ===== resolveTableIds() ===== + + @Test + public void testResolveTableIdsBasicMatching() { + // Scenario: INCLUDE 'ods.*' matches all tables in ods database + databases.add(mockDb("ods", + mockTable(1001, "orders"), + mockTable(1002, "users"), + mockTable(1003, "tmp_staging"))); + databases.add(mockDb("dw", + mockTable(2001, "fact_sales"))); + + OnTablesFilter filter = buildFilter( + new TableFilterRule(RuleType.INCLUDE, "ods.*")); + Map idNames = manager.resolveTableIds(filter); + + Assertions.assertEquals(3, idNames.size()); + Assertions.assertEquals("ods.orders", idNames.get(1001L)); + Assertions.assertEquals("ods.users", idNames.get(1002L)); + Assertions.assertEquals("ods.tmp_staging", idNames.get(1003L)); + Assertions.assertFalse(idNames.containsKey(2001L)); + } + + @Test + public void testResolveTableIdsWithExclude() { + // Scenario: INCLUDE 'ods.*' EXCLUDE 'ods.tmp_*' — exclude tmp tables + databases.add(mockDb("ods", + mockTable(1001, "orders"), + mockTable(1002, "tmp_staging"), + mockTable(1003, "tmp_data"))); + + OnTablesFilter filter = buildFilter( + new TableFilterRule(RuleType.INCLUDE, "ods.*"), + new TableFilterRule(RuleType.EXCLUDE, "ods.tmp_*")); + Map idNames = manager.resolveTableIds(filter); + + Assertions.assertEquals(1, idNames.size()); + Assertions.assertEquals("ods.orders", idNames.get(1001L)); + } + + @Test + public void testResolveTableIdsMultipleDatabases() { + // Scenario: INCLUDE 'ods.*', INCLUDE 'dw.fact_*' — match across two databases + databases.add(mockDb("ods", + mockTable(1001, "orders"), + mockTable(1002, "users"))); + databases.add(mockDb("dw", + mockTable(2001, "fact_sales"), + mockTable(2002, "dim_product"), + mockTable(2003, "fact_orders"))); + + OnTablesFilter filter = buildFilter( + new TableFilterRule(RuleType.INCLUDE, "ods.*"), + new TableFilterRule(RuleType.INCLUDE, "dw.fact_*")); + Map idNames = manager.resolveTableIds(filter); + + Assertions.assertEquals(4, idNames.size()); + Assertions.assertEquals("ods.orders", idNames.get(1001L)); + Assertions.assertEquals("ods.users", idNames.get(1002L)); + Assertions.assertEquals("dw.fact_sales", idNames.get(2001L)); + Assertions.assertEquals("dw.fact_orders", idNames.get(2003L)); + } + + @Test + public void testResolveTableIdsNoMatch() { + // Scenario: pattern matches nothing → empty map + databases.add(mockDb("ods", mockTable(1001, "orders"))); + + OnTablesFilter filter = buildFilter( + new TableFilterRule(RuleType.INCLUDE, "nonexistent.*")); + Map idNames = manager.resolveTableIds(filter); + + Assertions.assertTrue(idNames.isEmpty()); + } + + @Test + public void testResolveTableIdsNullFilter() { + Map idNames = manager.resolveTableIds(null); + Assertions.assertTrue(idNames.isEmpty()); + } + + @Test + public void testResolveTableIdsSkipsViews() { + databases.add(mockDb("ods", + mockTable(1001, "orders"), + mockView(1002, "orders_view"))); + + OnTablesFilter filter = buildFilter( + new TableFilterRule(RuleType.INCLUDE, "ods.*")); + Map idNames = manager.resolveTableIds(filter); + + Assertions.assertEquals(1, idNames.size()); + Assertions.assertEquals("ods.orders", idNames.get(1001L)); + Assertions.assertFalse(idNames.containsKey(1002L)); + } + + @Test + public void testResolveTableIdsDbNameWithPrefix() { + // CacheHotspotManager strips "default_cluster:" prefix from db name + databases.add(mockDb("default_cluster:ods", + mockTable(1001, "orders"))); + + OnTablesFilter filter = buildFilter( + new TableFilterRule(RuleType.INCLUDE, "ods.*")); + Map idNames = manager.resolveTableIds(filter); + + Assertions.assertEquals(1, idNames.size()); + Assertions.assertEquals("ods.orders", idNames.get(1001L)); + } + + // ===== resolveTableIds() with dynamic table changes ===== + + @Test + public void testResolveTableIdsAfterNewTableCreated() { + // Initial: ods has orders. After new table created, re-resolve picks it up. + DatabaseIf odsDb = mockDb("ods", mockTable(1001, "orders")); + databases.add(odsDb); + + OnTablesFilter filter = buildFilter( + new TableFilterRule(RuleType.INCLUDE, "ods.*")); + + Map ids1 = manager.resolveTableIds(filter); + Assertions.assertEquals(1, ids1.size()); + + // Simulate new table created: replace the db mock to include new table + databases.clear(); + databases.add(mockDb("ods", + mockTable(1001, "orders"), + mockTable(1004, "payments"))); + + Map ids2 = manager.resolveTableIds(filter); + Assertions.assertEquals(2, ids2.size()); + Assertions.assertEquals("ods.orders", ids2.get(1001L)); + Assertions.assertEquals("ods.payments", ids2.get(1004L)); + } + + @Test + public void testResolveTableIdsAfterTableDropped() { + // Initial: ods has orders and users. After orders dropped, re-resolve removes it. + databases.add(mockDb("ods", + mockTable(1001, "orders"), + mockTable(1002, "users"))); + + OnTablesFilter filter = buildFilter( + new TableFilterRule(RuleType.INCLUDE, "ods.*")); + + Map ids1 = manager.resolveTableIds(filter); + Assertions.assertEquals(2, ids1.size()); + + databases.clear(); + databases.add(mockDb("ods", mockTable(1002, "users"))); + + Map ids2 = manager.resolveTableIds(filter); + Assertions.assertEquals(1, ids2.size()); + Assertions.assertEquals("ods.users", ids2.get(1002L)); + } + + @Test + public void testResolveTableIdsAfterTableRenamed() { + // Scenario from user guide: INCLUDE 'db.order_*', rename order_2024→archive_2024 → stops matching + databases.add(mockDb("db", + mockTable(1001, "order_2024"), + mockTable(1002, "order_2025"))); + + OnTablesFilter filter = buildFilter( + new TableFilterRule(RuleType.INCLUDE, "db.order_*")); + + Map ids1 = manager.resolveTableIds(filter); + Assertions.assertEquals(2, ids1.size()); + + // Rename order_2024 → archive_2024 (no longer matches order_*) + databases.clear(); + databases.add(mockDb("db", + mockTable(1001, "archive_2024"), + mockTable(1002, "order_2025"))); + + Map ids2 = manager.resolveTableIds(filter); + Assertions.assertEquals(1, ids2.size()); + Assertions.assertEquals("db.order_2025", ids2.get(1002L)); + } + + @Test + public void testResolveTableIdsAfterAllTablesDropped() { + // User guide: all matched tables dropped → empty set, Job stays RUNNING + databases.add(mockDb("ods", mockTable(1001, "orders"))); + + OnTablesFilter filter = buildFilter( + new TableFilterRule(RuleType.INCLUDE, "ods.*")); + + Map ids1 = manager.resolveTableIds(filter); + Assertions.assertEquals(1, ids1.size()); + + databases.clear(); + databases.add(mockDb("ods")); // empty database + + Map ids2 = manager.resolveTableIds(filter); + Assertions.assertTrue(ids2.isEmpty()); + } + + // ===== refreshAllTableFilters() ===== + + @Test + public void testRefreshAllTableFiltersUpdatesJobTableIds() throws Exception { + databases.add(mockDb("ods", + mockTable(1001, "orders"), + mockTable(1002, "users"))); + + CloudWarmUpJob job = createEventDrivenJob("write_cg", "read_cg", + new TableFilterRule(RuleType.INCLUDE, "ods.*")); + + // Verify initial resolution picked up 2 tables with correct names + Assertions.assertEquals( + new HashSet<>(Arrays.asList(1001L, 1002L)), + job.getCurrentTableIds()); + + // Simulate new table created + databases.clear(); + databases.add(mockDb("ods", + mockTable(1001, "orders"), + mockTable(1002, "users"), + mockTable(1003, "payments"))); + + manager.refreshAllTableFilters(); + + // Verify job now has 3 table IDs + Assertions.assertEquals( + new HashSet<>(Arrays.asList(1001L, 1002L, 1003L)), + job.getCurrentTableIds()); + } + + @Test + public void testRefreshAllTableFiltersSkipsClusterLevelJob() throws Exception { + // Cluster-level job (no table filter) should not be affected by refresh + databases.add(mockDb("ods", mockTable(1001, "orders"))); + + CloudWarmUpJob clusterJob = replayEventDrivenJob(200L, "write_cg", "read_cg"); + + // currentTableIds should be empty (no table filter) + Assertions.assertTrue(clusterJob.getCurrentTableIds().isEmpty()); + + manager.refreshAllTableFilters(); + + // Still empty after refresh — cluster-level jobs are skipped + Assertions.assertTrue(clusterJob.getCurrentTableIds().isEmpty()); + } + + @Test + public void testRefreshAllTableFiltersHandlesTableDrop() throws Exception { + // Setup: job matching ods.*, initially 2 tables + databases.add(mockDb("ods", + mockTable(1001, "orders"), + mockTable(1002, "users"))); + + CloudWarmUpJob job = replayEventDrivenJob(300L, "write_cg", "read_cg", + new TableFilterRule(RuleType.INCLUDE, "ods.*")); + Assertions.assertEquals(2, job.getCurrentTableIds().size()); + + // Drop one table + databases.clear(); + databases.add(mockDb("ods", mockTable(1002, "users"))); + + manager.refreshAllTableFilters(); + + Assertions.assertEquals( + new HashSet<>(Arrays.asList(1002L)), + job.getCurrentTableIds()); + } + + @Test + public void testRefreshAllTableFiltersUpdatesMatchedNamesAfterRenameStillMatches() throws Exception { + databases.add(mockDb("db", + mockTable(1001, "order_2024"), + mockTable(1002, "order_2025"))); + + CloudWarmUpJob job = createEventDrivenJob("write_cg", "read_cg", + new TableFilterRule(RuleType.INCLUDE, "db.order_*")); + Assertions.assertEquals("db.order_2024, db.order_2025", job.getJobInfo(null).get(14)); + + databases.clear(); + databases.add(mockDb("db", + mockTable(1001, "order_2024_v2"), + mockTable(1002, "order_2025"))); + + manager.refreshAllTableFilters(); + + Assertions.assertEquals(new HashSet<>(Arrays.asList(1001L, 1002L)), job.getCurrentTableIds()); + Assertions.assertEquals("db.order_2024_v2, db.order_2025", job.getJobInfo(null).get(14)); + } + + @Test + public void testCreateJobRejectsOnTablesWithoutInitialMatches() { + databases.add(mockDb("ods", mockTable(1001, "orders"))); + + WarmUpClusterCommand stmt = buildEventDrivenStmt("write_cg", "read_cg", + new TableFilterRule(RuleType.INCLUDE, "dw.*")); + + AnalysisException exception = Assertions.assertThrows(AnalysisException.class, + () -> manager.createJob(stmt)); + Assertions.assertTrue(exception.getMessage().contains("No tables matched the ON TABLES filter")); + } + + @Test + public void testCreateJobRejectsEquivalentDuplicateTableFilter() throws Exception { + databases.add(mockDb("ods", mockTable(1001, "orders"))); + databases.add(mockDb("dw", + mockTable(2001, "fact_sales"), + mockTable(2002, "tmp_staging"))); + + WarmUpClusterCommand first = buildEventDrivenStmt("write_cg", "read_cg", + new TableFilterRule(RuleType.INCLUDE, "ods.*"), + new TableFilterRule(RuleType.INCLUDE, "dw.*"), + new TableFilterRule(RuleType.EXCLUDE, "dw.tmp_*")); + WarmUpClusterCommand second = buildEventDrivenStmt("write_cg", "read_cg", + new TableFilterRule(RuleType.EXCLUDE, "dw.tmp_*"), + new TableFilterRule(RuleType.INCLUDE, "dw.*"), + new TableFilterRule(RuleType.INCLUDE, "ods.*"), + new TableFilterRule(RuleType.INCLUDE, "ods.*")); + + manager.createJob(first); + + AnalysisException exception = Assertions.assertThrows(AnalysisException.class, + () -> manager.createJob(second)); + Assertions.assertTrue(exception.getMessage().contains("already has a runnable job")); + } + + @Test + public void testCreateJobRejectsTableLevelWhenClusterLevelLoadEventExists() throws Exception { + databases.add(mockDb("ods", mockTable(1001, "orders"))); + + WarmUpClusterCommand clusterLevel = buildEventDrivenStmt("write_cg", "read_cg"); + WarmUpClusterCommand tableLevel = buildEventDrivenStmt("write_cg", "read_cg", + new TableFilterRule(RuleType.INCLUDE, "ods.*")); + + long clusterJobId = manager.createJob(clusterLevel); + AnalysisException exception = Assertions.assertThrows(AnalysisException.class, + () -> manager.createJob(tableLevel)); + + Assertions.assertTrue(exception.getMessage().contains( + "Cannot create table-level load-event warm up job")); + Assertions.assertTrue(exception.getMessage().contains("cluster-level load-event warm up job " + + clusterJobId)); + Assertions.assertTrue(exception.getMessage().contains( + "Cancel existing load-event warm up job " + clusterJobId)); + Assertions.assertTrue(exception.getMessage().contains( + "source compute group 'write_cg' to destination compute group 'read_cg'")); + Assertions.assertEquals(1, manager.getAllJobInfos(10).size()); + } + + @Test + public void testCreateJobRejectsClusterLevelWhenTableLevelLoadEventExists() throws Exception { + databases.add(mockDb("ods", mockTable(1001, "orders"))); + + WarmUpClusterCommand tableLevel = buildEventDrivenStmt("write_cg", "read_cg", + new TableFilterRule(RuleType.INCLUDE, "ods.*")); + WarmUpClusterCommand clusterLevel = buildEventDrivenStmt("write_cg", "read_cg"); + + long tableJobId = manager.createJob(tableLevel); + AnalysisException exception = Assertions.assertThrows(AnalysisException.class, + () -> manager.createJob(clusterLevel)); + + Assertions.assertTrue(exception.getMessage().contains( + "Cannot create cluster-level load-event warm up job")); + Assertions.assertTrue(exception.getMessage().contains("table-level load-event warm up job " + + tableJobId)); + Assertions.assertTrue(exception.getMessage().contains( + "Cancel existing load-event warm up job " + tableJobId)); + Assertions.assertTrue(exception.getMessage().contains( + "with table filter [{\"include\":[\"ods.*\"]}]")); + Assertions.assertEquals(1, manager.getAllJobInfos(10).size()); + } + + @Test + public void testVirtualComputeGroupCancelsExistingTableLevelLoadEvent() throws Exception { + databases.add(mockDb("ods", mockTable(1001, "orders"))); + + CloudWarmUpJob tableLevelJob = createEventDrivenJob("write_cg", "read_cg", + new TableFilterRule(RuleType.INCLUDE, "ods.*")); + CloudWarmUpJob reverseTableLevelJob = createEventDrivenJob("read_cg", "write_cg", + new TableFilterRule(RuleType.INCLUDE, "ods.*")); + CloudWarmUpJob unrelatedTableLevelJob = createEventDrivenJob("write_cg", "outside_cg", + new TableFilterRule(RuleType.INCLUDE, "ods.*")); + CloudWarmUpJob clusterLevelJob = manager.getCloudWarmUpJob( + manager.createJob(buildEventDrivenStmt("other_write_cg", "other_read_cg"))); + CloudWarmUpJob finishedTableLevelJob = createEventDrivenJob("write_cg", "read_cg", + new TableFilterRule(RuleType.INCLUDE, "ods.orders")); + setField(finishedTableLevelJob, CloudWarmUpJob.class, "jobState", CloudWarmUpJob.JobState.CANCELLED); + + List cancelledJobIds = new ArrayList<>(); + Map cancelReasons = new HashMap<>(); + CacheHotspotManager spyManager = Mockito.spy(manager); + Mockito.doAnswer(invocation -> { + Long jobId = invocation.getArgument(0); + String errMsg = invocation.getArgument(1); + cancelledJobIds.add(jobId); + cancelReasons.put(jobId, errMsg); + return null; + }).when(spyManager).cancel(Mockito.anyLong(), Mockito.anyString()); + + String reason = "vcg cancel table-level load-event warm up job before rebuilding file cache jobs"; + RecordingAppender appender = new RecordingAppender("vcg-cancel-table-warmup-test"); + Logger logger = (Logger) LogManager.getLogger(CacheHotspotManager.class); + appender.start(); + logger.addAppender(appender); + try { + spyManager.cancelTableLevelLoadEventWarmUpJobsForVirtualComputeGroup( + "vcg", "write_cg", "read_cg", Arrays.asList("write_cg", "read_cg"), reason); + } finally { + logger.removeAppender(appender); + appender.stop(); + } + + Assertions.assertEquals(new HashSet<>(Arrays.asList( + tableLevelJob.getJobId(), reverseTableLevelJob.getJobId())), + new HashSet<>(cancelledJobIds)); + Assertions.assertEquals(2, cancelledJobIds.size()); + String expectedReason = reason + " for virtual compute group 'vcg'"; + Assertions.assertEquals(expectedReason, cancelReasons.get(tableLevelJob.getJobId())); + Assertions.assertEquals(expectedReason, cancelReasons.get(reverseTableLevelJob.getJobId())); + Assertions.assertFalse(cancelReasons.containsKey(unrelatedTableLevelJob.getJobId())); + Assertions.assertFalse(cancelReasons.containsKey(clusterLevelJob.getJobId())); + Assertions.assertFalse(cancelReasons.containsKey(finishedTableLevelJob.getJobId())); + + String logs = appender.messagesAsString(); + Assertions.assertTrue(logs.contains("virtual compute group 'vcg'"), logs); + Assertions.assertTrue(logs.contains(expectedReason), logs); + } + + @Test + public void testCancelTableFilterJobsForClusterChangeOnlyCancelsMatchingTableFilterJobs() throws Exception { + databases.add(mockDb("ods", mockTable(1001, "orders"))); + + CloudWarmUpJob srcMatchedJob = createEventDrivenJob("write_cg", "read_cg", + new TableFilterRule(RuleType.INCLUDE, "ods.*")); + CloudWarmUpJob dstMatchedJob = createEventDrivenJob("other_write_cg", "write_cg", + new TableFilterRule(RuleType.INCLUDE, "ods.*")); + CloudWarmUpJob clusterLevelJob = manager.getCloudWarmUpJob( + manager.createJob(buildEventDrivenStmt("write_cg", "cluster_level_read_cg"))); + CloudWarmUpJob unrelatedJob = createEventDrivenJob("other_write_cg", "other_read_cg", + new TableFilterRule(RuleType.INCLUDE, "ods.*")); + CloudWarmUpJob finishedJob = createEventDrivenJob("write_cg", "finished_read_cg", + new TableFilterRule(RuleType.INCLUDE, "ods.*")); + setField(finishedJob, CloudWarmUpJob.class, "jobState", CloudWarmUpJob.JobState.CANCELLED); + + List cancelledJobIds = new ArrayList<>(); + Map cancelReasons = new HashMap<>(); + CacheHotspotManager spyManager = Mockito.spy(manager); + Mockito.doAnswer(invocation -> { + Long jobId = invocation.getArgument(0); + String errMsg = invocation.getArgument(1); + cancelledJobIds.add(jobId); + cancelReasons.put(jobId, errMsg); + return null; + }).when(spyManager).cancel(Mockito.anyLong(), Mockito.anyString()); + + String reason = "system cancel: compute group write_cg renamed to write_cg_new"; + spyManager.cancelTableFilterJobsForClusterChange("write_cg", reason); + + Assertions.assertEquals(new HashSet<>(Arrays.asList( + srcMatchedJob.getJobId(), dstMatchedJob.getJobId())), + new HashSet<>(cancelledJobIds)); + Assertions.assertEquals(2, cancelledJobIds.size()); + Assertions.assertEquals(reason, cancelReasons.get(srcMatchedJob.getJobId())); + Assertions.assertEquals(reason, cancelReasons.get(dstMatchedJob.getJobId())); + Assertions.assertFalse(cancelReasons.containsKey(clusterLevelJob.getJobId())); + Assertions.assertFalse(cancelReasons.containsKey(unrelatedJob.getJobId())); + Assertions.assertFalse(cancelReasons.containsKey(finishedJob.getJobId())); + } + + // ===== Async materialized view (MTMV) matching ===== + + @Test + public void testResolveTableIdsMatchesAsyncMaterializedView() { + // Async MVs (MTMV) are separate table entries in the database catalog. + // They should be matched by ON TABLES filter just like regular OlapTables. + databases.add(mockDb("ods", + mockTable(1001, "orders"), + mockTable(1002, "users"), + mockMtmv(1003, "mv_order_summary"))); + + OnTablesFilter filter = buildFilter( + new TableFilterRule(RuleType.INCLUDE, "ods.*")); + Map idNames = manager.resolveTableIds(filter); + + Assertions.assertEquals(3, idNames.size()); + Assertions.assertEquals("ods.orders", idNames.get(1001L)); + Assertions.assertEquals("ods.users", idNames.get(1002L)); + Assertions.assertEquals("ods.mv_order_summary", idNames.get(1003L)); + } + + @Test + public void testResolveTableIdsMtmvMatchedByMvPattern() { + // Verify async MVs can be matched by mv_* pattern while base tables are not + databases.add(mockDb("analytics", + mockTable(2001, "fact_sales"), + mockMtmv(2002, "mv_daily_sales"), + mockMtmv(2003, "mv_monthly_revenue"), + mockTable(2004, "dim_product"))); + + OnTablesFilter filter = buildFilter( + new TableFilterRule(RuleType.INCLUDE, "analytics.mv_*")); + Map idNames = manager.resolveTableIds(filter); + + Assertions.assertEquals(2, idNames.size()); + Assertions.assertEquals("analytics.mv_daily_sales", idNames.get(2002L)); + Assertions.assertEquals("analytics.mv_monthly_revenue", idNames.get(2003L)); + Assertions.assertFalse(idNames.containsKey(2001L)); + Assertions.assertFalse(idNames.containsKey(2004L)); + } + + @Test + public void testResolveTableIdsMtmvExcludedByPattern() { + // Verify async MVs can be excluded by EXCLUDE rule + databases.add(mockDb("ods", + mockTable(1001, "orders"), + mockMtmv(1002, "mv_order_summary"), + mockMtmv(1003, "mv_user_stats"))); + + OnTablesFilter filter = buildFilter( + new TableFilterRule(RuleType.INCLUDE, "ods.*"), + new TableFilterRule(RuleType.EXCLUDE, "ods.mv_*")); + Map idNames = manager.resolveTableIds(filter); + + Assertions.assertEquals(1, idNames.size()); + Assertions.assertEquals("ods.orders", idNames.get(1001L)); + } + + @Test + public void testResolveTableIdsMixedTableTypesAcrossDatabases() { + // Multiple databases with mixed OlapTable and MTMV types + databases.add(mockDb("ods", + mockTable(1001, "orders"), + mockMtmv(1002, "mv_orders_agg"))); + databases.add(mockDb("dw", + mockTable(2001, "fact_sales"), + mockMtmv(2002, "mv_daily_report"))); + + OnTablesFilter filter = buildFilter( + new TableFilterRule(RuleType.INCLUDE, "ods.*"), + new TableFilterRule(RuleType.INCLUDE, "dw.mv_*")); + Map idNames = manager.resolveTableIds(filter); + + // ods.* matches orders + mv_orders_agg; dw.mv_* matches mv_daily_report + Assertions.assertEquals(3, idNames.size()); + Assertions.assertEquals("ods.orders", idNames.get(1001L)); + Assertions.assertEquals("ods.mv_orders_agg", idNames.get(1002L)); + Assertions.assertEquals("dw.mv_daily_report", idNames.get(2002L)); + Assertions.assertFalse(idNames.containsKey(2001L)); + } + + @Test + public void testRefreshAllTableFiltersPicksUpNewMtmv() throws Exception { + // When a new async MV is created after job creation, refreshAllTableFilters picks it up + databases.add(mockDb("ods", + mockTable(1001, "orders"))); + + CloudWarmUpJob job = createEventDrivenJob("write_cg", "read_cg", + new TableFilterRule(RuleType.INCLUDE, "ods.*")); + Assertions.assertEquals(1, job.getCurrentTableIds().size()); + + // Simulate async MV created + databases.clear(); + databases.add(mockDb("ods", + mockTable(1001, "orders"), + mockMtmv(1002, "mv_order_summary"))); + + manager.refreshAllTableFilters(); + + Assertions.assertEquals( + new HashSet<>(Arrays.asList(1001L, 1002L)), + job.getCurrentTableIds()); + } + + // ========== Performance tests: regex matching throughput at scale ========== + + /** + * Generate table name strings (db.table) for timing shouldWarmUp regex calls. + * No mocks needed — we test the filter's regex matching performance directly. + */ + private List generateTableNames(int dbCount, int tablesPerDb) { + List names = new ArrayList<>(dbCount * tablesPerDb); + for (int d = 0; d < dbCount; d++) { + String db = "db_" + d; + for (int t = 0; t < tablesPerDb; t++) { + names.add(new String[]{db, "tbl_" + String.format("%05d", t)}); + } + } + return names; + } + + @Test + public void testShouldWarmUpPerformance10kTables() { + List names = generateTableNames(10, 1000); // 10K + OnTablesFilter filter = buildFilter( + new TableFilterRule(RuleType.INCLUDE, "db_*.*")); + + long start = System.nanoTime(); + int matched = 0; + for (String[] pair : names) { + if (filter.shouldWarmUp(pair[0], pair[1])) { + matched++; + } + } + long elapsedMs = (System.nanoTime() - start) / 1_000_000; + + Assertions.assertEquals(10000, matched); + System.out.println("[Perf] 10K tables, wildcard match-all: " + elapsedMs + " ms"); + Assertions.assertTrue(elapsedMs < 500, + "10K regex matches should complete within 500ms, took " + elapsedMs + " ms"); + } + + @Test + public void testShouldWarmUpPerformance50kTables() { + List names = generateTableNames(50, 1000); // 50K + OnTablesFilter filter = buildFilter( + new TableFilterRule(RuleType.INCLUDE, "db_*.*")); + + long start = System.nanoTime(); + int matched = 0; + for (String[] pair : names) { + if (filter.shouldWarmUp(pair[0], pair[1])) { + matched++; + } + } + long elapsedMs = (System.nanoTime() - start) / 1_000_000; + + Assertions.assertEquals(50000, matched); + System.out.println("[Perf] 50K tables, wildcard match-all: " + elapsedMs + " ms"); + Assertions.assertTrue(elapsedMs < 500, + "50K regex matches should complete within 500ms, took " + elapsedMs + " ms"); + } + + @Test + public void testShouldWarmUpPerformance200kTables() { + List names = generateTableNames(100, 2000); // 200K + OnTablesFilter filter = buildFilter( + new TableFilterRule(RuleType.INCLUDE, "db_*.*")); + + long start = System.nanoTime(); + int matched = 0; + for (String[] pair : names) { + if (filter.shouldWarmUp(pair[0], pair[1])) { + matched++; + } + } + long elapsedMs = (System.nanoTime() - start) / 1_000_000; + + Assertions.assertEquals(200000, matched); + System.out.println("[Perf] 200K tables, wildcard match-all: " + elapsedMs + " ms"); + Assertions.assertTrue(elapsedMs < 1500, + "200K regex matches should complete within 1.5s, took " + elapsedMs + " ms"); + } + + @Test + public void testShouldWarmUpPerformance500kTables() { + List names = generateTableNames(100, 5000); // 500K + OnTablesFilter filter = buildFilter( + new TableFilterRule(RuleType.INCLUDE, "db_*.*")); + + long start = System.nanoTime(); + int matched = 0; + for (String[] pair : names) { + if (filter.shouldWarmUp(pair[0], pair[1])) { + matched++; + } + } + long elapsedMs = (System.nanoTime() - start) / 1_000_000; + + Assertions.assertEquals(500000, matched); + System.out.println("[Perf] 500K tables, wildcard match-all: " + elapsedMs + " ms"); + Assertions.assertTrue(elapsedMs < 4000, + "500K regex matches should complete within 4s, took " + elapsedMs + " ms"); + } + + @Test + public void testShouldWarmUpPerformanceSelectivePattern50k() { + List names = generateTableNames(50, 1000); // 50K + // Only match tables in db_0 + OnTablesFilter filter = buildFilter( + new TableFilterRule(RuleType.INCLUDE, "db_0.*")); + + long start = System.nanoTime(); + int matched = 0; + for (String[] pair : names) { + if (filter.shouldWarmUp(pair[0], pair[1])) { + matched++; + } + } + long elapsedMs = (System.nanoTime() - start) / 1_000_000; + + Assertions.assertEquals(1000, matched); + System.out.println("[Perf] 50K tables, selective db_0 pattern: " + elapsedMs + " ms"); + Assertions.assertTrue(elapsedMs < 500, + "50K regex matches (selective) should complete within 500ms, took " + elapsedMs + " ms"); + } + + @Test + public void testShouldWarmUpPerformanceMultipleRules50k() { + List names = generateTableNames(50, 1000); // 50K + // Include db_1* tables, exclude tables ending with digit 9 + OnTablesFilter filter = buildFilter( + new TableFilterRule(RuleType.INCLUDE, "db_1*.*"), + new TableFilterRule(RuleType.EXCLUDE, "*.*9")); + + long start = System.nanoTime(); + int matched = 0; + for (String[] pair : names) { + if (filter.shouldWarmUp(pair[0], pair[1])) { + matched++; + } + } + long elapsedMs = (System.nanoTime() - start) / 1_000_000; + + // db_1, db_10..db_19 = 11 dbs × 1000 tables = 11000 candidates + // Exclude tables ending with "9": tbl_00009, tbl_00019, ..., tbl_00999 = 100 per db + // Result = 11000 - 11*100 = 9900 + Assertions.assertEquals(9900, matched); + System.out.println("[Perf] 50K tables, include+exclude: " + elapsedMs + " ms, matched=" + matched); + Assertions.assertTrue(elapsedMs < 500, + "50K regex matches (multi-rule) should complete within 500ms, took " + elapsedMs + " ms"); + } + + @Test + public void testShouldWarmUpPerformanceManyRules200k() { + List names = generateTableNames(100, 2000); // 200K + // 10 include rules + 5 exclude rules + OnTablesFilter filter = buildFilter( + new TableFilterRule(RuleType.INCLUDE, "db_0.*"), + new TableFilterRule(RuleType.INCLUDE, "db_1.*"), + new TableFilterRule(RuleType.INCLUDE, "db_2.*"), + new TableFilterRule(RuleType.INCLUDE, "db_3.*"), + new TableFilterRule(RuleType.INCLUDE, "db_4.*"), + new TableFilterRule(RuleType.INCLUDE, "db_5.*"), + new TableFilterRule(RuleType.INCLUDE, "db_6.*"), + new TableFilterRule(RuleType.INCLUDE, "db_7.*"), + new TableFilterRule(RuleType.INCLUDE, "db_8.*"), + new TableFilterRule(RuleType.INCLUDE, "db_9.*"), + new TableFilterRule(RuleType.EXCLUDE, "*.tbl_00000"), + new TableFilterRule(RuleType.EXCLUDE, "*.tbl_00001"), + new TableFilterRule(RuleType.EXCLUDE, "*.tbl_00002"), + new TableFilterRule(RuleType.EXCLUDE, "*.tbl_00003"), + new TableFilterRule(RuleType.EXCLUDE, "*.tbl_00004")); + + long start = System.nanoTime(); + int matched = 0; + for (String[] pair : names) { + if (filter.shouldWarmUp(pair[0], pair[1])) { + matched++; + } + } + long elapsedMs = (System.nanoTime() - start) / 1_000_000; + + // 10 dbs × 2000 tables = 20000 included, minus 10 × 5 excluded = 19950 + Assertions.assertEquals(19950, matched); + System.out.println("[Perf] 200K tables, 15 rules (10 incl + 5 excl): " + elapsedMs + " ms"); + Assertions.assertTrue(elapsedMs < 3000, + "200K regex matches with 15 rules should complete within 3s, took " + elapsedMs + " ms"); + } + + @Test + public void testShouldWarmUpPerformanceRepeatedCycles200k() { + List names = generateTableNames(100, 2000); // 200K + OnTablesFilter filter = buildFilter( + new TableFilterRule(RuleType.INCLUDE, "db_*.*")); + + // JIT warm-up + for (String[] pair : names) { + filter.shouldWarmUp(pair[0], pair[1]); + } + + long start = System.nanoTime(); + int iterations = 5; + int totalMatched = 0; + for (int i = 0; i < iterations; i++) { + for (String[] pair : names) { + if (filter.shouldWarmUp(pair[0], pair[1])) { + totalMatched++; + } + } + } + long totalMs = (System.nanoTime() - start) / 1_000_000; + long avgMs = totalMs / iterations; + + Assertions.assertEquals(200000 * iterations, totalMatched); + System.out.println("[Perf] 200K tables × 5 cycles: total=" + totalMs + " ms, avg=" + avgMs + " ms/cycle"); + Assertions.assertTrue(avgMs < 1000, + "Avg per refresh cycle for 200K tables should be < 1s, avg=" + avgMs + " ms"); + } + + private static class RecordingAppender extends AbstractAppender { + private final List messages = new ArrayList<>(); + + RecordingAppender(String name) { + super(name, null, null, true, Property.EMPTY_ARRAY); + } + + @Override + public void append(LogEvent event) { + messages.add(event.getMessage().getFormattedMessage()); + } + + String messagesAsString() { + return String.join("\n", messages); + } + } +} diff --git a/fe/fe-core/src/test/java/org/apache/doris/cloud/CloudWarmUpJobTableFilterTest.java b/fe/fe-core/src/test/java/org/apache/doris/cloud/CloudWarmUpJobTableFilterTest.java new file mode 100644 index 00000000000000..1af6bf284db221 --- /dev/null +++ b/fe/fe-core/src/test/java/org/apache/doris/cloud/CloudWarmUpJobTableFilterTest.java @@ -0,0 +1,461 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.cloud; + +import org.apache.doris.cloud.CloudWarmUpJob.PersistedTableFilterRule; +import org.apache.doris.common.Config; +import org.apache.doris.common.io.Text; + +import com.google.gson.JsonObject; +import com.google.gson.JsonParser; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * Tests for table-filter extensions in {@link CloudWarmUpJob}: + * canonicalize(), rebuildOnTablesFilter(), hasTableFilter(), getJobInfo(), + * getMatchedTablesString(), dynamic table ID tracking, SHOW WARM UP JOB columns. + */ +public class CloudWarmUpJobTableFilterTest { + + private static final int COL_JOB_ID = 0; + private static final int COL_SRC = 1; + private static final int COL_DST = 2; + private static final int COL_STATUS = 3; + private static final int COL_TYPE = 4; + private static final int COL_SYNC_MODE = 5; + private static final int COL_CREATE_TIME = 6; + private static final int COL_START_TIME = 7; + private static final int COL_FINISH_BATCH = 8; + private static final int COL_ALL_BATCH = 9; + private static final int COL_FINISH_TIME = 10; + private static final int COL_ERR_MSG = 11; + private static final int COL_TABLES = 12; + private static final int COL_TABLE_FILTER = 13; + private static final int COL_MATCHED_TABLES = 14; + private static final int COL_SYNC_STATS = 15; + private static final int TOTAL_COLUMNS = 16; + + private PersistedTableFilterRule rule(String type, String pattern) { + PersistedTableFilterRule r = new PersistedTableFilterRule(); + r.ruleType = type; + r.pattern = pattern; + return r; + } + + private CloudWarmUpJob.Builder baseBuilder() { + return new CloudWarmUpJob.Builder() + .setJobId(1L) + .setSrcClusterName("write_cg") + .setDstClusterName("read_cg") + .setJobType(CloudWarmUpJob.JobType.TABLES) + .setSyncMode(CloudWarmUpJob.SyncMode.EVENT_DRIVEN); + } + + private CloudWarmUpJob.Builder clusterBuilder() { + return new CloudWarmUpJob.Builder() + .setJobId(1L) + .setSrcClusterName("write_cg") + .setDstClusterName("read_cg") + .setJobType(CloudWarmUpJob.JobType.CLUSTER) + .setSyncMode(CloudWarmUpJob.SyncMode.EVENT_DRIVEN); + } + + // ===== canonicalize() ===== + + @Test + public void testCanonicalizeIncludeOnly() { + List rules = Arrays.asList( + rule("INCLUDE", "dw.*"), + rule("INCLUDE", "ods.*")); + String expr = CloudWarmUpJob.canonicalize(rules); + Assertions.assertEquals("{\"include\":[\"dw.*\",\"ods.*\"]}", expr); + } + + @Test + public void testCanonicalizeWithExclude() { + List rules = Arrays.asList( + rule("INCLUDE", "ods.*"), + rule("INCLUDE", "dw.*"), + rule("EXCLUDE", "dw.tmp_*")); + String expr = CloudWarmUpJob.canonicalize(rules); + Assertions.assertEquals("{\"include\":[\"dw.*\",\"ods.*\"],\"exclude\":[\"dw.tmp_*\"]}", expr); + } + + @Test + public void testCanonicalizeOrderIndependentAndDedup() { + // Different order + duplicates → same canonical form (FAQ: order doesn't matter) + List rules1 = Arrays.asList( + rule("INCLUDE", "ods.*"), rule("INCLUDE", "dw.*"), rule("EXCLUDE", "dw.tmp_*")); + List rules2 = Arrays.asList( + rule("EXCLUDE", "dw.tmp_*"), rule("INCLUDE", "dw.*"), + rule("INCLUDE", "ods.*"), rule("INCLUDE", "ods.*")); + Assertions.assertEquals( + CloudWarmUpJob.canonicalize(rules1), + CloudWarmUpJob.canonicalize(rules2)); + } + + @Test + public void testBuilderNormalizesPersistedTableFilterRules() { + CloudWarmUpJob job = baseBuilder() + .setTableFilterRules(Arrays.asList( + rule("EXCLUDE", "dw.tmp_*"), + rule("INCLUDE", "ods.*"), + rule("INCLUDE", "dw.*"), + rule("INCLUDE", "ods.*"))) + .build(); + + List normalizedRules = job.getTableFilterRules(); + Assertions.assertEquals(3, normalizedRules.size()); + Assertions.assertEquals("INCLUDE", normalizedRules.get(0).ruleType); + Assertions.assertEquals("dw.*", normalizedRules.get(0).pattern); + Assertions.assertEquals("INCLUDE", normalizedRules.get(1).ruleType); + Assertions.assertEquals("ods.*", normalizedRules.get(1).pattern); + Assertions.assertEquals("EXCLUDE", normalizedRules.get(2).ruleType); + Assertions.assertEquals("dw.tmp_*", normalizedRules.get(2).pattern); + } + + @Test + public void testCanonicalizeExcludeKeyAbsentWhenNoExcludes() { + String expr = CloudWarmUpJob.canonicalize(Arrays.asList(rule("INCLUDE", "ods.*"))); + Assertions.assertFalse(expr.contains("exclude")); + } + + @Test + public void testCanonicalizeEmptyRules() { + String expr = CloudWarmUpJob.canonicalize(new ArrayList<>()); + Assertions.assertEquals("{\"include\":[]}", expr); + } + + // ===== rebuildOnTablesFilter() ===== + + @Test + public void testRebuildOnTablesFilter() { + CloudWarmUpJob job = baseBuilder() + .setTableFilterRules(Arrays.asList( + rule("INCLUDE", "ods.*"), rule("EXCLUDE", "ods.tmp_*"))) + .build(); + job.rebuildOnTablesFilter(); + + OnTablesFilter filter = job.getOnTablesFilter(); + Assertions.assertNotNull(filter); + Assertions.assertTrue(filter.shouldWarmUp("ods", "orders")); + Assertions.assertFalse(filter.shouldWarmUp("ods", "tmp_staging")); + Assertions.assertFalse(filter.shouldWarmUp("dw", "something")); + } + + @Test + public void testRebuildOnTablesFilterAlsoComputesExpr() { + // tableFilterExpr is transient, so after rebuild it should be recomputed from rules + CloudWarmUpJob job = baseBuilder() + .setTableFilterRules(Arrays.asList( + rule("INCLUDE", "ods.*"), rule("EXCLUDE", "ods.tmp_*"))) + .build(); + job.rebuildOnTablesFilter(); + List info = job.getJobInfo(null); + Assertions.assertEquals("{\"include\":[\"ods.*\"],\"exclude\":[\"ods.tmp_*\"]}", + info.get(COL_TABLE_FILTER)); + } + + @Test + public void testReadNormalizesPersistedTableFilterRules() throws IOException { + String json = "{" + + "\"jobId\":1," + + "\"jobState\":\"PENDING\"," + + "\"srcClusterName\":\"write_cg\"," + + "\"cloudClusterName\":\"read_cg\"," + + "\"JobType\":\"TABLES\"," + + "\"syncMode\":\"EVENT_DRIVEN\"," + + "\"tableFilterRules\":[" + + "{\"ruleType\":\"EXCLUDE\",\"pattern\":\"dw.tmp_*\"}," + + "{\"ruleType\":\"INCLUDE\",\"pattern\":\"ods.*\"}," + + "{\"ruleType\":\"INCLUDE\",\"pattern\":\"dw.*\"}," + + "{\"ruleType\":\"INCLUDE\",\"pattern\":\"ods.*\"}" + + "]" + + "}"; + ByteArrayOutputStream bytes = new ByteArrayOutputStream(); + DataOutputStream out = new DataOutputStream(bytes); + Text.writeString(out, json); + out.flush(); + + CloudWarmUpJob job = CloudWarmUpJob.read( + new DataInputStream(new ByteArrayInputStream(bytes.toByteArray()))); + + List normalizedRules = job.getTableFilterRules(); + Assertions.assertEquals(3, normalizedRules.size()); + Assertions.assertEquals("INCLUDE", normalizedRules.get(0).ruleType); + Assertions.assertEquals("dw.*", normalizedRules.get(0).pattern); + Assertions.assertEquals("INCLUDE", normalizedRules.get(1).ruleType); + Assertions.assertEquals("ods.*", normalizedRules.get(1).pattern); + Assertions.assertEquals("EXCLUDE", normalizedRules.get(2).ruleType); + Assertions.assertEquals("dw.tmp_*", normalizedRules.get(2).pattern); + Assertions.assertEquals("{\"include\":[\"dw.*\",\"ods.*\"],\"exclude\":[\"dw.tmp_*\"]}", + job.getTableFilterExpr()); + } + + @Test + public void testRebuildOnTablesFilterNoRules() { + CloudWarmUpJob job = baseBuilder().build(); + job.rebuildOnTablesFilter(); + Assertions.assertNull(job.getOnTablesFilter()); + } + + // ===== hasTableFilter() ===== + + @Test + public void testHasTableFilter() { + CloudWarmUpJob withFilter = baseBuilder() + .setTableFilterRules(Arrays.asList(rule("INCLUDE", "ods.*"))) + .build(); + Assertions.assertTrue(withFilter.hasTableFilter()); + + CloudWarmUpJob withoutFilter = baseBuilder().build(); + Assertions.assertFalse(withoutFilter.hasTableFilter()); + } + + // ===== tableFilterExpr derived from rules (single source of truth) ===== + + @Test + public void testTableFilterExprDerivedFromRules() { + // tableFilterExpr should be computed from rules, not set explicitly + CloudWarmUpJob job = baseBuilder() + .setTableFilterRules(Arrays.asList( + rule("INCLUDE", "dw.*"), rule("INCLUDE", "ods.*"))) + .build(); + List info = job.getJobInfo(null); + Assertions.assertEquals("{\"include\":[\"dw.*\",\"ods.*\"]}", info.get(COL_TABLE_FILTER)); + } + + @Test + public void testTableFilterExprEmptyWhenNoRules() { + CloudWarmUpJob job = baseBuilder().build(); + List info = job.getJobInfo(null); + Assertions.assertEquals("", info.get(COL_TABLE_FILTER)); + } + + // ===== getJobInfo() — SHOW WARM UP JOB output ===== + + @Test + public void testGetJobInfoTableLevelJob() { + // Scenario: user creates a table-level event-driven job and runs SHOW WARM UP JOB + CloudWarmUpJob job = baseBuilder() + .setTableFilterRules(Arrays.asList( + rule("INCLUDE", "ods.*"), rule("EXCLUDE", "ods.tmp_*"))) + .setSyncEvent(CloudWarmUpJob.SyncEvent.LOAD) + .build(); + // Simulate resolved table IDs with db.table names + Map idNames = new HashMap<>(); + idNames.put(1001L, "ods.orders"); + idNames.put(1002L, "ods.products"); + idNames.put(1003L, "ods.users"); + job.setCurrentTableIdNames(idNames); + + List info = job.getJobInfo(null); + Assertions.assertEquals(TOTAL_COLUMNS, info.size()); + Assertions.assertEquals("1", info.get(COL_JOB_ID)); + Assertions.assertEquals("write_cg", info.get(COL_SRC)); + Assertions.assertEquals("read_cg", info.get(COL_DST)); + Assertions.assertEquals("PENDING", info.get(COL_STATUS)); + Assertions.assertEquals("TABLES", info.get(COL_TYPE)); + Assertions.assertTrue(info.get(COL_SYNC_MODE).contains("EVENT_DRIVEN")); + Assertions.assertEquals("{\"include\":[\"ods.*\"],\"exclude\":[\"ods.tmp_*\"]}", + info.get(COL_TABLE_FILTER)); + // MatchedTables should show sorted db.table names + Assertions.assertEquals("ods.orders, ods.products, ods.users", info.get(COL_MATCHED_TABLES)); + } + + @Test + public void testGetJobInfoMatchedTablesTruncated() { + CloudWarmUpJob job = baseBuilder() + .setTableFilterRules(Arrays.asList(rule("INCLUDE", "ods.*"))) + .build(); + Map idNames = new HashMap<>(); + int originalDisplayLimit = Config.cloud_warm_up_matched_tables_display_limit; + Config.cloud_warm_up_matched_tables_display_limit = 3; + int totalTables = 5; + try { + for (int i = 0; i < totalTables; i++) { + idNames.put((long) i, String.format("ods.tbl_%03d", i)); + } + job.setCurrentTableIdNames(idNames); + + String matchedTables = job.getJobInfo(null).get(COL_MATCHED_TABLES); + Assertions.assertEquals("ods.tbl_000, ods.tbl_001, ods.tbl_002, " + + "... (truncated, 3 of 5 shown)", matchedTables); + Assertions.assertFalse(matchedTables.contains("ods.tbl_003")); + Assertions.assertEquals(totalTables, job.getCurrentTableIds().size()); + } finally { + Config.cloud_warm_up_matched_tables_display_limit = originalDisplayLimit; + } + } + + @Test + public void testMatchedTablesLogDisplayTruncated() { + List logEntries = new ArrayList<>(); + int originalDisplayLimit = Config.cloud_warm_up_matched_tables_display_limit; + Config.cloud_warm_up_matched_tables_display_limit = 3; + int totalTables = 5; + try { + for (int i = 0; i < totalTables; i++) { + logEntries.add(String.format("%d:ods.tbl_%03d", i, i)); + } + + String matchedTables = CloudWarmUpJob.formatMatchedTablesForDisplay(logEntries); + Assertions.assertEquals("0:ods.tbl_000, 1:ods.tbl_001, 2:ods.tbl_002, " + + "... (truncated, 3 of 5 shown)", matchedTables); + Assertions.assertFalse(matchedTables.contains("3:ods.tbl_003")); + } finally { + Config.cloud_warm_up_matched_tables_display_limit = originalDisplayLimit; + } + } + + @Test + public void testGetJobInfoClusterLevelJob() { + // Scenario: cluster-level job without ON TABLES — TableFilter and MatchedTables are empty + CloudWarmUpJob job = clusterBuilder() + .setSyncEvent(CloudWarmUpJob.SyncEvent.LOAD) + .build(); + List info = job.getJobInfo(null); + Assertions.assertEquals(TOTAL_COLUMNS, info.size()); + Assertions.assertEquals("CLUSTER", info.get(COL_TYPE)); + Assertions.assertEquals("", info.get(COL_TABLE_FILTER)); + Assertions.assertEquals("", info.get(COL_MATCHED_TABLES)); + Assertions.assertEquals("", info.get(COL_TABLES)); + } + + @Test + public void testGetJobInfoClusterLevelEventDrivenJobShowsSyncStats() { + CloudWarmUpJob job = clusterBuilder() + .setSyncEvent(CloudWarmUpJob.SyncEvent.LOAD) + .build(); + JobWarmUpStats stats = new JobWarmUpStats(); + stats.requestedSegmentNum30m = 6; + stats.requestedSegmentSize30m = 2048; + stats.requestedIndexNum30m = 2; + stats.requestedIndexSize30m = 1024; + stats.finishSegmentNum30m = 4; + stats.finishSegmentSize30m = 1024; + stats.finishIndexNum30m = 1; + stats.finishIndexSize30m = 512; + stats.failSegmentNum30m = 1; + stats.lastTriggerTs = 5000; + stats.progressTriggerTs = 4200; + stats.computeGap(); + + List detailed = job.getJobInfo(stats, true); + Assertions.assertEquals(TOTAL_COLUMNS, detailed.size()); + Assertions.assertEquals("CLUSTER", detailed.get(COL_TYPE)); + Assertions.assertEquals("", detailed.get(COL_TABLE_FILTER)); + Assertions.assertEquals("", detailed.get(COL_MATCHED_TABLES)); + + JsonObject detailStats = JsonParser.parseString(detailed.get(COL_SYNC_STATS)).getAsJsonObject(); + JsonObject segNum = detailStats.getAsJsonObject("seg_num"); + Assertions.assertEquals(6, segNum.get("requested_30m").getAsLong()); + Assertions.assertEquals(4, segNum.get("finish_30m").getAsLong()); + Assertions.assertEquals(2, segNum.get("gap_30m").getAsLong()); + Assertions.assertEquals(800, detailStats.get("trigger_gap_ms").getAsLong()); + Assertions.assertFalse(detailStats.has("window")); + + List summary = job.getJobInfo(stats, false); + JsonObject summaryStats = JsonParser.parseString(summary.get(COL_SYNC_STATS)).getAsJsonObject(); + Assertions.assertEquals("30m", summaryStats.get("window").getAsString()); + Assertions.assertEquals("3kb", summaryStats.get("src_size").getAsString()); + Assertions.assertEquals("1.5kb", summaryStats.get("dst_size").getAsString()); + Assertions.assertEquals("1.5kb", summaryStats.get("gap_size").getAsString()); + Assertions.assertEquals(800, summaryStats.get("trigger_gap_ms").getAsLong()); + Assertions.assertFalse(summaryStats.has("seg_num")); + } + + @Test + public void testGetJobInfoMatchedTablesEmpty() { + // Scenario: all matched tables have been dropped → MatchedTables becomes empty + CloudWarmUpJob job = baseBuilder() + .setTableFilterRules(Arrays.asList(rule("INCLUDE", "ods.*"))) + .build(); + // Initially had tables, now all dropped + job.setCurrentTableIdNames(new HashMap<>()); + + List info = job.getJobInfo(null); + Assertions.assertEquals("{\"include\":[\"ods.*\"]}", info.get(COL_TABLE_FILTER)); + Assertions.assertEquals("", info.get(COL_MATCHED_TABLES)); + } + + // ===== Dynamic table ID tracking (simulating create/drop/rename) ===== + + @Test + public void testDynamicTableIdTracking() { + // Scenario: User guide says system re-evaluates every 60s. + // - Initial: tables 1001, 1002 matched + // - New table 1003 created → next refresh adds it + // - Table 1001 dropped → next refresh removes it + CloudWarmUpJob job = baseBuilder() + .setTableFilterRules(Arrays.asList(rule("INCLUDE", "ods.*"))) + .build(); + + // Phase 1: initial resolution + Map initial = new HashMap<>(); + initial.put(1001L, "ods.orders"); + initial.put(1002L, "ods.products"); + job.setCurrentTableIdNames(initial); + Assertions.assertEquals(2, job.getCurrentTableIds().size()); + Assertions.assertTrue(job.getCurrentTableIds().contains(1001L)); + Assertions.assertTrue(job.getCurrentTableIds().contains(1002L)); + // Verify SHOW output shows db.table names + List info1 = job.getJobInfo(null); + Assertions.assertEquals("ods.orders, ods.products", info1.get(COL_MATCHED_TABLES)); + + // Phase 2: new table created + old table dropped (simulate refresh) + Map afterRefresh = new HashMap<>(); + afterRefresh.put(1002L, "ods.products"); + afterRefresh.put(1003L, "ods.users"); + job.setCurrentTableIdNames(afterRefresh); + Assertions.assertEquals(2, job.getCurrentTableIds().size()); + Assertions.assertFalse(job.getCurrentTableIds().contains(1001L)); + Assertions.assertTrue(job.getCurrentTableIds().contains(1003L)); + List info2 = job.getJobInfo(null); + Assertions.assertEquals("ods.products, ods.users", info2.get(COL_MATCHED_TABLES)); + + // Phase 3: all tables dropped → empty set (Job stays RUNNING per user guide) + job.setCurrentTableIdNames(new HashMap<>()); + Assertions.assertTrue(job.getCurrentTableIds().isEmpty()); + // TableFilter expr is still there (job not cancelled) + Assertions.assertTrue(job.hasTableFilter()); + List info3 = job.getJobInfo(null); + Assertions.assertEquals("", info3.get(COL_MATCHED_TABLES)); + } + + // ===== Builder validation ===== + + @Test + public void testBuilderMissingRequiredFieldsThrows() { + Assertions.assertThrows(IllegalStateException.class, () -> { + new CloudWarmUpJob.Builder().build(); + }); + } +} diff --git a/fe/fe-core/src/test/java/org/apache/doris/cloud/OnTablesFilterTest.java b/fe/fe-core/src/test/java/org/apache/doris/cloud/OnTablesFilterTest.java new file mode 100644 index 00000000000000..65bfad86bc8a05 --- /dev/null +++ b/fe/fe-core/src/test/java/org/apache/doris/cloud/OnTablesFilterTest.java @@ -0,0 +1,141 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.cloud; + +import org.apache.doris.cloud.OnTablesFilter.TableFilterRule; +import org.apache.doris.cloud.OnTablesFilter.TableFilterRule.RuleType; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.util.Arrays; +import java.util.Collections; + +/** + * Tests for {@link OnTablesFilter}: glob compilation, INCLUDE/EXCLUDE semantics, + * edge cases for wildcards and regex metacharacters. + */ +public class OnTablesFilterTest { + + private OnTablesFilter buildFilter(TableFilterRule... rules) { + return new OnTablesFilter(Arrays.asList(rules)); + } + + private TableFilterRule inc(String pattern) { + return new TableFilterRule(RuleType.INCLUDE, pattern); + } + + private TableFilterRule exc(String pattern) { + return new TableFilterRule(RuleType.EXCLUDE, pattern); + } + + // ===== Glob matching semantics ===== + + @Test + public void testGlobWildcards() { + // '*' matches any characters, '?' matches exactly one character + OnTablesFilter filter = buildFilter(inc("db?.tbl_*")); + Assertions.assertTrue(filter.shouldWarmUp("db1", "tbl_orders")); + Assertions.assertTrue(filter.shouldWarmUp("dbA", "tbl_")); + Assertions.assertFalse(filter.shouldWarmUp("db12", "tbl_x")); // '?' must match exactly one char + Assertions.assertFalse(filter.shouldWarmUp("db", "tbl_x")); // '?' requires one char, not zero + Assertions.assertFalse(filter.shouldWarmUp("db1", "orders")); // prefix must match + } + + @Test + public void testDotIsLiteral() { + // '.' is a regex metachar but in glob it should be literal + TableFilterRule rule = inc("ods.tbl"); + Assertions.assertTrue(rule.matches("ods.tbl")); + Assertions.assertFalse(rule.matches("odsXtbl")); // '.' must not match arbitrary char + } + + @Test + public void testRegexMetacharsEscaped() { + // All regex metacharacters should be treated as literals in glob + OnTablesFilter filter = buildFilter(inc("db(1).tbl[2]")); + Assertions.assertTrue(filter.shouldWarmUp("db(1)", "tbl[2]")); + Assertions.assertFalse(filter.shouldWarmUp("db1", "tbl2")); + } + + // ===== INCLUDE / EXCLUDE semantics ===== + + @Test + public void testIncludeOnlyMatchesTargetDb() { + OnTablesFilter filter = buildFilter(inc("ods.*")); + Assertions.assertTrue(filter.shouldWarmUp("ods", "orders")); + Assertions.assertTrue(filter.shouldWarmUp("ods", "users")); + Assertions.assertFalse(filter.shouldWarmUp("dw", "orders")); + } + + @Test + public void testExcludeOverridesInclude() { + OnTablesFilter filter = buildFilter(inc("ods.*"), exc("ods.tmp_*")); + Assertions.assertTrue(filter.shouldWarmUp("ods", "orders")); + Assertions.assertFalse(filter.shouldWarmUp("ods", "tmp_staging")); + Assertions.assertFalse(filter.shouldWarmUp("dw", "orders")); // not included + } + + @Test + public void testMultipleIncludesFormUnion() { + OnTablesFilter filter = buildFilter(inc("ods.*"), inc("dw.*")); + Assertions.assertTrue(filter.shouldWarmUp("ods", "orders")); + Assertions.assertTrue(filter.shouldWarmUp("dw", "fact_sales")); + Assertions.assertFalse(filter.shouldWarmUp("staging", "temp")); + } + + @Test + public void testExcludeOnlyNeverMatches() { + // No INCLUDE rules means nothing is included, regardless of EXCLUDE + OnTablesFilter filter = buildFilter(exc("ods.tmp_*")); + Assertions.assertFalse(filter.shouldWarmUp("ods", "orders")); + Assertions.assertFalse(filter.shouldWarmUp("ods", "tmp_staging")); + } + + @Test + public void testEmptyRulesNeverMatches() { + OnTablesFilter filter = new OnTablesFilter(Collections.emptyList()); + Assertions.assertFalse(filter.shouldWarmUp("any", "table")); + } + + // ===== Typical user scenario: multiple databases + selective exclusion ===== + + @Test + public void testComplexScenario() { + // Include everything in ods and dw, but exclude all tmp tables and a specific table + OnTablesFilter filter = buildFilter( + inc("ods.*"), inc("dw.*"), + exc("*.tmp_*"), exc("dw.secret_report")); + Assertions.assertTrue(filter.shouldWarmUp("ods", "orders")); + Assertions.assertTrue(filter.shouldWarmUp("dw", "fact_sales")); + Assertions.assertFalse(filter.shouldWarmUp("ods", "tmp_staging")); + Assertions.assertFalse(filter.shouldWarmUp("dw", "tmp_data")); + Assertions.assertFalse(filter.shouldWarmUp("dw", "secret_report")); + Assertions.assertFalse(filter.shouldWarmUp("staging", "anything")); + } + + // ===== Rule partitioning ===== + + @Test + public void testGetRulesPartition() { + OnTablesFilter filter = buildFilter(inc("ods.*"), exc("ods.tmp_*"), inc("dw.*")); + Assertions.assertEquals(2, filter.getIncludeRules().size()); + Assertions.assertEquals(1, filter.getExcludeRules().size()); + Assertions.assertEquals(3, filter.getAllRules().size()); + } +} diff --git a/fe/fe-core/src/test/java/org/apache/doris/cloud/WarmUpClusterOnTablesParseTest.java b/fe/fe-core/src/test/java/org/apache/doris/cloud/WarmUpClusterOnTablesParseTest.java new file mode 100644 index 00000000000000..8bec1d2d5eb394 --- /dev/null +++ b/fe/fe-core/src/test/java/org/apache/doris/cloud/WarmUpClusterOnTablesParseTest.java @@ -0,0 +1,447 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.cloud; + +import org.apache.doris.catalog.Env; +import org.apache.doris.cloud.OnTablesFilter.TableFilterRule; +import org.apache.doris.cloud.OnTablesFilter.TableFilterRule.RuleType; +import org.apache.doris.cloud.catalog.ComputeGroup; +import org.apache.doris.cloud.system.CloudSystemInfoService; +import org.apache.doris.common.AnalysisException; +import org.apache.doris.common.Config; +import org.apache.doris.nereids.parser.NereidsParser; +import org.apache.doris.nereids.trees.plans.commands.WarmUpClusterCommand; +import org.apache.doris.qe.ConnectContext; + +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.mockito.Mockito; + +import java.lang.reflect.Field; +import java.util.Arrays; +import java.util.List; + +/** + * Tests parsing of WARM UP CLUSTER ... ON TABLES (...) grammar. + * Covers valid syntax, extracted rule types/patterns, and syntax errors. + */ +public class WarmUpClusterOnTablesParseTest { + + private static ConnectContext connectContext; + private static Env env; + private static Object originalSystemInfo; + + private static void setField(Object target, Class clazz, String fieldName, Object value) throws Exception { + Field field = clazz.getDeclaredField(fieldName); + field.setAccessible(true); + field.set(target, value); + } + + private static Object getField(Object target, Class clazz, String fieldName) throws Exception { + Field field = clazz.getDeclaredField(fieldName); + field.setAccessible(true); + return field.get(target); + } + + @BeforeAll + public static void init() throws Exception { + env = Env.getCurrentEnv(); + originalSystemInfo = getField(env, Env.class, "systemInfo"); + connectContext = new ConnectContext(); + connectContext.setEnv(env); + connectContext.setThreadLocalInfo(); + } + + @AfterAll + public static void tearDown() throws Exception { + setField(env, Env.class, "systemInfo", originalSystemInfo); + ConnectContext.remove(); + } + + private WarmUpClusterCommand parse(String sql) { + try { + return (WarmUpClusterCommand) new NereidsParser().parseSingle(sql); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + private void analyze(WarmUpClusterCommand stmt) throws Exception { + stmt.validate(connectContext); + } + + private void mockValidateEnv(String srcCluster, String dstCluster) throws Exception { + CloudSystemInfoService cloudSys = Mockito.mock(CloudSystemInfoService.class); + Mockito.when(cloudSys.containClusterName(srcCluster)).thenReturn(true); + Mockito.when(cloudSys.containClusterName(dstCluster)).thenReturn(true); + setField(env, Env.class, "systemInfo", cloudSys); + } + + private CloudSystemInfoService buildCloudSystemInfoWithVirtualComputeGroup( + String virtualComputeGroupName, String activeComputeGroupName, String standbyComputeGroupName) { + CloudSystemInfoService cloudSys = new CloudSystemInfoService(); + addVirtualComputeGroup(cloudSys, virtualComputeGroupName, activeComputeGroupName, standbyComputeGroupName); + return cloudSys; + } + + private void addVirtualComputeGroup(CloudSystemInfoService cloudSys, + String virtualComputeGroupName, String activeComputeGroupName, String standbyComputeGroupName) { + ComputeGroup activeComputeGroup = new ComputeGroup(activeComputeGroupName + "_id", + activeComputeGroupName, ComputeGroup.ComputeTypeEnum.COMPUTE); + ComputeGroup standbyComputeGroup = new ComputeGroup(standbyComputeGroupName + "_id", + standbyComputeGroupName, ComputeGroup.ComputeTypeEnum.COMPUTE); + ComputeGroup virtualComputeGroup = new ComputeGroup(virtualComputeGroupName + "_id", + virtualComputeGroupName, ComputeGroup.ComputeTypeEnum.VIRTUAL); + virtualComputeGroup.setSubComputeGroups(Arrays.asList(activeComputeGroupName, standbyComputeGroupName)); + ComputeGroup.Policy policy = new ComputeGroup.Policy(); + policy.setActiveComputeGroup(activeComputeGroupName); + policy.setStandbyComputeGroup(standbyComputeGroupName); + virtualComputeGroup.setPolicy(policy); + + cloudSys.addComputeGroup(activeComputeGroup.getId(), activeComputeGroup); + cloudSys.addComputeGroup(standbyComputeGroup.getId(), standbyComputeGroup); + cloudSys.addComputeGroup(virtualComputeGroup.getId(), virtualComputeGroup); + } + + // ===== Valid syntax: ON TABLES clause is parsed correctly ===== + + @Test + public void testOnTablesSingleInclude() { + WarmUpClusterCommand cmd = parse( + "WARM UP CLUSTER dst WITH CLUSTER src " + + "ON TABLES (INCLUDE 'ods.*') " + + "PROPERTIES('sync_mode'='event_driven', 'sync_event'='LOAD')"); + List rules = cmd.getOnTablesRules(); + Assertions.assertNotNull(rules); + Assertions.assertEquals(1, rules.size()); + Assertions.assertEquals(RuleType.INCLUDE, rules.get(0).getRuleType()); + Assertions.assertEquals("ods.*", rules.get(0).getRawPattern()); + } + + @Test + public void testOnTablesMultipleRules() { + WarmUpClusterCommand cmd = parse( + "WARM UP CLUSTER dst WITH CLUSTER src " + + "ON TABLES (INCLUDE 'ods.*', INCLUDE 'dw.*', EXCLUDE 'dw.tmp_*') " + + "PROPERTIES('sync_mode'='event_driven', 'sync_event'='LOAD')"); + List rules = cmd.getOnTablesRules(); + Assertions.assertNotNull(rules); + Assertions.assertEquals(3, rules.size()); + Assertions.assertEquals(RuleType.INCLUDE, rules.get(0).getRuleType()); + Assertions.assertEquals("ods.*", rules.get(0).getRawPattern()); + Assertions.assertEquals(RuleType.INCLUDE, rules.get(1).getRuleType()); + Assertions.assertEquals("dw.*", rules.get(1).getRawPattern()); + Assertions.assertEquals(RuleType.EXCLUDE, rules.get(2).getRuleType()); + Assertions.assertEquals("dw.tmp_*", rules.get(2).getRawPattern()); + } + + @Test + public void testWithoutOnTablesClause() { + WarmUpClusterCommand cmd = parse( + "WARM UP CLUSTER dst WITH CLUSTER src " + + "PROPERTIES('sync_mode'='event_driven', 'sync_event'='LOAD')"); + Assertions.assertTrue(cmd.getOnTablesRules().isEmpty()); + } + + @Test + public void testOnTablesWithForce() { + WarmUpClusterCommand cmd = parse( + "WARM UP CLUSTER dst WITH CLUSTER src FORCE " + + "ON TABLES (INCLUDE 'ods.*') " + + "PROPERTIES('sync_mode'='event_driven', 'sync_event'='LOAD')"); + Assertions.assertTrue(cmd.isForce()); + Assertions.assertNotNull(cmd.getOnTablesRules()); + Assertions.assertEquals(1, cmd.getOnTablesRules().size()); + } + + @Test + public void testOnTablesWithComputeGroup() { + WarmUpClusterCommand cmd = parse( + "WARM UP COMPUTE GROUP dst WITH COMPUTE GROUP src " + + "ON TABLES (INCLUDE 'db1.*') " + + "PROPERTIES('sync_mode'='event_driven', 'sync_event'='LOAD')"); + Assertions.assertNotNull(cmd.getOnTablesRules()); + Assertions.assertEquals(1, cmd.getOnTablesRules().size()); + } + + // ===== Syntax errors ===== + + @Test + public void testOnTablesEmptyParensFails() { + Assertions.assertThrows(RuntimeException.class, () -> + parse("WARM UP CLUSTER dst WITH CLUSTER src " + + "ON TABLES () " + + "PROPERTIES('sync_mode'='event_driven', 'sync_event'='LOAD')")); + } + + @Test + public void testOnTablesMissingParensFails() { + Assertions.assertThrows(RuntimeException.class, () -> + parse("WARM UP CLUSTER dst WITH CLUSTER src " + + "ON TABLES INCLUDE 'ods.*' " + + "PROPERTIES('sync_mode'='event_driven', 'sync_event'='LOAD')")); + } + + @Test + public void testOnTablesMissingPatternFails() { + Assertions.assertThrows(RuntimeException.class, () -> + parse("WARM UP CLUSTER dst WITH CLUSTER src " + + "ON TABLES (INCLUDE) " + + "PROPERTIES('sync_mode'='event_driven', 'sync_event'='LOAD')")); + } + + // ===== Validation logic in WarmUpClusterCommand ===== + + @Test + public void testOnTablesExcludeOnlyParsesButLacksInclude() { + WarmUpClusterCommand cmd = parse( + "WARM UP CLUSTER dst WITH CLUSTER src " + + "ON TABLES (EXCLUDE 'ods.tmp_*') " + + "PROPERTIES('sync_mode'='event_driven', 'sync_event'='LOAD')"); + List rules = cmd.getOnTablesRules(); + Assertions.assertEquals(1, rules.size()); + Assertions.assertEquals(RuleType.EXCLUDE, rules.get(0).getRuleType()); + boolean hasInclude = rules.stream() + .anyMatch(r -> r.getRuleType() == RuleType.INCLUDE); + Assertions.assertFalse(hasInclude, "Exclude-only rules should have no INCLUDE"); + } + + @Test + public void testOnTablesNonEventDrivenSyncModeParses() { + WarmUpClusterCommand cmd = parse( + "WARM UP CLUSTER dst WITH CLUSTER src " + + "ON TABLES (INCLUDE 'ods.*') " + + "PROPERTIES('sync_mode'='once')"); + Assertions.assertNotNull(cmd.getOnTablesRules()); + Assertions.assertEquals("once", cmd.getProperties().get("sync_mode")); + } + + @Test + public void testOnTablesExcludeOnlyValidateFails() { + String originalCloudUniqueId = Config.cloud_unique_id; + Config.cloud_unique_id = "test_cloud"; + try { + mockValidateEnv("src", "dst"); + WarmUpClusterCommand cmd = parse( + "WARM UP CLUSTER dst WITH CLUSTER src " + + "ON TABLES (EXCLUDE 'ods.tmp_*') " + + "PROPERTIES('sync_mode'='event_driven', 'sync_event'='LOAD')"); + Assertions.assertThrows(AnalysisException.class, () -> analyze(cmd)); + } catch (Exception e) { + Assertions.fail(e); + } finally { + try { + setField(env, Env.class, "systemInfo", originalSystemInfo); + } catch (Exception e) { + throw new RuntimeException(e); + } + Config.cloud_unique_id = originalCloudUniqueId; + } + } + + @Test + public void testOnTablesNonEventDrivenValidateFails() { + String originalCloudUniqueId = Config.cloud_unique_id; + Config.cloud_unique_id = "test_cloud"; + try { + mockValidateEnv("src", "dst"); + WarmUpClusterCommand cmd = parse( + "WARM UP CLUSTER dst WITH CLUSTER src " + + "ON TABLES (INCLUDE 'ods.*') " + + "PROPERTIES('sync_mode'='once')"); + Assertions.assertThrows(AnalysisException.class, () -> analyze(cmd)); + } catch (Exception e) { + Assertions.fail(e); + } finally { + try { + setField(env, Env.class, "systemInfo", originalSystemInfo); + } catch (Exception e) { + throw new RuntimeException(e); + } + Config.cloud_unique_id = originalCloudUniqueId; + } + } + + @Test + public void testOnTablesWithExplicitTableValidateFails() { + String originalCloudUniqueId = Config.cloud_unique_id; + Config.cloud_unique_id = "test_cloud"; + try { + mockValidateEnv("src", "dst"); + WarmUpClusterCommand cmd = parse( + "WARM UP CLUSTER dst WITH TABLE db1.orders " + + "ON TABLES (INCLUDE 'ods.*')"); + AnalysisException exception = Assertions.assertThrows( + AnalysisException.class, () -> analyze(cmd)); + Assertions.assertTrue(exception.getMessage().contains("ON TABLES clause cannot be used")); + } catch (Exception e) { + Assertions.fail(e); + } finally { + try { + setField(env, Env.class, "systemInfo", originalSystemInfo); + } catch (Exception e) { + throw new RuntimeException(e); + } + Config.cloud_unique_id = originalCloudUniqueId; + } + } + + @Test + public void testOnTablesPatternWithoutDbTableFormatValidateFails() { + String originalCloudUniqueId = Config.cloud_unique_id; + Config.cloud_unique_id = "test_cloud"; + try { + mockValidateEnv("src", "dst"); + WarmUpClusterCommand cmd = parse( + "WARM UP CLUSTER dst WITH CLUSTER src " + + "ON TABLES (INCLUDE 'orders') " + + "PROPERTIES('sync_mode'='event_driven', 'sync_event'='LOAD')"); + Assertions.assertThrows(AnalysisException.class, () -> analyze(cmd)); + } catch (Exception e) { + Assertions.fail(e); + } finally { + try { + setField(env, Env.class, "systemInfo", originalSystemInfo); + } catch (Exception e) { + throw new RuntimeException(e); + } + Config.cloud_unique_id = originalCloudUniqueId; + } + } + + @Test + public void testOnTablesLoadEventValidateFailsWhenComputeGroupsOwnedByVirtualComputeGroup() { + String originalCloudUniqueId = Config.cloud_unique_id; + Config.cloud_unique_id = "test_cloud"; + try { + setField(env, Env.class, "systemInfo", buildCloudSystemInfoWithVirtualComputeGroup( + "vcg", "active_cg", "standby_cg")); + WarmUpClusterCommand cmd = parse( + "WARM UP CLUSTER standby_cg WITH CLUSTER active_cg " + + "ON TABLES (INCLUDE 'ods.*') " + + "PROPERTIES('sync_mode'='event_driven', 'sync_event'='LOAD')"); + + AnalysisException exception = Assertions.assertThrows( + AnalysisException.class, () -> analyze(cmd)); + Assertions.assertTrue(exception.getMessage().contains( + "Cannot create warm up job from source compute group 'active_cg' " + + "to destination compute group 'standby_cg'")); + Assertions.assertTrue(exception.getMessage().contains( + "source compute group 'active_cg' and destination compute group 'standby_cg' " + + "are both owned by virtual compute group 'vcg'")); + Assertions.assertTrue(exception.getMessage().contains( + "not support")); + Assertions.assertFalse(exception.getMessage().contains( + "cancel the conflicting managed warm-up job")); + } catch (Exception e) { + Assertions.fail(e); + } finally { + try { + setField(env, Env.class, "systemInfo", originalSystemInfo); + } catch (Exception e) { + throw new RuntimeException(e); + } + Config.cloud_unique_id = originalCloudUniqueId; + } + } + + @Test + public void testOnTablesLoadEventValidateAllowsDestinationComputeGroupOwnedByVirtualComputeGroupOnly() { + String originalCloudUniqueId = Config.cloud_unique_id; + Config.cloud_unique_id = "test_cloud"; + try { + CloudSystemInfoService cloudSys = buildCloudSystemInfoWithVirtualComputeGroup( + "vcg", "active_cg", "standby_cg"); + cloudSys.addComputeGroup("outside_cg_id", + new ComputeGroup("outside_cg_id", "outside_cg", ComputeGroup.ComputeTypeEnum.COMPUTE)); + setField(env, Env.class, "systemInfo", cloudSys); + WarmUpClusterCommand cmd = parse( + "WARM UP CLUSTER standby_cg WITH CLUSTER outside_cg " + + "ON TABLES (INCLUDE 'ods.*') " + + "PROPERTIES('sync_mode'='event_driven', 'sync_event'='LOAD')"); + + Assertions.assertDoesNotThrow(() -> analyze(cmd)); + } catch (Exception e) { + Assertions.fail(e); + } finally { + try { + setField(env, Env.class, "systemInfo", originalSystemInfo); + } catch (Exception e) { + throw new RuntimeException(e); + } + Config.cloud_unique_id = originalCloudUniqueId; + } + } + + @Test + public void testOnTablesLoadEventValidateAllowsSourceComputeGroupOwnedByVirtualComputeGroupOnly() { + String originalCloudUniqueId = Config.cloud_unique_id; + Config.cloud_unique_id = "test_cloud"; + try { + CloudSystemInfoService cloudSys = buildCloudSystemInfoWithVirtualComputeGroup( + "vcg", "active_cg", "standby_cg"); + cloudSys.addComputeGroup("outside_cg_id", + new ComputeGroup("outside_cg_id", "outside_cg", ComputeGroup.ComputeTypeEnum.COMPUTE)); + setField(env, Env.class, "systemInfo", cloudSys); + WarmUpClusterCommand cmd = parse( + "WARM UP CLUSTER outside_cg WITH CLUSTER active_cg " + + "ON TABLES (INCLUDE 'ods.*') " + + "PROPERTIES('sync_mode'='event_driven', 'sync_event'='LOAD')"); + + Assertions.assertDoesNotThrow(() -> analyze(cmd)); + } catch (Exception e) { + Assertions.fail(e); + } finally { + try { + setField(env, Env.class, "systemInfo", originalSystemInfo); + } catch (Exception e) { + throw new RuntimeException(e); + } + Config.cloud_unique_id = originalCloudUniqueId; + } + } + + @Test + public void testOnTablesLoadEventValidateAllowsComputeGroupsOwnedByDifferentVirtualComputeGroups() { + String originalCloudUniqueId = Config.cloud_unique_id; + Config.cloud_unique_id = "test_cloud"; + try { + CloudSystemInfoService cloudSys = new CloudSystemInfoService(); + addVirtualComputeGroup(cloudSys, "vcg1", "active_cg", "standby_cg"); + addVirtualComputeGroup(cloudSys, "vcg2", "other_active_cg", "other_standby_cg"); + setField(env, Env.class, "systemInfo", cloudSys); + WarmUpClusterCommand cmd = parse( + "WARM UP CLUSTER other_standby_cg WITH CLUSTER active_cg " + + "ON TABLES (INCLUDE 'ods.*') " + + "PROPERTIES('sync_mode'='event_driven', 'sync_event'='LOAD')"); + + Assertions.assertDoesNotThrow(() -> analyze(cmd)); + } catch (Exception e) { + Assertions.fail(e); + } finally { + try { + setField(env, Env.class, "systemInfo", originalSystemInfo); + } catch (Exception e) { + throw new RuntimeException(e); + } + Config.cloud_unique_id = originalCloudUniqueId; + } + } +} diff --git a/fe/fe-core/src/test/java/org/apache/doris/cloud/WarmUpStatsTest.java b/fe/fe-core/src/test/java/org/apache/doris/cloud/WarmUpStatsTest.java new file mode 100644 index 00000000000000..2f8c35f73941f0 --- /dev/null +++ b/fe/fe-core/src/test/java/org/apache/doris/cloud/WarmUpStatsTest.java @@ -0,0 +1,497 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.cloud; + +import com.google.gson.JsonObject; +import com.google.gson.JsonParser; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.util.HashMap; +import java.util.Map; + +/** + * Unit tests for warmup progress observation data models: + * - TableWarmUpWindowedStats: parse BE JSON, merge from multiple BEs + * - JobWarmUpStats: aggregate requested/finished, compute gap, serialize + */ +public class WarmUpStatsTest { + + // ==================== TableWarmUpWindowedStats ==================== + + @Test + public void testFromJsonComplete() { + String json = "{" + + "\"job_id\": 100," + + "\"requested\": {" + + " \"seg\": {\"num\": {\"5m\": 10, \"30m\": 50, \"1h\": 200}," + + " \"size\": {\"5m\": 1024, \"30m\": 5120, \"1h\": 20480}}," + + " \"idx\": {\"num\": {\"5m\": 3, \"30m\": 15, \"1h\": 60}," + + " \"size\": {\"5m\": 512, \"30m\": 2560, \"1h\": 10240}}" + + "}," + + "\"finish\": {" + + " \"seg\": {\"num\": {\"5m\": 8, \"30m\": 45, \"1h\": 190}," + + " \"size\": {\"5m\": 800, \"30m\": 4500, \"1h\": 19000}}," + + " \"idx\": {\"num\": {\"5m\": 2, \"30m\": 12, \"1h\": 55}," + + " \"size\": {\"5m\": 400, \"30m\": 2400, \"1h\": 9500}}" + + "}," + + "\"fail\": {" + + " \"seg\": {\"num\": {\"5m\": 1, \"30m\": 3, \"1h\": 5}," + + " \"size\": {\"5m\": 100, \"30m\": 300, \"1h\": 500}}," + + " \"idx\": {\"num\": {\"5m\": 0, \"30m\": 1, \"1h\": 2}," + + " \"size\": {\"5m\": 0, \"30m\": 50, \"1h\": 100}}" + + "}," + + "\"last_trigger_ts\": 1700000000000," + + "\"last_finish_ts\": 1700000001000," + + "\"progress_trigger_ts\": 1699999999000" + + "}"; + JsonObject obj = JsonParser.parseString(json).getAsJsonObject(); + TableWarmUpWindowedStats stats = TableWarmUpWindowedStats.fromJson(obj); + + // requested + Assertions.assertEquals(10, stats.requestedSegmentNum5m); + Assertions.assertEquals(50, stats.requestedSegmentNum30m); + Assertions.assertEquals(200, stats.requestedSegmentNum1h); + Assertions.assertEquals(1024, stats.requestedSegmentSize5m); + Assertions.assertEquals(3, stats.requestedIndexNum5m); + Assertions.assertEquals(512, stats.requestedIndexSize5m); + + // finish + Assertions.assertEquals(8, stats.finishSegmentNum5m); + Assertions.assertEquals(45, stats.finishSegmentNum30m); + Assertions.assertEquals(400, stats.finishIndexSize5m); + + // fail + Assertions.assertEquals(1, stats.failSegmentNum5m); + Assertions.assertEquals(0, stats.failIndexNum5m); + Assertions.assertEquals(100, stats.failSegmentSize5m); + + // timestamps + Assertions.assertEquals(1700000000000L, stats.lastTriggerTs); + Assertions.assertEquals(1700000001000L, stats.lastFinishTs); + Assertions.assertEquals(1699999999000L, stats.progressTriggerTs); + } + + @Test + public void testFromJsonMissingSections() { + // JSON with only requested, no finish or fail + String json = "{" + + "\"job_id\": 200," + + "\"requested\": {" + + " \"seg\": {\"num\": {\"5m\": 5}}" + + "}" + + "}"; + JsonObject obj = JsonParser.parseString(json).getAsJsonObject(); + TableWarmUpWindowedStats stats = TableWarmUpWindowedStats.fromJson(obj); + + Assertions.assertEquals(5, stats.requestedSegmentNum5m); + Assertions.assertEquals(0, stats.requestedSegmentNum30m); + Assertions.assertEquals(0, stats.finishSegmentNum5m); + Assertions.assertEquals(0, stats.failSegmentNum5m); + Assertions.assertEquals(0, stats.lastTriggerTs); + Assertions.assertEquals(0, stats.lastFinishTs); + Assertions.assertEquals(0, stats.progressTriggerTs); + } + + @Test + public void testFromJsonEmptyObject() { + String json = "{\"job_id\": 300}"; + JsonObject obj = JsonParser.parseString(json).getAsJsonObject(); + TableWarmUpWindowedStats stats = TableWarmUpWindowedStats.fromJson(obj); + + Assertions.assertEquals(0, stats.requestedSegmentNum5m); + Assertions.assertEquals(0, stats.finishSegmentNum5m); + Assertions.assertEquals(0, stats.failSegmentNum5m); + } + + @Test + public void testMergeAddsCounts() { + TableWarmUpWindowedStats a = new TableWarmUpWindowedStats(); + a.requestedSegmentNum5m = 10; + a.requestedSegmentSize5m = 1000; + a.finishSegmentNum5m = 8; + a.failSegmentNum5m = 1; + a.lastTriggerTs = 100; + a.lastFinishTs = 200; + a.progressTriggerTs = 500; + + TableWarmUpWindowedStats b = new TableWarmUpWindowedStats(); + b.requestedSegmentNum5m = 20; + b.requestedSegmentSize5m = 2000; + b.finishSegmentNum5m = 15; + b.failSegmentNum5m = 2; + b.lastTriggerTs = 150; + b.lastFinishTs = 180; + b.progressTriggerTs = 300; + + a.merge(b); + + Assertions.assertEquals(30, a.requestedSegmentNum5m); + Assertions.assertEquals(3000, a.requestedSegmentSize5m); + Assertions.assertEquals(23, a.finishSegmentNum5m); + Assertions.assertEquals(3, a.failSegmentNum5m); + Assertions.assertEquals(150, a.lastTriggerTs); // max + Assertions.assertEquals(200, a.lastFinishTs); // max + Assertions.assertEquals(300, a.progressTriggerTs); // min positive + } + + @Test + public void testMergeProgressTriggerTsIgnoresMissingValues() { + TableWarmUpWindowedStats a = new TableWarmUpWindowedStats(); + a.progressTriggerTs = 500; + + TableWarmUpWindowedStats missing = new TableWarmUpWindowedStats(); + a.merge(missing); + Assertions.assertEquals(500, a.progressTriggerTs); + + TableWarmUpWindowedStats b = new TableWarmUpWindowedStats(); + b.progressTriggerTs = 300; + missing.merge(b); + Assertions.assertEquals(300, missing.progressTriggerTs); + } + + // ==================== JobWarmUpStats ==================== + + @Test + public void testMergeRequestedAccumulates() { + JobWarmUpStats job = new JobWarmUpStats(); + + TableWarmUpWindowedStats src1 = new TableWarmUpWindowedStats(); + src1.requestedSegmentNum5m = 10; + src1.requestedSegmentSize5m = 1000; + src1.requestedIndexNum5m = 3; + src1.lastTriggerTs = 100; + + TableWarmUpWindowedStats src2 = new TableWarmUpWindowedStats(); + src2.requestedSegmentNum5m = 20; + src2.requestedSegmentSize5m = 2000; + src2.requestedIndexNum5m = 5; + src2.lastTriggerTs = 200; + + job.mergeRequested(src1); + job.mergeRequested(src2); + + Assertions.assertEquals(30, job.requestedSegmentNum5m); + Assertions.assertEquals(3000, job.requestedSegmentSize5m); + Assertions.assertEquals(8, job.requestedIndexNum5m); + Assertions.assertEquals(200, job.lastTriggerTs); + } + + @Test + public void testMergeFinishedAccumulates() { + JobWarmUpStats job = new JobWarmUpStats(); + + TableWarmUpWindowedStats dst = new TableWarmUpWindowedStats(); + dst.finishSegmentNum5m = 7; + dst.finishSegmentSize5m = 700; + dst.failSegmentNum5m = 2; + dst.failSegmentSize5m = 200; + dst.lastFinishTs = 300; + dst.progressTriggerTs = 250; + + job.mergeFinished(dst); + + Assertions.assertEquals(7, job.finishSegmentNum5m); + Assertions.assertEquals(700, job.finishSegmentSize5m); + Assertions.assertEquals(2, job.failSegmentNum5m); + Assertions.assertEquals(200, job.failSegmentSize5m); + Assertions.assertEquals(300, job.lastFinishTs); + Assertions.assertEquals(250, job.progressTriggerTs); + } + + @Test + public void testComputeGap() { + JobWarmUpStats job = new JobWarmUpStats(); + job.requestedSegmentNum5m = 100; + job.requestedSegmentNum30m = 500; + job.requestedSegmentNum1h = 2000; + job.requestedSegmentSize5m = 10240; + job.requestedIndexNum5m = 30; + + job.finishSegmentNum5m = 80; + job.finishSegmentNum30m = 450; + job.finishSegmentNum1h = 1900; + job.finishSegmentSize5m = 8192; + job.finishIndexNum5m = 25; + job.lastTriggerTs = 5000; + job.progressTriggerTs = 3000; + + job.computeGap(); + + Assertions.assertEquals(20, job.gapSegmentNum5m); + Assertions.assertEquals(50, job.gapSegmentNum30m); + Assertions.assertEquals(100, job.gapSegmentNum1h); + Assertions.assertEquals(2048, job.gapSegmentSize5m); + Assertions.assertEquals(5, job.gapIndexNum5m); + Assertions.assertEquals(2000, job.triggerGapMs); + } + + @Test + public void testComputeGapNegative() { + // Finished can exceed requested in windowed metrics (timing variance) + JobWarmUpStats job = new JobWarmUpStats(); + job.requestedSegmentNum5m = 10; + job.finishSegmentNum5m = 15; + + job.computeGap(); + + Assertions.assertEquals(-5, job.gapSegmentNum5m); + } + + @Test + public void testToJsonStringStructure() { + JobWarmUpStats job = new JobWarmUpStats(); + job.requestedSegmentNum5m = 100; + job.finishSegmentNum5m = 80; + job.failSegmentNum5m = 5; + job.gapSegmentNum5m = 20; + job.requestedSegmentSize5m = 1048576; // 1 MB + job.finishSegmentSize5m = 524288; // 512 KB + job.gapSegmentSize5m = 524288; + + String jsonStr = job.toJsonString(); + JsonObject root = JsonParser.parseString(jsonStr).getAsJsonObject(); + + // Verify structure + Assertions.assertTrue(root.has("seg_num")); + Assertions.assertTrue(root.has("seg_size")); + Assertions.assertTrue(root.has("idx_num")); + Assertions.assertTrue(root.has("idx_size")); + Assertions.assertTrue(root.has("last_trigger_ts")); + Assertions.assertTrue(root.has("last_finish_ts")); + Assertions.assertTrue(root.has("progress_trigger_ts")); + Assertions.assertTrue(root.has("trigger_gap_ms")); + Assertions.assertFalse(root.has("window")); + Assertions.assertFalse(root.has("src_size")); + Assertions.assertFalse(root.has("dst_size")); + Assertions.assertFalse(root.has("gap_size")); + + // seg_num values + JsonObject segNum = root.getAsJsonObject("seg_num"); + Assertions.assertEquals(100, segNum.get("requested_5m").getAsLong()); + Assertions.assertEquals(80, segNum.get("finish_5m").getAsLong()); + Assertions.assertEquals(20, segNum.get("gap_5m").getAsLong()); + Assertions.assertEquals(5, segNum.get("fail_5m").getAsLong()); + + // seg_size values are human-readable strings (via ByteSizeValue) + JsonObject segSize = root.getAsJsonObject("seg_size"); + Assertions.assertEquals("1mb", segSize.get("requested_5m").getAsString()); + Assertions.assertEquals("512kb", segSize.get("finish_5m").getAsString()); + } + + @Test + public void testToJsonStringZeroTimestamps() { + JobWarmUpStats job = new JobWarmUpStats(); + // All zeros + String jsonStr = job.toJsonString(); + JsonObject root = JsonParser.parseString(jsonStr).getAsJsonObject(); + + // Zero timestamps should be empty strings + Assertions.assertEquals("", root.get("last_trigger_ts").getAsString()); + Assertions.assertEquals("", root.get("last_finish_ts").getAsString()); + Assertions.assertEquals("", root.get("progress_trigger_ts").getAsString()); + Assertions.assertEquals(0, root.get("trigger_gap_ms").getAsLong()); + + // Zero counts + JsonObject segNum = root.getAsJsonObject("seg_num"); + Assertions.assertEquals(0, segNum.get("requested_5m").getAsLong()); + Assertions.assertEquals(0, segNum.get("gap_5m").getAsLong()); + } + + @Test + public void testToSummaryJsonStringMergesDataAndIndexSize() { + JobWarmUpStats job = new JobWarmUpStats(); + job.requestedSegmentSize30m = 1048576; // 1 MB + job.requestedIndexSize30m = 1048576; // 1 MB + job.finishSegmentSize30m = 524288; // 512 KB + job.finishIndexSize30m = 524288; // 512 KB + job.lastTriggerTs = 5000; + job.progressTriggerTs = 4500; + job.computeGap(); + + String jsonStr = job.toSummaryJsonString(); + JsonObject root = JsonParser.parseString(jsonStr).getAsJsonObject(); + + Assertions.assertEquals("30m", root.get("window").getAsString()); + Assertions.assertEquals("2mb", root.get("src_size").getAsString()); + Assertions.assertEquals("1mb", root.get("dst_size").getAsString()); + Assertions.assertEquals("1mb", root.get("gap_size").getAsString()); + Assertions.assertEquals(500, root.get("trigger_gap_ms").getAsLong()); + Assertions.assertFalse(root.has("seg_num")); + Assertions.assertFalse(root.has("seg_size")); + Assertions.assertFalse(root.has("idx_num")); + Assertions.assertFalse(root.has("idx_size")); + Assertions.assertFalse(root.has("last_trigger_ts")); + Assertions.assertFalse(root.has("last_finish_ts")); + Assertions.assertFalse(root.has("data_size")); + Assertions.assertFalse(root.has("index_size")); + } + + @Test + public void testHumanReadableSizeInJson() { + JobWarmUpStats job = new JobWarmUpStats(); + job.requestedSegmentSize5m = 500; // 500 B + job.finishSegmentSize5m = 1536; // 1.5 KB + job.gapSegmentSize5m = 1048576; // 1.0 MB + job.failSegmentSize5m = 1073741824L; // 1.0 GB + + String jsonStr = job.toJsonString(); + JsonObject segSize = JsonParser.parseString(jsonStr).getAsJsonObject() + .getAsJsonObject("seg_size"); + + Assertions.assertEquals("500b", segSize.get("requested_5m").getAsString()); + Assertions.assertEquals("1.5kb", segSize.get("finish_5m").getAsString()); + Assertions.assertEquals("1mb", segSize.get("gap_5m").getAsString()); + Assertions.assertEquals("1gb", segSize.get("fail_5m").getAsString()); + } + + @Test + public void testEndToEndSourceAndTargetAggregation() { + // Simulate: 2 source BEs + 1 target BE → aggregate into JobWarmUpStats + + // Source BE1 + String srcJson1 = "{" + + "\"job_id\": 42," + + "\"requested\": {" + + " \"seg\": {\"num\": {\"5m\": 50, \"30m\": 200, \"1h\": 800}," + + " \"size\": {\"5m\": 5000, \"30m\": 20000, \"1h\": 80000}}," + + " \"idx\": {\"num\": {\"5m\": 10, \"30m\": 40, \"1h\": 160}," + + " \"size\": {\"5m\": 1000, \"30m\": 4000, \"1h\": 16000}}" + + "}," + + "\"last_trigger_ts\": 1000" + + "}"; + + // Source BE2 + String srcJson2 = "{" + + "\"job_id\": 42," + + "\"requested\": {" + + " \"seg\": {\"num\": {\"5m\": 30, \"30m\": 120, \"1h\": 500}," + + " \"size\": {\"5m\": 3000, \"30m\": 12000, \"1h\": 50000}}," + + " \"idx\": {\"num\": {\"5m\": 6, \"30m\": 24, \"1h\": 100}," + + " \"size\": {\"5m\": 600, \"30m\": 2400, \"1h\": 10000}}" + + "}," + + "\"last_trigger_ts\": 1200" + + "}"; + + // Target BE + String dstJson = "{" + + "\"job_id\": 42," + + "\"finish\": {" + + " \"seg\": {\"num\": {\"5m\": 70, \"30m\": 300, \"1h\": 1250}," + + " \"size\": {\"5m\": 7000, \"30m\": 30000, \"1h\": 125000}}," + + " \"idx\": {\"num\": {\"5m\": 14, \"30m\": 60, \"1h\": 250}," + + " \"size\": {\"5m\": 1400, \"30m\": 6000, \"1h\": 25000}}" + + "}," + + "\"fail\": {" + + " \"seg\": {\"num\": {\"5m\": 2, \"30m\": 5, \"1h\": 10}," + + " \"size\": {\"5m\": 200, \"30m\": 500, \"1h\": 1000}}," + + " \"idx\": {\"num\": {\"5m\": 0, \"30m\": 1, \"1h\": 3}," + + " \"size\": {\"5m\": 0, \"30m\": 100, \"1h\": 300}}" + + "}," + + "\"last_finish_ts\": 1100," + + "\"progress_trigger_ts\": 900" + + "}"; + + // Parse and merge source BEs + TableWarmUpWindowedStats src = TableWarmUpWindowedStats.fromJson( + JsonParser.parseString(srcJson1).getAsJsonObject()); + src.merge(TableWarmUpWindowedStats.fromJson( + JsonParser.parseString(srcJson2).getAsJsonObject())); + + // Parse target BE + TableWarmUpWindowedStats dst = TableWarmUpWindowedStats.fromJson( + JsonParser.parseString(dstJson).getAsJsonObject()); + + // Aggregate + JobWarmUpStats job = new JobWarmUpStats(); + job.mergeRequested(src); + job.mergeFinished(dst); + job.computeGap(); + + // Verify aggregated requested (50+30=80, 200+120=320, ...) + Assertions.assertEquals(80, job.requestedSegmentNum5m); + Assertions.assertEquals(320, job.requestedSegmentNum30m); + Assertions.assertEquals(1300, job.requestedSegmentNum1h); + Assertions.assertEquals(8000, job.requestedSegmentSize5m); + Assertions.assertEquals(16, job.requestedIndexNum5m); + Assertions.assertEquals(1200, job.lastTriggerTs); // max(1000, 1200) + + // Verify finished + Assertions.assertEquals(70, job.finishSegmentNum5m); + Assertions.assertEquals(300, job.finishSegmentNum30m); + Assertions.assertEquals(2, job.failSegmentNum5m); + Assertions.assertEquals(1100, job.lastFinishTs); + Assertions.assertEquals(900, job.progressTriggerTs); + + // Verify gap + Assertions.assertEquals(10, job.gapSegmentNum5m); // 80 - 70 + Assertions.assertEquals(20, job.gapSegmentNum30m); // 320 - 300 + Assertions.assertEquals(50, job.gapSegmentNum1h); // 1300 - 1250 + Assertions.assertEquals(1000, job.gapSegmentSize5m); // 8000 - 7000 + Assertions.assertEquals(2, job.gapIndexNum5m); // 16 - 14 + Assertions.assertEquals(300, job.triggerGapMs); // 1200 - 900 + } + + @Test + public void testClusterLevelEventDrivenJobAggregatesStatsByJobId() { + CloudWarmUpJob job = new CloudWarmUpJob.Builder() + .setJobId(77L) + .setSrcClusterName("write_cg") + .setDstClusterName("read_cg") + .setJobType(CloudWarmUpJob.JobType.CLUSTER) + .setSyncMode(CloudWarmUpJob.SyncMode.EVENT_DRIVEN) + .setSyncEvent(CloudWarmUpJob.SyncEvent.LOAD) + .build(); + + TableWarmUpWindowedStats src = new TableWarmUpWindowedStats(); + src.requestedSegmentNum30m = 6; + src.requestedSegmentSize30m = 2048; + src.requestedIndexNum30m = 2; + src.requestedIndexSize30m = 1024; + src.lastTriggerTs = 1000; + + TableWarmUpWindowedStats dst = new TableWarmUpWindowedStats(); + dst.finishSegmentNum30m = 4; + dst.finishSegmentSize30m = 1024; + dst.finishIndexNum30m = 1; + dst.finishIndexSize30m = 512; + dst.failSegmentNum30m = 1; + dst.failSegmentSize30m = 128; + dst.lastFinishTs = 1200; + + Map srcStats = new HashMap<>(); + srcStats.put(77L, src); + Map dstStats = new HashMap<>(); + dstStats.put(77L, dst); + Map> clusterStats = new HashMap<>(); + clusterStats.put("write_cg", srcStats); + clusterStats.put("read_cg", dstStats); + + JobWarmUpStats stats = new CacheHotspotManager(null).aggregateStatsForJob(job, clusterStats); + + Assertions.assertEquals(6, stats.requestedSegmentNum30m); + Assertions.assertEquals(4, stats.finishSegmentNum30m); + Assertions.assertEquals(2, stats.gapSegmentNum30m); + Assertions.assertEquals(2, stats.requestedIndexNum30m); + Assertions.assertEquals(1, stats.finishIndexNum30m); + Assertions.assertEquals(1, stats.gapIndexNum30m); + Assertions.assertEquals(1536, stats.gapSegmentSize30m + stats.gapIndexSize30m); + Assertions.assertEquals(1000, stats.lastTriggerTs); + Assertions.assertEquals(1200, stats.lastFinishTs); + } +} diff --git a/fe/fe-core/src/test/java/org/apache/doris/cloud/catalog/CloudInstanceStatusCheckerTest.java b/fe/fe-core/src/test/java/org/apache/doris/cloud/catalog/CloudInstanceStatusCheckerTest.java new file mode 100644 index 00000000000000..ff19f67dcb64dc --- /dev/null +++ b/fe/fe-core/src/test/java/org/apache/doris/cloud/catalog/CloudInstanceStatusCheckerTest.java @@ -0,0 +1,260 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.cloud.catalog; + +import org.apache.doris.catalog.DatabaseIf; +import org.apache.doris.catalog.Env; +import org.apache.doris.catalog.StorageVaultMgr; +import org.apache.doris.catalog.TableIf; +import org.apache.doris.cloud.CacheHotspotManager; +import org.apache.doris.cloud.CloudWarmUpJob; +import org.apache.doris.cloud.OnTablesFilter.TableFilterRule; +import org.apache.doris.cloud.OnTablesFilter.TableFilterRule.RuleType; +import org.apache.doris.cloud.proto.Cloud; +import org.apache.doris.cloud.system.CloudSystemInfoService; +import org.apache.doris.common.Config; +import org.apache.doris.datasource.InternalCatalog; +import org.apache.doris.nereids.trees.plans.commands.WarmUpClusterCommand; +import org.apache.doris.persist.EditLog; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.core.LogEvent; +import org.apache.logging.log4j.core.Logger; +import org.apache.logging.log4j.core.appender.AbstractAppender; +import org.apache.logging.log4j.core.config.Property; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.MockedStatic; +import org.mockito.Mockito; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.concurrent.atomic.AtomicLong; + +public class CloudInstanceStatusCheckerTest { + private String originalCloudUniqueId; + private CloudSystemInfoService cloudSystemInfoService; + private CacheHotspotManager cacheHotspotManager; + private InternalCatalog internalCatalog; + private List> databases; + private MockedStatic mockedEnv; + private CloudEnv cloudEnv; + + @BeforeEach + public void setUp() { + originalCloudUniqueId = Config.cloud_unique_id; + Config.cloud_unique_id = "test_cloud"; + + cloudSystemInfoService = Mockito.spy(new CloudSystemInfoService()); + cacheHotspotManager = new CacheHotspotManager(cloudSystemInfoService); + internalCatalog = Mockito.mock(InternalCatalog.class); + databases = new ArrayList<>(); + Mockito.when(internalCatalog.getAllDbs()).thenAnswer(invocation -> databases); + + cloudEnv = Mockito.mock(CloudEnv.class); + AtomicLong nextId = new AtomicLong(10000L); + Mockito.when(cloudEnv.getNextId()).thenAnswer(invocation -> nextId.incrementAndGet()); + Mockito.when(cloudEnv.getEditLog()).thenReturn(Mockito.mock(EditLog.class)); + Mockito.when(cloudEnv.getStorageVaultMgr()).thenReturn(Mockito.mock(StorageVaultMgr.class)); + Mockito.when(cloudEnv.getCacheHotspotMgr()).thenReturn(cacheHotspotManager); + Mockito.when(cloudEnv.isMaster()).thenReturn(false); + + mockedEnv = Mockito.mockStatic(Env.class); + mockedEnv.when(Env::getCurrentEnv).thenReturn(cloudEnv); + mockedEnv.when(Env::getCurrentInternalCatalog).thenReturn(internalCatalog); + mockedEnv.when(Env::getCurrentSystemInfo).thenReturn(cloudSystemInfoService); + } + + @AfterEach + public void tearDown() { + if (mockedEnv != null) { + mockedEnv.close(); + } + Config.cloud_unique_id = originalCloudUniqueId; + } + + @Test + public void testSyncInstanceCreatesVirtualComputeGroup() { + addComputeGroup("active_cg_id", "active_cg"); + addComputeGroup("standby_cg_id", "standby_cg"); + Mockito.doReturn(instanceResponseWithVirtualComputeGroup()).when(cloudSystemInfoService).getCloudInstance(); + + new CloudInstanceStatusChecker(cloudSystemInfoService).runAfterCatalogReady(); + + ComputeGroup virtualComputeGroup = cloudSystemInfoService.getComputeGroupById("vcg_id"); + Assertions.assertNotNull(virtualComputeGroup); + Assertions.assertTrue(virtualComputeGroup.isVirtual()); + Assertions.assertEquals("vcg", virtualComputeGroup.getName()); + Assertions.assertEquals(Arrays.asList("active_cg", "standby_cg"), + virtualComputeGroup.getSubComputeGroups()); + Assertions.assertEquals("active_cg", virtualComputeGroup.getActiveComputeGroup()); + Assertions.assertEquals("standby_cg", virtualComputeGroup.getStandbyComputeGroup()); + } + + @Test + public void testSyncInstanceCreatesVirtualComputeGroupAndCancelsTableLevelLoadEvent() throws Exception { + databases.add(mockDb("ods", mockTable(1001, "orders"))); + addComputeGroup("active_cg_id", "active_cg"); + addComputeGroup("standby_cg_id", "standby_cg"); + long tableLevelJobId = cacheHotspotManager.createJob(buildEventDrivenStmt("active_cg", "standby_cg", + new TableFilterRule(RuleType.INCLUDE, "ods.*"))); + Mockito.doReturn(instanceResponseWithVirtualComputeGroup()).when(cloudSystemInfoService).getCloudInstance(); + Mockito.when(cloudEnv.isMaster()).thenReturn(true); + + RecordingAppender appender = new RecordingAppender("vcg-create-cancel-table-warmup-test"); + Logger logger = (Logger) LogManager.getLogger(CloudInstanceStatusChecker.class); + appender.start(); + logger.addAppender(appender); + try (MockedStatic mockedCloudSystemInfoService = + Mockito.mockStatic(CloudSystemInfoService.class, Mockito.CALLS_REAL_METHODS)) { + mockedCloudSystemInfoService.when(() -> CloudSystemInfoService.updateFileCacheJobIds( + Mockito.any(ComputeGroup.class), Mockito.anyList())).thenAnswer(invocation -> null); + + new CloudInstanceStatusChecker(cloudSystemInfoService).runAfterCatalogReady(); + mockedCloudSystemInfoService.verify(() -> CloudSystemInfoService.updateFileCacheJobIds( + Mockito.any(ComputeGroup.class), Mockito.anyList())); + } finally { + logger.removeAppender(appender); + appender.stop(); + } + + ComputeGroup virtualComputeGroup = cloudSystemInfoService.getComputeGroupById("vcg_id"); + Assertions.assertNotNull(virtualComputeGroup); + Assertions.assertTrue(virtualComputeGroup.isVirtual()); + Assertions.assertFalse(virtualComputeGroup.isNeedRebuildFileCache()); + + CloudWarmUpJob tableLevelJob = cacheHotspotManager.getCloudWarmUpJob(tableLevelJobId); + Assertions.assertEquals(CloudWarmUpJob.JobState.CANCELLED, tableLevelJob.getJobState()); + Assertions.assertTrue(tableLevelJob.getErrMsg().contains( + "vcg cancel table-level load-event warm up job before rebuilding file cache jobs")); + Assertions.assertTrue(tableLevelJob.getErrMsg().contains("virtual compute group 'vcg'")); + + Assertions.assertEquals(3, cacheHotspotManager.getAllJobInfos(10).size()); + Assertions.assertTrue(cacheHotspotManager.getCloudWarmUpJobs().values().stream().anyMatch(job -> + job.getJobType() == CloudWarmUpJob.JobType.CLUSTER + && job.isPeriodic() + && "active_cg".equals(job.getSrcClusterName()) + && "standby_cg".equals(job.getDstClusterName()))); + Assertions.assertTrue(cacheHotspotManager.getCloudWarmUpJobs().values().stream().anyMatch(job -> + job.getJobType() == CloudWarmUpJob.JobType.CLUSTER + && job.isEventDriven() + && job.getSyncEvent() == CloudWarmUpJob.SyncEvent.LOAD + && !job.hasTableFilter() + && "active_cg".equals(job.getSrcClusterName()) + && "standby_cg".equals(job.getDstClusterName()))); + + String logs = appender.messagesAsString(); + Assertions.assertFalse(logs.contains("failed to create virtual compute group vcg"), logs); + Assertions.assertTrue(logs.contains("generate new jobIds"), logs); + } + + private void addComputeGroup(String computeGroupId, String computeGroupName) { + cloudSystemInfoService.addComputeGroup(computeGroupId, + new ComputeGroup(computeGroupId, computeGroupName, ComputeGroup.ComputeTypeEnum.COMPUTE)); + } + + private Cloud.GetInstanceResponse instanceResponseWithVirtualComputeGroup() { + Cloud.ClusterPB activeComputeGroup = computeGroup("active_cg_id", "active_cg"); + Cloud.ClusterPB standbyComputeGroup = computeGroup("standby_cg_id", "standby_cg"); + Cloud.ClusterPB virtualComputeGroup = Cloud.ClusterPB.newBuilder() + .setClusterId("vcg_id") + .setClusterName("vcg") + .setType(Cloud.ClusterPB.Type.VIRTUAL) + .addClusterNames("active_cg") + .addClusterNames("standby_cg") + .setClusterPolicy(Cloud.ClusterPolicy.newBuilder() + .setType(Cloud.ClusterPolicy.PolicyType.ActiveStandby) + .setActiveClusterName("active_cg") + .addStandbyClusterNames("standby_cg") + .build()) + .build(); + return Cloud.GetInstanceResponse.newBuilder() + .setStatus(Cloud.MetaServiceResponseStatus.newBuilder() + .setCode(Cloud.MetaServiceCode.OK) + .setMsg("OK") + .build()) + .setInstance(Cloud.InstanceInfoPB.newBuilder() + .setStatus(Cloud.InstanceInfoPB.Status.NORMAL) + .addClusters(activeComputeGroup) + .addClusters(standbyComputeGroup) + .addClusters(virtualComputeGroup) + .build()) + .build(); + } + + private Cloud.ClusterPB computeGroup(String computeGroupId, String computeGroupName) { + return Cloud.ClusterPB.newBuilder() + .setClusterId(computeGroupId) + .setClusterName(computeGroupName) + .setType(Cloud.ClusterPB.Type.COMPUTE) + .build(); + } + + @SuppressWarnings("unchecked") + private DatabaseIf mockDb(String name, TableIf... tables) { + DatabaseIf db = Mockito.mock(DatabaseIf.class); + Mockito.when(db.getFullName()).thenReturn(name); + HashSet tableNames = new HashSet<>(); + for (TableIf table : tables) { + tableNames.add(table.getName()); + Mockito.when(db.getTableNullable(table.getName())).thenReturn(table); + } + Mockito.when(db.getTableNamesOrEmptyWithLock()).thenReturn(tableNames); + return db; + } + + private TableIf mockTable(long id, String name) { + TableIf table = Mockito.mock(TableIf.class); + Mockito.when(table.getId()).thenReturn(id); + Mockito.when(table.getName()).thenReturn(name); + Mockito.when(table.getType()).thenReturn(TableIf.TableType.OLAP); + Mockito.when(table.isManagedTable()).thenReturn(true); + return table; + } + + private WarmUpClusterCommand buildEventDrivenStmt(String src, String dst, TableFilterRule... rules) { + Map properties = new HashMap<>(); + properties.put("sync_mode", "event_driven"); + properties.put("sync_event", "load"); + return new WarmUpClusterCommand(new ArrayList<>(), src, dst, false, false, + properties, Arrays.asList(rules)); + } + + private static class RecordingAppender extends AbstractAppender { + private final List messages = new ArrayList<>(); + + RecordingAppender(String name) { + super(name, null, null, true, Property.EMPTY_ARRAY); + } + + @Override + public void append(LogEvent event) { + messages.add(event.getMessage().getFormattedMessage()); + } + + String messagesAsString() { + return String.join("\n", messages); + } + } +} diff --git a/fe/fe-core/src/test/java/org/apache/doris/metric/MetricsTest.java b/fe/fe-core/src/test/java/org/apache/doris/metric/MetricsTest.java index 164d767b66e203..0fe7d33b2e6028 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/metric/MetricsTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/metric/MetricsTest.java @@ -17,6 +17,8 @@ package org.apache.doris.metric; +import org.apache.doris.cloud.CloudWarmUpJob; +import org.apache.doris.cloud.JobWarmUpStats; import org.apache.doris.common.Config; import org.apache.doris.common.FeConstants; import org.apache.doris.common.util.JsonUtil; @@ -33,6 +35,7 @@ import java.lang.management.GarbageCollectorMXBean; import java.lang.management.ManagementFactory; +import java.util.Collections; import java.util.List; import java.util.concurrent.atomic.AtomicInteger; @@ -196,6 +199,163 @@ public void testVirtualComputeGroupSwitchMetricRename() { } } + @Test + public void testCloudWarmUpSyncJobMetricsReadStatsDirectlyFromJob() { + String oldCloudUniqueId = Config.cloud_unique_id; + Config.cloud_unique_id = "test_cloud_unique_id"; + try { + CloudWarmUpJob job = new CloudWarmUpJob.Builder() + .setJobId(1778211593204L) + .setSrcClusterName("warmup_source") + .setDstClusterName("warmup_target") + .setJobType(CloudWarmUpJob.JobType.CLUSTER) + .setSyncMode(CloudWarmUpJob.SyncMode.EVENT_DRIVEN) + .setSyncEvent(CloudWarmUpJob.SyncEvent.LOAD) + .build(); + job.setJobState(CloudWarmUpJob.JobState.RUNNING); + + JobWarmUpStats stats = new JobWarmUpStats(); + stats.requestedSegmentSize5m = 104857600L; + stats.requestedSegmentSize30m = 209715200L; + stats.requestedSegmentSize1h = 314572800L; + stats.finishSegmentSize5m = 94371840L; + stats.finishSegmentSize30m = 188743680L; + stats.finishSegmentSize1h = 283115520L; + stats.requestedIndexSize5m = 8388608L; + stats.requestedIndexSize30m = 16777216L; + stats.requestedIndexSize1h = 25165824L; + stats.finishIndexSize5m = 6291456L; + stats.finishIndexSize30m = 12582912L; + stats.finishIndexSize1h = 18874368L; + stats.computeGap(); + job.setSyncStats(stats); + + MetricRepo.syncCloudWarmUpSyncJobMetricDefinitions(Collections.singletonList(job)); + String metricResult = getPrometheusMetrics(); + Assert.assertTrue(metricResult.contains("doris_fe_file_cache_warm_up_sync_job_info" + + "{job_id=\"1778211593204\", job_type=\"CLUSTER\", sync_mode=\"EVENT_DRIVEN\", " + + "sync_event=\"LOAD\", job_state=\"RUNNING\", src_cluster_name=\"warmup_source\", " + + "dst_cluster_name=\"warmup_target\"} 1")); + Assert.assertFalse(metricResult.contains("doris_fe_file_cache_warm_up_sync_job_create_time_ms")); + Assert.assertFalse(metricResult.contains("doris_fe_file_cache_warm_up_sync_job_last_trigger_time_ms")); + Assert.assertFalse(metricResult.contains("doris_fe_file_cache_warm_up_sync_job_stats")); + Assert.assertTrue(metricResult.contains("doris_fe_file_cache_warm_up_sync_job_size_bytes" + + "{job_id=\"1778211593204\", job_type=\"CLUSTER\", src_cluster_name=\"warmup_source\", " + + "dst_cluster_name=\"warmup_target\", side=\"src\", window=\"5m\"} 113246208")); + Assert.assertTrue(metricResult.contains("doris_fe_file_cache_warm_up_sync_job_size_bytes" + + "{job_id=\"1778211593204\", job_type=\"CLUSTER\", src_cluster_name=\"warmup_source\", " + + "dst_cluster_name=\"warmup_target\", side=\"dst\", window=\"5m\"} 100663296")); + Assert.assertTrue(metricResult.contains("doris_fe_file_cache_warm_up_sync_job_size_bytes" + + "{job_id=\"1778211593204\", job_type=\"CLUSTER\", src_cluster_name=\"warmup_source\", " + + "dst_cluster_name=\"warmup_target\", side=\"src\", window=\"30m\"} 226492416")); + Assert.assertTrue(metricResult.contains("doris_fe_file_cache_warm_up_sync_job_size_bytes" + + "{job_id=\"1778211593204\", job_type=\"CLUSTER\", src_cluster_name=\"warmup_source\", " + + "dst_cluster_name=\"warmup_target\", side=\"dst\", window=\"1h\"} 301989888")); + + JobWarmUpStats updatedStats = new JobWarmUpStats(); + updatedStats.requestedSegmentSize5m = 12; + updatedStats.finishSegmentSize5m = 10; + updatedStats.computeGap(); + job.setSyncStats(updatedStats); + String updatedMetricResult = getPrometheusMetrics(); + Assert.assertTrue(updatedMetricResult.contains("doris_fe_file_cache_warm_up_sync_job_size_bytes" + + "{job_id=\"1778211593204\", job_type=\"CLUSTER\", src_cluster_name=\"warmup_source\", " + + "dst_cluster_name=\"warmup_target\", side=\"src\", window=\"5m\"} 12")); + Assert.assertTrue(updatedMetricResult.contains("doris_fe_file_cache_warm_up_sync_job_size_bytes" + + "{job_id=\"1778211593204\", job_type=\"CLUSTER\", src_cluster_name=\"warmup_source\", " + + "dst_cluster_name=\"warmup_target\", side=\"dst\", window=\"5m\"} 10")); + + CloudWarmUpJob replayedJob = new CloudWarmUpJob.Builder() + .setJobId(1778211593204L) + .setSrcClusterName("warmup_source") + .setDstClusterName("warmup_target") + .setJobType(CloudWarmUpJob.JobType.CLUSTER) + .setSyncMode(CloudWarmUpJob.SyncMode.EVENT_DRIVEN) + .setSyncEvent(CloudWarmUpJob.SyncEvent.LOAD) + .build(); + replayedJob.setJobState(CloudWarmUpJob.JobState.RUNNING); + JobWarmUpStats replayedStats = new JobWarmUpStats(); + replayedStats.requestedSegmentSize5m = 7; + replayedStats.requestedIndexSize5m = 3; + replayedStats.computeGap(); + replayedJob.setSyncStats(replayedStats); + MetricRepo.syncCloudWarmUpSyncJobMetricDefinitions(Collections.singletonList(replayedJob)); + String replayedMetricResult = getPrometheusMetrics(); + Assert.assertTrue(replayedMetricResult.contains("doris_fe_file_cache_warm_up_sync_job_size_bytes" + + "{job_id=\"1778211593204\", job_type=\"CLUSTER\", src_cluster_name=\"warmup_source\", " + + "dst_cluster_name=\"warmup_target\", side=\"src\", window=\"5m\"} 10")); + + replayedJob.setJobState(CloudWarmUpJob.JobState.CANCELLED); + MetricRepo.syncCloudWarmUpSyncJobMetricDefinitions(Collections.singletonList(replayedJob)); + String cancelledMetricResult = getPrometheusMetrics(); + Assert.assertTrue(cancelledMetricResult.contains("job_state=\"CANCELLED\"")); + Assert.assertFalse(cancelledMetricResult.contains("job_state=\"RUNNING\"")); + Assert.assertFalse(cancelledMetricResult.contains("doris_fe_file_cache_warm_up_sync_job_size_bytes")); + } finally { + MetricRepo.syncCloudWarmUpSyncJobMetricDefinitions(Collections.emptyList()); + Config.cloud_unique_id = oldCloudUniqueId; + } + } + + @Test + public void testEventDrivenCloudWarmUpSyncJobTriggerGapMetric() { + String oldCloudUniqueId = Config.cloud_unique_id; + Config.cloud_unique_id = "test_cloud_unique_id"; + try { + CloudWarmUpJob.PersistedTableFilterRule rule = new CloudWarmUpJob.PersistedTableFilterRule(); + rule.ruleType = "INCLUDE"; + rule.pattern = "db.tbl"; + CloudWarmUpJob job = new CloudWarmUpJob.Builder() + .setJobId(1778211593205L) + .setSrcClusterName("warmup_source") + .setDstClusterName("warmup_target") + .setJobType(CloudWarmUpJob.JobType.CLUSTER) + .setSyncMode(CloudWarmUpJob.SyncMode.EVENT_DRIVEN) + .setSyncEvent(CloudWarmUpJob.SyncEvent.LOAD) + .setTableFilterRules(Collections.singletonList(rule)) + .build(); + job.setJobState(CloudWarmUpJob.JobState.RUNNING); + + JobWarmUpStats stats = new JobWarmUpStats(); + stats.lastTriggerTs = 5000; + stats.progressTriggerTs = 4200; + stats.computeGap(); + job.setSyncStats(stats); + + MetricRepo.syncCloudWarmUpSyncJobMetricDefinitions(Collections.singletonList(job)); + String metricResult = getPrometheusMetrics(); + Assert.assertTrue(metricResult.contains("doris_fe_file_cache_warm_up_sync_job_trigger_gap_ms" + + "{job_id=\"1778211593205\", job_type=\"CLUSTER\", src_cluster_name=\"warmup_source\", " + + "dst_cluster_name=\"warmup_target\"} 800")); + + CloudWarmUpJob clusterLevelJob = new CloudWarmUpJob.Builder() + .setJobId(1778211593206L) + .setSrcClusterName("warmup_source") + .setDstClusterName("warmup_target") + .setJobType(CloudWarmUpJob.JobType.CLUSTER) + .setSyncMode(CloudWarmUpJob.SyncMode.EVENT_DRIVEN) + .setSyncEvent(CloudWarmUpJob.SyncEvent.LOAD) + .build(); + clusterLevelJob.setJobState(CloudWarmUpJob.JobState.RUNNING); + clusterLevelJob.setSyncStats(stats); + + MetricRepo.syncCloudWarmUpSyncJobMetricDefinitions(Collections.singletonList(clusterLevelJob)); + String clusterMetricResult = getPrometheusMetrics(); + Assert.assertTrue(clusterMetricResult.contains("doris_fe_file_cache_warm_up_sync_job_trigger_gap_ms" + + "{job_id=\"1778211593206\", job_type=\"CLUSTER\", src_cluster_name=\"warmup_source\", " + + "dst_cluster_name=\"warmup_target\"} 800")); + } finally { + MetricRepo.syncCloudWarmUpSyncJobMetricDefinitions(Collections.emptyList()); + Config.cloud_unique_id = oldCloudUniqueId; + } + } + + private String getPrometheusMetrics() { + MetricVisitor visitor = new PrometheusMetricVisitor(); + MetricRepo.DORIS_METRIC_REGISTER.accept(visitor); + return visitor.finish(); + } + @Test public void testGc() { PrometheusMetricVisitor visitor = new PrometheusMetricVisitor(); diff --git a/fe/fe-sql-parser/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 b/fe/fe-sql-parser/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 index efd539d5e7e360..446685892295e4 100644 --- a/fe/fe-sql-parser/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 +++ b/fe/fe-sql-parser/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 @@ -299,6 +299,7 @@ IMMEDIATE: 'IMMEDIATE'; IN: 'IN'; INCREMENTAL: 'INCREMENTAL'; INTEGRATION: 'INTEGRATION'; +INCLUDE: 'INCLUDE'; INDEX: 'INDEX'; INDEXES: 'INDEXES'; INFILE: 'INFILE'; diff --git a/fe/fe-sql-parser/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 b/fe/fe-sql-parser/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 index 1bf497f1cf9d8f..e7de219d017609 100644 --- a/fe/fe-sql-parser/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 +++ b/fe/fe-sql-parser/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 @@ -542,6 +542,7 @@ supportedOtherStatement | WARM UP (CLUSTER | COMPUTE GROUP) destination=identifier WITH ((CLUSTER | COMPUTE GROUP) source=identifier | (warmUpItem (AND warmUpItem)*)) FORCE? + onTablesClause? properties=propertyClause? #warmUpCluster | explain? WARM UP SELECT namedExpressionSeq FROM warmUpSingleTableRef whereClause? #warmUpSelect @@ -551,7 +552,15 @@ supportedOtherStatement | START TRANSACTION (WITH CONSISTENT SNAPSHOT)? #unsupportedStartTransaction ; - warmUpItem +onTablesClause + : ON TABLES LEFT_PAREN onTablesFilterRule (COMMA onTablesFilterRule)* RIGHT_PAREN + ; + +onTablesFilterRule + : (INCLUDE | EXCLUDE) STRING_LITERAL + ; + +warmUpItem : TABLE tableName=multipartIdentifier (PARTITION partitionName=identifier)? ; @@ -2150,6 +2159,7 @@ nonReserved | IMMEDIATE | INCREMENTAL | INTEGRATION + | INCLUDE | INDEXES | INSERT | INVERTED diff --git a/gensrc/proto/internal_service.proto b/gensrc/proto/internal_service.proto index 89a5d64976e29c..c5818339f3a66e 100644 --- a/gensrc/proto/internal_service.proto +++ b/gensrc/proto/internal_service.proto @@ -942,6 +942,8 @@ message PWarmUpRowsetRequest { optional int64 unix_ts_us = 2; optional int64 sync_wait_timeout_ms = 3; optional bool skip_existence_check = 4; + optional int64 job_id = 5; + optional int64 upstream_trigger_ts_ms = 6; } message PWarmUpRowsetResponse { diff --git a/gensrc/thrift/BackendService.thrift b/gensrc/thrift/BackendService.thrift index e9276caa42410c..2e5379bb42b256 100644 --- a/gensrc/thrift/BackendService.thrift +++ b/gensrc/thrift/BackendService.thrift @@ -223,6 +223,7 @@ struct TWarmUpTabletsRequest { 3: optional list job_metas 4: required TWarmUpTabletsRequestType type 5: optional TWarmUpEventType event + 6: optional list table_ids } struct TWarmUpTabletsResponse { diff --git a/regression-test/framework/src/main/groovy/org/apache/doris/regression/util/WarmupMetricsUtils.groovy b/regression-test/framework/src/main/groovy/org/apache/doris/regression/util/WarmupMetricsUtils.groovy new file mode 100644 index 00000000000000..aa877adb4132a5 --- /dev/null +++ b/regression-test/framework/src/main/groovy/org/apache/doris/regression/util/WarmupMetricsUtils.groovy @@ -0,0 +1,268 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.apache.doris.regression.util + +import groovy.json.JsonSlurper +import org.slf4j.Logger +import org.slf4j.LoggerFactory + +import java.util.regex.Pattern + +/** + * Utility methods for event-driven warmup regression tests. + * + * Methods that need database access accept a {@code Closure sqlRunner} + * parameter — callers pass {@code { String q -> sql(q) }} from the + * suite context. + */ +class WarmupMetricsUtils { + + static final Logger logger = LoggerFactory.getLogger(WarmupMetricsUtils.class) + + // Bvar metric names + static final String METRIC_REQUESTED = "file_cache_event_driven_warm_up_requested_segment_num" + static final String METRIC_SUBMITTED = "file_cache_event_driven_warm_up_submitted_segment_num" + static final String METRIC_FINISHED = "file_cache_event_driven_warm_up_finished_segment_num" + static final String METRIC_FAILED = "file_cache_event_driven_warm_up_failed_segment_num" + + /** + * Fetch a single bvar metric value from a BE's brpc_metrics endpoint. + */ + static long getBrpcMetric(String ip, String port, String metricName) { + def url = "http://${ip}:${port}/brpc_metrics" + def text = new URL(url).text + def matcher = text =~ ~"${metricName}\\s+(\\d+)" + if (matcher.find()) { + return matcher[0][1] as long + } + throw new RuntimeException("${metricName} not found for ${ip}:${port}") + } + + static String getPrometheusMetrics(String ip, Object port) { + return new URL("http://${ip}:${port}/metrics").text + } + + static BigDecimal findPrometheusMetricValue(String metricsText, String metricName, Map labels) { + def line = metricsText.readLines().find { metricLine -> + metricLine.startsWith("${metricName}{") + && labels.every { entry -> metricLine.contains(prometheusLabel(entry.key.toString(), entry.value)) } + } + if (line == null) { + return null + } + return new BigDecimal(line.substring(line.lastIndexOf(' ') + 1).trim()) + } + + static String prometheusLabel(String key, Object value) { + def text = value == null ? "" : value.toString() + text = text.replace("\\", "\\\\").replace("\"", "\\\"").replace("\n", "\\n") + return "${key}=\"${text}\"".toString() + } + + /** + * Sum a bvar metric across all BEs in the given cluster. + */ + static long getClusterMetricSum(Closure sqlRunner, String clusterName, String metricName) { + def clusterBes = getClusterBackends(sqlRunner, clusterName) + long sum = 0 + for (be in clusterBes) { + sum += getBrpcMetric(be[1].toString(), be[5].toString(), metricName) + } + return sum + } + + static List getClusterBackends(Closure sqlRunner, String clusterName) { + def backends = sqlRunner("SHOW BACKENDS") + return backends.findAll { + it[19].contains("\"compute_group_name\" : \"${clusterName}\"".toString()) + } + } + + static Map getClusterMetricValues(Closure sqlRunner, String clusterName, String metricName) { + Map values = [:] + for (be in getClusterBackends(sqlRunner, clusterName)) { + values[be[0].toString()] = getBrpcMetric(be[1].toString(), be[5].toString(), metricName) + } + return values + } + + static void clearFileCache(String ip, String httpPort) { + def response = new URL("http://${ip}:${httpPort}/api/file_cache?op=clear&sync=true").text + def json = new JsonSlurper().parseText(response) + if (json.status != "OK") { + throw new RuntimeException("Clear cache on ${ip}:${httpPort} failed: ${json.status}") + } + } + + static void clearFileCacheOnAllBackends(Closure sqlRunner, long waitMs = 5000) { + for (be in sqlRunner("SHOW BACKENDS")) { + clearFileCache(be[1].toString(), be[4].toString()) + } + Thread.sleep(waitMs) + } + + static long sumProfileCounter(String profileText, String counterName) { + def matcher = profileText =~ ~"(?m)(?{@code requested} is from the SOURCE cluster; the other three from DESTINATION.

+ * + * @return Map with keys: requested, submitted, finished, failed + */ + static Map getWarmupMetrics(Closure sqlRunner, String srcCluster, String dstCluster) { + return [ + requested: getClusterMetricSum(sqlRunner, srcCluster, METRIC_REQUESTED), + submitted: getClusterMetricSum(sqlRunner, dstCluster, METRIC_SUBMITTED), + finished : getClusterMetricSum(sqlRunner, dstCluster, METRIC_FINISHED), + failed : getClusterMetricSum(sqlRunner, dstCluster, METRIC_FAILED), + ] + } + + /** + * Log and return warmup metrics. + */ + static Map logWarmupMetrics(Closure sqlRunner, String srcCluster, String dstCluster) { + def m = getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + logger.info("warmup metrics [src=${srcCluster}, dst=${dstCluster}]: " + + "requested=${m.requested}, submitted=${m.submitted}, " + + "finished=${m.finished}, failed=${m.failed}") + return m + } + + /** + * Poll until enough segments have finished warming up. + * + * @param expectedFinished absolute finished count to wait for + * @param timeoutMs polling timeout in milliseconds + * @return latest metrics snapshot + */ + static Map waitForWarmupFinish(Closure sqlRunner, String srcCluster, String dstCluster, + long expectedFinished, long timeoutMs = 60000) { + long deadline = System.currentTimeMillis() + timeoutMs + while (System.currentTimeMillis() < deadline) { + def m = getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + if (m.finished >= expectedFinished && m.finished + m.failed >= m.submitted) { + return m + } + Thread.sleep(2000) + } + logger.warn("waitForWarmupFinish timed out after ${timeoutMs}ms, " + + "expected finished >= ${expectedFinished}") + return getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + } + + /** + * Parse the MatchedTables column (index 14) from SHOW WARM UP JOB output. + */ + static Set parseMatchedTables(List jobInfo) { + def raw = jobInfo[0][14]?.toString()?.trim() + if (raw == null || raw.isEmpty()) { + return [] as Set + } + return raw.split(/,\s*/).collect { it.trim() }.findAll { !it.isEmpty() }.toSet() + } + + /** + * Poll until MatchedTables contains (and excludes) the expected table names. + * + * @return last observed MatchedTables set + */ + static Set waitForMatchedTables(Closure sqlRunner, Object jobId, + Set expectedContains, + Set expectedNotContains = [] as Set, + long timeoutMs = 30000) { + long deadline = System.currentTimeMillis() + timeoutMs + Set lastMatched = [] as Set + while (System.currentTimeMillis() < deadline) { + def info = sqlRunner("SHOW WARM UP JOB WHERE ID = ${jobId}") + lastMatched = parseMatchedTables(info) + boolean allContained = expectedContains.every { lastMatched.contains(it) } + boolean noneExcluded = expectedNotContains.every { !lastMatched.contains(it) } + if (allContained && noneExcluded) { + return lastMatched + } + Thread.sleep(2000) + } + return lastMatched + } + + /** + * Parse the SyncStats column (index 15) from SHOW WARM UP JOB output. + */ + static Map parseSyncStats(List jobInfo) { + def raw = jobInfo[0][15]?.toString()?.trim() + if (raw == null || raw.isEmpty()) { + return [:] + } + return new JsonSlurper().parseText(raw) as Map + } + + /** + * Poll SHOW WARM UP JOB WHERE ID until SyncStats exists and satisfies the predicate. + * + * @return last parsed SyncStats map + */ + static Map waitForJobSyncStats(Closure sqlRunner, Object jobId, Closure predicate, + long timeoutMs = 30000) { + long deadline = System.currentTimeMillis() + timeoutMs + Map lastStats = [:] + while (System.currentTimeMillis() < deadline) { + def info = sqlRunner("SHOW WARM UP JOB WHERE ID = ${jobId}") + lastStats = parseSyncStats(info) + if (!lastStats.isEmpty() && predicate(lastStats)) { + return lastStats + } + Thread.sleep(2000) + } + return lastStats + } + + /** + * Wait for warmup metrics to stabilize (no new submissions for a sustained period). + * Uses a double-check pattern: waits 5s initially, then verifies stability over 3s. + * + * @return stabilized metrics snapshot + */ + static Map waitForMetricsStable(Closure sqlRunner, String srcCluster, String dstCluster, + long timeoutMs = 30000) { + long deadline = System.currentTimeMillis() + timeoutMs + def prev = getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + Thread.sleep(5000) + while (System.currentTimeMillis() < deadline) { + def cur = getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + if (cur.submitted == prev.submitted && cur.finished == prev.finished + && cur.finished + cur.failed >= cur.submitted) { + Thread.sleep(3000) + def verify = getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + if (verify.submitted == cur.submitted && verify.finished == cur.finished) { + return verify + } + } + prev = cur + Thread.sleep(2000) + } + logger.warn("waitForMetricsStable timed out after ${timeoutMs}ms") + return getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + } +} diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_abnormal_cancel_empty_recovery.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_abnormal_cancel_empty_recovery.groovy new file mode 100644 index 00000000000000..24bc49914548e5 --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_abnormal_cancel_empty_recovery.groovy @@ -0,0 +1,212 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.NodeType +import org.apache.doris.regression.util.WarmupMetricsUtils + +// Test points covered: EX-03, EX-08. +suite('test_warm_up_event_on_tables_abnormal_cancel_empty_recovery', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_warm_up_table_filter_refresh_interval_ms=1000', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + ] + options.enableDebugPoints() + options.cloudMode = true + + def waitUntil = { String desc, long timeoutMs, Closure predicate -> + long deadline = System.currentTimeMillis() + timeoutMs + while (System.currentTimeMillis() < deadline) { + if (predicate()) { + return + } + sleep(500) + } + assert false : "Timed out waiting for ${desc}" + } + + docker(options) { + Closure sqlRunner = { String q -> sql(q) } + + def srcCluster = "warmup_source" + def dstCluster = "warmup_target" + def dbName = "test_on_tables_abnormal_cancel_empty_db" + def jobIds = [] + def targetDebugEnabled = false + + cluster.addBackend(1, srcCluster) + cluster.addBackend(1, dstCluster) + + try { + sql """use @${srcCluster}""" + sql """CREATE DATABASE IF NOT EXISTS ${dbName}""" + sql """use ${dbName}""" + sql """CREATE TABLE IF NOT EXISTS cancel_tbl ( + id INT, + payload STRING + ) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 4 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + sql """CREATE TABLE IF NOT EXISTS fact_live ( + id INT, + amount INT + ) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + + def cancelJobId = sql(""" + WARM UP CLUSTER ${dstCluster} WITH CLUSTER ${srcCluster} + ON TABLES (INCLUDE '${dbName}.cancel_tbl') + PROPERTIES ("sync_mode" = "event_driven", "sync_event" = "load") + """)[0][0] + jobIds << cancelJobId + assert WarmupMetricsUtils.waitForMatchedTables(sqlRunner, cancelJobId, + ["${dbName}.cancel_tbl".toString()] as Set) == + ["${dbName}.cancel_tbl".toString()] as Set + + WarmupMetricsUtils.clearFileCacheOnAllBackends(sqlRunner) + def baseMetrics = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + sql """use @${srcCluster}""" + sql """use ${dbName}""" + sql """INSERT INTO cancel_tbl VALUES + (1, 'seed_1'), (2, 'seed_2'), (3, 'seed_3'), (4, 'seed_4')""" + def initialMetrics = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, srcCluster, dstCluster, + baseMetrics.finished + 1, 60000) + assert initialMetrics.failed == baseMetrics.failed : + "initial warmup should finish without failures, metrics=${initialMetrics}" + def initialCacheSize = 0L + waitUntil("initial warmup to populate target cache", 30000) { + initialCacheSize = WarmupMetricsUtils.getClusterMetricSum(sqlRunner, dstCluster, + "ttl_cache_size") + return initialCacheSize > 0 + } + assert initialCacheSize > 0 : "initial warmup should populate target cache" + + def targetBe = WarmupMetricsUtils.getClusterBackends(sqlRunner, dstCluster)[0] + GetDebugPoint().enableDebugPoint(targetBe[1].toString(), targetBe[4] as int, NodeType.BE, + "CloudInternalServiceImpl::warm_up_rowset.download_segment", [sleep: 10]) + targetDebugEnabled = true + + def beforeActiveLoad = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + sql """INSERT INTO cancel_tbl VALUES + (100, 'active_100'), (101, 'active_101'), (102, 'active_102'), (103, 'active_103'), + (104, 'active_104'), (105, 'active_105'), (106, 'active_106'), (107, 'active_107')""" + waitUntil("active warmup transfer to be submitted", 20000) { + def m = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + return m.submitted > beforeActiveLoad.submitted && m.finished < m.submitted + } + + sql """CANCEL WARM UP JOB WHERE ID = ${cancelJobId}""" + waitUntil("cancel job state", 20000) { + def info = sql """SHOW WARM UP JOB WHERE ID = ${cancelJobId}""" + return info[0][3] == "CANCELLED" + } + + def afterCancelStable = WarmupMetricsUtils.waitForMetricsStable(sqlRunner, + srcCluster, dstCluster, 50000) + assert afterCancelStable.submitted > beforeActiveLoad.submitted : + "active transfer should have submitted before cancel, before=${beforeActiveLoad}, after=${afterCancelStable}" + assert afterCancelStable.finished + afterCancelStable.failed >= afterCancelStable.submitted : + "active transfer should converge after cancel, metrics=${afterCancelStable}" + + def cacheAfterCancel = WarmupMetricsUtils.getClusterMetricSum(sqlRunner, dstCluster, + "ttl_cache_size") + assert cacheAfterCancel >= initialCacheSize : + "cancel should not clear existing target cache, before=${initialCacheSize}, after=${cacheAfterCancel}" + + GetDebugPoint().disableDebugPoint(targetBe[1].toString(), targetBe[4] as int, NodeType.BE, + "CloudInternalServiceImpl::warm_up_rowset.download_segment") + targetDebugEnabled = false + + def beforePostCancelLoad = WarmupMetricsUtils.waitForMetricsStable(sqlRunner, + srcCluster, dstCluster, 30000) + sql """INSERT INTO cancel_tbl VALUES (200, 'after_cancel_200'), (201, 'after_cancel_201')""" + sleep(5000) + def afterPostCancelLoad = WarmupMetricsUtils.waitForMetricsStable(sqlRunner, + srcCluster, dstCluster, 30000) + assert afterPostCancelLoad.submitted == beforePostCancelLoad.submitted : + "cancelled job should not submit later events, before=${beforePostCancelLoad}, after=${afterPostCancelLoad}" + assert afterPostCancelLoad.finished == beforePostCancelLoad.finished : + "cancelled job should not finish later events, before=${beforePostCancelLoad}, after=${afterPostCancelLoad}" + + def emptyWindowJobId = sql(""" + WARM UP CLUSTER ${dstCluster} WITH CLUSTER ${srcCluster} + ON TABLES (INCLUDE '${dbName}.fact_*') + PROPERTIES ("sync_mode" = "event_driven", "sync_event" = "load") + """)[0][0] + jobIds << emptyWindowJobId + assert WarmupMetricsUtils.waitForMatchedTables(sqlRunner, emptyWindowJobId, + ["${dbName}.fact_live".toString()] as Set) == + ["${dbName}.fact_live".toString()] as Set + + sql """ALTER TABLE ${dbName}.fact_live RENAME archive_live""" + def emptyMatched = WarmupMetricsUtils.waitForMatchedTables(sqlRunner, emptyWindowJobId, + [] as Set, + ["${dbName}.fact_live".toString(), "${dbName}.archive_live".toString()] as Set, + 30000) + assert emptyMatched.isEmpty() : "MatchedTables should be empty during the non-matching window: ${emptyMatched}" + def emptyJobInfo = sql """SHOW WARM UP JOB WHERE ID = ${emptyWindowJobId}""" + assert emptyJobInfo[0][3] in ["RUNNING", "PENDING"] : + "job should stay runnable when MatchedTables is empty, row=${emptyJobInfo[0]}" + + def beforeArchiveLoad = WarmupMetricsUtils.waitForMetricsStable(sqlRunner, + srcCluster, dstCluster, 30000) + sql """INSERT INTO archive_live VALUES (1, 10), (2, 20)""" + sleep(5000) + def afterArchiveLoad = WarmupMetricsUtils.waitForMetricsStable(sqlRunner, + srcCluster, dstCluster, 30000) + assert afterArchiveLoad.submitted == beforeArchiveLoad.submitted : + "non-matching empty-window load should not submit warmup, before=${beforeArchiveLoad}, after=${afterArchiveLoad}" + + sql """ALTER TABLE ${dbName}.archive_live RENAME fact_back""" + assert WarmupMetricsUtils.waitForMatchedTables(sqlRunner, emptyWindowJobId, + ["${dbName}.fact_back".toString()] as Set) == + ["${dbName}.fact_back".toString()] as Set + + def beforeRecoveredLoad = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + sql """INSERT INTO fact_back VALUES (3, 30), (4, 40)""" + def afterRecoveredLoad = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, + srcCluster, dstCluster, beforeRecoveredLoad.finished + 1, 60000) + assert afterRecoveredLoad.submitted > beforeRecoveredLoad.submitted : + "matching table after empty window should submit warmup, before=${beforeRecoveredLoad}, after=${afterRecoveredLoad}" + assert afterRecoveredLoad.finished > beforeRecoveredLoad.finished : + "matching table after empty window should finish warmup, before=${beforeRecoveredLoad}, after=${afterRecoveredLoad}" + } finally { + if (targetDebugEnabled) { + try { GetDebugPoint().clearDebugPointsForAllBEs() } catch (Exception ignored) {} + } + for (jid in jobIds) { + try { sql """CANCEL WARM UP JOB WHERE ID = ${jid}""" } catch (Exception ignored) {} + } + try { + sql """use @${srcCluster}""" + sql """use ${dbName}""" + sql """DROP TABLE IF EXISTS cancel_tbl""" + sql """DROP TABLE IF EXISTS fact_live""" + sql """DROP TABLE IF EXISTS archive_live""" + sql """DROP TABLE IF EXISTS fact_back""" + } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${dbName}""" } catch (Exception ignored) {} + } + } +} diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_abnormal_stats_and_failure.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_abnormal_stats_and_failure.groovy new file mode 100644 index 00000000000000..8888ba88d0ca4a --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_abnormal_stats_and_failure.groovy @@ -0,0 +1,261 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.NodeType +import org.apache.doris.regression.util.WarmupMetricsUtils + +// Test points covered: EX-05 (stats API HTTP 500, read timeout, BE down), EX-07. +suite('test_warm_up_event_on_tables_abnormal_stats_and_failure', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_tablet_rebalancer_interval_second=1', + 'cloud_warm_up_table_filter_refresh_interval_ms=1000', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + ] + options.enableDebugPoints() + options.cloudMode = true + + def waitUntil = { String desc, long timeoutMs, Closure predicate -> + long deadline = System.currentTimeMillis() + timeoutMs + while (System.currentTimeMillis() < deadline) { + if (predicate()) { + return + } + sleep(500) + } + assert false : "Timed out waiting for ${desc}" + } + + docker(options) { + Closure sqlRunner = { String q -> sql(q) } + + def srcCluster = "warmup_source" + def dstCluster = "warmup_target" + def dbName = "test_on_tables_abnormal_stats_fail_db" + def tableName = "abnormal_tbl" + def jobIds = [] + def statsApiDebugBe = null + def statsApiSleepBe = null + def downloadDebugBes = [] + + def rows = { int begin, int end -> + (begin.. 0 } + } + + def statsBeforeApiError = WarmupMetricsUtils.waitForJobSyncStats(sqlRunner, jobId, + { it.seg_num.finish_5m > 0 && it.seg_num.fail_5m == 0 }, 30000) + logger.info("SyncStats before API error injection: ${statsBeforeApiError}") + + statsApiDebugBe = WarmupMetricsUtils.getClusterBackends(sqlRunner, dstCluster)[0] + GetDebugPoint().enableDebugPoint(statsApiDebugBe[1].toString(), statsApiDebugBe[4] as int, + NodeType.BE, "WarmUpStatsAction.handle.return_error") + def degradedInfo = sql """SHOW WARM UP JOB WHERE ID = ${jobId}""" + assert degradedInfo[0][3] in ["RUNNING", "PENDING"] : + "SHOW should keep the job visible while one BE stats API fails, row=${degradedInfo[0]}" + def degradedStats = WarmupMetricsUtils.parseSyncStats(degradedInfo) + logger.info("SyncStats with one BE API failure: ${degradedStats}") + assert !degradedStats.isEmpty() : "SHOW should return degraded SyncStats instead of failing" + assert degradedStats.seg_num.finish_5m > 0 : + "remaining target BE stats should still be aggregated, stats=${degradedStats}" + + GetDebugPoint().disableDebugPoint(statsApiDebugBe[1].toString(), statsApiDebugBe[4] as int, + NodeType.BE, "WarmUpStatsAction.handle.return_error") + statsApiDebugBe = null + def restoredStats = WarmupMetricsUtils.waitForJobSyncStats(sqlRunner, jobId, + { it.seg_num.finish_5m >= statsBeforeApiError.seg_num.finish_5m }, 30000) + logger.info("SyncStats after API error recovery: ${restoredStats}") + + statsApiSleepBe = WarmupMetricsUtils.getClusterBackends(sqlRunner, dstCluster)[0] + GetDebugPoint().enableDebugPoint(statsApiSleepBe[1].toString(), statsApiSleepBe[4] as int, + NodeType.BE, "WarmUpStatsAction.handle.sleep", [sleep_ms: 12000]) + long timeoutStartMs = System.currentTimeMillis() + def timeoutInfo = sql """SHOW WARM UP JOB WHERE ID = ${jobId}""" + long timeoutElapsedMs = System.currentTimeMillis() - timeoutStartMs + assert timeoutInfo[0][3] in ["RUNNING", "PENDING"] : + "SHOW should keep the job visible while one BE stats API times out, row=${timeoutInfo[0]}" + def timeoutStats = WarmupMetricsUtils.parseSyncStats(timeoutInfo) + logger.info("SyncStats with one BE stats API timeout: ${timeoutStats}, elapsedMs=${timeoutElapsedMs}") + assert timeoutElapsedMs < 9000 : + "FE should use a bounded timeout for BE stats API requests, elapsedMs=${timeoutElapsedMs}" + assert !timeoutStats.isEmpty() : + "SHOW should return degraded SyncStats instead of waiting for the slow BE" + assert timeoutStats.seg_num.finish_5m > 0 : + "remaining target BE stats should still be aggregated after timeout, stats=${timeoutStats}" + GetDebugPoint().disableDebugPoint(statsApiSleepBe[1].toString(), statsApiSleepBe[4] as int, + NodeType.BE, "WarmUpStatsAction.handle.sleep") + statsApiSleepBe = null + + def targetBes = WarmupMetricsUtils.getClusterBackends(sqlRunner, dstCluster) + for (be in targetBes) { + GetDebugPoint().enableDebugPoint(be[1].toString(), be[4] as int, NodeType.BE, + "CloudInternalServiceImpl::warm_up_rowset.download_segment.inject_error") + downloadDebugBes << be + } + + def beforeFailureMetrics = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + def beforeFailureStats = WarmupMetricsUtils.waitForJobSyncStats(sqlRunner, jobId, + { !it.isEmpty() }, 30000) + sql """INSERT INTO ${tableName} VALUES ${rows(100, 108)}""" + waitUntil("download failure metric", 60000) { + def m = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + return m.failed > beforeFailureMetrics.failed && m.finished + m.failed >= m.submitted + } + def failedMetrics = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + assert failedMetrics.failed > beforeFailureMetrics.failed : + "injected download failure should increase failed bvar, before=${beforeFailureMetrics}, after=${failedMetrics}" + + def failedStats = WarmupMetricsUtils.waitForJobSyncStats(sqlRunner, jobId, { + it.seg_num.fail_5m > beforeFailureStats.seg_num.fail_5m + && it.seg_num.gap_5m > beforeFailureStats.seg_num.gap_5m + }, 30000) + logger.info("SyncStats after injected download failure: ${failedStats}") + assert failedStats.seg_num.fail_5m > 0 : "5m fail window should expose download failure" + assert failedStats.seg_num.fail_30m > 0 : "30m fail window should expose download failure" + assert failedStats.seg_num.fail_1h > 0 : "1h fail window should expose download failure" + assert failedStats.seg_num.gap_5m > 0 : "5m gap should expose unfinished failed warmup" + assert failedStats.seg_num.gap_30m > 0 : "30m gap should expose unfinished failed warmup" + assert failedStats.seg_num.gap_1h > 0 : "1h gap should expose unfinished failed warmup" + + for (be in downloadDebugBes) { + GetDebugPoint().disableDebugPoint(be[1].toString(), be[4] as int, NodeType.BE, + "CloudInternalServiceImpl::warm_up_rowset.download_segment.inject_error") + } + downloadDebugBes.clear() + + def beforeRecoveryMetrics = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + sql """INSERT INTO ${tableName} VALUES ${rows(200, 208)}""" + def afterRecoveryMetrics = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, srcCluster, dstCluster, + beforeRecoveryMetrics.finished + 1, 90000) + assert afterRecoveryMetrics.finished > beforeRecoveryMetrics.finished : + "warmup should recover and finish new downloads, before=${beforeRecoveryMetrics}, after=${afterRecoveryMetrics}" + assert afterRecoveryMetrics.failed == beforeRecoveryMetrics.failed : + "recovered warmup should not add new failures, before=${beforeRecoveryMetrics}, after=${afterRecoveryMetrics}" + + def recoveredStats = WarmupMetricsUtils.waitForJobSyncStats(sqlRunner, jobId, { + it.seg_num.finish_5m > failedStats.seg_num.finish_5m + && it.seg_num.fail_5m >= failedStats.seg_num.fail_5m + }, 30000) + logger.info("SyncStats after failure recovery: ${recoveredStats}") + + sql """use @${dstCluster}""" + sql """use ${dbName}""" + sql """set query_freshness_tolerance_ms = 5000""" + def res = sql """SELECT count(*) FROM ${tableName}""" + assert res[0][0].toString() == "48" : "target query should see all rows after failure recovery: ${res}" + + def stoppedStatsBeIndex = dstBeIndexes[0] as int + def stoppedStatsBe = cluster.getBeByIndex(stoppedStatsBeIndex) + cluster.stopBackends(stoppedStatsBeIndex) + waitUntil("target BE ${stoppedStatsBe.backendId} to be marked dead", 30000) { + def row = sql("SHOW BACKENDS").find { + it[0].toString() == stoppedStatsBe.backendId.toString() + } + return row != null && row[9].toString() == "false" + } + def beDownInfo = sql """SHOW WARM UP JOB WHERE ID = ${jobId}""" + assert beDownInfo[0][3] in ["RUNNING", "PENDING"] : + "SHOW should keep the job visible while one target BE is down, row=${beDownInfo[0]}" + def beDownStats = WarmupMetricsUtils.parseSyncStats(beDownInfo) + logger.info("SyncStats with one target BE down: ${beDownStats}") + assert !beDownStats.isEmpty() : + "SHOW should return degraded SyncStats when one target BE is down" + assert beDownStats.seg_num.finish_5m > 0 : + "remaining target BE stats should still be aggregated when one target BE is down, stats=${beDownStats}" + } finally { + if (statsApiDebugBe != null) { + try { + GetDebugPoint().disableDebugPoint(statsApiDebugBe[1].toString(), + statsApiDebugBe[4] as int, NodeType.BE, + "WarmUpStatsAction.handle.return_error") + } catch (Exception ignored) {} + } + if (statsApiSleepBe != null) { + try { + GetDebugPoint().disableDebugPoint(statsApiSleepBe[1].toString(), + statsApiSleepBe[4] as int, NodeType.BE, + "WarmUpStatsAction.handle.sleep") + } catch (Exception ignored) {} + } + if (!downloadDebugBes.isEmpty()) { + try { GetDebugPoint().clearDebugPointsForAllBEs() } catch (Exception ignored) {} + } + for (jid in jobIds) { + try { sql """CANCEL WARM UP JOB WHERE ID = ${jid}""" } catch (Exception ignored) {} + } + try { + sql """use @${srcCluster}""" + sql """use ${dbName}""" + sql """DROP TABLE IF EXISTS ${tableName}""" + } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${dbName}""" } catch (Exception ignored) {} + } + } +} diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_canonicalization.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_canonicalization.groovy new file mode 100644 index 00000000000000..5ca684acbd5064 --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_canonicalization.groovy @@ -0,0 +1,117 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import groovy.json.JsonSlurper + +suite('test_warm_up_event_on_tables_canonicalization', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_warm_up_table_filter_refresh_interval_ms=1000', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + ] + options.cloudMode = true + options.beNum = 1 + + docker(options) { + def clusterName1 = "warmup_source" + def clusterName2 = "warmup_target" + + cluster.addBackend(1, clusterName1) + cluster.addBackend(1, clusterName2) + + sql """use @${clusterName1}""" + + def dbName = "test_on_tables_canon_db" + def dbOther = "test_on_tables_canon_other_db" + def jobIds = [] + + try { + sql """CREATE DATABASE IF NOT EXISTS ${dbName}""" + sql """CREATE DATABASE IF NOT EXISTS ${dbOther}""" + + sql """use ${dbName}""" + sql """CREATE TABLE orders (id INT) DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1""" + sql """CREATE TABLE tmp_staging (id INT) DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1""" + sql """use ${dbOther}""" + sql """CREATE TABLE logs (id INT) DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1""" + sql """use @${clusterName1}""" + + // Create a job with specific rule order + def jobId_ = sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + ON TABLES ( + INCLUDE '${dbName}.*', + EXCLUDE '${dbName}.tmp_*', + INCLUDE '${dbOther}.*' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + def jobId = jobId_[0][0] + jobIds << jobId + + def jobInfo = sql """SHOW WARM UP JOB WHERE ID = ${jobId}""" + def tableFilter = jobInfo[0][13] + logger.info("TableFilter: ${tableFilter}") + + // Try creating a "duplicate" with rules in different order — should fail + // because canonicalization normalizes rule order + try { + sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + ON TABLES ( + INCLUDE '${dbOther}.*', + INCLUDE '${dbName}.*', + EXCLUDE '${dbName}.tmp_*' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + assert false : "Expected duplicate job error" + } catch (java.sql.SQLException e) { + logger.info("Expected error for duplicate job: ${e.getMessage()}") + assert e.getMessage().contains("already has a runnable job") + } + + } finally { + for (jid in jobIds) { + try { sql """CANCEL WARM UP JOB WHERE ID = ${jid}""" } catch (Exception ignored) {} + } + try { + sql """use ${dbName}""" + sql """DROP TABLE IF EXISTS orders""" + sql """DROP TABLE IF EXISTS tmp_staging""" + } catch (Exception ignored) {} + try { + sql """use ${dbOther}""" + sql """DROP TABLE IF EXISTS logs""" + } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${dbName}""" } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${dbOther}""" } catch (Exception ignored) {} + } + } +} diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_dynamic.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_dynamic.groovy new file mode 100644 index 00000000000000..c9de7fb56ee821 --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_dynamic.groovy @@ -0,0 +1,217 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.WarmupMetricsUtils + +suite('test_warm_up_event_on_tables_dynamic', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_warm_up_table_filter_refresh_interval_ms=1000', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + ] + options.cloudMode = true + options.beNum = 1 + + docker(options) { + Closure sqlRunner = { String q -> sql(q) } + + def clusterName1 = "warmup_source" + def clusterName2 = "warmup_target" + + cluster.addBackend(1, clusterName1) + cluster.addBackend(1, clusterName2) + + sql """use @${clusterName1}""" + + def dbName = "test_on_tables_dynamic_db" + def jobIds = [] + + try { + sql """CREATE DATABASE IF NOT EXISTS ${dbName}""" + sql """use ${dbName}""" + + // ===== Test 1: New table auto-included after job creation ===== + logger.info("===== Test 1: New table auto-included =====") + + sql """CREATE TABLE IF NOT EXISTS fact_orders (id INT, amount DOUBLE) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + + sql """use @${clusterName1}""" + def jobId_ = sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + ON TABLES ( + INCLUDE '${dbName}.fact_*' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + def jobId = jobId_[0][0] + jobIds << jobId + logger.info("Warm-up job ID: ${jobId}") + + sleep(3000) + + // Verify initial matched tables + def initMatched = WarmupMetricsUtils.waitForMatchedTables(sqlRunner, jobId, + ["${dbName}.fact_orders".toString()] as Set) + logger.info("Initial MatchedTables: ${initMatched}") + assert "${dbName}.fact_orders".toString() in initMatched + + // Create a new table that matches the pattern + sql """use ${dbName}""" + sql """CREATE TABLE IF NOT EXISTS fact_sales (id INT, revenue DOUBLE) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + + // Also create a table that does NOT match the pattern + sql """CREATE TABLE IF NOT EXISTS dim_product (id INT, name STRING) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + + // Poll until new matching table is auto-included + def matchedAfterCreate = WarmupMetricsUtils.waitForMatchedTables(sqlRunner, jobId, + ["${dbName}.fact_orders".toString(), "${dbName}.fact_sales".toString()] as Set, + ["${dbName}.dim_product".toString()] as Set) + logger.info("MatchedTables after create: ${matchedAfterCreate}") + assert "${dbName}.fact_orders".toString() in matchedAfterCreate + assert "${dbName}.fact_sales".toString() in matchedAfterCreate + assert !("${dbName}.dim_product".toString() in matchedAfterCreate) + + // Verify warmup works for the new table — with quantitative metric check + def baseMetrics = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, clusterName1, clusterName2) + def numInserts = 5 + sql """use ${dbName}""" + for (int i = 0; i < numInserts; i++) { + sql """INSERT INTO fact_sales VALUES (${i}, ${i * 100.0})""" + } + + def finalMetrics = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, clusterName1, clusterName2, + baseMetrics.finished + numInserts) + WarmupMetricsUtils.logWarmupMetrics(sqlRunner, clusterName1, clusterName2) + + def reqDelta = finalMetrics.requested - baseMetrics.requested + def subDelta = finalMetrics.submitted - baseMetrics.submitted + def finDelta = finalMetrics.finished - baseMetrics.finished + def failDelta = finalMetrics.failed - baseMetrics.failed + logger.info("Test1 deltas: requested=${reqDelta}, submitted=${subDelta}, finished=${finDelta}, failed=${failDelta}") + assert reqDelta >= numInserts : "Expected requested >= ${numInserts}, got ${reqDelta}" + assert subDelta >= numInserts : "Expected submitted >= ${numInserts}, got ${subDelta}" + assert finDelta >= numInserts : "Expected finished >= ${numInserts}, got ${finDelta}" + assert failDelta == 0 : "Expected 0 failed, got ${failDelta}" + + // Negative proof: insert into dim_product (not matched) + def metricsBeforeDim = WarmupMetricsUtils.waitForMetricsStable(sqlRunner, clusterName1, clusterName2) + for (int i = 0; i < numInserts; i++) { + sql """INSERT INTO dim_product VALUES (${i}, 'product_${i}')""" + } + sleep(5000) + def metricsAfterDim = WarmupMetricsUtils.logWarmupMetrics(sqlRunner, clusterName1, clusterName2) + def dimSubDelta = metricsAfterDim.submitted - metricsBeforeDim.submitted + def dimFinDelta = metricsAfterDim.finished - metricsBeforeDim.finished + assert dimSubDelta == 0 : "dim_product inserts should not trigger warmup, submitted delta=${dimSubDelta}" + assert dimFinDelta == 0 : "dim_product inserts should not trigger warmup, finished delta=${dimFinDelta}" + + // ===== Test 2: Dropped table auto-excluded ===== + logger.info("===== Test 2: Dropped table auto-excluded =====") + + sql """use ${dbName}""" + sql """DROP TABLE IF EXISTS fact_orders""" + + // Poll until dropped table is removed + def matchedAfterDrop = WarmupMetricsUtils.waitForMatchedTables(sqlRunner, jobId, + ["${dbName}.fact_sales".toString()] as Set, + ["${dbName}.fact_orders".toString()] as Set) + logger.info("MatchedTables after drop: ${matchedAfterDrop}") + assert !("${dbName}.fact_orders".toString() in matchedAfterDrop) + assert "${dbName}.fact_sales".toString() in matchedAfterDrop + + // Job should still be running + def jobInfoAfterDrop = sql """SHOW WARM UP JOB WHERE ID = ${jobId}""" + assert jobInfoAfterDrop[0][3] in ["RUNNING", "PENDING"] + + // ===== Test 3: Rename table — pattern re-evaluation ===== + logger.info("===== Test 3: Rename table =====") + + // Rename fact_sales to archive_sales (no longer matches fact_*) + sql """ALTER TABLE ${dbName}.fact_sales RENAME archive_sales""" + + def matchedAfterRename = WarmupMetricsUtils.waitForMatchedTables(sqlRunner, jobId, + [] as Set, + ["${dbName}.fact_sales".toString(), "${dbName}.archive_sales".toString()] as Set) + logger.info("MatchedTables after rename to archive_sales: ${matchedAfterRename}") + assert !("${dbName}.fact_sales".toString() in matchedAfterRename) + assert !("${dbName}.archive_sales".toString() in matchedAfterRename) + + // Job still running even with no matched tables + def jobInfoAfterRename = sql """SHOW WARM UP JOB WHERE ID = ${jobId}""" + assert jobInfoAfterRename[0][3] in ["RUNNING", "PENDING"] + + // Rename back to a matching name + sql """ALTER TABLE ${dbName}.archive_sales RENAME fact_revenue""" + + def matchedAfterRenameBack = WarmupMetricsUtils.waitForMatchedTables(sqlRunner, jobId, + ["${dbName}.fact_revenue".toString()] as Set) + logger.info("MatchedTables after rename to fact_revenue: ${matchedAfterRenameBack}") + assert "${dbName}.fact_revenue".toString() in matchedAfterRenameBack + + // Verify warmup still works after rename-back — with quantitative metric check + def metricsBeforeRenameInsert = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, clusterName1, clusterName2) + def numRenameInserts = 5 + sql """use ${dbName}""" + for (int i = 0; i < numRenameInserts; i++) { + sql """INSERT INTO fact_revenue VALUES (${i + 100}, ${i * 50.0})""" + } + + def metricsAfterRenameInsert = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, clusterName1, clusterName2, + metricsBeforeRenameInsert.finished + numRenameInserts) + WarmupMetricsUtils.logWarmupMetrics(sqlRunner, clusterName1, clusterName2) + + def renameReqDelta = metricsAfterRenameInsert.requested - metricsBeforeRenameInsert.requested + def renameSubDelta = metricsAfterRenameInsert.submitted - metricsBeforeRenameInsert.submitted + def renameFinDelta = metricsAfterRenameInsert.finished - metricsBeforeRenameInsert.finished + def renameFailDelta = metricsAfterRenameInsert.failed - metricsBeforeRenameInsert.failed + logger.info("Rename test deltas: requested=${renameReqDelta}, submitted=${renameSubDelta}, finished=${renameFinDelta}, failed=${renameFailDelta}") + assert renameReqDelta >= numRenameInserts : "Expected requested >= ${numRenameInserts}, got ${renameReqDelta}" + assert renameSubDelta >= numRenameInserts : "Expected submitted >= ${numRenameInserts}, got ${renameSubDelta}" + assert renameFinDelta >= numRenameInserts : "Expected finished >= ${numRenameInserts}, got ${renameFinDelta}" + assert renameFailDelta == 0 : "Expected 0 failed, got ${renameFailDelta}" + + } finally { + for (jid in jobIds) { + try { sql """CANCEL WARM UP JOB WHERE ID = ${jid}""" } catch (Exception ignored) {} + } + try { + sql """use ${dbName}""" + sql """DROP TABLE IF EXISTS fact_orders""" + sql """DROP TABLE IF EXISTS fact_sales""" + sql """DROP TABLE IF EXISTS fact_revenue""" + sql """DROP TABLE IF EXISTS archive_sales""" + sql """DROP TABLE IF EXISTS dim_product""" + } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${dbName}""" } catch (Exception ignored) {} + } + } +} diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_error_and_lifecycle.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_error_and_lifecycle.groovy new file mode 100644 index 00000000000000..34d68357ea31ef --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_error_and_lifecycle.groovy @@ -0,0 +1,387 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.WarmupMetricsUtils +import groovy.json.JsonSlurper + +suite('test_warm_up_event_on_tables_error_and_lifecycle', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_warm_up_table_filter_refresh_interval_ms=1000', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + ] + options.cloudMode = true + options.beNum = 1 + + docker(options) { + Closure sqlRunner = { String q -> sql(q) } + + def clusterName1 = "warmup_source" + def clusterName2 = "warmup_target" + + cluster.addBackend(1, clusterName1) + cluster.addBackend(1, clusterName2) + + sql """use @${clusterName1}""" + + def dbName = "test_on_tables_err_db" + def jobIds = [] + + try { + sql """CREATE DATABASE IF NOT EXISTS ${dbName}""" + sql """use ${dbName}""" + sql """CREATE TABLE IF NOT EXISTS base_table (id INT, val STRING) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + + sql """use @${clusterName1}""" + + // ===== Error Test 1: Exclude-only (no INCLUDE) ===== + logger.info("===== Error Test 1: Exclude-only =====") + try { + sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + ON TABLES ( + EXCLUDE '${dbName}.tmp_*' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + assert false : "Expected error for exclude-only ON TABLES" + } catch (java.sql.SQLException e) { + logger.info("Expected error: ${e.getMessage()}") + assert e.getMessage().contains("at least one INCLUDE") : "Error should mention INCLUDE requirement: ${e.getMessage()}" + } + + // ===== Error Test 2: Invalid pattern format (missing db.table) ===== + logger.info("===== Error Test 2: Invalid pattern format =====") + try { + sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + ON TABLES ( + INCLUDE 'orders' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + assert false : "Expected error for invalid pattern format" + } catch (java.sql.SQLException e) { + logger.info("Expected error: ${e.getMessage()}") + assert e.getMessage().contains("db.table") : "Error should mention db.table format: ${e.getMessage()}" + } + + // ===== Error Test 3: ON TABLES with non-event-driven sync mode ===== + logger.info("===== Error Test 3: ON TABLES with periodic sync mode =====") + try { + sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + ON TABLES ( + INCLUDE '${dbName}.*' + ) + PROPERTIES ( + "sync_mode" = "periodic", + "sync_interval_sec" = "10" + ) + """ + assert false : "Expected error for ON TABLES with periodic sync" + } catch (java.sql.SQLException e) { + logger.info("Expected error: ${e.getMessage()}") + assert e.getMessage().contains("event_driven") : "Error should mention event_driven requirement: ${e.getMessage()}" + } + + // ===== Error Test 4: No tables match the pattern ===== + logger.info("===== Error Test 4: No matching tables =====") + try { + sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + ON TABLES ( + INCLUDE 'nonexistent_db_xyz.*' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + assert false : "Expected error for no matching tables" + } catch (java.sql.SQLException e) { + logger.info("Expected error: ${e.getMessage()}") + def msg = e.getMessage().toLowerCase() + assert msg.contains("no tables matched") || msg.contains("no table") : "Error should indicate no tables matched: ${e.getMessage()}" + } + + // ===== Error Test 5: ON TABLES with ONCE sync mode ===== + logger.info("===== Error Test 5: ON TABLES with once sync mode =====") + try { + sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + ON TABLES ( + INCLUDE '${dbName}.*' + ) + PROPERTIES ( + "sync_mode" = "once" + ) + """ + assert false : "Expected error for ON TABLES with once sync" + } catch (java.sql.SQLException e) { + logger.info("Expected error: ${e.getMessage()}") + assert e.getMessage().contains("event_driven") : "Error should mention event_driven: ${e.getMessage()}" + } + + // ===== Lifecycle Test 1: Cluster-level and table-level jobs are mutually exclusive ===== + logger.info("===== Lifecycle Test 1: Cross-level conflict =====") + + // Create cluster-level event-driven job + def clusterJobId_ = sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + def clusterJobId = clusterJobId_[0][0] + jobIds << clusterJobId + logger.info("Cluster-level job ID: ${clusterJobId}") + + // Creating a table-level load-event job for the same source and destination should fail. + try { + sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + ON TABLES ( + INCLUDE '${dbName}.base_table' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + assert false : "Expected table-level job to conflict with existing cluster-level load-event job" + } catch (java.sql.SQLException e) { + logger.info("Expected cross-level conflict: ${e.getMessage()}") + assert e.getMessage().contains("Cannot create table-level load-event warm up job") : e.getMessage() + assert e.getMessage().contains("conflicting cluster-level load-event warm up job ${clusterJobId}") : + e.getMessage() + assert e.getMessage().contains("Cancel existing load-event warm up job ${clusterJobId}") : + e.getMessage() + } + + def clusterJobInfo = sql """SHOW WARM UP JOB WHERE ID = ${clusterJobId}""" + assert clusterJobInfo[0][13] == "" : "Cluster-level job should have empty TableFilter" + assert clusterJobInfo[0][14] == "" : "Cluster-level job should have empty MatchedTables" + + sql """CANCEL WARM UP JOB WHERE ID = ${clusterJobId}""" + def clusterCancelInfo = sql """SHOW WARM UP JOB WHERE ID = ${clusterJobId}""" + assert clusterCancelInfo[0][3] == "CANCELLED" + + // Create table-level event-driven job after cancelling the conflicting cluster-level job. + def tableJobId_ = sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + ON TABLES ( + INCLUDE '${dbName}.base_table' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + def tableJobId = tableJobId_[0][0] + jobIds << tableJobId + logger.info("Table-level job ID: ${tableJobId}") + + def tableJobInfo = sql """SHOW WARM UP JOB WHERE ID = ${tableJobId}""" + + // Table-level job should have non-empty TableFilter and MatchedTables + assert tableJobInfo[0][13].length() > 0 : "Table-level job should have non-empty TableFilter" + def tableJobMatched = WarmupMetricsUtils.parseMatchedTables(tableJobInfo) + assert "${dbName}.base_table".toString() in tableJobMatched : "Table-level job MatchedTables should contain base_table" + + // Creating a cluster-level load-event job should also fail while the table-level job is running. + try { + sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + assert false : "Expected cluster-level job to conflict with existing table-level load-event job" + } catch (java.sql.SQLException e) { + logger.info("Expected reverse cross-level conflict: ${e.getMessage()}") + assert e.getMessage().contains("Cannot create cluster-level load-event warm up job") : e.getMessage() + assert e.getMessage().contains("conflicting table-level load-event warm up job ${tableJobId}") : + e.getMessage() + assert e.getMessage().contains("Cancel existing load-event warm up job ${tableJobId}") : + e.getMessage() + } + + // ===== Lifecycle Test 2: Duplicate detection with normalized rules ===== + logger.info("===== Lifecycle Test 2: Duplicate detection =====") + + // Try creating same table-level job again + try { + sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + ON TABLES ( + INCLUDE '${dbName}.base_table' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + assert false : "Expected duplicate job error" + } catch (java.sql.SQLException e) { + logger.info("Expected error for duplicate: ${e.getMessage()}") + assert e.getMessage().contains("already has a runnable job") : e.getMessage() + } + + // Different filter should succeed + def tableJobId2_ = sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + ON TABLES ( + INCLUDE '${dbName}.*' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + def tableJobId2 = tableJobId2_[0][0] + jobIds << tableJobId2 + logger.info("Table-level job2 ID (different filter): ${tableJobId2}") + + // ===== Lifecycle Test 3: Cancel and recreate ===== + logger.info("===== Lifecycle Test 3: Cancel and recreate =====") + + sql """CANCEL WARM UP JOB WHERE ID = ${tableJobId}""" + def cancelInfo = sql """SHOW WARM UP JOB WHERE ID = ${tableJobId}""" + assert cancelInfo[0][3] == "CANCELLED" + + // After cancelling, we should be able to create a job with the same filter + def tableJobId3_ = sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + ON TABLES ( + INCLUDE '${dbName}.base_table' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + def tableJobId3 = tableJobId3_[0][0] + jobIds << tableJobId3 + logger.info("Table-level job3 ID (after cancel+recreate): ${tableJobId3}") + + // Verify new job is running + def jobInfo3 = sql """SHOW WARM UP JOB WHERE ID = ${tableJobId3}""" + assert jobInfo3[0][3] in ["RUNNING", "PENDING"] : "Recreated job should be running" + + // ===== Lifecycle Test 4: ? wildcard matching with quantitative metrics ===== + logger.info("===== Lifecycle Test 4: ? wildcard =====") + + sql """use ${dbName}""" + sql """CREATE TABLE IF NOT EXISTS log_a (id INT, msg STRING) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + sql """CREATE TABLE IF NOT EXISTS log_b (id INT, msg STRING) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + sql """CREATE TABLE IF NOT EXISTS log_ab (id INT, msg STRING) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + + sql """use @${clusterName1}""" + def jobIdQ_ = sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + ON TABLES ( + INCLUDE '${dbName}.log_?' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + def jobIdQ = jobIdQ_[0][0] + jobIds << jobIdQ + logger.info("Wildcard ? job ID: ${jobIdQ}") + + sleep(3000) + + def jobInfoQ = sql """SHOW WARM UP JOB WHERE ID = ${jobIdQ}""" + def matchedSetQ = WarmupMetricsUtils.parseMatchedTables(jobInfoQ) + logger.info("MatchedTables for ? wildcard: ${matchedSetQ}") + assert "${dbName}.log_a".toString() in matchedSetQ : "log_a should match log_? pattern" + assert "${dbName}.log_b".toString() in matchedSetQ : "log_b should match log_? pattern" + assert !("${dbName}.log_ab".toString() in matchedSetQ) : "log_ab should NOT match log_? (? matches exactly one char)" + + // Quantitative metric verification for ? wildcard + def baseMetrics = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, clusterName1, clusterName2) + + // Insert into matched tables log_a and log_b + def numInserts = 3 + sql """use ${dbName}""" + for (int i = 0; i < numInserts; i++) { + sql """INSERT INTO log_a VALUES (${i}, 'msg_a_${i}')""" + sql """INSERT INTO log_b VALUES (${i}, 'msg_b_${i}')""" + } + def expectedSegments = numInserts * 2 // 2 matched tables + + // Insert into non-matched table log_ab + for (int i = 0; i < numInserts; i++) { + sql """INSERT INTO log_ab VALUES (${i}, 'msg_ab_${i}')""" + } + + def finalMetrics = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, clusterName1, clusterName2, + baseMetrics.finished + expectedSegments) + WarmupMetricsUtils.logWarmupMetrics(sqlRunner, clusterName1, clusterName2) + + def reqDelta = finalMetrics.requested - baseMetrics.requested + def subDelta = finalMetrics.submitted - baseMetrics.submitted + def finDelta = finalMetrics.finished - baseMetrics.finished + def failDelta = finalMetrics.failed - baseMetrics.failed + logger.info("? wildcard deltas: requested=${reqDelta}, submitted=${subDelta}, finished=${finDelta}, failed=${failDelta}") + + // Only log_a and log_b should have been warmed, not log_ab + assert reqDelta >= expectedSegments : "Expected requested >= ${expectedSegments}, got ${reqDelta}" + assert subDelta >= expectedSegments : "Expected submitted >= ${expectedSegments}, got ${subDelta}" + assert finDelta >= expectedSegments : "Expected finished >= ${expectedSegments}, got ${finDelta}" + assert failDelta == 0 : "Expected 0 failed, got ${failDelta}" + + } finally { + for (jid in jobIds) { + try { sql """CANCEL WARM UP JOB WHERE ID = ${jid}""" } catch (Exception ignored) {} + } + try { + sql """use ${dbName}""" + sql """DROP TABLE IF EXISTS base_table""" + sql """DROP TABLE IF EXISTS log_a""" + sql """DROP TABLE IF EXISTS log_b""" + sql """DROP TABLE IF EXISTS log_ab""" + } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${dbName}""" } catch (Exception ignored) {} + } + } +} diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_include.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_include.groovy new file mode 100644 index 00000000000000..c2b02d8085a400 --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_include.groovy @@ -0,0 +1,167 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.WarmupMetricsUtils +import groovy.json.JsonSlurper + +suite('test_warm_up_event_on_tables_include', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_warm_up_table_filter_refresh_interval_ms=1000', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + ] + options.cloudMode = true + options.beNum = 1 + + docker(options) { + Closure sqlRunner = { String q -> sql(q) } + + def clusterName1 = "warmup_source" + def clusterName2 = "warmup_target" + + cluster.addBackend(1, clusterName1) + cluster.addBackend(1, clusterName2) + + sql """use @${clusterName1}""" + + def dbName = "test_on_tables_inc_db" + def dbExcluded = "test_on_tables_exc_db" + def jobIds = [] + + try { + sql """CREATE DATABASE IF NOT EXISTS ${dbName}""" + sql """CREATE DATABASE IF NOT EXISTS ${dbExcluded}""" + + sql """use ${dbName}""" + sql """CREATE TABLE IF NOT EXISTS orders (id INT, amount DOUBLE) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + sql """CREATE TABLE IF NOT EXISTS customers (id INT, name STRING) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + sql """CREATE VIEW IF NOT EXISTS view_orders AS SELECT id, amount FROM orders""" + + sql """use ${dbExcluded}""" + sql """CREATE TABLE IF NOT EXISTS logs (id INT, msg STRING) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + + // Create INCLUDE wildcard job + sql """use @${clusterName1}""" + def jobId_ = sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + ON TABLES ( + INCLUDE '${dbName}.*' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + def jobId = jobId_[0][0] + jobIds << jobId + logger.info("Warm-up job ID: ${jobId}") + + sleep(3000) + + def baseMetrics = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, clusterName1, clusterName2) + logger.info("Baseline metrics: ${baseMetrics}") + + // Negative proof: insert ONLY into excluded db + def numExcludedInserts = 5 + sql """use ${dbExcluded}""" + for (int i = 0; i < numExcludedInserts; i++) { + sql """INSERT INTO logs VALUES (${i}, 'log_message_${i}')""" + } + sleep(5000) + + def metricsAfterExcluded = WarmupMetricsUtils.logWarmupMetrics(sqlRunner, clusterName1, clusterName2) + def excludedSubmittedDelta = metricsAfterExcluded.submitted - baseMetrics.submitted + def excludedFinishedDelta = metricsAfterExcluded.finished - baseMetrics.finished + assert excludedSubmittedDelta == 0 : "Excluded inserts should not submit segments, delta=${excludedSubmittedDelta}" + assert excludedFinishedDelta == 0 : "Excluded inserts should not finish segments, delta=${excludedFinishedDelta}" + + // Positive proof: insert into included db + def numIncludedInserts = 5 + def expectedSegments = numIncludedInserts * 2 // 2 tables: orders + customers + sql """use ${dbName}""" + for (int i = 0; i < numIncludedInserts; i++) { + sql """INSERT INTO orders VALUES (${i}, ${i * 10.5})""" + sql """INSERT INTO customers VALUES (${i}, 'customer_${i}')""" + } + + def finalMetrics = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, clusterName1, clusterName2, + metricsAfterExcluded.finished + expectedSegments) + WarmupMetricsUtils.logWarmupMetrics(sqlRunner, clusterName1, clusterName2) + + def requestedDelta = finalMetrics.requested - metricsAfterExcluded.requested + def submittedDelta = finalMetrics.submitted - metricsAfterExcluded.submitted + def finishedDelta = finalMetrics.finished - metricsAfterExcluded.finished + def failedDelta = finalMetrics.failed - metricsAfterExcluded.failed + logger.info("Included warmup deltas: requested=${requestedDelta}, submitted=${submittedDelta}, finished=${finishedDelta}, failed=${failedDelta}") + assert requestedDelta >= expectedSegments : "Expected requested >= ${expectedSegments}, got ${requestedDelta}" + assert submittedDelta >= expectedSegments : "Expected submitted >= ${expectedSegments}, got ${submittedDelta}" + assert finishedDelta >= expectedSegments : "Expected finished >= ${expectedSegments}, got ${finishedDelta}" + assert failedDelta == 0 : "Expected 0 failed segments, got ${failedDelta}" + + // Verify SHOW WARM UP JOB output + def jobInfo = sql """SHOW WARM UP JOB WHERE ID = ${jobId}""" + assert jobInfo[0][0] == jobId + assert jobInfo[0][1] == clusterName1 + assert jobInfo[0][2] == clusterName2 + assert jobInfo[0][3] in ["RUNNING", "PENDING"] + assert jobInfo[0][4] == "TABLES" + assert jobInfo[0][5] == "EVENT_DRIVEN (LOAD)" + + def tableFilter = jobInfo[0][13] + logger.info("TableFilter: ${tableFilter}") + assert tableFilter != null && tableFilter.length() > 0 + def filterJson = new JsonSlurper().parseText(tableFilter) + assert filterJson.include.contains("${dbName}.*".toString()) + assert !filterJson.containsKey("exclude") + + def matchedSet = WarmupMetricsUtils.parseMatchedTables(jobInfo) + logger.info("MatchedTables set: ${matchedSet}") + assert "${dbName}.orders".toString() in matchedSet + assert "${dbName}.customers".toString() in matchedSet + assert !matchedSet.contains("${dbName}.view_orders".toString()) + assert !matchedSet.any { it.startsWith("${dbExcluded}.") } + + } finally { + for (jid in jobIds) { + try { sql """CANCEL WARM UP JOB WHERE ID = ${jid}""" } catch (Exception ignored) {} + } + try { + sql """use ${dbName}""" + sql """DROP TABLE IF EXISTS orders""" + sql """DROP TABLE IF EXISTS customers""" + } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${dbName}""" } catch (Exception ignored) {} + try { + sql """use ${dbExcluded}""" + sql """DROP TABLE IF EXISTS logs""" + } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${dbExcluded}""" } catch (Exception ignored) {} + } + } +} diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_include_exclude.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_include_exclude.groovy new file mode 100644 index 00000000000000..9fa2bdafc69d3b --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_include_exclude.groovy @@ -0,0 +1,153 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.WarmupMetricsUtils +import groovy.json.JsonSlurper + +suite('test_warm_up_event_on_tables_include_exclude', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_warm_up_table_filter_refresh_interval_ms=1000', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + ] + options.cloudMode = true + options.beNum = 1 + + docker(options) { + Closure sqlRunner = { String q -> sql(q) } + + def clusterName1 = "warmup_source" + def clusterName2 = "warmup_target" + + cluster.addBackend(1, clusterName1) + cluster.addBackend(1, clusterName2) + + sql """use @${clusterName1}""" + + def dbName = "test_on_tables_ie_db" + def jobIds = [] + + try { + sql """CREATE DATABASE IF NOT EXISTS ${dbName}""" + sql """use ${dbName}""" + sql """CREATE TABLE IF NOT EXISTS orders (id INT, amount DOUBLE) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + sql """CREATE TABLE IF NOT EXISTS customers (id INT, name STRING) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + sql """CREATE TABLE IF NOT EXISTS tmp_staging (id INT, data STRING) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + sql """CREATE TABLE IF NOT EXISTS orders_bak (id INT, amount DOUBLE) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + + sql """use @${clusterName1}""" + def jobId_ = sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + ON TABLES ( + INCLUDE '${dbName}.*', + EXCLUDE '${dbName}.tmp_*', + EXCLUDE '${dbName}.*_bak' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + def jobId = jobId_[0][0] + jobIds << jobId + logger.info("Warm-up job ID: ${jobId}") + + sleep(3000) + def baseMetrics = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, clusterName1, clusterName2) + + // Negative proof: insert only into excluded tables + def numExcInserts = 5 + sql """use ${dbName}""" + for (int i = 0; i < numExcInserts; i++) { + sql """INSERT INTO tmp_staging VALUES (${i}, 'staging_${i}')""" + sql """INSERT INTO orders_bak VALUES (${i}, ${i * 5.0})""" + } + sleep(5000) + + def metricsAfterExc = WarmupMetricsUtils.logWarmupMetrics(sqlRunner, clusterName1, clusterName2) + assert metricsAfterExc.submitted - baseMetrics.submitted == 0 : \ + "Excluded tables should not submit warmup segments" + assert metricsAfterExc.finished - baseMetrics.finished == 0 : \ + "Excluded tables should not finish warmup segments" + + // Positive proof: insert into included tables + def numIncInserts = 5 + def expectedSeg = numIncInserts * 2 // orders + customers + for (int i = 0; i < numIncInserts; i++) { + sql """INSERT INTO orders VALUES (${i + 100}, ${i * 20.5})""" + sql """INSERT INTO customers VALUES (${i + 100}, 'new_customer_${i}')""" + } + + def finalMetrics = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, clusterName1, clusterName2, + metricsAfterExc.finished + expectedSeg) + WarmupMetricsUtils.logWarmupMetrics(sqlRunner, clusterName1, clusterName2) + + def reqDelta = finalMetrics.requested - metricsAfterExc.requested + def subDelta = finalMetrics.submitted - metricsAfterExc.submitted + def finDelta = finalMetrics.finished - metricsAfterExc.finished + def failDelta = finalMetrics.failed - metricsAfterExc.failed + logger.info("Included deltas: requested=${reqDelta}, submitted=${subDelta}, finished=${finDelta}, failed=${failDelta}") + assert reqDelta >= expectedSeg : "Expected requested >= ${expectedSeg}, got ${reqDelta}" + assert subDelta >= expectedSeg : "Expected submitted >= ${expectedSeg}, got ${subDelta}" + assert finDelta >= expectedSeg : "Expected finished >= ${expectedSeg}, got ${finDelta}" + assert failDelta == 0 : "Expected 0 failed, got ${failDelta}" + + // Verify SHOW output + def jobInfo = sql """SHOW WARM UP JOB WHERE ID = ${jobId}""" + def tableFilter = jobInfo[0][13] + logger.info("TableFilter: ${tableFilter}") + def filterJson = new JsonSlurper().parseText(tableFilter) + assert filterJson.include.contains("${dbName}.*".toString()) + assert filterJson.exclude.contains("${dbName}.*_bak".toString()) + assert filterJson.exclude.contains("${dbName}.tmp_*".toString()) + + def matchedSet = WarmupMetricsUtils.parseMatchedTables(jobInfo) + logger.info("MatchedTables set: ${matchedSet}") + assert "${dbName}.orders".toString() in matchedSet + assert "${dbName}.customers".toString() in matchedSet + assert !("${dbName}.tmp_staging".toString() in matchedSet) + assert !("${dbName}.orders_bak".toString() in matchedSet) + + } finally { + for (jid in jobIds) { + try { sql """CANCEL WARM UP JOB WHERE ID = ${jid}""" } catch (Exception ignored) {} + } + try { + sql """use ${dbName}""" + sql """DROP TABLE IF EXISTS orders""" + sql """DROP TABLE IF EXISTS customers""" + sql """DROP TABLE IF EXISTS tmp_staging""" + sql """DROP TABLE IF EXISTS orders_bak""" + } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${dbName}""" } catch (Exception ignored) {} + } + } +} diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_mow_compaction.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_mow_compaction.groovy new file mode 100644 index 00000000000000..129451ddd862b4 --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_mow_compaction.groovy @@ -0,0 +1,221 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import groovy.json.JsonSlurper +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.WarmupMetricsUtils + +// Covers table-level event-driven warmup on a MOW table with upsert writes, target reads, and full compaction. +suite('test_warm_up_event_on_tables_mow_compaction', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_warm_up_table_filter_refresh_interval_ms=1000', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + 'enable_compaction_delay_commit_for_warm_up=true', + 'warm_up_rowset_sync_wait_min_timeout_ms=20000', + 'warm_up_rowset_sync_wait_max_timeout_ms=30000', + ] + options.cloudMode = true + options.beNum = 1 + + def httpJson = { String method, String url, int readTimeoutMs = 180000 -> + def conn = new URL(url).openConnection() + conn.setRequestMethod(method) + conn.setConnectTimeout(10000) + conn.setReadTimeout(readTimeoutMs) + def text = conn.responseCode >= 400 ? conn.errorStream?.text : conn.inputStream.text + assert text != null && !text.trim().isEmpty() : "empty HTTP response from ${url}" + return new JsonSlurper().parseText(text.trim()) + } + + def triggerFullCompaction = { ip, port, tabletId -> + def status = httpJson("POST", + "http://${ip}:${port}/api/compaction/run?tablet_id=${tabletId}&compact_type=full") + assert status.status.toString().toLowerCase() in ["success", "already_exist"] : + "trigger compaction failed on ${ip}:${port}, tablet=${tabletId}, status=${status}" + return status + } + + def waitForCompactionFinish = { ip, port, tabletId, timeoutMs -> + long deadline = System.currentTimeMillis() + timeoutMs + def lastStatus = null + while (System.currentTimeMillis() < deadline) { + lastStatus = httpJson("GET", + "http://${ip}:${port}/api/compaction/run_status?tablet_id=${tabletId}", 10000) + assert lastStatus.status.toLowerCase() == "success" : + "compaction failed on ${ip}:${port}, tablet=${tabletId}, status=${lastStatus}" + if (!lastStatus.run_status) { + return lastStatus + } + sleep(1000) + } + assert false : "compaction did not finish on ${ip}:${port}, tablet=${tabletId}, last=${lastStatus}" + } + + docker(options) { + Closure sqlRunner = { String q -> sql(q) } + + def srcCluster = "warmup_source" + def dstCluster = "warmup_target" + def dbName = "test_on_tables_mow_compaction_db" + def tableName = "mow_tbl" + def jobIds = [] + + cluster.addBackend(1, srcCluster) + cluster.addBackend(1, dstCluster) + + try { + sql """use @${srcCluster}""" + sql """CREATE DATABASE IF NOT EXISTS ${dbName}""" + sql """use ${dbName}""" + sql """CREATE TABLE IF NOT EXISTS ${tableName} ( + id INT NOT NULL, + value INT, + tag STRING + ) + UNIQUE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ( + "enable_unique_key_merge_on_write" = "true", + "file_cache_ttl_seconds" = "3600", + "disable_auto_compaction" = "true" + )""" + + def jobId = sql(""" + WARM UP CLUSTER ${dstCluster} WITH CLUSTER ${srcCluster} + ON TABLES (INCLUDE '${dbName}.${tableName}') + PROPERTIES ("sync_mode" = "event_driven", "sync_event" = "load") + """)[0][0] + jobIds << jobId + assert WarmupMetricsUtils.waitForMatchedTables(sqlRunner, jobId, + ["${dbName}.${tableName}".toString()] as Set) == + ["${dbName}.${tableName}".toString()] as Set + + WarmupMetricsUtils.clearFileCacheOnAllBackends(sqlRunner) + def baseMetrics = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + + sql """use @${srcCluster}""" + sql """use ${dbName}""" + sql """INSERT INTO ${tableName} VALUES (1, 10, 'a'), (2, 20, 'b'), (3, 30, 'c')""" + def afterInitialLoad = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, srcCluster, dstCluster, + baseMetrics.finished + 1, 60000) + assert afterInitialLoad.finished >= baseMetrics.finished + 1 : + "initial MOW warmup should finish, metrics=${afterInitialLoad}" + assert afterInitialLoad.failed == baseMetrics.failed : + "initial MOW warmup should not fail, metrics=${afterInitialLoad}" + + sql """use @${dstCluster}""" + sql """use ${dbName}""" + def initialRead = sql """SELECT count(*), sum(value) FROM ${tableName}""" + assert initialRead[0][0].toString() == "3" : "target initial MOW count mismatch: ${initialRead}" + assert initialRead[0][1].toString() == "60" : "target initial MOW sum mismatch: ${initialRead}" + + def beforeUpsert = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + sql """use @${srcCluster}""" + sql """use ${dbName}""" + sql """INSERT INTO ${tableName} VALUES (2, 200, 'b2'), (3, 300, 'c2')""" + def afterUpsert = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, srcCluster, dstCluster, + beforeUpsert.finished + 1, 60000) + assert afterUpsert.finished >= beforeUpsert.finished + 1 : + "first MOW upsert warmup should finish, metrics=${afterUpsert}" + assert afterUpsert.failed == beforeUpsert.failed : + "first MOW upsert warmup should not fail, metrics=${afterUpsert}" + + def beforeSecondUpsert = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + sql """INSERT INTO ${tableName} VALUES (2, 220, 'b3'), (4, 40, 'd')""" + def afterSecondUpsert = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, srcCluster, dstCluster, + beforeSecondUpsert.finished + 1, 60000) + assert afterSecondUpsert.finished >= beforeSecondUpsert.finished + 1 : + "second MOW upsert warmup should finish, metrics=${afterSecondUpsert}" + assert afterSecondUpsert.failed == beforeSecondUpsert.failed : + "second MOW upsert warmup should not fail, metrics=${afterSecondUpsert}" + + def tablets = sql_return_maparray """SHOW TABLETS FROM ${tableName}""" + assert tablets.size() == 1 : "${tableName} should have one tablet, tablets=${tablets}" + def tabletId = tablets[0].TabletId.toString() + def sourceBe = WarmupMetricsUtils.getClusterBackends(sqlRunner, srcCluster)[0] + def beforeCompaction = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + + triggerFullCompaction(sourceBe[1].toString(), sourceBe[4].toString(), tabletId) + waitForCompactionFinish(sourceBe[1].toString(), sourceBe[4].toString(), tabletId, 90000) + + def afterCompaction = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, srcCluster, dstCluster, + beforeCompaction.finished + 1, 90000) + assert afterCompaction.finished >= beforeCompaction.finished + 1 : + "MOW full compaction rowset warmup should finish, metrics=${afterCompaction}" + assert afterCompaction.failed == beforeCompaction.failed : + "MOW full compaction rowset warmup should not fail, metrics=${afterCompaction}" + + def beforePostCompactionUpsert = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + sql """INSERT INTO ${tableName} VALUES (2, 222, 'b4'), (4, 44, 'd2'), (5, 50, 'e')""" + def afterPostCompactionUpsert = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, srcCluster, dstCluster, + beforePostCompactionUpsert.finished + 1, 60000) + assert afterPostCompactionUpsert.finished >= beforePostCompactionUpsert.finished + 1 : + "post-compaction MOW upsert warmup should finish, metrics=${afterPostCompactionUpsert}" + assert afterPostCompactionUpsert.failed == beforePostCompactionUpsert.failed : + "post-compaction MOW upsert warmup should not fail, metrics=${afterPostCompactionUpsert}" + + def stats = WarmupMetricsUtils.waitForJobSyncStats(sqlRunner, jobId, + { it.seg_num.finish_5m >= 5 && it.seg_num.fail_5m == 0 && it.seg_num.gap_5m == 0 }, 60000) + logger.info("MOW warmup SyncStats: ${stats}") + assert stats.seg_num.fail_5m == 0 : "MOW warmup SyncStats should have no failures: ${stats}" + assert stats.seg_num.gap_5m == 0 : "MOW warmup SyncStats should converge: ${stats}" + + sql """use @${dstCluster}""" + sql """use ${dbName}""" + profile("mow_compaction_target_profile") { + sql """set enable_profile = true""" + sql """set profile_level = 2""" + run { + def res = sql """/* mow_compaction_target_profile */ SELECT count(*), sum(value) FROM ${tableName}""" + assert res[0][0].toString() == "5" : "target final MOW count mismatch: ${res}" + assert res[0][1].toString() == "626" : "target final MOW sum mismatch: ${res}" + sleep(1000) + } + check { profileString, exception -> + if (exception != null) { + throw exception + } + assert profileString.contains("NumRemoteIOTotal") : "profile should contain file cache counters" + def remoteTotal = WarmupMetricsUtils.sumProfileCounter(profileString, "NumRemoteIOTotal") + def localTotal = WarmupMetricsUtils.sumProfileCounter(profileString, "NumLocalIOTotal") + logger.info("MOW profile NumRemoteIOTotal=${remoteTotal}, NumLocalIOTotal=${localTotal}") + assert remoteTotal == 0 : "warmed MOW target query should not read remote data" + assert localTotal > 0 : "warmed MOW target query should hit local file cache" + } + } + + def finalRead = sql """SELECT id, value, tag FROM ${tableName} ORDER BY id""" + assert finalRead.toString() == "[[1, 10, a], [2, 222, b4], [3, 300, c2], [4, 44, d2], [5, 50, e]]" : + "target MOW rows mismatch after upsert and full compaction: ${finalRead}" + } finally { + for (jid in jobIds) { + try { sql """CANCEL WARM UP JOB WHERE ID = ${jid}""" } catch (Exception ignored) {} + } + try { + sql """use @${srcCluster}""" + sql """use ${dbName}""" + sql """DROP TABLE IF EXISTS ${tableName}""" + } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${dbName}""" } catch (Exception ignored) {} + } + } +} diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_multi_dst.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_multi_dst.groovy new file mode 100644 index 00000000000000..7a220876378558 --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_multi_dst.groovy @@ -0,0 +1,213 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.WarmupMetricsUtils +import groovy.json.JsonSlurper + +suite('test_warm_up_event_on_tables_multi_dst', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_warm_up_table_filter_refresh_interval_ms=1000', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + ] + options.cloudMode = true + options.beNum = 1 + + docker(options) { + Closure sqlRunner = { String q -> sql(q) } + + def srcCluster = "warmup_source" + def dstCluster1 = "warmup_target_1" + def dstCluster2 = "warmup_target_2" + + cluster.addBackend(1, srcCluster) + cluster.addBackend(1, dstCluster1) + cluster.addBackend(1, dstCluster2) + + sql """use @${srcCluster}""" + + def dbName = "test_on_tables_multi_dst_db" + def jobIds = [] + + try { + sql """CREATE DATABASE IF NOT EXISTS ${dbName}""" + sql """use ${dbName}""" + + sql """CREATE TABLE IF NOT EXISTS orders (id INT, amount DOUBLE) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + sql """CREATE TABLE IF NOT EXISTS logs (id INT, msg STRING) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + + // ===== Create job1: source -> target1, only 'orders' ===== + sql """use @${srcCluster}""" + def jobId1_ = sql """ + WARM UP CLUSTER ${dstCluster1} WITH CLUSTER ${srcCluster} + ON TABLES ( + INCLUDE '${dbName}.orders' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + def jobId1 = jobId1_[0][0] + jobIds << jobId1 + logger.info("Job1 (source -> target1, orders only): ID=${jobId1}") + + // ===== Create job2: source -> target2, all tables ===== + def jobId2_ = sql """ + WARM UP CLUSTER ${dstCluster2} WITH CLUSTER ${srcCluster} + ON TABLES ( + INCLUDE '${dbName}.*' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + def jobId2 = jobId2_[0][0] + jobIds << jobId2 + logger.info("Job2 (source -> target2, all tables): ID=${jobId2}") + + sleep(3000) + + // Verify matched tables for each job + def matched1 = WarmupMetricsUtils.waitForMatchedTables(sqlRunner, jobId1, + ["${dbName}.orders".toString()] as Set, + ["${dbName}.logs".toString()] as Set) + logger.info("Job1 MatchedTables: ${matched1}") + assert "${dbName}.orders".toString() in matched1 + assert !("${dbName}.logs".toString() in matched1) + + def matched2 = WarmupMetricsUtils.waitForMatchedTables(sqlRunner, jobId2, + ["${dbName}.orders".toString(), "${dbName}.logs".toString()] as Set) + logger.info("Job2 MatchedTables: ${matched2}") + assert "${dbName}.orders".toString() in matched2 + assert "${dbName}.logs".toString() in matched2 + + // ===== Test 1: Insert into 'orders' — both targets should warm up ===== + logger.info("===== Test 1: orders -> both targets =====") + + def baseDst1 = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster1) + def baseDst2 = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster2) + logger.info("Baseline target1: ${baseDst1}, target2: ${baseDst2}") + + def numInserts = 5 + sql """use ${dbName}""" + for (int i = 0; i < numInserts; i++) { + sql """INSERT INTO orders VALUES (${i}, ${i * 10.5})""" + } + + // Wait for both targets to finish + def finalDst1 = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, srcCluster, dstCluster1, + baseDst1.finished + numInserts) + def finalDst2 = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, srcCluster, dstCluster2, + baseDst2.finished + numInserts) + WarmupMetricsUtils.logWarmupMetrics(sqlRunner, srcCluster, dstCluster1) + WarmupMetricsUtils.logWarmupMetrics(sqlRunner, srcCluster, dstCluster2) + + // Verify target1 + def dst1SubDelta = finalDst1.submitted - baseDst1.submitted + def dst1FinDelta = finalDst1.finished - baseDst1.finished + def dst1FailDelta = finalDst1.failed - baseDst1.failed + logger.info("Target1 deltas: submitted=${dst1SubDelta}, finished=${dst1FinDelta}, failed=${dst1FailDelta}") + assert dst1SubDelta >= numInserts : "Target1: expected submitted >= ${numInserts}, got ${dst1SubDelta}" + assert dst1FinDelta >= numInserts : "Target1: expected finished >= ${numInserts}, got ${dst1FinDelta}" + assert dst1FailDelta == 0 : "Target1: expected 0 failed, got ${dst1FailDelta}" + + // Verify target2 + def dst2SubDelta = finalDst2.submitted - baseDst2.submitted + def dst2FinDelta = finalDst2.finished - baseDst2.finished + def dst2FailDelta = finalDst2.failed - baseDst2.failed + logger.info("Target2 deltas: submitted=${dst2SubDelta}, finished=${dst2FinDelta}, failed=${dst2FailDelta}") + assert dst2SubDelta >= numInserts : "Target2: expected submitted >= ${numInserts}, got ${dst2SubDelta}" + assert dst2FinDelta >= numInserts : "Target2: expected finished >= ${numInserts}, got ${dst2FinDelta}" + assert dst2FailDelta == 0 : "Target2: expected 0 failed, got ${dst2FailDelta}" + + // ===== Test 2: Insert into 'logs' — only target2 should warm up ===== + logger.info("===== Test 2: logs -> only target2 =====") + + // Wait for metrics to stabilize before negative proof + WarmupMetricsUtils.waitForMetricsStable(sqlRunner, srcCluster, dstCluster1) + def baseDst1ForLogs = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster1) + def baseDst2ForLogs = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster2) + + sql """use ${dbName}""" + for (int i = 0; i < numInserts; i++) { + sql """INSERT INTO logs VALUES (${i}, 'log_${i}')""" + } + + // Wait for target2 to finish (logs is matched by job2) + def finalDst2ForLogs = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, srcCluster, dstCluster2, + baseDst2ForLogs.finished + numInserts) + WarmupMetricsUtils.logWarmupMetrics(sqlRunner, srcCluster, dstCluster2) + + // Verify target2 warmed logs + def dst2LogsSubDelta = finalDst2ForLogs.submitted - baseDst2ForLogs.submitted + def dst2LogsFinDelta = finalDst2ForLogs.finished - baseDst2ForLogs.finished + def dst2LogsFailDelta = finalDst2ForLogs.failed - baseDst2ForLogs.failed + logger.info("Target2 logs deltas: submitted=${dst2LogsSubDelta}, finished=${dst2LogsFinDelta}, failed=${dst2LogsFailDelta}") + assert dst2LogsSubDelta >= numInserts : "Target2: expected submitted >= ${numInserts}, got ${dst2LogsSubDelta}" + assert dst2LogsFinDelta >= numInserts : "Target2: expected finished >= ${numInserts}, got ${dst2LogsFinDelta}" + assert dst2LogsFailDelta == 0 : "Target2: expected 0 failed, got ${dst2LogsFailDelta}" + + // Verify target1 did NOT warm logs (negative proof) + sleep(5000) + def finalDst1ForLogs = WarmupMetricsUtils.logWarmupMetrics(sqlRunner, srcCluster, dstCluster1) + def dst1LogsSubDelta = finalDst1ForLogs.submitted - baseDst1ForLogs.submitted + def dst1LogsFinDelta = finalDst1ForLogs.finished - baseDst1ForLogs.finished + logger.info("Target1 logs deltas: submitted=${dst1LogsSubDelta}, finished=${dst1LogsFinDelta}") + assert dst1LogsSubDelta == 0 : "Target1 should NOT warm logs, submitted delta=${dst1LogsSubDelta}" + assert dst1LogsFinDelta == 0 : "Target1 should NOT warm logs, finished delta=${dst1LogsFinDelta}" + + // ===== Verify SHOW WARM UP JOB for both jobs ===== + logger.info("===== Verify SHOW WARM UP JOB output =====") + + def jobInfo1 = sql """SHOW WARM UP JOB WHERE ID = ${jobId1}""" + assert jobInfo1[0][2] == dstCluster1 + assert jobInfo1[0][3] in ["RUNNING", "PENDING"] + def filter1 = new JsonSlurper().parseText(jobInfo1[0][13]) + assert filter1.include.contains("${dbName}.orders".toString()) + assert !filter1.containsKey("exclude") + + def jobInfo2 = sql """SHOW WARM UP JOB WHERE ID = ${jobId2}""" + assert jobInfo2[0][2] == dstCluster2 + assert jobInfo2[0][3] in ["RUNNING", "PENDING"] + def filter2 = new JsonSlurper().parseText(jobInfo2[0][13]) + assert filter2.include.contains("${dbName}.*".toString()) + + } finally { + for (jid in jobIds) { + try { sql """CANCEL WARM UP JOB WHERE ID = ${jid}""" } catch (Exception ignored) {} + } + try { + sql """use ${dbName}""" + sql """DROP TABLE IF EXISTS orders""" + sql """DROP TABLE IF EXISTS logs""" + } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${dbName}""" } catch (Exception ignored) {} + } + } +} diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_multi_include.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_multi_include.groovy new file mode 100644 index 00000000000000..1faccc40e10cdf --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_multi_include.groovy @@ -0,0 +1,142 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.WarmupMetricsUtils + +suite('test_warm_up_event_on_tables_multi_include', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_warm_up_table_filter_refresh_interval_ms=1000', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + ] + options.cloudMode = true + options.beNum = 1 + + docker(options) { + Closure sqlRunner = { String q -> sql(q) } + + def clusterName1 = "warmup_source" + def clusterName2 = "warmup_target" + + cluster.addBackend(1, clusterName1) + cluster.addBackend(1, clusterName2) + + sql """use @${clusterName1}""" + + def dbName = "test_on_tables_mi_db" + def dbOther = "test_on_tables_mi_other_db" + def jobIds = [] + + try { + sql """CREATE DATABASE IF NOT EXISTS ${dbName}""" + sql """CREATE DATABASE IF NOT EXISTS ${dbOther}""" + + sql """use ${dbName}""" + sql """CREATE TABLE IF NOT EXISTS orders (id INT, amount DOUBLE) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + sql """CREATE TABLE IF NOT EXISTS customers (id INT, name STRING) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + + sql """use ${dbOther}""" + sql """CREATE TABLE IF NOT EXISTS logs (id INT, msg STRING) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + + // Multiple INCLUDE: orders from dbName + logs from dbOther (but NOT customers) + sql """use @${clusterName1}""" + def jobId_ = sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + ON TABLES ( + INCLUDE '${dbName}.orders', + INCLUDE '${dbOther}.logs' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + def jobId = jobId_[0][0] + jobIds << jobId + logger.info("Warm-up job ID: ${jobId}") + + sleep(3000) + def baseMetrics = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, clusterName1, clusterName2) + + // Insert into matched (orders, logs) and unmatched (customers) + def numInserts = 5 + sql """use ${dbName}""" + for (int i = 0; i < numInserts; i++) { + sql """INSERT INTO orders VALUES (${i}, ${i * 30.0})""" + sql """INSERT INTO customers VALUES (${i}, 'extra_${i}')""" + } + sql """use ${dbOther}""" + for (int i = 0; i < numInserts; i++) { + sql """INSERT INTO logs VALUES (${i}, 'important_${i}')""" + } + + // Expected: orders(5) + logs(5) = 10 segments; customers(5) NOT included + def expectedSeg = numInserts * 2 // orders + logs + def finalMetrics = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, clusterName1, clusterName2, + baseMetrics.finished + expectedSeg) + WarmupMetricsUtils.logWarmupMetrics(sqlRunner, clusterName1, clusterName2) + + def reqDelta = finalMetrics.requested - baseMetrics.requested + def subDelta = finalMetrics.submitted - baseMetrics.submitted + def finDelta = finalMetrics.finished - baseMetrics.finished + def failDelta = finalMetrics.failed - baseMetrics.failed + logger.info("Multi-include deltas: requested=${reqDelta}, submitted=${subDelta}, finished=${finDelta}, failed=${failDelta}") + assert reqDelta >= expectedSeg : "Expected requested >= ${expectedSeg}, got ${reqDelta}" + assert subDelta >= expectedSeg : "Expected submitted >= ${expectedSeg}, got ${subDelta}" + assert finDelta >= expectedSeg : "Expected finished >= ${expectedSeg}, got ${finDelta}" + // customers(5 inserts) should NOT contribute; if they did, submitted would be >= 15 + assert subDelta < expectedSeg + numInserts : \ + "customers should NOT be warmed (submitted=${subDelta} should be < ${expectedSeg + numInserts})" + assert failDelta == 0 : "Expected 0 failed, got ${failDelta}" + + def jobInfo = sql """SHOW WARM UP JOB WHERE ID = ${jobId}""" + def matchedSet = WarmupMetricsUtils.parseMatchedTables(jobInfo) + logger.info("MatchedTables set: ${matchedSet}") + assert "${dbName}.orders".toString() in matchedSet + assert "${dbOther}.logs".toString() in matchedSet + assert !("${dbName}.customers".toString() in matchedSet) + + } finally { + for (jid in jobIds) { + try { sql """CANCEL WARM UP JOB WHERE ID = ${jid}""" } catch (Exception ignored) {} + } + try { + sql """use ${dbName}""" + sql """DROP TABLE IF EXISTS orders""" + sql """DROP TABLE IF EXISTS customers""" + } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${dbName}""" } catch (Exception ignored) {} + try { + sql """use ${dbOther}""" + sql """DROP TABLE IF EXISTS logs""" + } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${dbOther}""" } catch (Exception ignored) {} + } + } +} diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_overlap_and_mv.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_overlap_and_mv.groovy new file mode 100644 index 00000000000000..fd55d170e483a9 --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_overlap_and_mv.groovy @@ -0,0 +1,332 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.WarmupMetricsUtils + +suite('test_warm_up_event_on_tables_overlap_and_mv', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_warm_up_table_filter_refresh_interval_ms=1000', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + ] + options.cloudMode = true + options.beNum = 1 + + docker(options) { + Closure sqlRunner = { String q -> sql(q) } + + def dstCluster = "warmup_target" + + def clusters = sql """SHOW CLUSTERS""" + assert !clusters.isEmpty() : "SHOW CLUSTERS should return the default source cluster" + def defaultCluster = clusters.find { + it[1].toString().equalsIgnoreCase("true") + } + def srcCluster = (defaultCluster ?: clusters[0])[0].toString() + logger.info("use default source cluster for overlap and mv warmup case: ${srcCluster}") + cluster.addBackend(1, dstCluster) + + def overlapDb = "test_on_tables_overlap_extra_db" + def mvDb = "test_on_tables_mv_extra_db" + def jobIds = [] + + try { + // FT-10: overlapping table-level jobs can coexist without duplicate target downloads. + sql """use @${srcCluster}""" + sql """CREATE DATABASE IF NOT EXISTS ${overlapDb}""" + sql """use ${overlapDb}""" + sql """CREATE TABLE IF NOT EXISTS orders (id INT, amount INT) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + sql """CREATE TABLE IF NOT EXISTS customers (id INT, name STRING) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + sql """CREATE TABLE IF NOT EXISTS audit_log (id INT, msg STRING) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + + def ordersJobId = sql(""" + WARM UP CLUSTER ${dstCluster} WITH CLUSTER ${srcCluster} + ON TABLES ( + INCLUDE '${overlapDb}.orders' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """)[0][0] + jobIds << ordersJobId + + def customersJobId = sql(""" + WARM UP CLUSTER ${dstCluster} WITH CLUSTER ${srcCluster} + ON TABLES ( + INCLUDE '${overlapDb}.customers' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """)[0][0] + jobIds << customersJobId + + def overlapJobId = sql(""" + WARM UP CLUSTER ${dstCluster} WITH CLUSTER ${srcCluster} + ON TABLES ( + INCLUDE '${overlapDb}.*', + EXCLUDE '${overlapDb}.audit_*' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """)[0][0] + jobIds << overlapJobId + + def ordersMatched = WarmupMetricsUtils.waitForMatchedTables(sqlRunner, ordersJobId, + ["${overlapDb}.orders".toString()] as Set, + ["${overlapDb}.customers".toString(), "${overlapDb}.audit_log".toString()] as Set) + def customersMatched = WarmupMetricsUtils.waitForMatchedTables(sqlRunner, customersJobId, + ["${overlapDb}.customers".toString()] as Set, + ["${overlapDb}.orders".toString(), "${overlapDb}.audit_log".toString()] as Set) + def overlapMatched = WarmupMetricsUtils.waitForMatchedTables(sqlRunner, overlapJobId, + ["${overlapDb}.orders".toString(), "${overlapDb}.customers".toString()] as Set, + ["${overlapDb}.audit_log".toString()] as Set) + assert ordersMatched == ["${overlapDb}.orders".toString()] as Set + assert customersMatched == ["${overlapDb}.customers".toString()] as Set + assert overlapMatched == ["${overlapDb}.customers".toString(), "${overlapDb}.orders".toString()] as Set + sleep(3000) + + def overlapBaseMetrics = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + int rowsPerTable = 4 + for (int i = 0; i < rowsPerTable; i++) { + sql """INSERT INTO orders VALUES (${i}, ${i * 10})""" + sql """INSERT INTO customers VALUES (${i}, 'customer_${i}')""" + sql """INSERT INTO audit_log VALUES (${i}, 'audit_${i}')""" + } + + int uniqueMatchedSegments = rowsPerTable * 2 + int jobMatchedSegments = rowsPerTable * 4 + def overlapFinalMetrics = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, srcCluster, dstCluster, + overlapBaseMetrics.finished + uniqueMatchedSegments) + def requestedDelta = overlapFinalMetrics.requested - overlapBaseMetrics.requested + def submittedDelta = overlapFinalMetrics.submitted - overlapBaseMetrics.submitted + def finishedDelta = overlapFinalMetrics.finished - overlapBaseMetrics.finished + def failedDelta = overlapFinalMetrics.failed - overlapBaseMetrics.failed + logger.info("overlap deltas requested=${requestedDelta}, submitted=${submittedDelta}, " + + "finished=${finishedDelta}, failed=${failedDelta}") + assert requestedDelta >= jobMatchedSegments : + "source requested should count each matching job, expected >= ${jobMatchedSegments}, got ${requestedDelta}" + assert submittedDelta >= uniqueMatchedSegments : + "target should warm all unique matched rowsets, expected >= ${uniqueMatchedSegments}, got ${submittedDelta}" + assert submittedDelta <= uniqueMatchedSegments : + "overlap jobs should not amplify target downloads, expected <= ${uniqueMatchedSegments}, got ${submittedDelta}" + assert finishedDelta >= uniqueMatchedSegments : + "target should finish all unique matched rowsets, expected >= ${uniqueMatchedSegments}, got ${finishedDelta}" + assert failedDelta == 0 : "overlap jobs should not fail, got failed delta ${failedDelta}" + + def ordersStats = WarmupMetricsUtils.waitForJobSyncStats(sqlRunner, ordersJobId, { stats -> + stats.seg_num.requested_5m >= rowsPerTable + }) + def customersStats = WarmupMetricsUtils.waitForJobSyncStats(sqlRunner, customersJobId, { stats -> + stats.seg_num.requested_5m >= rowsPerTable + }) + def overlapStats = WarmupMetricsUtils.waitForJobSyncStats(sqlRunner, overlapJobId, { stats -> + stats.seg_num.requested_5m >= rowsPerTable * 2 + }) + assert ordersStats.seg_num.requested_5m < rowsPerTable * 2 : + "orders-only job should not include customers/audit, stats=${ordersStats}" + assert customersStats.seg_num.requested_5m < rowsPerTable * 2 : + "customers-only job should not include orders/audit, stats=${customersStats}" + assert overlapStats.seg_num.requested_5m >= rowsPerTable * 2 : + "overlap job should include orders and customers, stats=${overlapStats}" + + for (jid in [ordersJobId, customersJobId, overlapJobId]) { + try { sql """CANCEL WARM UP JOB WHERE ID = ${jid}""" } catch (Exception ignored) {} + } + WarmupMetricsUtils.waitForMetricsStable(sqlRunner, srcCluster, dstCluster) + + // FT-12: async MV is independently matchable, while sync MV/rollup warms with the base table. + sql """use @${srcCluster}""" + sql """CREATE DATABASE IF NOT EXISTS ${mvDb}""" + sql """use ${mvDb}""" + def baseTable = "fact_rollup" + def rollupName = "rollup_sum" + def asyncMv = "mv_async_summary" + + sql """CREATE TABLE IF NOT EXISTS ${baseTable} (k INT, v INT) + DUPLICATE KEY(k) DISTRIBUTED BY HASH(k) BUCKETS 1 + PROPERTIES ( + "file_cache_ttl_seconds" = "3600", + "disable_auto_compaction" = "true" + )""" + sql """INSERT INTO ${baseTable} VALUES (1, 10), (2, 20)""" + sql """DROP MATERIALIZED VIEW IF EXISTS ${rollupName} ON ${baseTable}""" + sql """CREATE MATERIALIZED VIEW ${rollupName} AS + SELECT k AS rollup_k, sum(v) AS rollup_total_v FROM ${baseTable} GROUP BY k""" + waitingMVTaskFinishedByMvName(mvDb, baseTable, rollupName) + + sql """DROP MATERIALIZED VIEW IF EXISTS ${asyncMv}""" + sql """ + CREATE MATERIALIZED VIEW ${asyncMv} + BUILD DEFERRED REFRESH COMPLETE ON MANUAL + DISTRIBUTED BY HASH(k) BUCKETS 1 + PROPERTIES ('replication_num' = '1') + AS SELECT k, sum(v) AS total_v FROM ${baseTable} GROUP BY k + """ + + def baseJobId = sql(""" + WARM UP CLUSTER ${dstCluster} WITH CLUSTER ${srcCluster} + ON TABLES ( + INCLUDE '${mvDb}.${baseTable}' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """)[0][0] + jobIds << baseJobId + + def mvJobId = sql(""" + WARM UP CLUSTER ${dstCluster} WITH CLUSTER ${srcCluster} + ON TABLES ( + INCLUDE '${mvDb}.mv_*' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """)[0][0] + jobIds << mvJobId + + def baseMatched = WarmupMetricsUtils.waitForMatchedTables(sqlRunner, baseJobId, + ["${mvDb}.${baseTable}".toString()] as Set, + ["${mvDb}.${asyncMv}".toString(), "${mvDb}.${rollupName}".toString()] as Set) + def mvMatched = WarmupMetricsUtils.waitForMatchedTables(sqlRunner, mvJobId, + ["${mvDb}.${asyncMv}".toString()] as Set, + ["${mvDb}.${baseTable}".toString(), "${mvDb}.${rollupName}".toString()] as Set) + assert baseMatched == ["${mvDb}.${baseTable}".toString()] as Set : + "base filter should match only base table, got ${baseMatched}" + assert mvMatched == ["${mvDb}.${asyncMv}".toString()] as Set : + "mv_* filter should match only async MV, got ${mvMatched}" + sleep(3000) + + def mvBaseMetrics = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + int baseInsertRows = 3 + for (int i = 0; i < baseInsertRows; i++) { + sql """INSERT INTO ${baseTable} VALUES (${i + 1}, ${i + 1})""" + } + + int expectedBaseWarmupSegments = baseInsertRows + def afterBaseLoad = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, srcCluster, dstCluster, + mvBaseMetrics.finished + expectedBaseWarmupSegments) + def baseLoadFinishedDelta = afterBaseLoad.finished - mvBaseMetrics.finished + logger.info("base table load with rollup warmup finished delta: ${baseLoadFinishedDelta}") + assert baseLoadFinishedDelta >= expectedBaseWarmupSegments : + "base load should warm while rollup exists, expected >= ${expectedBaseWarmupSegments}, got ${baseLoadFinishedDelta}" + + def baseJobStats = WarmupMetricsUtils.waitForJobSyncStats(sqlRunner, baseJobId, { stats -> + stats.seg_num.requested_5m >= expectedBaseWarmupSegments + && stats.seg_num.finish_5m >= expectedBaseWarmupSegments + }) + assert baseJobStats.seg_num.requested_5m >= expectedBaseWarmupSegments : + "base job should warm base table with rollup present without matching rollup as a table, stats=${baseJobStats}" + + def beforeMvRefresh = WarmupMetricsUtils.waitForMetricsStable(sqlRunner, srcCluster, dstCluster) + sql """REFRESH MATERIALIZED VIEW ${asyncMv} COMPLETE""" + waitingMTMVTaskFinishedByMvName(asyncMv, mvDb) + def afterMvRefresh = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, srcCluster, dstCluster, + beforeMvRefresh.finished + 1) + assert afterMvRefresh.finished > beforeMvRefresh.finished : + "async MV refresh should trigger event-driven warmup after mv_* job is created" + + def mvJobStats = WarmupMetricsUtils.waitForJobSyncStats(sqlRunner, mvJobId, { stats -> + stats.seg_num.requested_5m >= 1 && stats.seg_num.finish_5m >= 1 + }) + assert mvJobStats.seg_num.requested_5m >= 1 : + "mv_* job should independently warm async MV rowsets, stats=${mvJobStats}" + + sql """use @${dstCluster}""" + sql """use ${mvDb}""" + def asyncMvRewriteQuerySql = + "SELECT k, sum(v) AS total_v FROM ${baseTable} GROUP BY k ORDER BY k" + mv_rewrite_success(asyncMvRewriteQuerySql, asyncMv, true) + profile("ft12_async_mv_target_profile") { + sql """set enable_profile = true""" + sql """set profile_level = 2""" + run { + def res = sql """/* ft12_async_mv_target_profile */ ${asyncMvRewriteQuerySql}""" + assert res.collect { [it[0].toString(), it[1].toString()] } == + [["1", "11"], ["2", "22"], ["3", "3"]] : + "target aggregate query should be rewritten to async MV and return MV data, got ${res}" + sleep(1000) + } + check { profileString, exception -> + if (exception != null) { + throw exception + } + assert profileString.contains("NumRemoteIOTotal") : + "profile should contain file cache counters" + def remoteTotal = WarmupMetricsUtils.sumProfileCounter(profileString, + "NumRemoteIOTotal") + def localTotal = WarmupMetricsUtils.sumProfileCounter(profileString, + "NumLocalIOTotal") + logger.info("async MV target profile NumRemoteIOTotal=${remoteTotal}, " + + "NumLocalIOTotal=${localTotal}") + assert remoteTotal == 0 : + "rewritten async MV query should not read remote data after warmup" + assert localTotal > 0 : + "rewritten async MV query should hit local file cache after warmup" + } + } + + def rollupQuery = sql """SELECT k, sum(v) FROM ${baseTable} GROUP BY k ORDER BY k""" + assert rollupQuery.collect { [it[0].toString(), it[1].toString()] } == + [["1", "11"], ["2", "22"], ["3", "3"]] : + "target cluster should read base table with rollup data correctly, got ${rollupQuery}" + def asyncMvQuery = sql """SELECT k, total_v FROM ${asyncMv} ORDER BY k""" + assert asyncMvQuery.collect { [it[0].toString(), it[1].toString()] } == + [["1", "11"], ["2", "22"], ["3", "3"]] : + "target cluster should read async MV correctly, got ${asyncMvQuery}" + + } finally { + for (jid in jobIds) { + try { sql """CANCEL WARM UP JOB WHERE ID = ${jid}""" } catch (Exception ignored) {} + } + try { + sql """use ${overlapDb}""" + sql """DROP TABLE IF EXISTS orders""" + sql """DROP TABLE IF EXISTS customers""" + sql """DROP TABLE IF EXISTS audit_log""" + } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${overlapDb}""" } catch (Exception ignored) {} + try { + sql """use @${srcCluster}""" + sql """use ${mvDb}""" + sql """DROP MATERIALIZED VIEW IF EXISTS mv_async_summary""" + sql """DROP MATERIALIZED VIEW IF EXISTS rollup_sum ON fact_rollup""" + sql """DROP TABLE IF EXISTS fact_rollup""" + } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${mvDb}""" } catch (Exception ignored) {} + } + } +} diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_show_and_cancel.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_show_and_cancel.groovy new file mode 100644 index 00000000000000..1287e57999f8af --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_show_and_cancel.groovy @@ -0,0 +1,384 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import groovy.json.JsonSlurper +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.WarmupMetricsUtils + +suite('test_warm_up_event_on_tables_show_and_cancel', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_warm_up_table_filter_refresh_interval_ms=1000', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + ] + options.cloudMode = true + options.beNum = 1 + + docker(options) { + Closure sqlRunner = { String q -> sql(q) } + + def srcCluster = "warmup_source" + def dstCluster = "warmup_target" + + cluster.addBackend(1, srcCluster) + cluster.addBackend(1, dstCluster) + + def showDb = "test_on_tables_show_extra_db" + def cancelDb = "test_on_tables_cancel_extra_db" + def jobIds = [] + def slurper = new JsonSlurper() + + def getJobRow = { jobId -> + def rows = sql """SHOW WARM UP JOB WHERE ID = ${jobId}""" + assert rows.size() == 1 : "expected one row for job ${jobId}, got ${rows}" + assert rows[0].size() == 16 : "SHOW WARM UP JOB should expose 16 columns, got ${rows[0].size()}" + return rows[0] + } + + def waitForJobState = { jobId, Set expectedStates, long timeoutMs = 60000 -> + long deadline = System.currentTimeMillis() + timeoutMs + def row = null + while (System.currentTimeMillis() < deadline) { + row = getJobRow(jobId) + if (expectedStates.contains(row[3].toString())) { + return row + } + sleep(1000) + } + return row + } + + def assertEmptyNewColumns = { row, String jobDesc -> + assert row[13]?.toString() == "" : "${jobDesc} should have empty TableFilter, row=${row}" + assert row[14]?.toString() == "" : "${jobDesc} should have empty MatchedTables, row=${row}" + assert row[15]?.toString() == "" : "${jobDesc} should have empty SyncStats, row=${row}" + } + + def assertDetailedSyncStats = { row, String jobDesc -> + def stats = WarmupMetricsUtils.parseSyncStats([row]) + assert !stats.isEmpty() : "${jobDesc} should have detailed SyncStats, row=${row}" + assert stats.containsKey("seg_num") : "${jobDesc} detailed SyncStats should contain seg_num: ${stats}" + assert stats.containsKey("seg_size") : "${jobDesc} detailed SyncStats should contain seg_size: ${stats}" + assert stats.containsKey("idx_num") : "${jobDesc} detailed SyncStats should contain idx_num: ${stats}" + assert stats.containsKey("idx_size") : "${jobDesc} detailed SyncStats should contain idx_size: ${stats}" + assert stats.containsKey("last_trigger_ts") : + "${jobDesc} detailed SyncStats should contain last_trigger_ts: ${stats}" + assert stats.containsKey("last_finish_ts") : + "${jobDesc} detailed SyncStats should contain last_finish_ts: ${stats}" + assert !stats.containsKey("window") : "${jobDesc} WHERE ID output should not be compact summary: ${stats}" + assert !stats.containsKey("src_size") : "${jobDesc} WHERE ID output should not be compact summary: ${stats}" + assert !stats.containsKey("dst_size") : "${jobDesc} WHERE ID output should not be compact summary: ${stats}" + assert !stats.containsKey("gap_size") : "${jobDesc} WHERE ID output should not be compact summary: ${stats}" + return stats + } + + def assertSummarySyncStats = { row, String jobDesc -> + def raw = row[15]?.toString()?.trim() + assert raw != null && raw.length() > 0 : "${jobDesc} should have compact SyncStats summary, row=${row}" + def stats = slurper.parseText(raw) + assert stats.window == "30m" : "${jobDesc} list output should use 30m summary, row=${row}" + assert stats.src_size instanceof String : "${jobDesc} summary src_size should be a string: ${stats}" + assert stats.dst_size instanceof String : "${jobDesc} summary dst_size should be a string: ${stats}" + assert stats.gap_size instanceof String : "${jobDesc} summary gap_size should be a string: ${stats}" + assert !stats.containsKey("seg_num") : "${jobDesc} list output should not include detailed seg_num" + assert !stats.containsKey("seg_size") : "${jobDesc} list output should not include detailed seg_size" + assert !stats.containsKey("idx_num") : "${jobDesc} list output should not include detailed idx_num" + assert !stats.containsKey("idx_size") : "${jobDesc} list output should not include detailed idx_size" + assert !stats.containsKey("last_trigger_ts") : + "${jobDesc} list output should not include detailed last_trigger_ts" + assert !stats.containsKey("last_finish_ts") : + "${jobDesc} list output should not include detailed last_finish_ts" + return stats + } + + try { + // FT-05: SHOW WARM UP JOB mixes new ON TABLES jobs with old once/periodic/table jobs. + sql """use @${srcCluster}""" + sql """CREATE DATABASE IF NOT EXISTS ${showDb}""" + sql """use ${showDb}""" + sql """CREATE TABLE IF NOT EXISTS show_base (id INT, val STRING) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + sql """CREATE TABLE IF NOT EXISTS show_extra (id INT, val STRING) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + sql """INSERT INTO show_base VALUES (0, 'seed')""" + + def oldTableJobId = sql("""WARM UP CLUSTER ${dstCluster} WITH TABLE show_base""")[0][0] + jobIds << oldTableJobId + def oldTableRow = waitForJobState(oldTableJobId, ["FINISHED", "RUNNING", "PENDING"] as Set) + assert oldTableRow[4] == "TABLE" : "old WITH TABLE job should be TABLE, row=${oldTableRow}" + assert oldTableRow[5].toString().startsWith("ONCE") : "old WITH TABLE job should be ONCE, row=${oldTableRow}" + assert oldTableRow[12].toString().contains("${showDb}.show_base".toString()) : + "old WITH TABLE job should show warmed table, row=${oldTableRow}" + assertEmptyNewColumns(oldTableRow, "old WITH TABLE job") + + def periodicJobId = sql(""" + WARM UP CLUSTER ${dstCluster} WITH CLUSTER ${srcCluster} + PROPERTIES ( + "sync_mode" = "periodic", + "sync_interval_sec" = "10" + ) + """)[0][0] + jobIds << periodicJobId + + def clusterJobId = sql(""" + WARM UP CLUSTER ${dstCluster} WITH CLUSTER ${srcCluster} + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """)[0][0] + jobIds << clusterJobId + + sleep(2000) + sql """use ${showDb}""" + sql """INSERT INTO show_base VALUES (1, 'cluster_base')""" + sql """INSERT INTO show_extra VALUES (1, 'cluster_extra')""" + + def clusterStats = WarmupMetricsUtils.waitForJobSyncStats(sqlRunner, clusterJobId, { stats -> + stats.seg_num.requested_5m > 0 && stats.seg_num.finish_5m > 0 + }, 8000) + assert !clusterStats.isEmpty() : "cluster-level event job should expose SyncStats" + assert clusterStats.seg_num.requested_5m > 0 : + "cluster event job SyncStats should observe load requests: ${clusterStats}" + assert clusterStats.seg_num.finish_5m > 0 : + "cluster event job SyncStats should observe target finishes: ${clusterStats}" + def runningClusterRow = getJobRow(clusterJobId) + def runningClusterStats = assertDetailedSyncStats(runningClusterRow, "cluster event job") + assert runningClusterStats.seg_num.requested_5m > 0 : + "cluster event job detailed SyncStats should observe load requests: ${runningClusterStats}" + assert runningClusterStats.seg_num.finish_5m > 0 : + "cluster event job detailed SyncStats should observe target finishes: ${runningClusterStats}" + def runningListRows = sql """SHOW WARM UP JOB""" + def runningClusterSummaryRow = runningListRows.find { it[0].toString() == clusterJobId.toString() } + def runningClusterSummary = assertSummarySyncStats(runningClusterSummaryRow, "cluster event job") + assert !runningClusterSummary.containsKey("data_size") : + "cluster event job summary should merge data and index sizes: ${runningClusterSummary}" + assert !runningClusterSummary.containsKey("index_size") : + "cluster event job summary should merge data and index sizes: ${runningClusterSummary}" + + try { + sql(""" + WARM UP CLUSTER ${dstCluster} WITH CLUSTER ${srcCluster} + ON TABLES ( + INCLUDE '${showDb}.show_base' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """) + assert false : "Expected ON TABLES load-event job to conflict with existing cluster-level job" + } catch (java.sql.SQLException e) { + logger.info("Expected cross-level conflict: ${e.getMessage()}") + assert e.getMessage().contains("Cannot create table-level load-event warm up job") : e.getMessage() + assert e.getMessage().contains("conflicting cluster-level load-event warm up job ${clusterJobId}") : + e.getMessage() + assert e.getMessage().contains("Cancel existing load-event warm up job ${clusterJobId}") : + e.getMessage() + } + + sql """CANCEL WARM UP JOB WHERE ID = ${clusterJobId}""" + def cancelledClusterRow = getJobRow(clusterJobId) + assert cancelledClusterRow[3] == "CANCELLED" : "cluster event job should be cancelled, row=${cancelledClusterRow}" + + def tableJobId = sql(""" + WARM UP CLUSTER ${dstCluster} WITH CLUSTER ${srcCluster} + ON TABLES ( + INCLUDE '${showDb}.show_base' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """)[0][0] + jobIds << tableJobId + + WarmupMetricsUtils.waitForMatchedTables(sqlRunner, tableJobId, + ["${showDb}.show_base".toString()] as Set, + ["${showDb}.show_extra".toString()] as Set) + + sleep(2000) + sql """use ${showDb}""" + sql """INSERT INTO show_base VALUES (2, 'table_base')""" + sql """INSERT INTO show_extra VALUES (2, 'table_extra')""" + + def tableStats = WarmupMetricsUtils.waitForJobSyncStats(sqlRunner, tableJobId, { stats -> + stats.seg_num.requested_5m > 0 && stats.seg_num.finish_5m > 0 + }, 8000) + assert !tableStats.isEmpty() : "table-level event job should expose SyncStats" + assert tableStats.seg_num.requested_5m > 0 : + "table-level event job SyncStats should observe load requests: ${tableStats}" + assert tableStats.seg_num.finish_5m > 0 : + "table-level event job SyncStats should observe target finishes: ${tableStats}" + + def periodicRow = getJobRow(periodicJobId) + assert periodicRow[4] == "CLUSTER" : "periodic job type should be CLUSTER, row=${periodicRow}" + assert periodicRow[5] == "PERIODIC (10s)" : "periodic job sync mode mismatch, row=${periodicRow}" + assertEmptyNewColumns(periodicRow, "periodic cluster job") + + def clusterRow = getJobRow(clusterJobId) + assert clusterRow[3] == "CANCELLED" : "cluster event job should remain visible after cancel, row=${clusterRow}" + assert clusterRow[4] == "CLUSTER" : "cluster event job type should be CLUSTER, row=${clusterRow}" + assert clusterRow[5] == "EVENT_DRIVEN (LOAD)" : "cluster event sync mode mismatch, row=${clusterRow}" + assert clusterRow[13] == "" : "cluster event job should not have TableFilter, row=${clusterRow}" + assert clusterRow[14] == "" : "cluster event job should not have MatchedTables, row=${clusterRow}" + + def tableRow = getJobRow(tableJobId) + assert tableRow[4] == "TABLES" : "ON TABLES job type should be TABLES, row=${tableRow}" + assert tableRow[5] == "EVENT_DRIVEN (LOAD)" : "ON TABLES sync mode mismatch, row=${tableRow}" + def tableFilter = slurper.parseText(tableRow[13].toString()) + assert tableFilter.include == ["${showDb}.show_base".toString()] : + "table filter should show the canonical include rule, row=${tableRow}" + def matched = WarmupMetricsUtils.parseMatchedTables([tableRow]) + assert matched == ["${showDb}.show_base".toString()] as Set : + "MatchedTables should contain only show_base, got ${matched}" + def detailedTableStats = assertDetailedSyncStats(tableRow, "ON TABLES job") + assert detailedTableStats.seg_num.requested_5m > 0 : + "ON TABLES detailed SyncStats should observe load requests: ${detailedTableStats}" + assert detailedTableStats.seg_num.finish_5m > 0 : + "ON TABLES detailed SyncStats should observe target finishes: ${detailedTableStats}" + + def listRows = sql """SHOW WARM UP JOB""" + for (jobId in [oldTableJobId, periodicJobId, clusterJobId, tableJobId]) { + def row = listRows.find { it[0].toString() == jobId.toString() } + assert row != null : "SHOW WARM UP JOB should include job ${jobId}, rows=${listRows}" + assert row.size() == 16 : "SHOW WARM UP JOB list row should expose 16 columns, row=${row}" + } + def tableSummaryRow = listRows.find { it[0].toString() == tableJobId.toString() } + def tableSummary = assertSummarySyncStats(tableSummaryRow, "ON TABLES job") + assert !tableSummary.containsKey("data_size") : + "ON TABLES job summary should merge data and index sizes: ${tableSummary}" + assert !tableSummary.containsKey("index_size") : + "ON TABLES job summary should merge data and index sizes: ${tableSummary}" + + for (jid in [periodicJobId, clusterJobId, tableJobId]) { + try { sql """CANCEL WARM UP JOB WHERE ID = ${jid}""" } catch (Exception ignored) {} + } + + // FT-11: cancel keeps existing cache but removes the job from subsequent load triggers. + sql """use @${srcCluster}""" + sql """CREATE DATABASE IF NOT EXISTS ${cancelDb}""" + sql """use ${cancelDb}""" + sql """CREATE TABLE IF NOT EXISTS cancel_base (id INT, val STRING) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + + def cancelJobId = sql(""" + WARM UP CLUSTER ${dstCluster} WITH CLUSTER ${srcCluster} + ON TABLES ( + INCLUDE '${cancelDb}.cancel_base' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """)[0][0] + jobIds << cancelJobId + WarmupMetricsUtils.waitForMatchedTables(sqlRunner, cancelJobId, + ["${cancelDb}.cancel_base".toString()] as Set) + sleep(3000) + + def firstBaseMetrics = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + int firstLoadRows = 4 + for (int i = 0; i < firstLoadRows; i++) { + sql """INSERT INTO cancel_base VALUES (${i}, 'before_cancel_${i}')""" + } + WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, srcCluster, dstCluster, + firstBaseMetrics.finished + firstLoadRows) + def stableBeforeCancel = WarmupMetricsUtils.waitForMetricsStable(sqlRunner, srcCluster, dstCluster) + def targetCacheBeforeCancel = WarmupMetricsUtils.getClusterMetricSum(sqlRunner, dstCluster, "ttl_cache_size") + assert targetCacheBeforeCancel > 0 : + "target cache should be populated before cancel, size=${targetCacheBeforeCancel}" + + sql """CANCEL WARM UP JOB WHERE ID = ${cancelJobId}""" + def cancelledRow = getJobRow(cancelJobId) + assert cancelledRow[3] == "CANCELLED" : "job should be CANCELLED, row=${cancelledRow}" + def targetCacheAfterCancel = WarmupMetricsUtils.getClusterMetricSum(sqlRunner, dstCluster, "ttl_cache_size") + assert targetCacheAfterCancel >= targetCacheBeforeCancel : + "cancel should not evict existing cache, before=${targetCacheBeforeCancel}, after=${targetCacheAfterCancel}" + + int afterCancelRows = 3 + for (int i = 0; i < afterCancelRows; i++) { + sql """INSERT INTO cancel_base VALUES (${i + 100}, 'after_cancel_${i}')""" + } + sleep(8000) + def afterCancelledLoad = WarmupMetricsUtils.logWarmupMetrics(sqlRunner, srcCluster, dstCluster) + assert afterCancelledLoad.requested == stableBeforeCancel.requested : + "cancelled job should not request new segments after load" + assert afterCancelledLoad.submitted == stableBeforeCancel.submitted : + "cancelled job should not submit new segments after load" + assert afterCancelledLoad.finished == stableBeforeCancel.finished : + "cancelled job should not finish new segments after load" + def targetCacheAfterCancelledLoad = + WarmupMetricsUtils.getClusterMetricSum(sqlRunner, dstCluster, "ttl_cache_size") + assert targetCacheAfterCancelledLoad >= targetCacheBeforeCancel : + "existing cache should remain after post-cancel load" + + sql """use @${dstCluster}""" + sql """use ${cancelDb}""" + profile("ft11_cancel_target_profile") { + sql """set enable_profile = true""" + sql """set profile_level = 2""" + run { + def expectedRows = firstLoadRows + afterCancelRows + def expectedSum = (0.. + if (exception != null) { + throw exception + } + assert profileString.contains("NumRemoteIOTotal") : "profile should contain file cache counters" + def remoteTotal = WarmupMetricsUtils.sumProfileCounter(profileString, "NumRemoteIOTotal") + def localTotal = WarmupMetricsUtils.sumProfileCounter(profileString, "NumLocalIOTotal") + logger.info("cancel target profile NumRemoteIOTotal=${remoteTotal}, NumLocalIOTotal=${localTotal}") + assert remoteTotal > 0 : + "post-cancel target query should read remote data for segments loaded after cancel" + assert localTotal > 0 : "post-cancel target query should still hit existing warmed cache" + } + } + + } finally { + for (jid in jobIds) { + try { sql """CANCEL WARM UP JOB WHERE ID = ${jid}""" } catch (Exception ignored) {} + } + try { + sql """use ${showDb}""" + sql """DROP TABLE IF EXISTS show_base""" + sql """DROP TABLE IF EXISTS show_extra""" + } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${showDb}""" } catch (Exception ignored) {} + try { + sql """use ${cancelDb}""" + sql """DROP TABLE IF EXISTS cancel_base""" + } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${cancelDb}""" } catch (Exception ignored) {} + } + } +} diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_sync_stats.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_sync_stats.groovy new file mode 100644 index 00000000000000..73b4577a45472e --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_sync_stats.groovy @@ -0,0 +1,300 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.WarmupMetricsUtils +import groovy.json.JsonSlurper + +suite('test_warm_up_event_on_tables_sync_stats', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_warm_up_table_filter_refresh_interval_ms=1000', + 'cloud_warm_up_sync_stats_refresh_interval_ms=1000', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + ] + options.cloudMode = true + options.beNum = 1 + + docker(options) { + Closure sqlRunner = { String q -> sql(q) } + Closure fetchFeMetrics = { -> + def masterFe = cluster.getMasterFe() + WarmupMetricsUtils.getPrometheusMetrics(masterFe.host, masterFe.httpPort) + } + Closure waitForWarmUpSyncJobMetrics = { + Object jobId, String jobType, String srcClusterName, String dstClusterName -> + def commonLabels = [ + job_id: jobId.toString(), + job_type: jobType, + src_cluster_name: srcClusterName, + dst_cluster_name: dstClusterName + ] + String lastDebug = "" + long deadline = System.currentTimeMillis() + 30000 + while (System.currentTimeMillis() < deadline) { + def metricsText = fetchFeMetrics() + def infoLabels = [ + job_id: jobId.toString(), + job_type: jobType, + sync_mode: "EVENT_DRIVEN", + sync_event: "LOAD", + job_state: "RUNNING", + src_cluster_name: srcClusterName, + dst_cluster_name: dstClusterName + ] + def info = WarmupMetricsUtils.findPrometheusMetricValue(metricsText, + "doris_fe_file_cache_warm_up_sync_job_info", infoLabels) + def sizeMetrics = [:] + boolean allSizeMetricsPositive = true + for (window in ["5m", "30m", "1h"]) { + for (side in ["src", "dst"]) { + def key = "${side}_${window}".toString() + sizeMetrics[key] = WarmupMetricsUtils.findPrometheusMetricValue(metricsText, + "doris_fe_file_cache_warm_up_sync_job_size_bytes", + commonLabels + [side: side, window: window]) + if (sizeMetrics[key] == null || sizeMetrics[key] <= 0) { + allSizeMetricsPositive = false + } + } + } + + if (info == 1G && allSizeMetricsPositive) { + logger.info("FE warm-up sync metrics for job ${jobId}: ${sizeMetrics}") + return + } + lastDebug = metricsText.readLines() + .findAll { it.contains("file_cache_warm_up_sync_job") && it.contains("job_id=\"${jobId}\"") } + .join("\n") + sleep(1000) + } + assert false : "Timed out waiting FE warm-up sync metrics for ${jobType} job ${jobId}. " + + "Last matching metrics:\n${lastDebug}" + } + + def clusterName1 = "warmup_source" + def clusterName2 = "warmup_target" + + cluster.addBackend(1, clusterName1) + cluster.addBackend(1, clusterName2) + + sql """use @${clusterName1}""" + + def dbName = "test_sync_stats_db" + def jobIds = [] + + try { + sql """CREATE DATABASE IF NOT EXISTS ${dbName}""" + sql """use ${dbName}""" + sql """CREATE TABLE IF NOT EXISTS t1 (id INT, val STRING) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + + // Create event-driven warmup job + sql """use @${clusterName1}""" + def jobId_ = sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + ON TABLES ( + INCLUDE '${dbName}.*' + ) + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + def jobId = jobId_[0][0] + jobIds << jobId + logger.info("Warm-up job ID: ${jobId}") + + sleep(3000) + + // Capture baseline BEFORE inserts so we know the target + def baseMetrics = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, clusterName1, clusterName2) + logger.info("Baseline metrics: ${baseMetrics}") + + // Insert data to trigger warmup + def numInserts = 5 + sql """use ${dbName}""" + for (int i = 0; i < numInserts; i++) { + sql """INSERT INTO t1 VALUES (${i}, 'value_${i}')""" + } + + // Wait for warmup to finish using bvar metrics + def metrics = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, clusterName1, clusterName2, + baseMetrics.finished + numInserts) + logger.info("Warmup metrics after finish: ${metrics}") + + // Compute bvar deltas (source submitted, target finished) + def submittedDelta = metrics.submitted - baseMetrics.submitted + def finishedDelta = metrics.finished - baseMetrics.finished + logger.info("Bvar deltas: submitted=${submittedDelta}, finished=${finishedDelta}") + + // Poll SHOW WARM UP JOB until windowed metrics catch up with bvar values + // (bvar::Window samples every ~1s; values need time to accumulate) + def syncStats = null + def syncStatsStr = "" + long deadline = System.currentTimeMillis() + 30000 + while (System.currentTimeMillis() < deadline) { + def jobInfo = sql """SHOW WARM UP JOB WHERE ID = ${jobId}""" + assert jobInfo.size() > 0 : "SHOW WARM UP JOB returned no rows" + syncStatsStr = jobInfo[0][15]?.toString()?.trim() + if (syncStatsStr != null && syncStatsStr.length() > 0) { + syncStats = new JsonSlurper().parseText(syncStatsStr) + if (syncStats.seg_num.requested_5m == submittedDelta + && syncStats.seg_num.finish_5m == finishedDelta + && syncStats.seg_num.gap_5m == 0 + && syncStats.seg_num.fail_5m == 0 + && syncStats.trigger_gap_ms == 0) { + break + } + } + sleep(2000) + } + logger.info("SyncStats column: ${syncStatsStr}") + assert syncStats != null : "SyncStats should not be empty for event-driven job" + + // Verify top-level keys + assert syncStats.containsKey("seg_num") : "Missing seg_num" + assert syncStats.containsKey("seg_size") : "Missing seg_size" + assert syncStats.containsKey("idx_num") : "Missing idx_num" + assert syncStats.containsKey("idx_size") : "Missing idx_size" + assert syncStats.containsKey("last_trigger_ts") : "Missing last_trigger_ts" + assert syncStats.containsKey("last_finish_ts") : "Missing last_finish_ts" + assert syncStats.containsKey("trigger_gap_ms") : "Missing trigger_gap_ms" + assert !syncStats.containsKey("window") : "Detailed SyncStats should not be compact summary" + assert !syncStats.containsKey("src_size") : "Detailed SyncStats should not be compact summary" + assert !syncStats.containsKey("dst_size") : "Detailed SyncStats should not be compact summary" + assert !syncStats.containsKey("gap_size") : "Detailed SyncStats should not be compact summary" + + // Verify detailed stats have the expected window keys. + def assertWindowFields = { groupName, group -> + for (window in ["5m", "30m", "1h"]) { + for (field in ["requested", "finish", "gap", "fail"]) { + def key = "${field}_${window}".toString() + assert group.containsKey(key) : "Missing ${groupName}.${key}" + } + } + } + def segNum = syncStats.seg_num + assertWindowFields("seg_num", segNum) + assertWindowFields("seg_size", syncStats.seg_size) + assertWindowFields("idx_num", syncStats.idx_num) + assertWindowFields("idx_size", syncStats.idx_size) + + // Verify absolute segment counts match bvar deltas + logger.info("seg_num.requested_5m=${segNum.requested_5m}, bvar submitted delta=${submittedDelta}") + logger.info("seg_num.finish_5m=${segNum.finish_5m}, bvar finished delta=${finishedDelta}") + assert segNum.requested_5m == submittedDelta : + "seg_num.requested_5m(${segNum.requested_5m}) should equal source submitted delta(${submittedDelta})" + assert segNum.finish_5m == finishedDelta : + "seg_num.finish_5m(${segNum.finish_5m}) should equal target finished delta(${finishedDelta})" + + // Verify gap is 0 after warmup completes (all requested segments finished) + assert segNum.gap_5m == 0 : "Expected gap_5m == 0 after warmup completes, got ${segNum.gap_5m}" + assert syncStats.trigger_gap_ms == 0 : + "Expected trigger_gap_ms == 0 after warmup completes, got ${syncStats.trigger_gap_ms}, stats=${syncStats}" + + // Verify fail count is 0 + assert segNum.fail_5m == 0 : "Expected no failures, got fail_5m=${segNum.fail_5m}" + + // Verify seg_size values are human-readable strings + def segSize = syncStats.seg_size + logger.info("seg_size.requested_5m = ${segSize.requested_5m}") + assert segSize.requested_5m instanceof String : "seg_size values should be strings" + assert syncStats.idx_size.requested_5m instanceof String : "idx_size values should be strings" + assert syncStats.idx_num.requested_5m instanceof Number : "idx_num values should be numbers" + + // Verify timestamps are non-empty (warmup has occurred) + logger.info("last_trigger_ts = ${syncStats.last_trigger_ts}, last_finish_ts = ${syncStats.last_finish_ts}") + assert syncStats.last_trigger_ts != null && syncStats.last_trigger_ts.toString().length() > 0 : + "last_trigger_ts should be non-empty after warmup" + assert syncStats.last_finish_ts != null && syncStats.last_finish_ts.toString().length() > 0 : + "last_finish_ts should be non-empty after warmup" + + // SHOW WARM UP JOB list output should show a compact 30m summary, not the detailed SyncStats. + def allJobInfo = sql """SHOW WARM UP JOB""" + def summaryRow = allJobInfo.find { row -> row[0]?.toString() == jobId.toString() } + assert summaryRow != null : "SHOW WARM UP JOB should include job ${jobId}" + def summaryStatsStr = summaryRow[15]?.toString()?.trim() + logger.info("SyncStats summary column: ${summaryStatsStr}") + assert summaryStatsStr != null && summaryStatsStr.length() > 0 : + "SyncStats summary should not be empty for event-driven job" + def summaryStats = new JsonSlurper().parseText(summaryStatsStr) + assert summaryStats.window == "30m" : "Summary should use 30m window" + assert summaryStats.src_size instanceof String : "Summary src_size should be a string" + assert summaryStats.dst_size instanceof String : "Summary dst_size should be a string" + assert summaryStats.gap_size instanceof String : "Summary gap_size should be a string" + assert summaryStats.trigger_gap_ms == 0 : + "Summary trigger_gap_ms should be 0 after warmup completes, got ${summaryStats.trigger_gap_ms}" + assert !summaryStats.containsKey("seg_num") : "List summary should not include detailed seg_num" + assert !summaryStats.containsKey("seg_size") : "List summary should not include detailed seg_size" + assert !summaryStats.containsKey("idx_num") : "List summary should not include detailed idx_num" + assert !summaryStats.containsKey("idx_size") : "List summary should not include detailed idx_size" + assert !summaryStats.containsKey("last_trigger_ts") : "List summary should not include detailed timestamp" + assert !summaryStats.containsKey("last_finish_ts") : "List summary should not include detailed timestamp" + assert !summaryStats.containsKey("data_size") : "List summary should merge data and index sizes" + assert !summaryStats.containsKey("index_size") : "List summary should merge data and index sizes" + + waitForWarmUpSyncJobMetrics(jobId, "TABLES", clusterName1, clusterName2) + + sql """CANCEL WARM UP JOB WHERE ID = ${jobId}""" + sleep(1000) + + def clusterJobIdRows = sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + def clusterJobId = clusterJobIdRows[0][0] + jobIds << clusterJobId + logger.info("Cluster-level warm-up job ID: ${clusterJobId}") + sleep(3000) + + def clusterBaseMetrics = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, clusterName1, clusterName2) + logger.info("Cluster job baseline metrics: ${clusterBaseMetrics}") + + for (int i = numInserts; i < numInserts * 2; i++) { + sql """INSERT INTO t1 VALUES (${i}, 'value_${i}')""" + } + + def clusterMetrics = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, clusterName1, clusterName2, + clusterBaseMetrics.finished + numInserts) + logger.info("Cluster job warmup metrics after finish: ${clusterMetrics}") + def clusterSubmittedDelta = clusterMetrics.submitted - clusterBaseMetrics.submitted + def clusterFinishedDelta = clusterMetrics.finished - clusterBaseMetrics.finished + assert clusterSubmittedDelta > 0 : "Cluster-level job should submit source warm-up requests" + assert clusterFinishedDelta > 0 : "Cluster-level job should finish target warm-up requests" + waitForWarmUpSyncJobMetrics(clusterJobId, "CLUSTER", clusterName1, clusterName2) + + } finally { + for (jid in jobIds) { + try { sql """CANCEL WARM UP JOB WHERE ID = ${jid}""" } catch (Exception ignored) {} + } + try { + sql """use ${dbName}""" + sql """DROP TABLE IF EXISTS t1""" + } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${dbName}""" } catch (Exception ignored) {} + } + } +} diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_cluster_change.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_cluster_change.groovy new file mode 100644 index 00000000000000..9cbeea1a99cb89 --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_cluster_change.groovy @@ -0,0 +1,168 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import groovy.json.JsonSlurper +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.WarmupMetricsUtils + +// Test point covered: ST-12. +suite('test_warm_up_event_on_tables_system_cluster_change', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_warm_up_table_filter_refresh_interval_ms=1000', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + ] + options.cloudMode = true + options.beNum = 1 + + docker(options) { + Closure sqlRunner = { String q -> sql(q) } + + def srcCluster = "warmup_st12_source" + def dstCluster = "warmup_st12_target" + def dstCluster2 = "warmup_st12_target2" + def dstClusterRenamed = "warmup_st12_target_renamed" + def srcClusterRenamed = "warmup_st12_source_renamed" + def dbName = "test_on_tables_system_cluster_change_db" + def tableName = "base_tbl" + def jobIds = [] + def jsonSlurper = new JsonSlurper() + def metaService = cluster.getAllMetaservices().get(0) + + def waitForCluster = { String clusterName, boolean expectedPresent -> + List clusters = [] + for (int i = 0; i < 60; i++) { + clusters = sql """SHOW CLUSTERS""" + boolean present = clusters.any { it[0].toString() == clusterName } + if (present == expectedPresent) { + return + } + sleep(1000) + } + assert false : "cluster ${clusterName} present=${!expectedPresent} did not become ${expectedPresent}, clusters=${clusters}" + } + + def getClusterId = { String clusterName -> + def tag = getCloudBeTagByName(clusterName) + return jsonSlurper.parseText(tag).compute_group_id.toString() + } + + def prepareSourceTable = { String clusterName -> + sql """use @${clusterName}""" + sql """CREATE DATABASE IF NOT EXISTS ${dbName}""" + sql """use ${dbName}""" + sql """CREATE TABLE IF NOT EXISTS ${tableName} ( + id INT, + val STRING + ) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + } + + def createTableWarmupJob = { String source, String target -> + prepareSourceTable(source) + def jobId = sql(""" + WARM UP CLUSTER ${target} WITH CLUSTER ${source} + ON TABLES (INCLUDE '${dbName}.${tableName}') + PROPERTIES ("sync_mode" = "event_driven", "sync_event" = "load") + """)[0][0] + jobIds << jobId + assert WarmupMetricsUtils.waitForMatchedTables(sqlRunner, jobId, + ["${dbName}.${tableName}".toString()] as Set) == + ["${dbName}.${tableName}".toString()] as Set + return jobId + } + + def showJob = { jobId -> + def rows = sql """SHOW WARM UP JOB WHERE ID = ${jobId}""" + assert !rows.isEmpty() : "warmup job ${jobId} should exist" + return rows[0] + } + + def waitForSystemCancelled = { jobId, String phase -> + def row = null + for (int i = 0; i < 60; i++) { + row = showJob(jobId) + if (row[3].toString() == "CANCELLED" + && row[11].toString().toLowerCase().contains("system cancel")) { + return row + } + sleep(1000) + } + assert false : "${phase}: expected system-cancelled warmup job ${jobId}, row=${row}" + } + + try { + cluster.addBackend(1, srcCluster) + cluster.addBackend(1, dstCluster) + waitForCluster(srcCluster, true) + waitForCluster(dstCluster, true) + + def alterJobId = createTableWarmupJob(srcCluster, dstCluster) + sql """ALTER COMPUTE GROUP ${srcCluster} PROPERTIES ('balance_type'='without_warmup')""" + sql """ALTER COMPUTE GROUP ${dstCluster} PROPERTIES ('balance_type'='without_warmup')""" + sleep(5000) + def alterRow = showJob(alterJobId) + assert alterRow[3].toString() in ["RUNNING", "PENDING"] : + "altering compute group properties should not cancel table warmup job, row=${alterRow}" + + sql """ALTER SYSTEM RENAME COMPUTE GROUP ${dstCluster} ${dstClusterRenamed}""" + waitForCluster(dstClusterRenamed, true) + waitForSystemCancelled(alterJobId, "target rename") + + sql """ALTER SYSTEM RENAME COMPUTE GROUP ${dstClusterRenamed} ${dstCluster}""" + waitForCluster(dstCluster, true) + + def sourceRenameJobId = createTableWarmupJob(srcCluster, dstCluster) + sql """ALTER SYSTEM RENAME COMPUTE GROUP ${srcCluster} ${srcClusterRenamed}""" + waitForCluster(srcClusterRenamed, true) + waitForSystemCancelled(sourceRenameJobId, "source rename") + + sql """ALTER SYSTEM RENAME COMPUTE GROUP ${srcClusterRenamed} ${srcCluster}""" + waitForCluster(srcCluster, true) + + def targetDropJobId = createTableWarmupJob(srcCluster, dstCluster) + def dstClusterId = getClusterId(dstCluster) + drop_cluster(dstCluster, dstClusterId, metaService) + waitForCluster(dstCluster, false) + waitForSystemCancelled(targetDropJobId, "target drop") + + cluster.addBackend(1, dstCluster2) + waitForCluster(dstCluster2, true) + def sourceDropJobId = createTableWarmupJob(srcCluster, dstCluster2) + def srcClusterId = getClusterId(srcCluster) + drop_cluster(srcCluster, srcClusterId, metaService) + waitForCluster(srcCluster, false) + waitForSystemCancelled(sourceDropJobId, "source drop") + } finally { + for (jid in jobIds) { + try { sql """CANCEL WARM UP JOB WHERE ID = ${jid}""" } catch (Exception ignored) {} + } + try { + sql """use @${srcCluster}""" + sql """use ${dbName}""" + sql """DROP TABLE IF EXISTS ${tableName}""" + sql """DROP DATABASE IF EXISTS ${dbName}""" + } catch (Exception ignored) {} + } + } +} diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_compaction_sync_wait.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_compaction_sync_wait.groovy new file mode 100644 index 00000000000000..fdf04d3a56f4f3 --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_compaction_sync_wait.groovy @@ -0,0 +1,214 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.NodeType +import org.apache.doris.regression.util.WarmupMetricsUtils + +// Test point covered: ST-04. +suite('test_warm_up_event_on_tables_system_compaction_sync_wait', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_warm_up_table_filter_refresh_interval_ms=1000', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + 'warm_up_rowset_slow_log_ms=1', + 'enable_compaction_delay_commit_for_warm_up=true', + 'warm_up_rowset_sync_wait_min_timeout_ms=20000', + 'warm_up_rowset_sync_wait_max_timeout_ms=30000', + ] + options.enableDebugPoints() + options.cloudMode = true + + def waitForMetricAtLeast = { ip, port, metricName, target, timeoutMs -> + long deadline = System.currentTimeMillis() + timeoutMs + long last = 0 + while (System.currentTimeMillis() < deadline) { + last = WarmupMetricsUtils.getBrpcMetric(ip.toString(), port.toString(), metricName) + if (last >= target) { + return last + } + sleep(500) + } + assert false : "metric ${metricName} on ${ip}:${port} did not reach ${target}, last=${last}" + } + + def httpJson = { String method, String url, int readTimeoutMs = 180000 -> + def conn = new URL(url).openConnection() + conn.setRequestMethod(method) + conn.setConnectTimeout(10000) + conn.setReadTimeout(readTimeoutMs) + def text = conn.responseCode >= 400 ? conn.errorStream?.text : conn.inputStream.text + assert text != null && !text.trim().isEmpty() : "empty HTTP response from ${url}" + return parseJson(text.trim()) + } + + def triggerCumulativeCompaction = { ip, port, tabletId -> + def status = httpJson("POST", + "http://${ip}:${port}/api/compaction/run?tablet_id=${tabletId}&compact_type=cumulative") + assert status.status.toLowerCase() in ["success", "already_exist"] : + "trigger compaction failed on ${ip}:${port}, tablet=${tabletId}, status=${status}" + return status + } + + def waitForCompactionFinish = { ip, port, tabletId, timeoutMs -> + long deadline = System.currentTimeMillis() + timeoutMs + def lastStatus = null + while (System.currentTimeMillis() < deadline) { + lastStatus = httpJson("GET", + "http://${ip}:${port}/api/compaction/run_status?tablet_id=${tabletId}", 10000) + assert lastStatus.status.toLowerCase() == "success" : + "compaction failed on ${ip}:${port}, tablet=${tabletId}, status=${lastStatus}" + if (!lastStatus.run_status) { + return lastStatus + } + sleep(1000) + } + assert false : "compaction did not finish on ${ip}:${port}, tablet=${tabletId}, last=${lastStatus}" + } + + docker(options) { + Closure sqlRunner = { String q -> sql(q) } + + def srcCluster = "warmup_source" + def dstCluster = "warmup_target" + def dbName = "test_on_tables_system_compaction_db" + def jobIds = [] + def debugEnabled = false + def targetBe = null + def sourceBe = null + def compactionFuture = null + def loadCount = 8 + + cluster.addBackend(1, srcCluster) + cluster.addBackend(1, dstCluster) + + try { + sql """use @${srcCluster}""" + sql """CREATE DATABASE IF NOT EXISTS ${dbName}""" + sql """use ${dbName}""" + sql """CREATE TABLE IF NOT EXISTS compact_tbl ( + id INT NOT NULL, + payload STRING + ) + UNIQUE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ( + "file_cache_ttl_seconds" = "3600", + "disable_auto_compaction" = "true" + )""" + + def jobId = sql(""" + WARM UP CLUSTER ${dstCluster} WITH CLUSTER ${srcCluster} + ON TABLES (INCLUDE '${dbName}.compact_tbl') + PROPERTIES ("sync_mode" = "event_driven", "sync_event" = "load") + """)[0][0] + jobIds << jobId + assert WarmupMetricsUtils.waitForMatchedTables(sqlRunner, jobId, + ["${dbName}.compact_tbl".toString()] as Set) == + ["${dbName}.compact_tbl".toString()] as Set + + WarmupMetricsUtils.clearFileCacheOnAllBackends(sqlRunner) + def baseMetrics = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + + sql """use @${srcCluster}""" + sql """use ${dbName}""" + for (int i = 0; i < loadCount; i++) { + sql """INSERT INTO compact_tbl VALUES (${i}, 'row_${i}')""" + } + def metrics = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, srcCluster, dstCluster, + baseMetrics.finished + loadCount, 90000) + assert metrics.failed == baseMetrics.failed : "initial rowset warmup should not fail, metrics=${metrics}" + sleep(15000) + + def tablets = sql_return_maparray """SHOW TABLETS FROM compact_tbl""" + assert tablets.size() == 1 : "compact_tbl should have one tablet, tablets=${tablets}" + def tabletId = tablets[0].TabletId.toString() + sourceBe = WarmupMetricsUtils.getClusterBackends(sqlRunner, srcCluster)[0] + targetBe = WarmupMetricsUtils.getClusterBackends(sqlRunner, dstCluster)[0] + def beforeSubmitted = WarmupMetricsUtils.getBrpcMetric(targetBe[1].toString(), targetBe[5].toString(), + WarmupMetricsUtils.METRIC_SUBMITTED) + def beforeFinished = WarmupMetricsUtils.getBrpcMetric(targetBe[1].toString(), targetBe[5].toString(), + WarmupMetricsUtils.METRIC_FINISHED) + def beforeWaitCompaction = WarmupMetricsUtils.getBrpcMetric(targetBe[1].toString(), targetBe[5].toString(), + "file_cache_warm_up_rowset_wait_for_compaction_num") + def beforeWaitTimeout = WarmupMetricsUtils.getBrpcMetric(targetBe[1].toString(), targetBe[5].toString(), + "file_cache_warm_up_rowset_wait_for_compaction_timeout_num") + + GetDebugPoint().enableDebugPoint(targetBe[1].toString(), targetBe[4] as int, NodeType.BE, + "S3FileReader::read_at_impl.io_slow", [sleep: 10]) + debugEnabled = true + + compactionFuture = thread { + sql """use @${srcCluster}""" + sql """use ${dbName}""" + triggerCumulativeCompaction(sourceBe[1].toString(), sourceBe[4].toString(), tabletId) + waitForCompactionFinish(sourceBe[1].toString(), sourceBe[4].toString(), tabletId, 90000) + } + + waitForMetricAtLeast(targetBe[1], targetBe[5], + "file_cache_warm_up_rowset_wait_for_compaction_num", beforeWaitCompaction + 1, 60000) + assert WarmupMetricsUtils.getBrpcMetric(targetBe[1].toString(), targetBe[5].toString(), + WarmupMetricsUtils.METRIC_SUBMITTED) >= beforeSubmitted + 1 : + "compaction rowset should submit one more target warmup" + assert WarmupMetricsUtils.getBrpcMetric(targetBe[1].toString(), targetBe[5].toString(), + WarmupMetricsUtils.METRIC_FINISHED) >= beforeFinished : + "finished warmup metric should not regress" + + sql """use @${dstCluster}""" + sql """use ${dbName}""" + assert sql("""SELECT count(*) FROM compact_tbl""")[0][0].toString() == loadCount.toString() + + compactionFuture.get() + assert WarmupMetricsUtils.getBrpcMetric(targetBe[1].toString(), targetBe[5].toString(), + WarmupMetricsUtils.METRIC_FINISHED) >= beforeFinished + 1 : + "compaction rowset warmup should finish after sync wait" + assert WarmupMetricsUtils.getBrpcMetric(targetBe[1].toString(), targetBe[5].toString(), + "file_cache_warm_up_rowset_wait_for_compaction_timeout_num") == beforeWaitTimeout : + "compaction sync wait should not time out" + + sql """use @${srcCluster}""" + sql """use ${dbName}""" + sql """INSERT INTO compact_tbl VALUES (${loadCount + 1}, 'after_compaction')""" + WarmupMetricsUtils.waitForMetricsStable(sqlRunner, srcCluster, dstCluster, 30000) + sql """use @${dstCluster}""" + sql """use ${dbName}""" + assert sql("""SELECT count(*) FROM compact_tbl""")[0][0].toString() == (loadCount + 1).toString() + + def stats = WarmupMetricsUtils.waitForJobSyncStats(sqlRunner, jobId, + { it.seg_num.finish_5m >= loadCount + 2 && it.seg_num.fail_5m == 0 && it.seg_num.gap_5m == 0 }, + 60000) + assert stats.seg_num.gap_5m == 0 : "compaction warmup should converge, stats=${stats}" + } finally { + if (debugEnabled) { + try { GetDebugPoint().clearDebugPointsForAllBEs() } catch (Exception ignored) {} + } + for (jid in jobIds) { + try { sql """CANCEL WARM UP JOB WHERE ID = ${jid}""" } catch (Exception ignored) {} + } + try { + sql """use @${srcCluster}""" + sql """use ${dbName}""" + sql """DROP TABLE IF EXISTS compact_tbl""" + } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${dbName}""" } catch (Exception ignored) {} + } + } +} diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_e2e_multi_be.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_e2e_multi_be.groovy new file mode 100644 index 00000000000000..1429498764f834 --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_e2e_multi_be.groovy @@ -0,0 +1,269 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.WarmupMetricsUtils + +// Test points covered: ST-01, ST-02, ST-10. +suite('test_warm_up_event_on_tables_system_e2e_multi_be', 'docker') { + def options = new ClusterOptions() + options.beNum = 1 + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_tablet_rebalancer_interval_second=1', + 'cloud_warm_up_table_filter_refresh_interval_ms=1000', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + ] + options.cloudMode = true + + docker(options) { + Closure sqlRunner = { String q -> sql(q) } + + def srcCluster = "warmup_source" + def dstCluster = "warmup_target" + def dbName = "test_on_tables_system_e2e_db" + def jobIds = [] + + cluster.addBackend(3, srcCluster) + cluster.addBackend(3, dstCluster) + + try { + sql """use @${srcCluster}""" + sql """CREATE DATABASE IF NOT EXISTS ${dbName}""" + sql """use ${dbName}""" + sql """CREATE TABLE IF NOT EXISTS orders (id INT, amount INT) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 9 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + sql """CREATE TABLE IF NOT EXISTS customers (id INT, name STRING) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 9 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + sql """CREATE TABLE IF NOT EXISTS audit_log (id INT, msg STRING) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 3 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + + def ordersJobId = sql(""" + WARM UP CLUSTER ${dstCluster} WITH CLUSTER ${srcCluster} + ON TABLES (INCLUDE '${dbName}.orders') + PROPERTIES ("sync_mode" = "event_driven", "sync_event" = "load") + """)[0][0] + jobIds << ordersJobId + def customersJobId = sql(""" + WARM UP CLUSTER ${dstCluster} WITH CLUSTER ${srcCluster} + ON TABLES (INCLUDE '${dbName}.customers') + PROPERTIES ("sync_mode" = "event_driven", "sync_event" = "load") + """)[0][0] + jobIds << customersJobId + def wildcardJobId = sql(""" + WARM UP CLUSTER ${dstCluster} WITH CLUSTER ${srcCluster} + ON TABLES ( + INCLUDE '${dbName}.*', + EXCLUDE '${dbName}.audit_*' + ) + PROPERTIES ("sync_mode" = "event_driven", "sync_event" = "load") + """)[0][0] + jobIds << wildcardJobId + + assert WarmupMetricsUtils.waitForMatchedTables(sqlRunner, ordersJobId, + ["${dbName}.orders".toString()] as Set, + ["${dbName}.customers".toString(), "${dbName}.audit_log".toString()] as Set) == + ["${dbName}.orders".toString()] as Set + assert WarmupMetricsUtils.waitForMatchedTables(sqlRunner, customersJobId, + ["${dbName}.customers".toString()] as Set, + ["${dbName}.orders".toString(), "${dbName}.audit_log".toString()] as Set) == + ["${dbName}.customers".toString()] as Set + assert WarmupMetricsUtils.waitForMatchedTables(sqlRunner, wildcardJobId, + ["${dbName}.orders".toString(), "${dbName}.customers".toString()] as Set, + ["${dbName}.audit_log".toString()] as Set) == + ["${dbName}.orders".toString(), "${dbName}.customers".toString()] as Set + + assert WarmupMetricsUtils.getClusterBackends(sqlRunner, srcCluster).size() == 3 + assert WarmupMetricsUtils.getClusterBackends(sqlRunner, dstCluster).size() == 3 + + WarmupMetricsUtils.clearFileCacheOnAllBackends(sqlRunner) + def baseMetrics = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + def targetFinishedBefore = WarmupMetricsUtils.getClusterMetricValues(sqlRunner, + dstCluster, WarmupMetricsUtils.METRIC_FINISHED) + + sql """use @${srcCluster}""" + sql """use ${dbName}""" + int rowCount = 90 + long expectedOrderSum = ((long) rowCount - 1) * rowCount / 2 * 10 + def orderValues = (0..= 18 : "target should warm every bucket for orders/customers" + assert requestedDelta >= submittedDelta * 2 : + "overlapping jobs should request the same matched rowsets independently, got requested=${requestedDelta}, submitted=${submittedDelta}" + assert finishedDelta == submittedDelta : "all target downloads should finish" + assert failedDelta == 0 : "warmup should not fail" + + def targetFinishedAfter = WarmupMetricsUtils.getClusterMetricValues(sqlRunner, + dstCluster, WarmupMetricsUtils.METRIC_FINISHED) + def targetFinishedDeltas = targetFinishedAfter.collectEntries { + [(it.key): it.value - (targetFinishedBefore[it.key] ?: 0)] + } + logger.info("target finished deltas by BE: ${targetFinishedDeltas}") + assert targetFinishedDeltas.size() == 3 : "target cluster should have 3 BEs" + assert targetFinishedDeltas.every { it.value > 0 } : + "each target BE should finish warmup tasks, got ${targetFinishedDeltas}" + + sql """use @${dstCluster}""" + sql """use ${dbName}""" + profile("st01_target_profile") { + sql """set enable_profile = true""" + sql """set profile_level = 2""" + run { + def res = sql """/* st01_target_profile */ SELECT count(*), sum(amount) FROM orders""" + assert res[0][0].toString() == rowCount.toString() : "target query row count mismatch: ${res}" + assert res[0][1].toString() == expectedOrderSum.toString() : "target query sum mismatch: ${res}" + sleep(1000) + } + check { profileString, exception -> + if (exception != null) { + throw exception + } + assert profileString.contains("NumRemoteIOTotal") : "profile should contain file cache counters" + def remoteTotal = WarmupMetricsUtils.sumProfileCounter(profileString, "NumRemoteIOTotal") + def localTotal = WarmupMetricsUtils.sumProfileCounter(profileString, "NumLocalIOTotal") + logger.info("target profile NumRemoteIOTotal=${remoteTotal}, NumLocalIOTotal=${localTotal}") + assert remoteTotal == 0 : "warmed target query should not read remote data" + assert localTotal > 0 : "warmed target query should hit local file cache" + } + } + + def targetTtl = [:] + long targetTtlDeadline = System.currentTimeMillis() + 30000 + while (System.currentTimeMillis() < targetTtlDeadline) { + targetTtl = WarmupMetricsUtils.getClusterMetricValues(sqlRunner, dstCluster, "ttl_cache_size") + if (targetTtl.size() == 3 && targetTtl.values().sum() > 0) { + break + } + sleep(2000) + } + logger.info("target ttl cache by BE: ${targetTtl}") + assert targetTtl.size() == 3 : "target cluster should have 3 BEs" + assert targetTtl.values().sum() > 0 : "target cluster should own warmed cache, got ${targetTtl}" + + def ordersStats = WarmupMetricsUtils.waitForJobSyncStats(sqlRunner, ordersJobId, { stats -> + stats.seg_num.requested_5m > 0 + && stats.seg_num.finish_5m == stats.seg_num.requested_5m + && stats.seg_num.gap_5m == 0 + && stats.seg_num.fail_5m == 0 + && stats.seg_size.finish_5m == stats.seg_size.requested_5m + && stats.seg_size.gap_5m == "0b" + && stats.seg_size.fail_5m == "0b" + }, 30000) + logger.info("system e2e SyncStats for orders job ${ordersJobId}: ${ordersStats}") + assert ordersStats.seg_num.requested_5m > 0 : + "orders job should have requested segments in SyncStats: ${ordersStats}" + assert ordersStats.seg_num.finish_5m == ordersStats.seg_num.requested_5m : + "orders job should count already-warmed overlapping rowsets as finished: ${ordersStats}" + assert ordersStats.seg_num.gap_5m == 0 : + "orders job should have no SyncStats segment gap after warmup: ${ordersStats}" + assert ordersStats.seg_num.fail_5m == 0 : + "orders job should have no SyncStats segment failures: ${ordersStats}" + assert ordersStats.seg_size.finish_5m == ordersStats.seg_size.requested_5m : + "orders job should count already-warmed overlapping rowset bytes as finished: ${ordersStats}" + assert ordersStats.seg_size.gap_5m == "0b" : + "orders job should have no SyncStats size gap after warmup: ${ordersStats}" + assert ordersStats.seg_size.fail_5m == "0b" : + "orders job should have no SyncStats size failures: ${ordersStats}" + + def customersStats = WarmupMetricsUtils.waitForJobSyncStats(sqlRunner, customersJobId, { stats -> + stats.seg_num.requested_5m > 0 + && stats.seg_num.finish_5m == stats.seg_num.requested_5m + && stats.seg_num.gap_5m == 0 + && stats.seg_num.fail_5m == 0 + && stats.seg_size.finish_5m == stats.seg_size.requested_5m + && stats.seg_size.gap_5m == "0b" + && stats.seg_size.fail_5m == "0b" + }, 30000) + logger.info("system e2e SyncStats for customers job ${customersJobId}: ${customersStats}") + assert customersStats.seg_num.requested_5m > 0 : + "customers job should have requested segments in SyncStats: ${customersStats}" + assert customersStats.seg_num.finish_5m == customersStats.seg_num.requested_5m : + "customers job should count already-warmed overlapping rowsets as finished: ${customersStats}" + assert customersStats.seg_num.gap_5m == 0 : + "customers job should have no SyncStats segment gap after warmup: ${customersStats}" + assert customersStats.seg_num.fail_5m == 0 : + "customers job should have no SyncStats segment failures: ${customersStats}" + assert customersStats.seg_size.finish_5m == customersStats.seg_size.requested_5m : + "customers job should count already-warmed overlapping rowset bytes as finished: ${customersStats}" + assert customersStats.seg_size.gap_5m == "0b" : + "customers job should have no SyncStats size gap after warmup: ${customersStats}" + assert customersStats.seg_size.fail_5m == "0b" : + "customers job should have no SyncStats size failures: ${customersStats}" + + def wildcardStats = WarmupMetricsUtils.waitForJobSyncStats(sqlRunner, wildcardJobId, { stats -> + stats.seg_num.requested_5m > 0 + && stats.seg_num.finish_5m == stats.seg_num.requested_5m + && stats.seg_num.gap_5m == 0 + && stats.seg_num.fail_5m == 0 + && stats.seg_size.finish_5m == stats.seg_size.requested_5m + && stats.seg_size.gap_5m == "0b" + && stats.seg_size.fail_5m == "0b" + }, 30000) + logger.info("system e2e SyncStats for wildcard job ${wildcardJobId}: ${wildcardStats}") + assert wildcardStats.seg_num.requested_5m > 0 : + "wildcard job should have requested segments in SyncStats: ${wildcardStats}" + assert wildcardStats.seg_num.finish_5m == wildcardStats.seg_num.requested_5m : + "wildcard job should count already-warmed overlapping rowsets as finished: ${wildcardStats}" + assert wildcardStats.seg_num.gap_5m == 0 : + "wildcard job should have no SyncStats segment gap after warmup: ${wildcardStats}" + assert wildcardStats.seg_num.fail_5m == 0 : + "wildcard job should have no SyncStats segment failures: ${wildcardStats}" + assert wildcardStats.seg_size.finish_5m == wildcardStats.seg_size.requested_5m : + "wildcard job should count already-warmed overlapping rowset bytes as finished: ${wildcardStats}" + assert wildcardStats.seg_size.gap_5m == "0b" : + "wildcard job should have no SyncStats size gap after warmup: ${wildcardStats}" + assert wildcardStats.seg_size.fail_5m == "0b" : + "wildcard job should have no SyncStats size failures: ${wildcardStats}" + def wildcardOverlapMessage = "wildcard job should cover both overlapping tables, orders=${ordersStats}, customers=${customersStats}, wildcard=${wildcardStats}" + assert wildcardStats.seg_num.requested_5m >= + ordersStats.seg_num.requested_5m + customersStats.seg_num.requested_5m : + wildcardOverlapMessage + } finally { + for (jid in jobIds) { + try { sql """CANCEL WARM UP JOB WHERE ID = ${jid}""" } catch (Exception ignored) {} + } + try { + sql """use @${srcCluster}""" + sql """use ${dbName}""" + sql """DROP TABLE IF EXISTS orders""" + sql """DROP TABLE IF EXISTS customers""" + sql """DROP TABLE IF EXISTS audit_log""" + } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${dbName}""" } catch (Exception ignored) {} + } + } +} diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_packed_file.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_packed_file.groovy new file mode 100644 index 00000000000000..a85cc48d99b1ff --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_packed_file.groovy @@ -0,0 +1,155 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.WarmupMetricsUtils + +// Test point covered: ST-06. +suite('test_warm_up_event_on_tables_system_packed_file', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_warm_up_table_filter_refresh_interval_ms=1000', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + 'enable_packed_file=true', + 'small_file_threshold_bytes=102400', + 'disable_auto_compaction=true', + ] + options.cloudMode = true + + docker(options) { + Closure sqlRunner = { String q -> sql(q) } + + def srcCluster = "warmup_source" + def dstCluster = "warmup_target" + def dbName = "test_on_tables_system_packed_file_db" + def tableName = "packed_tbl" + def jobIds = [] + def loadCount = 30 + + cluster.addBackend(1, srcCluster) + cluster.addBackend(1, dstCluster) + + try { + sql """use @${srcCluster}""" + sql """CREATE DATABASE IF NOT EXISTS ${dbName}""" + sql """use ${dbName}""" + sql """CREATE TABLE IF NOT EXISTS ${tableName} ( + id INT, + name STRING, + payload STRING + ) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ( + "file_cache_ttl_seconds" = "3600", + "disable_auto_compaction" = "true" + )""" + + def jobId = sql(""" + WARM UP CLUSTER ${dstCluster} WITH CLUSTER ${srcCluster} + ON TABLES (INCLUDE '${dbName}.${tableName}') + PROPERTIES ("sync_mode" = "event_driven", "sync_event" = "load") + """)[0][0] + jobIds << jobId + assert WarmupMetricsUtils.waitForMatchedTables(sqlRunner, jobId, + ["${dbName}.${tableName}".toString()] as Set) == + ["${dbName}.${tableName}".toString()] as Set + + WarmupMetricsUtils.clearFileCacheOnAllBackends(sqlRunner) + def baseMetrics = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + def basePackedFiles = WarmupMetricsUtils.getClusterMetricSum(sqlRunner, srcCluster, + "packed_file_total_small_file_num") + def baseTargetCacheSize = WarmupMetricsUtils.getClusterMetricSum(sqlRunner, dstCluster, + "ttl_cache_size") + + sql """use @${srcCluster}""" + sql """use ${dbName}""" + for (int i = 0; i < loadCount; i++) { + sql """INSERT INTO ${tableName} VALUES (${i}, 'packed_${i}', repeat('x', 128))""" + } + + def metrics = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, srcCluster, dstCluster, + baseMetrics.finished + loadCount, 90000) + def requestedDelta = metrics.requested - baseMetrics.requested + def submittedDelta = metrics.submitted - baseMetrics.submitted + def finishedDelta = metrics.finished - baseMetrics.finished + def failedDelta = metrics.failed - baseMetrics.failed + logger.info("packed file bvar deltas requested=${requestedDelta}, submitted=${submittedDelta}, " + + "finished=${finishedDelta}, failed=${failedDelta}") + assert requestedDelta >= loadCount : "source bvar should request packed small-file rowsets" + assert submittedDelta >= loadCount : "target bvar should submit packed small-file rowsets" + assert finishedDelta == submittedDelta : "target bvar should finish all submitted packed rowsets" + assert failedDelta == 0 : "packed-file warmup should not fail" + + def packedFilesDelta = WarmupMetricsUtils.getClusterMetricSum(sqlRunner, srcCluster, + "packed_file_total_small_file_num") - basePackedFiles + logger.info("packed_file_total_small_file_num delta=${packedFilesDelta}") + assert packedFilesDelta > 0 : "source cluster should write small files into packed file" + + def targetCacheSizeDelta = WarmupMetricsUtils.getClusterMetricSum(sqlRunner, dstCluster, + "ttl_cache_size") - baseTargetCacheSize + logger.info("target ttl_cache_size delta=${targetCacheSizeDelta}") + assert targetCacheSizeDelta > 0 : "target packed-file warmup should populate TTL file cache" + + def stats = WarmupMetricsUtils.waitForJobSyncStats(sqlRunner, jobId, + { it.seg_num.requested_5m > 0 && it.seg_num.finish_5m >= loadCount && it.seg_num.fail_5m == 0 }, + 60000) + logger.info("packed file SyncStats: ${stats}") + assert stats.seg_num.requested_5m > 0 : "SyncStats should observe packed small-file rowset requests" + assert stats.seg_num.finish_5m >= loadCount : "SyncStats should finish packed small-file rowsets" + assert stats.seg_num.fail_5m == 0 : "SyncStats should have no packed-file failures" + + sql """use @${dstCluster}""" + sql """use ${dbName}""" + profile("st06_packed_file_profile") { + sql """set enable_profile = true""" + sql """set profile_level = 2""" + run { + def res = sql """/* st06_packed_file_profile */ SELECT count(*), sum(id) FROM ${tableName}""" + assert res[0][0].toString() == loadCount.toString() : "packed table count mismatch: ${res}" + assert res[0][1].toString() == "435" : "packed table sum mismatch: ${res}" + sleep(1000) + } + check { profileString, exception -> + if (exception != null) { + throw exception + } + assert profileString.contains("NumRemoteIOTotal") : "profile should contain file cache counters" + def remoteTotal = WarmupMetricsUtils.sumProfileCounter(profileString, "NumRemoteIOTotal") + def localTotal = WarmupMetricsUtils.sumProfileCounter(profileString, "NumLocalIOTotal") + logger.info("packed profile NumRemoteIOTotal=${remoteTotal}, NumLocalIOTotal=${localTotal}") + assert remoteTotal == 0 : "warmed packed-file query should not read remote data" + assert localTotal > 0 : "warmed packed-file query should hit local file cache" + } + } + } finally { + for (jid in jobIds) { + try { sql """CANCEL WARM UP JOB WHERE ID = ${jid}""" } catch (Exception ignored) {} + } + try { + sql """use @${srcCluster}""" + sql """use ${dbName}""" + sql """DROP TABLE IF EXISTS ${tableName}""" + } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${dbName}""" } catch (Exception ignored) {} + } + } +} diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_restart_and_resize.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_restart_and_resize.groovy new file mode 100644 index 00000000000000..a0d09791868c19 --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_restart_and_resize.groovy @@ -0,0 +1,223 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.WarmupMetricsUtils + +// Test points covered: ST-07, ST-08. +suite('test_warm_up_event_on_tables_system_restart_and_resize', 'docker') { + def options = new ClusterOptions() + options.feNum = 3 + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_tablet_rebalancer_interval_second=1', + 'cloud_warm_up_table_filter_refresh_interval_ms=1000', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + ] + options.cloudMode = true + + def restartMasterFe = { + def oldMasterFe = cluster.getMasterFe() + cluster.restartFrontends(oldMasterFe.index) + boolean hasRestart = false + for (int i = 0; i < 30; i++) { + if (cluster.getFeByIndex(oldMasterFe.index).alive) { + hasRestart = true + break + } + sleep(1000) + } + assert hasRestart : "master FE did not restart" + context.reconnectFe() + } + + docker(options) { + Closure sqlRunner = { String q -> sql(q) } + + def srcCluster = "warmup_source" + def dstCluster = "warmup_target" + def dbName = "test_on_tables_system_restart_resize_db" + def jobIds = [] + + def srcBeIndexes = cluster.addBackend(1, srcCluster) + def dstBeIndexes = cluster.addBackend(1, dstCluster) + + def assertWarmupReached = { Map metrics, long expectedFinished, String phase -> + assert metrics.finished >= expectedFinished : + "${phase}: expected finished >= ${expectedFinished}, metrics=${metrics}" + assert metrics.finished + metrics.failed >= metrics.submitted : + "${phase}: submitted warmup tasks should be terminal, metrics=${metrics}" + } + + def aliveFrontends = { String phase -> + def fes = [] + for (int i = 0; i < 30; i++) { + fes = cluster.getAllFrontends(true) + if (fes.size() == options.feNum) { + return fes + } + sleep(1000) + } + assert false : "${phase}: expected ${options.feNum} alive FEs, got ${fes}" + } + + def assertShowWarmupOnAllFes = { Object jobId, Set expectedTables, String phase -> + for (fe in aliveFrontends(phase)) { + def feLabel = "fe-${fe.index}" + def jdbcUrl = String.format( + "jdbc:mysql://%s:%s/?useLocalSessionState=true&allowLoadLocalInfile=false", + fe.host, fe.queryPort) + connect(context.config.jdbcUser, context.config.jdbcPassword, jdbcUrl) { + def rows = sql """SHOW WARM UP JOB WHERE ID = ${jobId}""" + assert rows.size() == 1 : "${phase}: ${feLabel} should show one warmup job, rows=${rows}" + def row = rows[0] + assert row[0].toString() == jobId.toString() : + "${phase}: ${feLabel} job id mismatch, row=${row}" + assert row[1].toString() == srcCluster : + "${phase}: ${feLabel} source cluster mismatch, row=${row}" + assert row[2].toString() == dstCluster : + "${phase}: ${feLabel} target cluster mismatch, row=${row}" + assert row[3] in ["RUNNING", "PENDING"] : + "${phase}: ${feLabel} job should be running or pending, row=${row}" + assert row[4].toString() == "TABLES" : + "${phase}: ${feLabel} job type mismatch, row=${row}" + assert row[5].toString().startsWith("EVENT_DRIVEN") : + "${phase}: ${feLabel} sync mode mismatch, row=${row}" + def matched = WarmupMetricsUtils.parseMatchedTables(rows) + assert matched.containsAll(expectedTables) : + "${phase}: ${feLabel} matched tables mismatch, expected=${expectedTables}, matched=${matched}" + logger.info("${phase}: SHOW WARM UP JOB on ${feLabel}(${fe.host}:${fe.queryPort}) row=${row}") + } + } + } + + try { + sql """use @${srcCluster}""" + sql """CREATE DATABASE IF NOT EXISTS ${dbName}""" + sql """use ${dbName}""" + sql """CREATE TABLE IF NOT EXISTS ha_tbl ( + id INT, + val STRING + ) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 2 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + + def jobId = sql(""" + WARM UP CLUSTER ${dstCluster} WITH CLUSTER ${srcCluster} + ON TABLES (INCLUDE '${dbName}.*') + PROPERTIES ("sync_mode" = "event_driven", "sync_event" = "load") + """)[0][0] + jobIds << jobId + assert WarmupMetricsUtils.waitForMatchedTables(sqlRunner, jobId, + ["${dbName}.ha_tbl".toString()] as Set).contains("${dbName}.ha_tbl".toString()) + assertShowWarmupOnAllFes(jobId, ["${dbName}.ha_tbl".toString()] as Set, + "after creating table-level warmup job") + + restartMasterFe() + sql """use @${srcCluster}""" + sql """use ${dbName}""" + + WarmupMetricsUtils.clearFileCacheOnAllBackends(sqlRunner) + def baseMetrics = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + for (int i = 0; i < 4; i++) { + sql """INSERT INTO ha_tbl VALUES (${i}, 'before_restart_${i}')""" + } + def afterFeRestart = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, srcCluster, dstCluster, + baseMetrics.finished + 4, 90000) + assertWarmupReached(afterFeRestart, baseMetrics.finished + 4, "after master FE restart") + assert afterFeRestart.failed == baseMetrics.failed : + "warmup should continue after master FE restart, metrics=${afterFeRestart}" + assertShowWarmupOnAllFes(jobId, ["${dbName}.ha_tbl".toString()] as Set, + "after master FE restart") + + cluster.restartBackends(dstBeIndexes[0] as int) + sleep(5000) + sql """use @${srcCluster}""" + sql """use ${dbName}""" + def beforeTargetRestartLoad = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + for (int i = 4; i < 8; i++) { + sql """INSERT INTO ha_tbl VALUES (${i}, 'after_target_restart_${i}')""" + } + def afterTargetRestart = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, srcCluster, dstCluster, + beforeTargetRestartLoad.finished + 4, 90000) + assertWarmupReached(afterTargetRestart, beforeTargetRestartLoad.finished + 4, + "after target BE restart") + assert afterTargetRestart.failed == beforeTargetRestartLoad.failed : + "warmup should continue after target BE restart, metrics=${afterTargetRestart}" + + def targetBeforeScale = WarmupMetricsUtils.getClusterBackends(sqlRunner, dstCluster) + .collect { it[0].toString() } as Set + cluster.addBackend(1, dstCluster) + sleep(5000) + def targetAfterScale = WarmupMetricsUtils.getClusterBackends(sqlRunner, dstCluster) + def newTargetBes = targetAfterScale.findAll { !targetBeforeScale.contains(it[0].toString()) } + assert newTargetBes.size() == 1 : "expected one new target BE, before=${targetBeforeScale}, after=${targetAfterScale}" + def newTargetBe = newTargetBes[0] + + sql """use @${srcCluster}""" + sql """use ${dbName}""" + sql """CREATE TABLE IF NOT EXISTS scale_tbl ( + id INT, + val STRING + ) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 4 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + assert WarmupMetricsUtils.waitForMatchedTables(sqlRunner, jobId, + ["${dbName}.ha_tbl".toString(), "${dbName}.scale_tbl".toString()] as Set) + .contains("${dbName}.scale_tbl".toString()) + assertShowWarmupOnAllFes(jobId, + ["${dbName}.ha_tbl".toString(), "${dbName}.scale_tbl".toString()] as Set, + "after target scale-out table match") + + def newBeFinishedBefore = WarmupMetricsUtils.getBrpcMetric(newTargetBe[1].toString(), + newTargetBe[5].toString(), WarmupMetricsUtils.METRIC_FINISHED) + def beforeScaleLoad = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + for (int i = 0; i < 8; i++) { + sql """INSERT INTO scale_tbl VALUES (${i}, 'scale_${i}')""" + } + def afterScaleLoad = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, srcCluster, dstCluster, + beforeScaleLoad.finished + 8, 90000) + assertWarmupReached(afterScaleLoad, beforeScaleLoad.finished + 8, "after target scale-out") + assert afterScaleLoad.failed == beforeScaleLoad.failed : + "warmup should continue after target scale-out, metrics=${afterScaleLoad}" + def newBeFinishedAfter = WarmupMetricsUtils.getBrpcMetric(newTargetBe[1].toString(), + newTargetBe[5].toString(), WarmupMetricsUtils.METRIC_FINISHED) + assert newBeFinishedAfter > newBeFinishedBefore : + "new target BE should participate in later table-level warmup" + + sql """use @${dstCluster}""" + sql """use ${dbName}""" + assert sql("""SELECT count(*) FROM ha_tbl""")[0][0].toString() == "8" + assert sql("""SELECT count(*) FROM scale_tbl""")[0][0].toString() == "8" + } finally { + for (jid in jobIds) { + try { sql """CANCEL WARM UP JOB WHERE ID = ${jid}""" } catch (Exception ignored) {} + } + try { + sql """use @${srcCluster}""" + sql """use ${dbName}""" + sql """DROP TABLE IF EXISTS ha_tbl""" + sql """DROP TABLE IF EXISTS scale_tbl""" + } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${dbName}""" } catch (Exception ignored) {} + } + } +} diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_schema_index.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_schema_index.groovy new file mode 100644 index 00000000000000..75e7b5c9af8ae1 --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/on_tables/test_warm_up_event_on_tables_system_schema_index.groovy @@ -0,0 +1,182 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.WarmupMetricsUtils + +// Test points covered: ST-03, ST-05. +suite('test_warm_up_event_on_tables_system_schema_index', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_warm_up_table_filter_refresh_interval_ms=1000', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + 'disable_auto_compaction=true', + ] + options.cloudMode = true + + def waitLatestColumnAlter = { tableName -> + long deadline = System.currentTimeMillis() + 60000 + def last = [] + while (System.currentTimeMillis() < deadline) { + last = sql """SHOW ALTER TABLE COLUMN WHERE TableName = '${tableName}' + ORDER BY CreateTime DESC LIMIT 1""" + if (last.isEmpty() || last[0].toString().contains("FINISHED")) { + sleep(1000) + return + } + sleep(1000) + } + assert false : "schema change on ${tableName} did not finish, last=${last}" + } + + docker(options) { + Closure sqlRunner = { String q -> sql(q) } + + def srcCluster = "warmup_source" + def dstCluster = "warmup_target" + def dbName = "test_on_tables_system_schema_index_db" + def jobIds = [] + + cluster.addBackend(1, srcCluster) + cluster.addBackend(1, dstCluster) + + try { + sql """use @${srcCluster}""" + sql """CREATE DATABASE IF NOT EXISTS ${dbName}""" + sql """use ${dbName}""" + sql """CREATE TABLE IF NOT EXISTS schema_tbl ( + id INT, + v INT, + tag STRING + ) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 2 + PROPERTIES ("file_cache_ttl_seconds" = "3600")""" + sql """CREATE TABLE IF NOT EXISTS idx_tbl ( + id INT, + body STRING, + city STRING, + INDEX idx_body(body) USING INVERTED + PROPERTIES("parser" = "english", "support_phrase" = "true") + ) + DUPLICATE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ( + "file_cache_ttl_seconds" = "3600", + "storage_format" = "V2" + )""" + + def jobId = sql(""" + WARM UP CLUSTER ${dstCluster} WITH CLUSTER ${srcCluster} + ON TABLES ( + INCLUDE '${dbName}.schema_*', + INCLUDE '${dbName}.idx_*' + ) + PROPERTIES ("sync_mode" = "event_driven", "sync_event" = "load") + """)[0][0] + jobIds << jobId + + def matched = WarmupMetricsUtils.waitForMatchedTables(sqlRunner, jobId, + ["${dbName}.schema_tbl".toString(), + "${dbName}.idx_tbl".toString()] as Set) + assert matched == ["${dbName}.schema_tbl".toString(), + "${dbName}.idx_tbl".toString()] as Set : + "unexpected matched tables: ${matched}" + + WarmupMetricsUtils.clearFileCacheOnAllBackends(sqlRunner) + def baseMetrics = WarmupMetricsUtils.getWarmupMetrics(sqlRunner, srcCluster, dstCluster) + + sql """use @${srcCluster}""" + sql """use ${dbName}""" + + sql """INSERT INTO schema_tbl VALUES (1, 10, 'a'), (2, 20, 'b')""" + sql """ALTER TABLE schema_tbl ADD COLUMN extra STRING DEFAULT 'x'""" + waitLatestColumnAlter("schema_tbl") + sql """INSERT INTO schema_tbl(id, v, tag, extra) VALUES (3, 30, 'c', 'c_extra')""" + sql """ALTER TABLE schema_tbl RENAME COLUMN tag label""" + waitLatestColumnAlter("schema_tbl") + sql """INSERT INTO schema_tbl(id, v, label, extra) VALUES (4, 40, 'd', 'd_extra')""" + sql """ALTER TABLE schema_tbl MODIFY COLUMN v BIGINT NULL""" + waitLatestColumnAlter("schema_tbl") + sql """ALTER TABLE schema_tbl DROP COLUMN extra""" + waitLatestColumnAlter("schema_tbl") + sql """INSERT INTO schema_tbl(id, v, label) VALUES (5, 50, 'e')""" + + sql """INSERT INTO idx_tbl VALUES + (1, 'quick brown fox', 'beijing'), + (2, 'slow yellow fox', 'shanghai'), + (3, 'quick blue whale', 'beijing')""" + + def metrics = WarmupMetricsUtils.waitForWarmupFinish(sqlRunner, srcCluster, dstCluster, + baseMetrics.finished + 5, 90000) + assert metrics.failed == baseMetrics.failed : "warmup should not fail, metrics=${metrics}, base=${baseMetrics}" + + def stats = WarmupMetricsUtils.waitForJobSyncStats(sqlRunner, jobId, + { it.seg_num.finish_5m >= 5 && it.idx_num.finish_5m > 0 + && it.seg_num.fail_5m == 0 && it.idx_num.fail_5m == 0 }, + 60000) + logger.info("schema/index SyncStats: ${stats}") + assert stats.idx_num.finish_5m > 0 : "inverted index files should be warmed, stats=${stats}" + assert stats.idx_num.fail_5m == 0 : "inverted index warmup should not fail, stats=${stats}" + + sql """use @${dstCluster}""" + sql """use ${dbName}""" + def schemaRes = sql """SELECT count(*), sum(v) FROM schema_tbl""" + assert schemaRes[0][0].toString() == "5" : "schema table count mismatch: ${schemaRes}" + assert schemaRes[0][1].toString() == "150" : "schema table sum mismatch: ${schemaRes}" + + profile("st05_inverted_index_profile") { + sql """set enable_profile = true""" + sql """set profile_level = 2""" + sql """set enable_common_expr_pushdown = true""" + run { + def res = sql """/* st05_inverted_index_profile */ SELECT id FROM idx_tbl + WHERE body MATCH_ALL 'quick' ORDER BY id""" + assert res.collect { it[0].toString() } == ["1", "3"] : "index query mismatch: ${res}" + sleep(1000) + } + check { profileString, exception -> + if (exception != null) { + throw exception + } + assert profileString.contains("InvertedIndexNumRemoteIOTotal") : + "profile should contain inverted index file cache counters" + def idxRemoteTotal = WarmupMetricsUtils.sumProfileCounter(profileString, + "InvertedIndexNumRemoteIOTotal") + def remoteTotal = WarmupMetricsUtils.sumProfileCounter(profileString, "NumRemoteIOTotal") + logger.info("index profile remote counters: data=${remoteTotal}, inverted_index=${idxRemoteTotal}") + assert idxRemoteTotal == 0 : "warmed inverted index query should not read index files remotely" + assert remoteTotal == 0 : "warmed inverted index query should not read data files remotely" + } + } + } finally { + for (jid in jobIds) { + try { sql """CANCEL WARM UP JOB WHERE ID = ${jid}""" } catch (Exception ignored) {} + } + try { + sql """use @${srcCluster}""" + sql """use ${dbName}""" + sql """DROP TABLE IF EXISTS schema_tbl""" + sql """DROP TABLE IF EXISTS idx_tbl""" + } catch (Exception ignored) {} + try { sql """DROP DATABASE IF EXISTS ${dbName}""" } catch (Exception ignored) {} + } + } +}