From e1841cf7425ef302e27ae696e15a528d1bd7c248 Mon Sep 17 00:00:00 2001 From: Hongdan Zhu Date: Thu, 13 Mar 2025 02:03:59 -0700 Subject: [PATCH 1/6] HIVE-28755: Statistics Management Task --- .../hive/metastore/conf/MetastoreConf.java | 15 ++ .../metastore/StatisticsManagementTask.java | 162 ++++++++++++++++++ .../hive/metastore/tools/BenchmarkTool.java | 9 +- .../hive/metastore/tools/HMSBenchmarks.java | 66 +++++++ 4 files changed, 250 insertions(+), 2 deletions(-) create mode 100644 standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/StatisticsManagementTask.java diff --git a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java b/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java index 8f8befd83480..0da9dcb8600f 100644 --- a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java +++ b/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java @@ -1291,6 +1291,21 @@ public enum ConfVars { "metastore.partition.management.table.pattern", "*", "Automatic partition management will look for tables using the specified table pattern"), + STATISTICS_MANAGEMENT_TASK_FREQUENCY("metastore.statistics.management.task.frequency", + "metastore.statistics.management.task.frequency", + 365, TimeUnit.DAYS, "Frequency at which timer task runs to do automatic statistics management for tables\n" + + "with table property 'statistics.auto.deletion'='true'. Statistics management include 2 configs. \n" + + "One is 'statistics.auto.deletion', and the other is 'statistics.retention.period'. \n" + + "When 'statistics.auto.deletion'='true' is set, statistics management will look for tables which their\n " + + "column statistics are over the retention period, and then delete the column stats. \n"), + STATISTICS_RETENTION_PERIOD("metastore.statistics.retention.period", + "metastore.statistics.retention.period", 365, TimeUnit.DAYS, "The retention period " + + "that we want to keep the stats for each table, which means if the stats are older than this period\n" + + "of time, the stats will be automatically deleted. \n"), + + STATISTICS_AUTO_DELETION("statistics.auto.deletion", "statistics.auto.deletion", true, + "Whether table/partition column statistics will be auto deleted after retention period"), + METASTORE_METADATA_TRANSFORMER_CLASS("metastore.metadata.transformer.class", "metastore.metadata.transformer.class", "org.apache.hadoop.hive.metastore.MetastoreDefaultTransformer", "Fully qualified class name for the metastore metadata transformer class \n" diff --git a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/StatisticsManagementTask.java b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/StatisticsManagementTask.java new file mode 100644 index 000000000000..f1c3a98879b3 --- /dev/null +++ b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/StatisticsManagementTask.java @@ -0,0 +1,162 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.metastore; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.locks.Lock; +import java.util.concurrent.locks.ReentrantLock; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.common.TableName; +import org.apache.hadoop.hive.metastore.api.Database; +import org.apache.hadoop.hive.metastore.api.DeleteColumnStatisticsRequest; +import org.apache.hadoop.hive.metastore.api.Partition; +import org.apache.hadoop.hive.metastore.api.Table; +import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; +import org.apache.hadoop.hive.metastore.conf.MetastoreConf; +import org.apache.hadoop.hive.metastore.conf.TimeValidator; +import org.apache.hadoop.hive.metastore.model.MPartitionColumnStatistics; +import org.apache.hadoop.hive.metastore.model.MTable; +import org.apache.hadoop.hive.metastore.model.MTableColumnStatistics; +import org.apache.hadoop.hive.metastore.utils.MetaStoreUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.util.concurrent.ThreadFactoryBuilder; + +import javax.jdo.PersistenceManager; +import javax.jdo.Query; + +/** + * Statistics management task is primarily responsible for auto deletion of table column stats based on a certain frequency + * + * If some table column statistics are older than the period time, they should be deleted automatically + * Statistics Retention - If "partition.retention.period" table property is set with retention interval, when this + * metastore task runs periodically, it will drop partitions with age (creation time) greater than retention period. + * Dropping partitions after retention period will also delete the data in that partition. + * + */ +public class StatisticsManagementTask implements MetastoreTaskThread { + private static final Logger LOG = LoggerFactory.getLogger(StatisticsManagementTask.class); + + // global + public static final String STATISTICS_AUTO_DELETION = "statistics.auto.deletion"; + public static final String STATISTICS_RETENTION_PERIOD = "statistics.retention.period"; + + // The 2 configs for users to set in the conf + // this is an optional table property, if this property does not exist for a table, then it is not excluded + public static final String STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY = "statistics.auto.deletion.exclude"; + + private static final Lock lock = new ReentrantLock(); + + // these are just for testing + private static int completedAttempts; + private static int skippedAttempts; + + private Configuration conf; + + @Override + public long runFrequency(TimeUnit unit) { + return MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.STATISTICS_MANAGEMENT_TASK_FREQUENCY, unit); + } + + @Override + public void setConf(Configuration configuration) { + // we modify conf in setupConf(), so we make a copy + conf = new Configuration(configuration); + } + + @Override + public Configuration getConf() { + return conf; + } + + // what needs to be included in this run() method: + // get the "lastAnalyzed" information from TAB_COL_STATS and find all the tables need to be deleted + // delete all column stats + @Override + public void run() { + if (LOG.isDebugEnabled()) { + LOG.debug("Auto statistics deletion started. Cleaning up table/partition column statistics over the retention period."); + } + long retentionMillis = MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars. STATISTICS_RETENTION_PERIOD, TimeUnit.MILLISECONDS); + if (retentionMillis <= 0 || !MetastoreConf.getBoolVar(conf, MetastoreConf.ConfVars.STATISTICS_AUTO_DELETION)) { + LOG.info("Statistics auto deletion is set to off currently."); + return; + } + if (lock.tryLock()) { + skippedAttempts = 0; + String qualifiedTableName = null; + IMetaStoreClient msc = null; + try { + // Get retention period in conf in milliseconds; default is 365 days. + long now = System.currentTimeMillis(); + long lastAnalyzedThreshold = (now - retentionMillis) / 1000; + + // Get all databases from metastore + List databases = msc.getAllDatabases(); + RawStore ms = HMSHandler.getMSForConf(conf); + PersistenceManager pm = ((ObjectStore) ms).getPersistenceManager(); + Query q = pm.newQuery("SELECT FROM org.apache.hadoop.hive.metastore.model.MTableColumnStatistics"); + q.setFilter("lastAnalyzed < " + lastAnalyzedThreshold); + List results = (List) q.execute(); + + for (MTableColumnStatistics stat : results) { + String dbName = stat.getTable().getDatabase().getName(); + String tblName = stat.getTable().getTableName(); + Map tblParams = stat.getTable().getParameters(); + if (tblParams != null && tblParams.getOrDefault(STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY, null) != null) { + LOG.info("Skipping table {}.{} due to exclude property.", dbName, tblName); + continue; + } + /** + // if this table contains "lastAnalyzed" in table property, we process the auto stats deletion + long lastAnalyzed = stat.getLastAnalyzed(); + // lastAnalyzed is in unit seconds, switch it to milliseconds + lastAnalyzed *= 1000; + **/ + + DeleteColumnStatisticsRequest request = new DeleteColumnStatisticsRequest(dbName, tblName); + request.setEngine("hive"); + boolean isPartitioned = stat.getTable().getPartitionKeys() != null && !stat.getTable().getPartitionKeys().isEmpty(); + // Delete table-level column statistics + if (!isPartitioned) { + request.setTableLevel(true); + } else { + request.setTableLevel(false); + } + msc.deleteColumnStatistics(request); + } + } catch (Exception e) { + LOG.error("Error during statistics auto deletion", e); + } + } + } + +} diff --git a/standalone-metastore/metastore-tools/metastore-benchmarks/src/main/java/org/apache/hadoop/hive/metastore/tools/BenchmarkTool.java b/standalone-metastore/metastore-tools/metastore-benchmarks/src/main/java/org/apache/hadoop/hive/metastore/tools/BenchmarkTool.java index 551ffabe6b9a..aac06202699d 100644 --- a/standalone-metastore/metastore-tools/metastore-benchmarks/src/main/java/org/apache/hadoop/hive/metastore/tools/BenchmarkTool.java +++ b/standalone-metastore/metastore-tools/metastore-benchmarks/src/main/java/org/apache/hadoop/hive/metastore/tools/BenchmarkTool.java @@ -70,6 +70,7 @@ import static org.apache.hadoop.hive.metastore.tools.HMSBenchmarks.benchmarkOpenTxns; import static org.apache.hadoop.hive.metastore.tools.HMSBenchmarks.benchmarkPartitionManagement; import static org.apache.hadoop.hive.metastore.tools.HMSBenchmarks.benchmarkRenameTable; +import static org.apache.hadoop.hive.metastore.tools.HMSBenchmarks.benchmarkStatisticsManagement; import static org.apache.hadoop.hive.metastore.tools.HMSBenchmarks.benchmarkTableCreate; import static org.apache.hadoop.hive.metastore.tools.HMSBenchmarks.benchmarkUpdatePartitionsStat; import static org.apache.hadoop.hive.metastore.tools.Util.getServerUri; @@ -303,7 +304,9 @@ private void runNonAcidBenchmarks() { .add("openTxn", () -> benchmarkOpenTxns(bench, bData, 1)) .add("PartitionManagementTask", - () -> benchmarkPartitionManagement(bench, bData, 1)); + () -> benchmarkPartitionManagement(bench, bData, 1)) + .add("StatisticsManagementTask", + () -> benchmarkStatisticsManagement(bench, bData, 1)); for (int howMany: instances) { suite.add("listTables" + '.' + howMany, @@ -345,7 +348,9 @@ private void runNonAcidBenchmarks() { .add("openTxns" + '.' + howMany, () -> benchmarkOpenTxns(bench, bData, howMany)) .add("PartitionManagementTask" + "." + howMany, - () -> benchmarkPartitionManagement(bench, bData, howMany)); + () -> benchmarkPartitionManagement(bench, bData, howMany)) + .add("PartitionStatisticsTask" + "." + howMany, + () -> benchmarkStatisticsManagement(bench, bData, howMany)); } List toRun = suite.listMatching(matches, exclude); diff --git a/standalone-metastore/metastore-tools/metastore-benchmarks/src/main/java/org/apache/hadoop/hive/metastore/tools/HMSBenchmarks.java b/standalone-metastore/metastore-tools/metastore-benchmarks/src/main/java/org/apache/hadoop/hive/metastore/tools/HMSBenchmarks.java index c01200c33be5..0d9d6d2cc0aa 100644 --- a/standalone-metastore/metastore-tools/metastore-benchmarks/src/main/java/org/apache/hadoop/hive/metastore/tools/HMSBenchmarks.java +++ b/standalone-metastore/metastore-tools/metastore-benchmarks/src/main/java/org/apache/hadoop/hive/metastore/tools/HMSBenchmarks.java @@ -22,6 +22,7 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.metastore.PartitionManagementTask; +import org.apache.hadoop.hive.metastore.StatisticsManagementTask; import org.apache.hadoop.hive.metastore.TableType; import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.metastore.api.Partition; @@ -42,6 +43,7 @@ import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicLong; import java.util.stream.Collectors; import java.util.stream.IntStream; @@ -695,4 +697,68 @@ static DescriptiveStatistics benchmarkPartitionManagement(@NotNull MicroBenchmar } } + static DescriptiveStatistics benchmarkStatisticsManagement(@NotNull MicroBenchmark bench, + @NotNull BenchData data, + int tableCount) { + + String dbName = data.dbName + "_" + tableCount; + String tableNamePrefix = data.tableName; + final HMSClient client = data.getClient(); + final StatisticsManagementTask statsTask = new StatisticsManagementTask(); + final FileSystem fs; + try { + fs = FileSystem.get(client.getHadoopConf()); + client.getHadoopConf().set("hive.metastore.uris", client.getServerURI().toString()); + client.getHadoopConf().set("metastore.statistics.management.database.pattern", dbName); + statsTask.setConf(client.getHadoopConf()); + + client.createDatabase(dbName); + for (int i = 0; i < tableCount; i++) { + String tableName = tableNamePrefix + "_" + i; + Util.TableBuilder tableBuilder = new Util.TableBuilder(dbName, tableName) + .withType(TableType.MANAGED_TABLE) + .withColumns(createSchema(Arrays.asList("col1:string", "col2:int"))) + .withPartitionKeys(createSchema(Collections.singletonList("part_col"))) + .withParameter("columnStatsAccurate", "true"); + + client.createTable(tableBuilder.build()); + addManyPartitionsNoException(client, dbName, tableName, null, Collections.singletonList("part_col"), 100); + + // simulate the partitions of each table which its stats has an old "lastAnalyzed" + List partitions = client.listPartitions(dbName, tableName); + for (Partition partition : partitions) { + Map params = partition.getParameters(); + // to manually change the "lastAnalyzed" to an old time, ex. 400 days + params.put("lastAnalyzed", String.valueOf(System.currentTimeMillis() - TimeUnit.DAYS.toMillis(400))); + client.alterPartition(dbName, tableName, partition); + } + } + } catch (Exception e) { + throw new RuntimeException(e); + } + + Runnable preRun = () -> { + System.out.println("Preparing for benchmark..."); + }; + + try { + DescriptiveStatistics stats = bench.measure(preRun, statsTask, null); + + // check if the stats are deleted + for (int i = 0; i < tableCount; i++) { + String tableName = tableNamePrefix + "_" + i; + List partitions = client.listPartitions(dbName, tableName); + for (Partition partition : partitions) { + Map params = partition.getParameters(); + if (params.containsKey("lastAnalyzed")) { + throw new AssertionError("Partition stats not deleted for table: " + tableName); + } + } + } + return stats; + } catch (Exception e) { + throw new RuntimeException(e); + } + } + } From 94be1ff97a80c9994250899d83a9a5dab8bbb02e Mon Sep 17 00:00:00 2001 From: Hongdan Zhu Date: Thu, 4 Dec 2025 02:34:35 -0800 Subject: [PATCH 2/6] HIVE-28755: commit 2 --- .../hive/metastore/conf/MetastoreConf.java | 5 +- .../metastore/StatisticsManagementTask.java | 60 +++++-------------- 2 files changed, 19 insertions(+), 46 deletions(-) diff --git a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java b/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java index 0da9dcb8600f..71949951e24d 100644 --- a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java +++ b/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java @@ -1293,7 +1293,7 @@ public enum ConfVars { STATISTICS_MANAGEMENT_TASK_FREQUENCY("metastore.statistics.management.task.frequency", "metastore.statistics.management.task.frequency", - 365, TimeUnit.DAYS, "Frequency at which timer task runs to do automatic statistics management for tables\n" + + 7, TimeUnit.DAYS, "Frequency at which timer task runs to do automatic statistics management for tables\n" + "with table property 'statistics.auto.deletion'='true'. Statistics management include 2 configs. \n" + "One is 'statistics.auto.deletion', and the other is 'statistics.retention.period'. \n" + "When 'statistics.auto.deletion'='true' is set, statistics management will look for tables which their\n " + @@ -1541,7 +1541,8 @@ public enum ConfVars { ACID_METRICS_TASK_CLASS + "," + ACID_METRICS_LOGGER_CLASS + "," + "org.apache.hadoop.hive.metastore.HiveProtoEventsCleanerTask" + "," + "org.apache.hadoop.hive.metastore.ScheduledQueryExecutionsMaintTask" + "," - + "org.apache.hadoop.hive.metastore.ReplicationMetricsMaintTask", + + "org.apache.hadoop.hive.metastore.ReplicationMetricsMaintTask" + "," + + "org.apache.hadoop.hive.metastore.StatisticsManagementTask", "Comma separated list of tasks that will be started in separate threads. These will " + "always be started, regardless of whether the metastore is running in embedded mode " + "or in server mode. They must implement " + METASTORE_TASK_THREAD_CLASS), diff --git a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/StatisticsManagementTask.java b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/StatisticsManagementTask.java index f1c3a98879b3..b6e3040bb2c8 100644 --- a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/StatisticsManagementTask.java +++ b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/StatisticsManagementTask.java @@ -18,67 +18,37 @@ package org.apache.hadoop.hive.metastore; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashSet; import java.util.List; import java.util.Map; -import java.util.Set; -import java.util.concurrent.CountDownLatch; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantLock; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hive.common.TableName; -import org.apache.hadoop.hive.metastore.api.Database; import org.apache.hadoop.hive.metastore.api.DeleteColumnStatisticsRequest; -import org.apache.hadoop.hive.metastore.api.Partition; -import org.apache.hadoop.hive.metastore.api.Table; -import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; import org.apache.hadoop.hive.metastore.conf.MetastoreConf; -import org.apache.hadoop.hive.metastore.conf.TimeValidator; -import org.apache.hadoop.hive.metastore.model.MPartitionColumnStatistics; -import org.apache.hadoop.hive.metastore.model.MTable; import org.apache.hadoop.hive.metastore.model.MTableColumnStatistics; -import org.apache.hadoop.hive.metastore.utils.MetaStoreUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.google.common.annotations.VisibleForTesting; -import com.google.common.util.concurrent.ThreadFactoryBuilder; - import javax.jdo.PersistenceManager; import javax.jdo.Query; /** * Statistics management task is primarily responsible for auto deletion of table column stats based on a certain frequency * - * If some table column statistics are older than the period time, they should be deleted automatically - * Statistics Retention - If "partition.retention.period" table property is set with retention interval, when this - * metastore task runs periodically, it will drop partitions with age (creation time) greater than retention period. - * Dropping partitions after retention period will also delete the data in that partition. - * + * If some table or partition column statistics are older than the configured retention interval + * (MetastoreConf.ConfVars.STATISTICS_RETENTION_PERIOD), they are deleted when this metastore task runs periodically. */ public class StatisticsManagementTask implements MetastoreTaskThread { private static final Logger LOG = LoggerFactory.getLogger(StatisticsManagementTask.class); - // global - public static final String STATISTICS_AUTO_DELETION = "statistics.auto.deletion"; - public static final String STATISTICS_RETENTION_PERIOD = "statistics.retention.period"; - // The 2 configs for users to set in the conf // this is an optional table property, if this property does not exist for a table, then it is not excluded public static final String STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY = "statistics.auto.deletion.exclude"; private static final Lock lock = new ReentrantLock(); - // these are just for testing - private static int completedAttempts; - private static int skippedAttempts; - private Configuration conf; @Override @@ -111,21 +81,24 @@ public void run() { return; } if (lock.tryLock()) { - skippedAttempts = 0; - String qualifiedTableName = null; - IMetaStoreClient msc = null; + IMetaStoreClient msc; try { // Get retention period in conf in milliseconds; default is 365 days. long now = System.currentTimeMillis(); long lastAnalyzedThreshold = (now - retentionMillis) / 1000; + String filter = "lastAnalyzed < threshold"; + String paramStr = "long threshold"; // Get all databases from metastore - List databases = msc.getAllDatabases(); + msc = new HiveMetaStoreClient(conf); RawStore ms = HMSHandler.getMSForConf(conf); PersistenceManager pm = ((ObjectStore) ms).getPersistenceManager(); - Query q = pm.newQuery("SELECT FROM org.apache.hadoop.hive.metastore.model.MTableColumnStatistics"); - q.setFilter("lastAnalyzed < " + lastAnalyzedThreshold); - List results = (List) q.execute(); + Query q = pm.newQuery(MTableColumnStatistics.class); + q.setFilter(filter); + q.declareParameters(paramStr); + @SuppressWarnings("unchecked") + List results = + (List) q.execute(lastAnalyzedThreshold); for (MTableColumnStatistics stat : results) { String dbName = stat.getTable().getDatabase().getName(); @@ -136,12 +109,11 @@ public void run() { continue; } /** - // if this table contains "lastAnalyzed" in table property, we process the auto stats deletion - long lastAnalyzed = stat.getLastAnalyzed(); - // lastAnalyzed is in unit seconds, switch it to milliseconds - lastAnalyzed *= 1000; + * if this table contains "lastAnalyzed" in table property, we process the auto stats deletion + * long lastAnalyzed = stat.getLastAnalyzed(); + * lastAnalyzed is in unit seconds, switch it to milliseconds + * lastAnalyzed *= 1000; **/ - DeleteColumnStatisticsRequest request = new DeleteColumnStatisticsRequest(dbName, tblName); request.setEngine("hive"); boolean isPartitioned = stat.getTable().getPartitionKeys() != null && !stat.getTable().getPartitionKeys().isEmpty(); From 1193c1119ea341df67c5d9ce95d98325257b6abc Mon Sep 17 00:00:00 2001 From: Hongdan Zhu Date: Mon, 12 Jan 2026 14:07:49 -0800 Subject: [PATCH 3/6] HIVE-28755: commit 3 after review comments --- .../metastore/StatisticsManagementTask.java | 99 ++++++++++--------- .../hive/metastore/tools/HMSBenchmarks.java | 8 +- 2 files changed, 55 insertions(+), 52 deletions(-) diff --git a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/StatisticsManagementTask.java b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/StatisticsManagementTask.java index b6e3040bb2c8..becb4b4cfef6 100644 --- a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/StatisticsManagementTask.java +++ b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/StatisticsManagementTask.java @@ -59,7 +59,7 @@ public long runFrequency(TimeUnit unit) { @Override public void setConf(Configuration configuration) { // we modify conf in setupConf(), so we make a copy - conf = new Configuration(configuration); + this.conf = configuration; } @Override @@ -72,62 +72,69 @@ public Configuration getConf() { // delete all column stats @Override public void run() { - if (LOG.isDebugEnabled()) { - LOG.debug("Auto statistics deletion started. Cleaning up table/partition column statistics over the retention period."); - } + LOG.debug("Auto statistics deletion started. Cleaning up table/partition column statistics over the retention period."); long retentionMillis = MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars. STATISTICS_RETENTION_PERIOD, TimeUnit.MILLISECONDS); if (retentionMillis <= 0 || !MetastoreConf.getBoolVar(conf, MetastoreConf.ConfVars.STATISTICS_AUTO_DELETION)) { LOG.info("Statistics auto deletion is set to off currently."); return; } - if (lock.tryLock()) { - IMetaStoreClient msc; - try { - // Get retention period in conf in milliseconds; default is 365 days. - long now = System.currentTimeMillis(); - long lastAnalyzedThreshold = (now - retentionMillis) / 1000; - String filter = "lastAnalyzed < threshold"; - String paramStr = "long threshold"; - - // Get all databases from metastore - msc = new HiveMetaStoreClient(conf); + if (!lock.tryLock()) { + return; + } + try { + long now = System.currentTimeMillis(); + long lastAnalyzedThreshold = (now - retentionMillis) / 1000; + + String filter = "lastAnalyzed < threshold"; + String paramStr = "long threshold"; + try (IMetaStoreClient msc = new HiveMetaStoreClient(conf)) { RawStore ms = HMSHandler.getMSForConf(conf); PersistenceManager pm = ((ObjectStore) ms).getPersistenceManager(); - Query q = pm.newQuery(MTableColumnStatistics.class); - q.setFilter(filter); - q.declareParameters(paramStr); - @SuppressWarnings("unchecked") - List results = - (List) q.execute(lastAnalyzedThreshold); - - for (MTableColumnStatistics stat : results) { - String dbName = stat.getTable().getDatabase().getName(); - String tblName = stat.getTable().getTableName(); - Map tblParams = stat.getTable().getParameters(); - if (tblParams != null && tblParams.getOrDefault(STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY, null) != null) { - LOG.info("Skipping table {}.{} due to exclude property.", dbName, tblName); - continue; + + Query q = null; + try { + q = pm.newQuery(MTableColumnStatistics.class); + q.setFilter(filter); + q.declareParameters(paramStr); + // only fetch required fields, avoid loading heavy MTable objects + q.setResult( + "table.database.name, " + + "table.tableName, " + + "partitionName, " + + "table.parameters.get(\"" + STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY + "\")" + ); + @SuppressWarnings("unchecked") + List rows = (List) q.execute(lastAnalyzedThreshold); + + for (Object[] row : rows) { + String dbName = (String) row[0]; + String tblName = (String) row[1]; + String partName = (String) row[2]; // can be null for table-level stats + String excludeVal = (String) row[3]; // can be null + + // exclude check uses projected param value + if (excludeVal != null) { + LOG.info("Skipping auto deletion of stats for table {}.{} due to STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY property being set on the table.", dbName, tblName); + continue; + } + DeleteColumnStatisticsRequest request = new DeleteColumnStatisticsRequest(dbName, tblName); + request.setEngine("hive"); + + // decide tableLevel based on whether this stat row is table-level or partition-level + // avoids loading table partition keys / MTable + request.setTableLevel(partName == null); + msc.deleteColumnStatistics(request); } - /** - * if this table contains "lastAnalyzed" in table property, we process the auto stats deletion - * long lastAnalyzed = stat.getLastAnalyzed(); - * lastAnalyzed is in unit seconds, switch it to milliseconds - * lastAnalyzed *= 1000; - **/ - DeleteColumnStatisticsRequest request = new DeleteColumnStatisticsRequest(dbName, tblName); - request.setEngine("hive"); - boolean isPartitioned = stat.getTable().getPartitionKeys() != null && !stat.getTable().getPartitionKeys().isEmpty(); - // Delete table-level column statistics - if (!isPartitioned) { - request.setTableLevel(true); - } else { - request.setTableLevel(false); + } finally { + if (q != null) { + q.closeAll(); } - msc.deleteColumnStatistics(request); } - } catch (Exception e) { - LOG.error("Error during statistics auto deletion", e); } + } catch (Exception e) { + LOG.error("Error during statistics auto deletion", e); + } finally { + lock.unlock(); } } diff --git a/standalone-metastore/metastore-tools/metastore-benchmarks/src/main/java/org/apache/hadoop/hive/metastore/tools/HMSBenchmarks.java b/standalone-metastore/metastore-tools/metastore-benchmarks/src/main/java/org/apache/hadoop/hive/metastore/tools/HMSBenchmarks.java index 0d9d6d2cc0aa..e49c2ca7ee46 100644 --- a/standalone-metastore/metastore-tools/metastore-benchmarks/src/main/java/org/apache/hadoop/hive/metastore/tools/HMSBenchmarks.java +++ b/standalone-metastore/metastore-tools/metastore-benchmarks/src/main/java/org/apache/hadoop/hive/metastore/tools/HMSBenchmarks.java @@ -705,9 +705,7 @@ static DescriptiveStatistics benchmarkStatisticsManagement(@NotNull MicroBenchma String tableNamePrefix = data.tableName; final HMSClient client = data.getClient(); final StatisticsManagementTask statsTask = new StatisticsManagementTask(); - final FileSystem fs; try { - fs = FileSystem.get(client.getHadoopConf()); client.getHadoopConf().set("hive.metastore.uris", client.getServerURI().toString()); client.getHadoopConf().set("metastore.statistics.management.database.pattern", dbName); statsTask.setConf(client.getHadoopConf()); @@ -730,16 +728,14 @@ static DescriptiveStatistics benchmarkStatisticsManagement(@NotNull MicroBenchma Map params = partition.getParameters(); // to manually change the "lastAnalyzed" to an old time, ex. 400 days params.put("lastAnalyzed", String.valueOf(System.currentTimeMillis() - TimeUnit.DAYS.toMillis(400))); - client.alterPartition(dbName, tableName, partition); } + client.alterPartitions(dbName, tableName, partitions); } } catch (Exception e) { throw new RuntimeException(e); } - Runnable preRun = () -> { - System.out.println("Preparing for benchmark..."); - }; + Runnable preRun = () -> LOG.debug("Preparing for benchmark..."); try { DescriptiveStatistics stats = bench.measure(preRun, statsTask, null); From 0d05096beec9d10c7f6147e910b9cff7da21c64b Mon Sep 17 00:00:00 2001 From: Hongdan Zhu Date: Thu, 22 Jan 2026 16:49:34 -0800 Subject: [PATCH 4/6] HIVE-28755: general tests added --- .../metastore/TestStatisticsManagement.java | 239 ++++++++++++++++++ 1 file changed, 239 insertions(+) create mode 100644 standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/TestStatisticsManagement.java diff --git a/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/TestStatisticsManagement.java b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/TestStatisticsManagement.java new file mode 100644 index 000000000000..397455551473 --- /dev/null +++ b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/TestStatisticsManagement.java @@ -0,0 +1,239 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.metastore; + +import static org.apache.hadoop.hive.metastore.Warehouse.DEFAULT_CATALOG_NAME; +import static org.apache.hadoop.hive.metastore.Warehouse.DEFAULT_DATABASE_NAME; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.util.List; +import java.util.concurrent.TimeUnit; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.metastore.annotation.MetastoreUnitTest; +import org.apache.hadoop.hive.metastore.api.ColumnStatistics; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; +import org.apache.hadoop.hive.metastore.api.Database; +import org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData; +import org.apache.hadoop.hive.metastore.api.NoSuchObjectException; +import org.apache.hadoop.hive.metastore.api.Table; +import org.apache.hadoop.hive.metastore.client.builder.DatabaseBuilder; +import org.apache.hadoop.hive.metastore.client.builder.TableBuilder; +import org.apache.hadoop.hive.metastore.conf.MetastoreConf; +import org.apache.hadoop.hive.metastore.conf.MetastoreConf.ConfVars; +import org.apache.hadoop.hive.metastore.security.HadoopThriftAuthBridge; +import org.apache.hadoop.hive.metastore.utils.TestTxnDbUtil; +import org.apache.thrift.TException; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +import javax.jdo.PersistenceManager; +import javax.jdo.Query; + +@Category(MetastoreUnitTest.class) +public class TestStatisticsManagement { + + private IMetaStoreClient client; + private Configuration conf; + + @Before + public void setUp() throws Exception { + conf = MetastoreConf.newMetastoreConf(); + MetastoreConf.setVar(conf, ConfVars.METASTORE_METADATA_TRANSFORMER_CLASS, " "); + MetaStoreTestUtils.setConfForStandloneMode(conf); + conf.setBoolean(ConfVars.MULTITHREADED.getVarname(), false); + conf.setBoolean(ConfVars.HIVE_IN_TEST.getVarname(), true); + + // enable stats auto deletion, set up a short retention so threshold check triggers easily + MetastoreConf.setBoolVar(conf, ConfVars.STATISTICS_AUTO_DELETION, true); + MetastoreConf.setTimeVar(conf, ConfVars.STATISTICS_RETENTION_PERIOD, 1, TimeUnit.DAYS); + + MetaStoreTestUtils.startMetaStoreWithRetry(HadoopThriftAuthBridge.getBridge(), conf); + TestTxnDbUtil.setConfValues(conf); + TestTxnDbUtil.prepDb(conf); + + client = new HiveMetaStoreClient(conf); + } + + @After + public void tearDown() throws Exception { + if (client != null) { + // Drop any leftover databases, similar to TestPartitionManagement.java + List dbs = client.getAllDatabases(DEFAULT_CATALOG_NAME); + for (String db : dbs) { + if (!db.equalsIgnoreCase(DEFAULT_DATABASE_NAME)) { + client.dropDatabase(DEFAULT_CATALOG_NAME, db, true, false, true); + } + } + } + try { + if (client != null) { + client.close(); + } + } finally { + client = null; + } + } + + @Test + public void testExpiredTableColStatsAreDeleted() throws Exception { + String dbName = "stats_db1"; + String tableName = "tbl1"; + createDbAndTable(dbName, tableName, false); + // create a column stats entry (table-level) + writeTableLevelColStats(dbName, tableName, "c1"); + // ensure stats exists + assertHasTableColStats(dbName, tableName, "c1"); + // make lastAnalyzed older than the threshold + makeAllTableColStatsOlderThanRetention(dbName, tableName); + + runStatisticsManagementTask(conf); + assertNoTableColStats(dbName, tableName, "c1"); + } + + @Test + public void testExcludedTableStatsAreNotDeleted() throws Exception { + String dbName = "stats_db2"; + String tableName = "tbl2"; + // Create a database and table that ARE excluded from auto deletion. + createDbAndTable(dbName, tableName, true); + writeTableLevelColStats(dbName, tableName, "c1"); + assertHasTableColStats(dbName, tableName, "c1"); + + // Manually set lastAnalyzed to a very old timestamp so it would normally be expired. + makeAllTableColStatsOlderThanRetention(dbName, tableName); + + runStatisticsManagementTask(conf); + + // Verify that stats are still present because the table is excluded. + assertHasTableColStats(dbName, tableName, "c1"); + } + + private void createDbAndTable(String dbName, String tableName, boolean exclude) throws Exception { + Database db; + if (!DEFAULT_DATABASE_NAME.equals(dbName)) { + db = new DatabaseBuilder() + .setName(dbName) + .setCatalogName(DEFAULT_CATALOG_NAME) + .create(client, conf); + } else { + db = client.getDatabase(DEFAULT_CATALOG_NAME, DEFAULT_DATABASE_NAME); + } + + // Build a simple test table with two columns. + TableBuilder tb = new TableBuilder() + .inDb(db) + .setTableName(tableName) + .addCol("c1", "double") + .addCol("c2", "string") + .setInputFormat("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat") + .setOutputFormat("org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat"); + + Table t = tb.build(conf); + + // If requested, mark this table as excluded from automatic stats deletion. + if (exclude) { + t.getParameters().put(StatisticsManagementTask.STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY, "true"); + } + client.createTable(t); + client.flushCache(); + } + + private void writeTableLevelColStats(String db, String tbl, String col) throws TException { + // Create a stats object for one column. + ColumnStatisticsObj obj = new ColumnStatisticsObj(); + obj.setColName(col); + obj.setColType("double"); + + // Fill in minimal double-column statistics data. + DoubleColumnStatsData doubleData = new DoubleColumnStatsData(); + doubleData.setNumNulls(0); + doubleData.setNumDVs(1); + doubleData.setLowValue(1.0); + doubleData.setHighValue(1.0); + + ColumnStatisticsData data = new ColumnStatisticsData(); + data.setDoubleStats(doubleData); + obj.setStatsData(data); + + ColumnStatistics cs = new ColumnStatistics(); + ColumnStatisticsDesc desc = new ColumnStatisticsDesc(true, db, tbl); + desc.setCatName("hive"); + cs.addToStatsObj(obj); + + client.updateTableColumnStatistics(cs); + } + + private void assertHasTableColStats(String db, String tbl, String col) throws TException { + List objs = client.getTableColumnStatistics(db, tbl, List.of(col), "hive"); + assertTrue("Expected stats for " + db + "." + tbl + "." + col, objs != null && !objs.isEmpty()); + } + + private void assertNoTableColStats(String db, String tbl, String col) throws TException { + try { + List objs = client.getTableColumnStatistics(db, tbl, List.of(col), "hive"); + assertTrue("Expected no stats for " + db + "." + tbl + "." + col, objs == null || objs.isEmpty()); + } catch (NoSuchObjectException e) { + // acceptable: server may throw if stats absent depending on impl + } + } + + private void makeAllTableColStatsOlderThanRetention(String db, String tbl) throws Exception { + // We update via ObjectStore/PM directly to avoid relying on params "lastAnalyzed". + RawStore ms = HMSHandler.getMSForConf(conf); + ObjectStore os = (ObjectStore) ms; + os.setConf(conf); + + // Compute an old timestamp in seconds, here we use 400 days ago. + long oldSeconds = (System.currentTimeMillis() - TimeUnit.DAYS.toMillis(400)) / 1000; + + // NOTE: exact JDO classes/field paths sometimes vary; adjust filter if needed based on MTableColumnStatistics mapping + PersistenceManager pm = os.getPersistenceManager(); + Query q = null; + try { + // Query MTableColumnStatistics rows for the target table. + q = pm.newQuery(org.apache.hadoop.hive.metastore.model.MTableColumnStatistics.class); + q.setFilter("table.tableName == t && table.database.name == d"); + q.declareParameters("java.lang.String t, java.lang.String d"); + @SuppressWarnings("unchecked") + List rows = + (List) q.execute(tbl, db); + // Make all matching column stats rows to look old/expired. + for (org.apache.hadoop.hive.metastore.model.MTableColumnStatistics r : rows) { + r.setLastAnalyzed(oldSeconds); + } + pm.flush(); + } finally { + if (q != null) { + q.closeAll(); + } + pm.close(); + } + } + + private void runStatisticsManagementTask(Configuration conf) { + StatisticsManagementTask task = new StatisticsManagementTask(); + task.setConf(conf); + task.run(); + } +} From ddcf54bba31bcae4cbc9362d5e14dd765d902ff2 Mon Sep 17 00:00:00 2001 From: Hongdan Zhu Date: Wed, 6 May 2026 02:40:29 -0700 Subject: [PATCH 5/6] HIVE-28755: final changes --- .../hive/metastore/conf/MetastoreConf.java | 8 +-- .../metastore/StatisticsManagementTask.java | 54 ++++++++-------- .../metastore/TestStatisticsManagement.java | 1 + .../hive/metastore/tools/BenchmarkTool.java | 9 +-- .../hive/metastore/tools/HMSBenchmarks.java | 62 ------------------- 5 files changed, 35 insertions(+), 99 deletions(-) diff --git a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java b/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java index 71949951e24d..84bab95653eb 100644 --- a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java +++ b/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java @@ -1293,9 +1293,9 @@ public enum ConfVars { STATISTICS_MANAGEMENT_TASK_FREQUENCY("metastore.statistics.management.task.frequency", "metastore.statistics.management.task.frequency", - 7, TimeUnit.DAYS, "Frequency at which timer task runs to do automatic statistics management for tables\n" + - "with table property 'statistics.auto.deletion'='true'. Statistics management include 2 configs. \n" + - "One is 'statistics.auto.deletion', and the other is 'statistics.retention.period'. \n" + + 7, TimeUnit.DAYS, "Frequency at which timer task runs to do automatic statistics \n" + + "management for tables. Statistics management include 2 configs. \n" + + "One is 'statistics.auto.deletion', and the other is 'metastore.statistics.retention.period'. \n" + "When 'statistics.auto.deletion'='true' is set, statistics management will look for tables which their\n " + "column statistics are over the retention period, and then delete the column stats. \n"), STATISTICS_RETENTION_PERIOD("metastore.statistics.retention.period", @@ -1303,7 +1303,7 @@ public enum ConfVars { "that we want to keep the stats for each table, which means if the stats are older than this period\n" + "of time, the stats will be automatically deleted. \n"), - STATISTICS_AUTO_DELETION("statistics.auto.deletion", "statistics.auto.deletion", true, + STATISTICS_AUTO_DELETION("metastore.statistics.auto.deletion", "metastore.statistics.auto.deletion", false, "Whether table/partition column statistics will be auto deleted after retention period"), METASTORE_METADATA_TRANSFORMER_CLASS("metastore.metadata.transformer.class", "metastore.metadata.transformer.class", diff --git a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/StatisticsManagementTask.java b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/StatisticsManagementTask.java index becb4b4cfef6..077c0aa8a44e 100644 --- a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/StatisticsManagementTask.java +++ b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/StatisticsManagementTask.java @@ -19,7 +19,6 @@ package org.apache.hadoop.hive.metastore; import java.util.List; -import java.util.Map; import java.util.concurrent.TimeUnit; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantLock; @@ -40,7 +39,7 @@ * If some table or partition column statistics are older than the configured retention interval * (MetastoreConf.ConfVars.STATISTICS_RETENTION_PERIOD), they are deleted when this metastore task runs periodically. */ -public class StatisticsManagementTask implements MetastoreTaskThread { +public class StatisticsManagementTask extends ObjectStore implements MetastoreTaskThread { private static final Logger LOG = LoggerFactory.getLogger(StatisticsManagementTask.class); // The 2 configs for users to set in the conf @@ -59,7 +58,8 @@ public long runFrequency(TimeUnit unit) { @Override public void setConf(Configuration configuration) { // we modify conf in setupConf(), so we make a copy - this.conf = configuration; + this.conf = new Configuration(configuration); + super.setConf(configuration); } @Override @@ -73,7 +73,7 @@ public Configuration getConf() { @Override public void run() { LOG.debug("Auto statistics deletion started. Cleaning up table/partition column statistics over the retention period."); - long retentionMillis = MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars. STATISTICS_RETENTION_PERIOD, TimeUnit.MILLISECONDS); + long retentionMillis = MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.STATISTICS_RETENTION_PERIOD, TimeUnit.MILLISECONDS); if (retentionMillis <= 0 || !MetastoreConf.getBoolVar(conf, MetastoreConf.ConfVars.STATISTICS_AUTO_DELETION)) { LOG.info("Statistics auto deletion is set to off currently."); return; @@ -87,16 +87,16 @@ public void run() { String filter = "lastAnalyzed < threshold"; String paramStr = "long threshold"; - try (IMetaStoreClient msc = new HiveMetaStoreClient(conf)) { - RawStore ms = HMSHandler.getMSForConf(conf); - PersistenceManager pm = ((ObjectStore) ms).getPersistenceManager(); + PersistenceManager pm = getPersistenceManager(); + boolean committed = false; + openTransaction(); // open JDO transaction + try { Query q = null; try { q = pm.newQuery(MTableColumnStatistics.class); q.setFilter(filter); q.declareParameters(paramStr); - // only fetch required fields, avoid loading heavy MTable objects q.setResult( "table.database.name, " + "table.tableName, " + @@ -106,30 +106,33 @@ public void run() { @SuppressWarnings("unchecked") List rows = (List) q.execute(lastAnalyzedThreshold); - for (Object[] row : rows) { - String dbName = (String) row[0]; - String tblName = (String) row[1]; - String partName = (String) row[2]; // can be null for table-level stats - String excludeVal = (String) row[3]; // can be null - - // exclude check uses projected param value - if (excludeVal != null) { - LOG.info("Skipping auto deletion of stats for table {}.{} due to STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY property being set on the table.", dbName, tblName); - continue; + try (IMetaStoreClient msc = new HiveMetaStoreClient(conf)) { + for (Object[] row : rows) { + String dbName = (String) row[0]; + String tblName = (String) row[1]; + String partName = (String) row[2]; + String excludeVal = (String) row[3]; + + if (excludeVal != null) { + LOG.info("Skipping auto deletion of stats for table {}.{} due to STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY property being set on the table.", dbName, tblName); + continue; + } + DeleteColumnStatisticsRequest request = new DeleteColumnStatisticsRequest(dbName, tblName); + request.setEngine("hive"); + request.setTableLevel(partName == null); + msc.deleteColumnStatistics(request); } - DeleteColumnStatisticsRequest request = new DeleteColumnStatisticsRequest(dbName, tblName); - request.setEngine("hive"); - - // decide tableLevel based on whether this stat row is table-level or partition-level - // avoids loading table partition keys / MTable - request.setTableLevel(partName == null); - msc.deleteColumnStatistics(request); } } finally { if (q != null) { q.closeAll(); } } + committed = commitTransaction(); + } finally { + if (!committed) { + rollbackTransaction(); + } } } catch (Exception e) { LOG.error("Error during statistics auto deletion", e); @@ -137,5 +140,4 @@ public void run() { lock.unlock(); } } - } diff --git a/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/TestStatisticsManagement.java b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/TestStatisticsManagement.java index 397455551473..e1ba5b1f84cb 100644 --- a/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/TestStatisticsManagement.java +++ b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/TestStatisticsManagement.java @@ -179,6 +179,7 @@ private void writeTableLevelColStats(String db, String tbl, String col) throws T ColumnStatistics cs = new ColumnStatistics(); ColumnStatisticsDesc desc = new ColumnStatisticsDesc(true, db, tbl); desc.setCatName("hive"); + cs.setStatsDesc(desc); cs.addToStatsObj(obj); client.updateTableColumnStatistics(cs); diff --git a/standalone-metastore/metastore-tools/metastore-benchmarks/src/main/java/org/apache/hadoop/hive/metastore/tools/BenchmarkTool.java b/standalone-metastore/metastore-tools/metastore-benchmarks/src/main/java/org/apache/hadoop/hive/metastore/tools/BenchmarkTool.java index aac06202699d..551ffabe6b9a 100644 --- a/standalone-metastore/metastore-tools/metastore-benchmarks/src/main/java/org/apache/hadoop/hive/metastore/tools/BenchmarkTool.java +++ b/standalone-metastore/metastore-tools/metastore-benchmarks/src/main/java/org/apache/hadoop/hive/metastore/tools/BenchmarkTool.java @@ -70,7 +70,6 @@ import static org.apache.hadoop.hive.metastore.tools.HMSBenchmarks.benchmarkOpenTxns; import static org.apache.hadoop.hive.metastore.tools.HMSBenchmarks.benchmarkPartitionManagement; import static org.apache.hadoop.hive.metastore.tools.HMSBenchmarks.benchmarkRenameTable; -import static org.apache.hadoop.hive.metastore.tools.HMSBenchmarks.benchmarkStatisticsManagement; import static org.apache.hadoop.hive.metastore.tools.HMSBenchmarks.benchmarkTableCreate; import static org.apache.hadoop.hive.metastore.tools.HMSBenchmarks.benchmarkUpdatePartitionsStat; import static org.apache.hadoop.hive.metastore.tools.Util.getServerUri; @@ -304,9 +303,7 @@ private void runNonAcidBenchmarks() { .add("openTxn", () -> benchmarkOpenTxns(bench, bData, 1)) .add("PartitionManagementTask", - () -> benchmarkPartitionManagement(bench, bData, 1)) - .add("StatisticsManagementTask", - () -> benchmarkStatisticsManagement(bench, bData, 1)); + () -> benchmarkPartitionManagement(bench, bData, 1)); for (int howMany: instances) { suite.add("listTables" + '.' + howMany, @@ -348,9 +345,7 @@ private void runNonAcidBenchmarks() { .add("openTxns" + '.' + howMany, () -> benchmarkOpenTxns(bench, bData, howMany)) .add("PartitionManagementTask" + "." + howMany, - () -> benchmarkPartitionManagement(bench, bData, howMany)) - .add("PartitionStatisticsTask" + "." + howMany, - () -> benchmarkStatisticsManagement(bench, bData, howMany)); + () -> benchmarkPartitionManagement(bench, bData, howMany)); } List toRun = suite.listMatching(matches, exclude); diff --git a/standalone-metastore/metastore-tools/metastore-benchmarks/src/main/java/org/apache/hadoop/hive/metastore/tools/HMSBenchmarks.java b/standalone-metastore/metastore-tools/metastore-benchmarks/src/main/java/org/apache/hadoop/hive/metastore/tools/HMSBenchmarks.java index e49c2ca7ee46..c01200c33be5 100644 --- a/standalone-metastore/metastore-tools/metastore-benchmarks/src/main/java/org/apache/hadoop/hive/metastore/tools/HMSBenchmarks.java +++ b/standalone-metastore/metastore-tools/metastore-benchmarks/src/main/java/org/apache/hadoop/hive/metastore/tools/HMSBenchmarks.java @@ -22,7 +22,6 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.metastore.PartitionManagementTask; -import org.apache.hadoop.hive.metastore.StatisticsManagementTask; import org.apache.hadoop.hive.metastore.TableType; import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.metastore.api.Partition; @@ -43,7 +42,6 @@ import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; -import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicLong; import java.util.stream.Collectors; import java.util.stream.IntStream; @@ -697,64 +695,4 @@ static DescriptiveStatistics benchmarkPartitionManagement(@NotNull MicroBenchmar } } - static DescriptiveStatistics benchmarkStatisticsManagement(@NotNull MicroBenchmark bench, - @NotNull BenchData data, - int tableCount) { - - String dbName = data.dbName + "_" + tableCount; - String tableNamePrefix = data.tableName; - final HMSClient client = data.getClient(); - final StatisticsManagementTask statsTask = new StatisticsManagementTask(); - try { - client.getHadoopConf().set("hive.metastore.uris", client.getServerURI().toString()); - client.getHadoopConf().set("metastore.statistics.management.database.pattern", dbName); - statsTask.setConf(client.getHadoopConf()); - - client.createDatabase(dbName); - for (int i = 0; i < tableCount; i++) { - String tableName = tableNamePrefix + "_" + i; - Util.TableBuilder tableBuilder = new Util.TableBuilder(dbName, tableName) - .withType(TableType.MANAGED_TABLE) - .withColumns(createSchema(Arrays.asList("col1:string", "col2:int"))) - .withPartitionKeys(createSchema(Collections.singletonList("part_col"))) - .withParameter("columnStatsAccurate", "true"); - - client.createTable(tableBuilder.build()); - addManyPartitionsNoException(client, dbName, tableName, null, Collections.singletonList("part_col"), 100); - - // simulate the partitions of each table which its stats has an old "lastAnalyzed" - List partitions = client.listPartitions(dbName, tableName); - for (Partition partition : partitions) { - Map params = partition.getParameters(); - // to manually change the "lastAnalyzed" to an old time, ex. 400 days - params.put("lastAnalyzed", String.valueOf(System.currentTimeMillis() - TimeUnit.DAYS.toMillis(400))); - } - client.alterPartitions(dbName, tableName, partitions); - } - } catch (Exception e) { - throw new RuntimeException(e); - } - - Runnable preRun = () -> LOG.debug("Preparing for benchmark..."); - - try { - DescriptiveStatistics stats = bench.measure(preRun, statsTask, null); - - // check if the stats are deleted - for (int i = 0; i < tableCount; i++) { - String tableName = tableNamePrefix + "_" + i; - List partitions = client.listPartitions(dbName, tableName); - for (Partition partition : partitions) { - Map params = partition.getParameters(); - if (params.containsKey("lastAnalyzed")) { - throw new AssertionError("Partition stats not deleted for table: " + tableName); - } - } - } - return stats; - } catch (Exception e) { - throw new RuntimeException(e); - } - } - } From 73d7606dacd0f5162cc74975c84609d00b0aa1da Mon Sep 17 00:00:00 2001 From: Hongdan Zhu Date: Thu, 7 May 2026 01:26:56 -0700 Subject: [PATCH 6/6] HIVE-28755: new changes based on reviews --- .../hive/metastore/conf/MetastoreConf.java | 15 +- .../metastore/StatisticsManagementTask.java | 175 ++++++++++++------ .../metastore/TestStatisticsManagement.java | 104 +++++++---- 3 files changed, 195 insertions(+), 99 deletions(-) diff --git a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java b/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java index 84bab95653eb..e0acafcfa7c9 100644 --- a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java +++ b/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java @@ -1291,19 +1291,20 @@ public enum ConfVars { "metastore.partition.management.table.pattern", "*", "Automatic partition management will look for tables using the specified table pattern"), - STATISTICS_MANAGEMENT_TASK_FREQUENCY("metastore.statistics.management.task.frequency", - "metastore.statistics.management.task.frequency", + COLUMN_STATISTICS_MANAGEMENT_TASK_FREQUENCY("metastore.column.statistics.management.task.frequency", + "metastore.column.statistics.management.task.frequency", 7, TimeUnit.DAYS, "Frequency at which timer task runs to do automatic statistics \n" + "management for tables. Statistics management include 2 configs. \n" + - "One is 'statistics.auto.deletion', and the other is 'metastore.statistics.retention.period'. \n" + - "When 'statistics.auto.deletion'='true' is set, statistics management will look for tables which their\n " + + "One is 'metastore.column.statistics.auto.deletion', and the other is 'metastore.column.statistics.retention.period'. \n" + + "When 'metastore.column.statistics.auto.deletion'='true' is set, statistics management will look for tables which their\n" + "column statistics are over the retention period, and then delete the column stats. \n"), - STATISTICS_RETENTION_PERIOD("metastore.statistics.retention.period", - "metastore.statistics.retention.period", 365, TimeUnit.DAYS, "The retention period " + + + COLUMN_STATISTICS_RETENTION_PERIOD("metastore.column.statistics.retention.period", + "metastore.column.statistics.retention.period", 365, TimeUnit.DAYS, "The retention period " + "that we want to keep the stats for each table, which means if the stats are older than this period\n" + "of time, the stats will be automatically deleted. \n"), - STATISTICS_AUTO_DELETION("metastore.statistics.auto.deletion", "metastore.statistics.auto.deletion", false, + COLUMN_STATISTICS_AUTO_DELETION("metastore.column.statistics.auto.deletion", "metastore.column.statistics.auto.deletion", false, "Whether table/partition column statistics will be auto deleted after retention period"), METASTORE_METADATA_TRANSFORMER_CLASS("metastore.metadata.transformer.class", "metastore.metadata.transformer.class", diff --git a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/StatisticsManagementTask.java b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/StatisticsManagementTask.java index 077c0aa8a44e..05abf455d06a 100644 --- a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/StatisticsManagementTask.java +++ b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/StatisticsManagementTask.java @@ -26,6 +26,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.metastore.api.DeleteColumnStatisticsRequest; import org.apache.hadoop.hive.metastore.conf.MetastoreConf; +import org.apache.hadoop.hive.metastore.model.MPartitionColumnStatistics; import org.apache.hadoop.hive.metastore.model.MTableColumnStatistics; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -34,30 +35,37 @@ import javax.jdo.Query; /** - * Statistics management task is primarily responsible for auto deletion of table column stats based on a certain frequency + * Statistics management task responsible for periodic auto-deletion of table and partition column + * statistics based on a configured retention interval. * - * If some table or partition column statistics are older than the configured retention interval - * (MetastoreConf.ConfVars.STATISTICS_RETENTION_PERIOD), they are deleted when this metastore task runs periodically. + *

When {@code metastore.statistics.auto.deletion} is enabled, this task scans + * {@code TAB_COL_STATS} and {@code PART_COL_STATS} for rows whose {@code lastAnalyzed} timestamp + * is older than {@code metastore.statistics.retention.period}, and deletes them. + * Individual tables may opt out by setting the table property + * {@value #STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY} to any non-null value. */ public class StatisticsManagementTask extends ObjectStore implements MetastoreTaskThread { + private static final Logger LOG = LoggerFactory.getLogger(StatisticsManagementTask.class); - // The 2 configs for users to set in the conf - // this is an optional table property, if this property does not exist for a table, then it is not excluded - public static final String STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY = "statistics.auto.deletion.exclude"; + /** + * Table property key that, when present on a table, excludes it from automatic statistics + * deletion regardless of the global retention setting. + */ + public static final String STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY = + "statistics.auto.deletion.exclude"; - private static final Lock lock = new ReentrantLock(); + private static final Lock LOCK = new ReentrantLock(); private Configuration conf; @Override public long runFrequency(TimeUnit unit) { - return MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.STATISTICS_MANAGEMENT_TASK_FREQUENCY, unit); + return MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COLUMN_STATISTICS_MANAGEMENT_TASK_FREQUENCY, unit); } @Override public void setConf(Configuration configuration) { - // we modify conf in setupConf(), so we make a copy this.conf = new Configuration(configuration); super.setConf(configuration); } @@ -67,66 +75,28 @@ public Configuration getConf() { return conf; } - // what needs to be included in this run() method: - // get the "lastAnalyzed" information from TAB_COL_STATS and find all the tables need to be deleted - // delete all column stats @Override public void run() { LOG.debug("Auto statistics deletion started. Cleaning up table/partition column statistics over the retention period."); - long retentionMillis = MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.STATISTICS_RETENTION_PERIOD, TimeUnit.MILLISECONDS); - if (retentionMillis <= 0 || !MetastoreConf.getBoolVar(conf, MetastoreConf.ConfVars.STATISTICS_AUTO_DELETION)) { + long retentionMillis = + MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COLUMN_STATISTICS_RETENTION_PERIOD, TimeUnit.MILLISECONDS); + if (retentionMillis <= 0 || !MetastoreConf.getBoolVar(conf, MetastoreConf.ConfVars.COLUMN_STATISTICS_AUTO_DELETION)) { LOG.info("Statistics auto deletion is set to off currently."); return; } - if (!lock.tryLock()) { + if (!LOCK.tryLock()) { return; } try { long now = System.currentTimeMillis(); long lastAnalyzedThreshold = (now - retentionMillis) / 1000; - - String filter = "lastAnalyzed < threshold"; - String paramStr = "long threshold"; - PersistenceManager pm = getPersistenceManager(); boolean committed = false; - openTransaction(); // open JDO transaction + openTransaction(); try { - Query q = null; - try { - q = pm.newQuery(MTableColumnStatistics.class); - q.setFilter(filter); - q.declareParameters(paramStr); - q.setResult( - "table.database.name, " + - "table.tableName, " + - "partitionName, " + - "table.parameters.get(\"" + STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY + "\")" - ); - @SuppressWarnings("unchecked") - List rows = (List) q.execute(lastAnalyzedThreshold); - - try (IMetaStoreClient msc = new HiveMetaStoreClient(conf)) { - for (Object[] row : rows) { - String dbName = (String) row[0]; - String tblName = (String) row[1]; - String partName = (String) row[2]; - String excludeVal = (String) row[3]; - - if (excludeVal != null) { - LOG.info("Skipping auto deletion of stats for table {}.{} due to STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY property being set on the table.", dbName, tblName); - continue; - } - DeleteColumnStatisticsRequest request = new DeleteColumnStatisticsRequest(dbName, tblName); - request.setEngine("hive"); - request.setTableLevel(partName == null); - msc.deleteColumnStatistics(request); - } - } - } finally { - if (q != null) { - q.closeAll(); - } + try (IMetaStoreClient msc = new HiveMetaStoreClient(conf)) { + deleteExpiredTableColStats(pm, msc, lastAnalyzedThreshold); + deleteExpiredPartitionColStats(pm, msc, lastAnalyzedThreshold); } committed = commitTransaction(); } finally { @@ -137,7 +107,98 @@ public void run() { } catch (Exception e) { LOG.error("Error during statistics auto deletion", e); } finally { - lock.unlock(); + LOCK.unlock(); + } + } + + /** + * Deletes expired table-level column statistics from {@code TAB_COL_STATS}. + * Tables with the {@value #STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY} property set are skipped. + * + * @param pm the JDO persistence manager to use for the query + * @param msc the metastore client used to issue delete requests + * @param lastAnalyzedThreshold epoch seconds; rows with lastAnalyzed below this value are expired + * @throws Exception if the JDO query or the delete request fails + */ + private void deleteExpiredTableColStats(PersistenceManager pm, IMetaStoreClient msc, + long lastAnalyzedThreshold) throws Exception { + Query tblQuery = null; + try { + tblQuery = pm.newQuery(MTableColumnStatistics.class); + tblQuery.setFilter("lastAnalyzed < threshold"); + tblQuery.declareParameters("long threshold"); + // partitionName does not exist on MTableColumnStatistics; omitted here + tblQuery.setResult( + "table.database.name, " + + "table.tableName, " + + "table.parameters.get(\"" + STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY + "\")"); + @SuppressWarnings("unchecked") + List tblRows = (List) tblQuery.execute(lastAnalyzedThreshold); + for (Object[] row : tblRows) { + String dbName = (String) row[0]; + String tblName = (String) row[1]; + String excludeVal = (String) row[2]; + if (excludeVal != null) { + LOG.info("Skipping auto deletion of table stats for {}.{} due to exclude property.", + dbName, tblName); + continue; + } + DeleteColumnStatisticsRequest request = new DeleteColumnStatisticsRequest(dbName, tblName); + request.setEngine("hive"); + request.setTableLevel(true); + msc.deleteColumnStatistics(request); + } + } finally { + if (tblQuery != null) { + tblQuery.closeAll(); + } + } + } + + /** + * Deletes expired partition-level column statistics from {@code PART_COL_STATS}. + * Tables with the {@value #STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY} property set are skipped. + * + * @param pm the JDO persistence manager to use for the query + * @param msc the metastore client used to issue delete requests + * @param lastAnalyzedThreshold epoch seconds; rows with lastAnalyzed below this value are expired + * @throws Exception if the JDO query or the delete request fails + */ + private void deleteExpiredPartitionColStats(PersistenceManager pm, IMetaStoreClient msc, + long lastAnalyzedThreshold) throws Exception { + Query partQuery = null; + try { + partQuery = pm.newQuery(MPartitionColumnStatistics.class); + partQuery.setFilter("lastAnalyzed < threshold"); + partQuery.declareParameters("long threshold"); + // project via partition navigation to reach partitionName and the table exclude property + partQuery.setResult( + "partition.table.database.name, " + + "partition.table.tableName, " + + "partition.partitionName, " + + "partition.table.parameters.get(\"" + STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY + "\")"); + @SuppressWarnings("unchecked") + List partRows = (List) partQuery.execute(lastAnalyzedThreshold); + for (Object[] row : partRows) { + String dbName = (String) row[0]; + String tblName = (String) row[1]; + String partName = (String) row[2]; + String excludeVal = (String) row[3]; + if (excludeVal != null) { + LOG.info("Skipping auto deletion of partition stats for {}.{} due to exclude property.", + dbName, tblName); + continue; + } + DeleteColumnStatisticsRequest request = new DeleteColumnStatisticsRequest(dbName, tblName); + request.setEngine("hive"); + request.setTableLevel(false); + request.addToPart_names(partName); + msc.deleteColumnStatistics(request); + } + } finally { + if (partQuery != null) { + partQuery.closeAll(); + } } } -} +} \ No newline at end of file diff --git a/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/TestStatisticsManagement.java b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/TestStatisticsManagement.java index e1ba5b1f84cb..bb5ad9f29e5b 100644 --- a/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/TestStatisticsManagement.java +++ b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/TestStatisticsManagement.java @@ -19,7 +19,6 @@ import static org.apache.hadoop.hive.metastore.Warehouse.DEFAULT_CATALOG_NAME; import static org.apache.hadoop.hive.metastore.Warehouse.DEFAULT_DATABASE_NAME; -import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import java.util.List; @@ -39,6 +38,7 @@ import org.apache.hadoop.hive.metastore.client.builder.TableBuilder; import org.apache.hadoop.hive.metastore.conf.MetastoreConf; import org.apache.hadoop.hive.metastore.conf.MetastoreConf.ConfVars; +import org.apache.hadoop.hive.metastore.model.MTableColumnStatistics; import org.apache.hadoop.hive.metastore.security.HadoopThriftAuthBridge; import org.apache.hadoop.hive.metastore.utils.TestTxnDbUtil; import org.apache.thrift.TException; @@ -50,6 +50,11 @@ import javax.jdo.PersistenceManager; import javax.jdo.Query; +/** + * Unit tests for {@link StatisticsManagementTask}, verifying that expired table-level column + * statistics are deleted on schedule and that tables marked with the exclude property are left + * untouched. + */ @Category(MetastoreUnitTest.class) public class TestStatisticsManagement { @@ -64,9 +69,9 @@ public void setUp() throws Exception { conf.setBoolean(ConfVars.MULTITHREADED.getVarname(), false); conf.setBoolean(ConfVars.HIVE_IN_TEST.getVarname(), true); - // enable stats auto deletion, set up a short retention so threshold check triggers easily - MetastoreConf.setBoolVar(conf, ConfVars.STATISTICS_AUTO_DELETION, true); - MetastoreConf.setTimeVar(conf, ConfVars.STATISTICS_RETENTION_PERIOD, 1, TimeUnit.DAYS); + // Enable stats auto deletion with a short retention so the threshold check triggers easily. + MetastoreConf.setBoolVar(conf, ConfVars.COLUMN_STATISTICS_AUTO_DELETION, true); + MetastoreConf.setTimeVar(conf, ConfVars.COLUMN_STATISTICS_RETENTION_PERIOD, 1, TimeUnit.DAYS); MetaStoreTestUtils.startMetaStoreWithRetry(HadoopThriftAuthBridge.getBridge(), conf); TestTxnDbUtil.setConfValues(conf); @@ -78,7 +83,7 @@ public void setUp() throws Exception { @After public void tearDown() throws Exception { if (client != null) { - // Drop any leftover databases, similar to TestPartitionManagement.java + // Drop any leftover databases, similar to TestPartitionManagement.java. List dbs = client.getAllDatabases(DEFAULT_CATALOG_NAME); for (String db : dbs) { if (!db.equalsIgnoreCase(DEFAULT_DATABASE_NAME)) { @@ -100,14 +105,12 @@ public void testExpiredTableColStatsAreDeleted() throws Exception { String dbName = "stats_db1"; String tableName = "tbl1"; createDbAndTable(dbName, tableName, false); - // create a column stats entry (table-level) writeTableLevelColStats(dbName, tableName, "c1"); - // ensure stats exists assertHasTableColStats(dbName, tableName, "c1"); - // make lastAnalyzed older than the threshold makeAllTableColStatsOlderThanRetention(dbName, tableName); runStatisticsManagementTask(conf); + assertNoTableColStats(dbName, tableName, "c1"); } @@ -115,20 +118,24 @@ public void testExpiredTableColStatsAreDeleted() throws Exception { public void testExcludedTableStatsAreNotDeleted() throws Exception { String dbName = "stats_db2"; String tableName = "tbl2"; - // Create a database and table that ARE excluded from auto deletion. createDbAndTable(dbName, tableName, true); writeTableLevelColStats(dbName, tableName, "c1"); assertHasTableColStats(dbName, tableName, "c1"); - - // Manually set lastAnalyzed to a very old timestamp so it would normally be expired. makeAllTableColStatsOlderThanRetention(dbName, tableName); runStatisticsManagementTask(conf); - // Verify that stats are still present because the table is excluded. + // Stats must still be present because the table is marked as excluded. assertHasTableColStats(dbName, tableName, "c1"); } + /** + * Creates a database (unless it is the default database) and a simple two-column test table. + * + * @param dbName name of the database to create + * @param tableName name of the table to create + * @param exclude if {@code true}, sets the auto-deletion exclude property on the table + */ private void createDbAndTable(String dbName, String tableName, boolean exclude) throws Exception { Database db; if (!DEFAULT_DATABASE_NAME.equals(dbName)) { @@ -140,7 +147,6 @@ private void createDbAndTable(String dbName, String tableName, boolean exclude) db = client.getDatabase(DEFAULT_CATALOG_NAME, DEFAULT_DATABASE_NAME); } - // Build a simple test table with two columns. TableBuilder tb = new TableBuilder() .inDb(db) .setTableName(tableName) @@ -150,22 +156,26 @@ private void createDbAndTable(String dbName, String tableName, boolean exclude) .setOutputFormat("org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat"); Table t = tb.build(conf); - - // If requested, mark this table as excluded from automatic stats deletion. if (exclude) { - t.getParameters().put(StatisticsManagementTask.STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY, "true"); + t.getParameters().put( + StatisticsManagementTask.STATISTICS_AUTO_DELETION_EXCLUDE_TBLPROPERTY, "true"); } client.createTable(t); client.flushCache(); } + /** + * Writes minimal table-level column statistics for the given column via the metastore client. + * + * @param db database name + * @param tbl table name + * @param col column name + */ private void writeTableLevelColStats(String db, String tbl, String col) throws TException { - // Create a stats object for one column. ColumnStatisticsObj obj = new ColumnStatisticsObj(); obj.setColName(col); obj.setColType("double"); - // Fill in minimal double-column statistics data. DoubleColumnStatsData doubleData = new DoubleColumnStatsData(); doubleData.setNumNulls(0); doubleData.setNumDVs(1); @@ -176,51 +186,70 @@ private void writeTableLevelColStats(String db, String tbl, String col) throws T data.setDoubleStats(doubleData); obj.setStatsData(data); - ColumnStatistics cs = new ColumnStatistics(); ColumnStatisticsDesc desc = new ColumnStatisticsDesc(true, db, tbl); desc.setCatName("hive"); + + ColumnStatistics cs = new ColumnStatistics(); cs.setStatsDesc(desc); cs.addToStatsObj(obj); client.updateTableColumnStatistics(cs); } + /** + * Asserts that at least one column statistics object exists for the specified column. + * + * @param db database name + * @param tbl table name + * @param col column name + */ private void assertHasTableColStats(String db, String tbl, String col) throws TException { List objs = client.getTableColumnStatistics(db, tbl, List.of(col), "hive"); assertTrue("Expected stats for " + db + "." + tbl + "." + col, objs != null && !objs.isEmpty()); } + /** + * Asserts that no column statistics exist for the specified column. + * A {@link NoSuchObjectException} from the server is also treated as an acceptable absence signal. + * + * @param db database name + * @param tbl table name + * @param col column name + */ private void assertNoTableColStats(String db, String tbl, String col) throws TException { try { List objs = client.getTableColumnStatistics(db, tbl, List.of(col), "hive"); assertTrue("Expected no stats for " + db + "." + tbl + "." + col, objs == null || objs.isEmpty()); } catch (NoSuchObjectException e) { - // acceptable: server may throw if stats absent depending on impl + // Acceptable: server may throw if stats are absent depending on implementation. } } + /** + * Backdates the {@code lastAnalyzed} field of all {@code MTableColumnStatistics} rows for the + * given table to 400 days ago, making them appear expired relative to any reasonable retention + * period. Uses a fresh {@link ObjectStore} instance to bypass the proxy wrapper returned by + * {@code HMSHandler.getMSForConf()}. + * + * @param db database name + * @param tbl table name + */ private void makeAllTableColStatsOlderThanRetention(String db, String tbl) throws Exception { - // We update via ObjectStore/PM directly to avoid relying on params "lastAnalyzed". - RawStore ms = HMSHandler.getMSForConf(conf); - ObjectStore os = (ObjectStore) ms; + // Instantiate ObjectStore directly; HMSHandler.getMSForConf() returns a proxy that + // cannot be cast to ObjectStore and does not expose getPersistenceManager(). + ObjectStore os = new ObjectStore(); os.setConf(conf); - // Compute an old timestamp in seconds, here we use 400 days ago. long oldSeconds = (System.currentTimeMillis() - TimeUnit.DAYS.toMillis(400)) / 1000; - - // NOTE: exact JDO classes/field paths sometimes vary; adjust filter if needed based on MTableColumnStatistics mapping PersistenceManager pm = os.getPersistenceManager(); Query q = null; try { - // Query MTableColumnStatistics rows for the target table. - q = pm.newQuery(org.apache.hadoop.hive.metastore.model.MTableColumnStatistics.class); + q = pm.newQuery(MTableColumnStatistics.class); q.setFilter("table.tableName == t && table.database.name == d"); q.declareParameters("java.lang.String t, java.lang.String d"); @SuppressWarnings("unchecked") - List rows = - (List) q.execute(tbl, db); - // Make all matching column stats rows to look old/expired. - for (org.apache.hadoop.hive.metastore.model.MTableColumnStatistics r : rows) { + List rows = (List) q.execute(tbl, db); + for (MTableColumnStatistics r : rows) { r.setLastAnalyzed(oldSeconds); } pm.flush(); @@ -232,9 +261,14 @@ private void makeAllTableColStatsOlderThanRetention(String db, String tbl) throw } } - private void runStatisticsManagementTask(Configuration conf) { + /** + * Instantiates and runs a {@link StatisticsManagementTask} with the given configuration. + * + * @param configuration the HMS configuration to pass to the task + */ + private void runStatisticsManagementTask(Configuration configuration) { StatisticsManagementTask task = new StatisticsManagementTask(); - task.setConf(conf); + task.setConf(configuration); task.run(); } -} +} \ No newline at end of file