From fb317b68f735e4385a79c84832d07f5697500ebe Mon Sep 17 00:00:00 2001 From: rayshrey Date: Fri, 29 May 2026 20:44:00 +0530 Subject: [PATCH] Adding virtual memory pools Signed-off-by: rayshrey --- libs/arrow-spi/build.gradle | 4 +- .../opensearch/arrow/spi/NativeAllocator.java | 113 ++-- .../arrow/spi/NativeAllocatorPoolConfig.java | 3 - .../org/opensearch/arrow/spi/PoolGroup.java | 37 ++ .../spi/NativeAllocatorPoolConfigTests.java | 4 - .../arrow/allocator/ArrowBasePlugin.java | 499 +++++++-------- .../arrow/allocator/ArrowBaseStatsAction.java | 75 +++ .../arrow/allocator/ArrowNativeAllocator.java | 572 ++++++++++++------ .../allocator/NativeMemoryRebalancer.java | 235 +++++++ .../arrow/allocator/ArrowBasePluginTests.java | 363 ++--------- .../allocator/ArrowNativeAllocatorTests.java | 138 +---- .../NativeMemoryRebalancerTests.java | 152 +++++ .../arrow/flight/BackpressureProducerIT.java | 3 +- .../flight/NativeAllocatorBoundaryIT.java | 125 ++-- .../flight/NativeMemoryRebalancerIT.java | 100 +++ .../flight/UnifiedNativeMemoryStatsIT.java | 95 +++ .../transport/FlightTransportTestBase.java | 4 +- .../dataformat-native/rust/common/src/lib.rs | 1 + .../rust/common/src/memory_pool.rs | 370 +++++++++++ .../be/datafusion/DataFusionPlugin.java | 99 ++- .../be/datafusion/DatafusionSettings.java | 33 + .../DataFusionPluginSettingsTests.java | 15 +- .../datafusion/DatafusionSettingsTests.java | 2 +- .../UnifiedNativeMemoryFullStackIT.java | 84 +++ .../composite/CompositeDataFormatPlugin.java | 2 +- .../benchmark/VSRRotationBenchmark.java | 9 +- .../parquet/ParquetDataFormatPlugin.java | 61 +- .../opensearch/parquet/ParquetSettings.java | 95 +++ .../opensearch/parquet/bridge/RustBridge.java | 40 ++ .../src/main/rust/src/ffm.rs | 32 + .../src/main/rust/src/lib.rs | 1 + .../src/main/rust/src/memory.rs | 57 ++ .../src/main/rust/src/tests/mod.rs | 12 +- .../rust/src/writer_properties_builder.rs | 6 +- .../rust/tests/writer_integration_tests.rs | 6 +- .../ParquetDataFormatAwareEngineTests.java | 8 +- .../engine/ParquetIndexingEngineTests.java | 6 +- .../parquet/memory/ArrowBufferPoolTests.java | 4 +- .../parquet/vsr/VSRManagerTests.java | 5 +- .../opensearch/parquet/vsr/VSRPoolTests.java | 4 +- .../parquet/writer/ParquetWriterTests.java | 5 +- server/build.gradle | 1 + .../admin/cluster/node/stats/NodeStats.java | 41 +- .../cluster/node/stats/NodesStatsRequest.java | 1 - .../node/stats/TransportNodesStatsAction.java | 1 - .../stats/TransportClusterStatsAction.java | 1 - .../main/java/org/opensearch/node/Node.java | 12 +- .../java/org/opensearch/node/NodeService.java | 4 +- .../stats/NativeAllocatorPoolStats.java | 138 +++-- .../plugins/SearchBackEndPlugin.java | 39 ++ ...kendNativeMemoryStatsVersionGateTests.java | 166 ----- .../cluster/node/stats/NodeStatsTests.java | 35 +- .../cluster/stats/ClusterStatsNodesTests.java | 1 - .../stats/ClusterStatsResponseTests.java | 1 - .../opensearch/cluster/DiskUsageTests.java | 7 - .../node/NodeServiceNativeMemoryTests.java | 294 +-------- .../stats/NativeAllocatorPoolStatsTests.java | 33 +- .../MockInternalClusterInfoService.java | 1 - .../opensearch/test/InternalTestCluster.java | 1 - 59 files changed, 2598 insertions(+), 1658 deletions(-) create mode 100644 libs/arrow-spi/src/main/java/org/opensearch/arrow/spi/PoolGroup.java create mode 100644 plugins/arrow-base/src/main/java/org/opensearch/arrow/allocator/ArrowBaseStatsAction.java create mode 100644 plugins/arrow-base/src/main/java/org/opensearch/arrow/allocator/NativeMemoryRebalancer.java create mode 100644 plugins/arrow-base/src/test/java/org/opensearch/arrow/allocator/NativeMemoryRebalancerTests.java create mode 100644 plugins/arrow-flight-rpc/src/internalClusterTest/java/org/opensearch/arrow/flight/NativeMemoryRebalancerIT.java create mode 100644 plugins/arrow-flight-rpc/src/internalClusterTest/java/org/opensearch/arrow/flight/UnifiedNativeMemoryStatsIT.java create mode 100644 sandbox/libs/dataformat-native/rust/common/src/memory_pool.rs create mode 100644 sandbox/plugins/composite-engine/src/internalClusterTest/java/org/opensearch/composite/UnifiedNativeMemoryFullStackIT.java create mode 100644 sandbox/plugins/parquet-data-format/src/main/rust/src/memory.rs delete mode 100644 server/src/test/java/org/opensearch/action/admin/cluster/node/stats/AnalyticsBackendNativeMemoryStatsVersionGateTests.java diff --git a/libs/arrow-spi/build.gradle b/libs/arrow-spi/build.gradle index abf7eecf84c77..c1a716c6f72c4 100644 --- a/libs/arrow-spi/build.gradle +++ b/libs/arrow-spi/build.gradle @@ -11,7 +11,9 @@ apply plugin: 'opensearch.publish' dependencies { api project(':libs:opensearch-core') api project(':libs:opensearch-common') - testImplementation project(':test:framework') + testImplementation(project(':test:framework')) { + exclude group: 'org.opensearch', module: 'opensearch-arrow-spi' + } } tasks.named('forbiddenApisMain').configure { diff --git a/libs/arrow-spi/src/main/java/org/opensearch/arrow/spi/NativeAllocator.java b/libs/arrow-spi/src/main/java/org/opensearch/arrow/spi/NativeAllocator.java index 89d6866da2c89..70776f7dcef6c 100644 --- a/libs/arrow-spi/src/main/java/org/opensearch/arrow/spi/NativeAllocator.java +++ b/libs/arrow-spi/src/main/java/org/opensearch/arrow/spi/NativeAllocator.java @@ -9,36 +9,35 @@ package org.opensearch.arrow.spi; import java.io.Closeable; +import java.util.Set; +import java.util.function.Consumer; +import java.util.function.Supplier; /** - * Arrow-agnostic interface for a hierarchical native memory allocator. + * Unified native memory allocator interface. * - *

The implementation (backed by Arrow's {@code RootAllocator}) is provided by - * a plugin. The SPI allows other subsystems to interact with the allocator - * without depending on Arrow classes. - * - *

Plugins that need Arrow allocators obtain the implementation via - * service lookup or plugin extension and call {@link #getOrCreatePool} to - * register their pool. + *

Manages memory pools under a shared budget. Each pool has a minimum + * guaranteed allocation and a maximum burst limit. Implementations may + * redistribute unused capacity across pools. * * @opensearch.api */ public interface NativeAllocator extends Closeable { /** - * Returns the named pool, creating it on first access with the given limit. - * Subsequent calls with the same name return the same pool (first-call limit wins). + * Returns the named pool, creating it on first access. + * Subsequent calls with the same name return the existing pool (first-call config wins). * - * @param poolName logical pool name (e.g., "query", "flight") - * @param limit maximum bytes this pool can allocate in aggregate + * @param poolName logical pool name + * @param min minimum guaranteed bytes + * @param max maximum bytes this pool can allocate + * @param group the group this pool belongs to for aggregated stats, or null * @return an opaque pool handle */ - PoolHandle getOrCreatePool(String poolName, long limit); + PoolHandle getOrCreatePool(String poolName, long min, long max, PoolGroup group); /** - * Updates the limit of an existing pool. Children of the pool allocator - * inherit the change automatically via Arrow's parent-cap check at - * allocation time — no notification SPI is needed. + * Updates the effective limit of an existing pool. * * @param poolName logical pool name * @param newLimit new maximum bytes for the pool @@ -46,15 +45,69 @@ public interface NativeAllocator extends Closeable { void setPoolLimit(String poolName, long newLimit); /** - * Sets the root-level memory limit for the entire allocator. + * Registers a virtual pool with initial min/max and a callback + * invoked when the pool's limit changes. + * + * @param poolName logical pool name + * @param min minimum guaranteed bytes + * @param max initial maximum bytes (the pool's starting limit) + * @param group the group this pool belongs to for aggregated stats + * @param limitSetter callback invoked when the pool limit changes + * @return a handle to update stats from the native layer + */ + VirtualPoolHandle registerVirtualPool(String poolName, long min, long max, PoolGroup group, Consumer limitSetter); + + /** + * Updates the minimum guaranteed bytes for a pool. + * + * @param poolName logical pool name + * @param newMin new minimum bytes + */ + void setPoolMin(String poolName, long newMin); + + /** + * Returns all registered pool names. + */ + Set getAllPoolNames(); + + /** + * Adds a callback invoked before stats collection to refresh pool usage data. * - * @param limit new maximum bytes for the root allocator + * @param refresher runnable that updates pool stats + */ + void addStatsRefresher(Runnable refresher); + + /** + * Sets the supplier for process-wide native memory stats. + * + * @param supplier returns [allocatedBytes, residentBytes] + */ + void setNativeMemoryStatsSupplier(Supplier supplier); + + /** + * Handle for a virtual pool. Plugins update stats via this handle. */ - void setRootLimit(long limit); + interface VirtualPoolHandle { + /** + * Update the current usage stats. + * + * @param allocatedBytes current allocated bytes + * @param peakBytes peak allocated bytes + */ + void updateStats(long allocatedBytes, long peakBytes); + + /** Returns current allocated bytes. */ + long allocatedBytes(); + + /** Returns peak allocated bytes. */ + long peakBytes(); + + /** Returns current limit. */ + long limit(); + } /** - * Opaque handle to a memory pool. Plugins downcast to the concrete type - * (e.g., Arrow's {@code BufferAllocator}) in the implementation layer. + * Opaque handle to a memory pool. */ interface PoolHandle { @@ -63,28 +116,20 @@ interface PoolHandle { * * @param childName name for debugging * @param childLimit maximum bytes for the child - * @return an opaque child handle (downcast to BufferAllocator in Arrow impl) + * @return a child handle */ PoolHandle newChild(String childName, long childLimit); - /** - * Returns the current allocated bytes for this pool/child. - */ + /** Returns the current allocated bytes. */ long allocatedBytes(); - /** - * Returns the peak memory allocation. - */ + /** Returns the peak memory allocation. */ long peakBytes(); - /** - * Returns the configured limit. - */ + /** Returns the configured limit. */ long limit(); - /** - * Releases this allocation handle. - */ + /** Releases this allocation handle. */ void close(); } } diff --git a/libs/arrow-spi/src/main/java/org/opensearch/arrow/spi/NativeAllocatorPoolConfig.java b/libs/arrow-spi/src/main/java/org/opensearch/arrow/spi/NativeAllocatorPoolConfig.java index 98f991cb86704..29dba48e9f165 100644 --- a/libs/arrow-spi/src/main/java/org/opensearch/arrow/spi/NativeAllocatorPoolConfig.java +++ b/libs/arrow-spi/src/main/java/org/opensearch/arrow/spi/NativeAllocatorPoolConfig.java @@ -33,9 +33,6 @@ public final class NativeAllocatorPoolConfig { /** Pool name for query-execution memory (analytics-engine fragments and per-query allocators). */ public static final String POOL_QUERY = "query"; - /** Setting key for the root allocator limit. */ - public static final String SETTING_ROOT_LIMIT = "native.allocator.root.limit"; - /** Setting key for the Flight pool minimum. */ public static final String SETTING_FLIGHT_MIN = "native.allocator.pool.flight.min"; /** Setting key for the Flight pool maximum. */ diff --git a/libs/arrow-spi/src/main/java/org/opensearch/arrow/spi/PoolGroup.java b/libs/arrow-spi/src/main/java/org/opensearch/arrow/spi/PoolGroup.java new file mode 100644 index 0000000000000..d292b42ae595f --- /dev/null +++ b/libs/arrow-spi/src/main/java/org/opensearch/arrow/spi/PoolGroup.java @@ -0,0 +1,37 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.arrow.spi; + +/** + * Groups that memory pools belong to for aggregated customer-facing stats. + * Each pool is assigned to exactly one group at registration time. + * + * @opensearch.api + */ +public enum PoolGroup { + /** Arrow Flight transport pool group. */ + TRANSPORT("transport"), + /** Query and analytics execution pool group. */ + SEARCH("search"), + /** Ingest and write path pool group. */ + INDEXING("indexing"), + /** Background merge operations pool group. */ + MERGE("merge"); + + private final String name; + + PoolGroup(String name) { + this.name = name; + } + + /** Returns the group name used in stats output. */ + public String getName() { + return name; + } +} diff --git a/libs/arrow-spi/src/test/java/org/opensearch/arrow/spi/NativeAllocatorPoolConfigTests.java b/libs/arrow-spi/src/test/java/org/opensearch/arrow/spi/NativeAllocatorPoolConfigTests.java index a21ca8ff54943..025b41a603212 100644 --- a/libs/arrow-spi/src/test/java/org/opensearch/arrow/spi/NativeAllocatorPoolConfigTests.java +++ b/libs/arrow-spi/src/test/java/org/opensearch/arrow/spi/NativeAllocatorPoolConfigTests.java @@ -26,8 +26,4 @@ public void testSettingKeys() { assertEquals("native.allocator.pool.query.min", NativeAllocatorPoolConfig.SETTING_QUERY_MIN); assertEquals("native.allocator.pool.query.max", NativeAllocatorPoolConfig.SETTING_QUERY_MAX); } - - public void testRootSettingKey() { - assertEquals("native.allocator.root.limit", NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT); - } } diff --git a/plugins/arrow-base/src/main/java/org/opensearch/arrow/allocator/ArrowBasePlugin.java b/plugins/arrow-base/src/main/java/org/opensearch/arrow/allocator/ArrowBasePlugin.java index a9bd9968b5884..ad49305b72106 100644 --- a/plugins/arrow-base/src/main/java/org/opensearch/arrow/allocator/ArrowBasePlugin.java +++ b/plugins/arrow-base/src/main/java/org/opensearch/arrow/allocator/ArrowBasePlugin.java @@ -9,11 +9,16 @@ package org.opensearch.arrow.allocator; import org.opensearch.arrow.spi.NativeAllocatorPoolConfig; +import org.opensearch.arrow.spi.PoolGroup; import org.opensearch.cluster.metadata.IndexNameExpressionResolver; +import org.opensearch.cluster.node.DiscoveryNodes; import org.opensearch.cluster.service.ClusterService; import org.opensearch.common.settings.ClusterSettings; +import org.opensearch.common.settings.IndexScopedSettings; import org.opensearch.common.settings.Setting; import org.opensearch.common.settings.Settings; +import org.opensearch.common.settings.SettingsFilter; +import org.opensearch.common.util.concurrent.FutureUtils; import org.opensearch.core.common.io.stream.NamedWriteableRegistry; import org.opensearch.core.common.unit.ByteSizeValue; import org.opensearch.core.xcontent.NamedXContentRegistry; @@ -22,10 +27,14 @@ import org.opensearch.node.resource.tracker.ResourceTrackerSettings; import org.opensearch.plugin.stats.NativeAllocatorPoolStats; import org.opensearch.plugin.stats.NativeAllocatorStatsRegistry; +import org.opensearch.plugins.ActionPlugin; import org.opensearch.plugins.ExtensiblePlugin; import org.opensearch.plugins.Plugin; import org.opensearch.repositories.RepositoriesService; +import org.opensearch.rest.RestController; +import org.opensearch.rest.RestHandler; import org.opensearch.script.ScriptService; +import org.opensearch.threadpool.Scheduler; import org.opensearch.threadpool.ThreadPool; import org.opensearch.transport.client.Client; import org.opensearch.watcher.ResourceWatcherService; @@ -33,207 +42,133 @@ import java.io.IOException; import java.util.Collection; import java.util.List; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.ScheduledFuture; +import java.util.concurrent.TimeUnit; import java.util.function.Supplier; /** * Top-level plugin that owns the unified Arrow-backed native memory allocator. * - *

All Arrow-consuming plugins (arrow-flight-rpc, parquet-data-format) extend - * this plugin to share one {@link ArrowNativeAllocator} and its classloader. - * - *

Each pool has a min (guaranteed floor) and max (burst ceiling). The rebalancer - * ensures every pool can always allocate up to its min, and distributes unused - * capacity allowing pools to grow up to their max. + *

All Arrow-consuming plugins extend this plugin to share one + * {@link ArrowNativeAllocator} and its classloader. */ -public class ArrowBasePlugin extends Plugin implements ExtensiblePlugin { +public class ArrowBasePlugin extends Plugin implements ExtensiblePlugin, ActionPlugin { /** Creates the plugin. */ public ArrowBasePlugin() {} - /** - * Maximum bytes for the root Arrow allocator. - * - *

When unset, the default is 20% of - * {@link ResourceTrackerSettings#NODE_NATIVE_MEMORY_LIMIT_SETTING}; see - * {@link #deriveRootLimitDefault}. The Arrow framework gets a small fraction of the - * native budget because the dominant consumer of native memory in analytics workloads - * is the DataFusion Rust runtime (~75% of {@code node.native_memory.limit}), not Arrow. - * If AC is unconfigured (limit = 0), the default is {@link Long#MAX_VALUE}, preserving - * pre-AC behaviour. - */ - public static final Setting ROOT_LIMIT_SETTING = new Setting<>( - NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT, - ArrowBasePlugin::deriveRootLimitDefault, - s -> { - long v = Long.parseLong(s); - if (v < 0) { - throw new IllegalArgumentException("Setting [" + NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT + "] must be >= 0, got " + v); - } - return v; - }, + // ─── Settings ──────────────────────────────────────────────────────────────── + + /** Whether the NativeMemoryRebalancer is enabled. */ + public static final Setting REBALANCER_ENABLED_SETTING = Setting.boolSetting( + "native.allocator.rebalancer.enabled", + true, Setting.Property.NodeScope, Setting.Property.Dynamic ); - /** - * Computes the default for {@link #ROOT_LIMIT_SETTING} as 20% of - * {@link ResourceTrackerSettings#NODE_NATIVE_MEMORY_LIMIT_SETTING}. The Arrow framework's - * hard cap covers only Arrow allocations — DataFusion's Rust runtime is a sibling of - * Arrow root and gets the larger share of the native budget (see - * {@code DataFusionPlugin#deriveMemoryPoolLimitDefault}). - * - *

Returns the bytes-as-string representation expected by the {@link Setting} parser. - * If the AC limit is unset (== 0), the default is {@link Long#MAX_VALUE} — unbounded — - * preserving pre-AC behaviour. - */ - static String deriveRootLimitDefault(Settings settings) { - ByteSizeValue nativeLimit = ResourceTrackerSettings.NODE_NATIVE_MEMORY_LIMIT_SETTING.get(settings); - if (nativeLimit.getBytes() <= 0) { - return Long.toString(Long.MAX_VALUE); - } - return Long.toString(nativeLimit.getBytes() * 20 / 100); - } + /** Interval in seconds between pool rebalance cycles. 0 disables rebalancing. */ + public static final Setting REBALANCE_INTERVAL_SETTING = Setting.longSetting( + "native.allocator.rebalance.interval_seconds", + 5L, + 0L, + Setting.Property.NodeScope, + Setting.Property.Dynamic + ); + + /** Pool utilization above this triggers growth. */ + public static final Setting PRESSURE_THRESHOLD_SETTING = Setting.doubleSetting( + "native.allocator.rebalancer.pressure_threshold", + 0.75, + 0.0, + 1.0, + Setting.Property.NodeScope, + Setting.Property.Dynamic + ); + + /** Pool utilization below this means pool can give back capacity. */ + public static final Setting IDLE_THRESHOLD_SETTING = Setting.doubleSetting( + "native.allocator.rebalancer.idle_threshold", + 0.50, + 0.0, + 1.0, + Setting.Property.NodeScope, + Setting.Property.Dynamic + ); - /** Minimum guaranteed bytes for the Flight pool. */ - public static final Setting FLIGHT_MIN_SETTING = Setting.longSetting( + /** Factor to shrink idle pools by (new limit = limit * (1 - shrink_factor)). */ + public static final Setting SHRINK_FACTOR_SETTING = Setting.doubleSetting( + "native.allocator.rebalancer.shrink_factor", + 0.10, + 0.0, + 1.0, + Setting.Property.NodeScope, + Setting.Property.Dynamic + ); + + /** Minimum guaranteed bytes for the Flight pool. Default is 2% of budget. */ + public static final Setting FLIGHT_MIN_SETTING = new Setting<>( NativeAllocatorPoolConfig.SETTING_FLIGHT_MIN, - 0L, - 0L, + s -> derivePoolMinDefault(s, 2), + s -> parseNonNegativeLong(s, NativeAllocatorPoolConfig.SETTING_FLIGHT_MIN), Setting.Property.NodeScope, Setting.Property.Dynamic ); - /** - * Maximum bytes the Flight pool can burst to. Default is 5% of - * {@link ResourceTrackerSettings#NODE_NATIVE_MEMORY_LIMIT_SETTING}; see - * {@link #derivePoolMaxDefault}. Falls back to {@link Long#MAX_VALUE} when AC is - * unconfigured. Matches the partitioning model documented in PR #21732. - */ + /** Maximum bytes the Flight pool can burst to. Default is 5% of budget. */ public static final Setting FLIGHT_MAX_SETTING = new Setting<>( NativeAllocatorPoolConfig.SETTING_FLIGHT_MAX, s -> derivePoolMaxDefault(s, 5), - s -> { - long v = Long.parseLong(s); - if (v < 0) { - throw new IllegalArgumentException("Setting [" + NativeAllocatorPoolConfig.SETTING_FLIGHT_MAX + "] must be >= 0, got " + v); - } - return v; - }, + s -> parseNonNegativeLong(s, NativeAllocatorPoolConfig.SETTING_FLIGHT_MAX), Setting.Property.NodeScope, Setting.Property.Dynamic ); - /** Minimum guaranteed bytes for the ingest pool. */ - public static final Setting INGEST_MIN_SETTING = Setting.longSetting( + /** Minimum guaranteed bytes for the ingest pool. Default is 4% of budget. */ + public static final Setting INGEST_MIN_SETTING = new Setting<>( NativeAllocatorPoolConfig.SETTING_INGEST_MIN, - 0L, - 0L, + s -> derivePoolMinDefault(s, 4), + s -> parseNonNegativeLong(s, NativeAllocatorPoolConfig.SETTING_INGEST_MIN), Setting.Property.NodeScope, Setting.Property.Dynamic ); - /** - * Maximum bytes the ingest pool can burst to. Default is 8% of - * {@link ResourceTrackerSettings#NODE_NATIVE_MEMORY_LIMIT_SETTING}; see - * {@link #derivePoolMaxDefault}. Falls back to {@link Long#MAX_VALUE} when AC is - * unconfigured. Ingest gets a larger fraction than Flight/Query because parquet VSR - * allocators dominate write-path memory usage — see partitioning model in PR #21732. - */ + /** Maximum bytes the ingest pool can burst to. Default is 8% of budget. */ public static final Setting INGEST_MAX_SETTING = new Setting<>( NativeAllocatorPoolConfig.SETTING_INGEST_MAX, s -> derivePoolMaxDefault(s, 8), - s -> { - long v = Long.parseLong(s); - if (v < 0) { - throw new IllegalArgumentException("Setting [" + NativeAllocatorPoolConfig.SETTING_INGEST_MAX + "] must be >= 0, got " + v); - } - return v; - }, + s -> parseNonNegativeLong(s, NativeAllocatorPoolConfig.SETTING_INGEST_MAX), Setting.Property.NodeScope, Setting.Property.Dynamic ); - /** - * Minimum guaranteed bytes for the query pool. Honored by the rebalancer (when - * enabled) — sets a floor below which the rebalancer will not shrink the pool. - * Has no effect when rebalancing is disabled. - */ - public static final Setting QUERY_MIN_SETTING = Setting.longSetting( + /** Minimum guaranteed bytes for the query pool. Default is 2% of budget. */ + public static final Setting QUERY_MIN_SETTING = new Setting<>( NativeAllocatorPoolConfig.SETTING_QUERY_MIN, - 0L, - 0L, + s -> derivePoolMinDefault(s, 2), + s -> parseNonNegativeLong(s, NativeAllocatorPoolConfig.SETTING_QUERY_MIN), Setting.Property.NodeScope, Setting.Property.Dynamic ); - /** - * Maximum bytes the query pool can allocate. Default is 5% of - * {@link ResourceTrackerSettings#NODE_NATIVE_MEMORY_LIMIT_SETTING}; see - * {@link #derivePoolMaxDefault}. Falls back to {@link Long#MAX_VALUE} when AC is - * unconfigured. Enforced by Arrow's child-allocator limit — analytics-engine's - * per-query allocators are children of this pool, so the sum of in-flight per-query - * allocations is capped here. - * - *

Note: each individual analytics query is also bounded by - * {@code analytics.exec.QueryContext} per-query limit (currently the constant - * {@code DEFAULT_PER_QUERY_MEMORY_LIMIT = 256 MB}). Lowering {@code QUERY_MAX} - * below {@code 256 MB × concurrent-queries} can starve queries even when each - * individual query is within its per-query limit. - */ + /** Maximum bytes the query pool can allocate. Default is 5% of budget. */ public static final Setting QUERY_MAX_SETTING = new Setting<>( NativeAllocatorPoolConfig.SETTING_QUERY_MAX, s -> derivePoolMaxDefault(s, 5), - s -> { - long v = Long.parseLong(s); - if (v < 0) { - throw new IllegalArgumentException("Setting [" + NativeAllocatorPoolConfig.SETTING_QUERY_MAX + "] must be >= 0, got " + v); - } - return v; - }, + s -> parseNonNegativeLong(s, NativeAllocatorPoolConfig.SETTING_QUERY_MAX), Setting.Property.NodeScope, Setting.Property.Dynamic ); - /** - * Computes the default for a pool max as a percentage of - * {@link ResourceTrackerSettings#NODE_NATIVE_MEMORY_LIMIT_SETTING} (the operator's - * declared off-heap budget), falling back to {@link Long#MAX_VALUE} when AC is - * unconfigured. Returns the bytes-as-string representation expected by the - * {@link Setting} parser. - * - *

Pools are anchored to {@code node.native_memory.limit} rather than to - * {@link #ROOT_LIMIT_SETTING} so the diagrammed partitioning (PR #21732) holds: - * sum of pool maxes (5+8+5 = 18% of native_memory.limit) fits within the framework - * root cap (20% of native_memory.limit) by default. Operator overrides of - * {@code root.limit} that drop it below {@code sum(pool.max)} are caught by the - * grouped validator. - * - *

The fraction is taken straight from {@code node.native_memory.limit}, not from - * {@code limit - buffer_percent}. {@code buffer_percent} is an admission-control - * throttle margin, not a framework budget reduction. - * - * @param settings node settings - * @param percent fraction of {@code node.native_memory.limit} the pool max defaults to - */ - static String derivePoolMaxDefault(Settings settings, int percent) { - ByteSizeValue nativeLimit = ResourceTrackerSettings.NODE_NATIVE_MEMORY_LIMIT_SETTING.get(settings); - if (nativeLimit.getBytes() <= 0) { - return Long.toString(Long.MAX_VALUE); - } - long pool = Math.max(0L, nativeLimit.getBytes() * percent / 100); - return Long.toString(pool); - } - - /** Interval in seconds between pool rebalance cycles. 0 disables rebalancing. */ - public static final Setting REBALANCE_INTERVAL_SETTING = Setting.longSetting( - "native.allocator.rebalance.interval_seconds", - 0L, - 0L, - Setting.Property.NodeScope, - Setting.Property.Dynamic - ); + // ─── Instance state ────────────────────────────────────────────────────────── private volatile ArrowNativeAllocator allocator; + private volatile ScheduledExecutorService rebalancerScheduler; + private volatile ScheduledFuture rebalanceTask; + private volatile NativeMemoryRebalancer rebalancer; + + // ─── Plugin lifecycle ──────────────────────────────────────────────────────── @Override public Collection createComponents( @@ -251,12 +186,11 @@ public Collection createComponents( ) { Settings settings = environment.settings(); ClusterSettings cs = clusterService.getClusterSettings(); - ArrowNativeAllocator built = buildAllocator(settings, cs); + Supplier budgetSupplier = () -> ResourceTrackerSettings.NODE_NATIVE_MEMORY_LIMIT_SETTING.get(clusterService.getSettings()) + .getBytes(); + ArrowNativeAllocator built = buildAllocator(settings, cs, budgetSupplier); this.allocator = built; - // Publish a NativeAllocatorStatsRegistry alongside the allocator so the server-side - // NodeService can discover the supplier via pluginComponents (instanceof filter) without - // taking a compile-time dependency on this plugin. The lambda re-reads `this.allocator` - // each invocation, so after close() nulls the field, the supplier returns null cleanly. + Supplier statsSupplier = () -> { ArrowNativeAllocator a = this.allocator; return a != null ? a.stats() : null; @@ -264,96 +198,172 @@ public Collection createComponents( return List.of(built, new NativeAllocatorStatsRegistry(statsSupplier)); } + @Override + public List> getSettings() { + return List.of( + FLIGHT_MIN_SETTING, + FLIGHT_MAX_SETTING, + INGEST_MIN_SETTING, + INGEST_MAX_SETTING, + QUERY_MIN_SETTING, + QUERY_MAX_SETTING, + REBALANCE_INTERVAL_SETTING, + REBALANCER_ENABLED_SETTING, + PRESSURE_THRESHOLD_SETTING, + IDLE_THRESHOLD_SETTING, + SHRINK_FACTOR_SETTING + ); + } + + @Override + public List getRestHandlers( + Settings settings, + RestController restController, + ClusterSettings clusterSettings, + IndexScopedSettings indexScopedSettings, + SettingsFilter settingsFilter, + IndexNameExpressionResolver indexNameExpressionResolver, + Supplier nodesInCluster + ) { + Supplier statsSupplier = () -> allocator != null ? allocator.stats() : null; + return List.of(new ArrowBaseStatsAction(statsSupplier)); + } + + @Override + public void close() throws IOException { + if (rebalancerScheduler != null) { + rebalancerScheduler.shutdownNow(); + } + if (allocator != null) { + allocator.close(); + allocator = null; + } + } + + // ─── Package-private (visible for tests) ───────────────────────────────────── + /** - * Constructs the allocator and wires its pools and dynamic-update consumers from - * a pure {@code (Settings, ClusterSettings)} pair. Package-private so unit tests - * can exercise the full wiring without a heavyweight {@link ClusterService} - * fixture — mirrors the shape of {@link #registerSettingsUpdateConsumers} which - * is already test-friendly for the same reason. + * Constructs the allocator and wires its pools and the rebalancer. */ - static ArrowNativeAllocator buildAllocator(Settings settings, ClusterSettings cs) { - long rootLimit = ROOT_LIMIT_SETTING.get(settings); - ArrowNativeAllocator allocator = new ArrowNativeAllocator(rootLimit); - allocator.setRebalanceInterval(REBALANCE_INTERVAL_SETTING.get(settings)); + ArrowNativeAllocator buildAllocator(Settings settings, ClusterSettings cs, Supplier budgetSupplier) { + ArrowNativeAllocator allocator = new ArrowNativeAllocator(); - // Single source of truth for cross-setting invariants — same logic runs on - // dynamic updates via the grouped consumer below. - validateUpdate(settings); + // Set budget for validation + long nativeBudget = ResourceTrackerSettings.NODE_NATIVE_MEMORY_LIMIT_SETTING.get(settings).getBytes(); + if (nativeBudget > 0) { + allocator.setBudget(nativeBudget); + } + + // Validate min < max for each pool + validateMinMax(NativeAllocatorPoolConfig.POOL_FLIGHT, FLIGHT_MIN_SETTING.get(settings), FLIGHT_MAX_SETTING.get(settings)); + validateMinMax(NativeAllocatorPoolConfig.POOL_INGEST, INGEST_MIN_SETTING.get(settings), INGEST_MAX_SETTING.get(settings)); + validateMinMax(NativeAllocatorPoolConfig.POOL_QUERY, QUERY_MIN_SETTING.get(settings), QUERY_MAX_SETTING.get(settings)); + // Create pools (always start at max) allocator.getOrCreatePool( NativeAllocatorPoolConfig.POOL_FLIGHT, FLIGHT_MIN_SETTING.get(settings), - FLIGHT_MAX_SETTING.get(settings) + FLIGHT_MAX_SETTING.get(settings), + PoolGroup.TRANSPORT ); allocator.getOrCreatePool( NativeAllocatorPoolConfig.POOL_INGEST, INGEST_MIN_SETTING.get(settings), - INGEST_MAX_SETTING.get(settings) + INGEST_MAX_SETTING.get(settings), + PoolGroup.INDEXING + ); + allocator.getOrCreatePool( + NativeAllocatorPoolConfig.POOL_QUERY, + QUERY_MIN_SETTING.get(settings), + QUERY_MAX_SETTING.get(settings), + PoolGroup.SEARCH ); - allocator.getOrCreatePool(NativeAllocatorPoolConfig.POOL_QUERY, QUERY_MIN_SETTING.get(settings), QUERY_MAX_SETTING.get(settings)); - registerSettingsUpdateConsumers(cs, allocator); + // Register dynamic setting consumers for min/max changes + cs.addSettingsUpdateConsumer(FLIGHT_MIN_SETTING, newMin -> allocator.setPoolMin(NativeAllocatorPoolConfig.POOL_FLIGHT, newMin)); + cs.addSettingsUpdateConsumer(FLIGHT_MAX_SETTING, newMax -> allocator.setPoolLimit(NativeAllocatorPoolConfig.POOL_FLIGHT, newMax)); + cs.addSettingsUpdateConsumer(INGEST_MIN_SETTING, newMin -> allocator.setPoolMin(NativeAllocatorPoolConfig.POOL_INGEST, newMin)); + cs.addSettingsUpdateConsumer(INGEST_MAX_SETTING, newMax -> allocator.setPoolLimit(NativeAllocatorPoolConfig.POOL_INGEST, newMax)); + cs.addSettingsUpdateConsumer(QUERY_MIN_SETTING, newMin -> allocator.setPoolMin(NativeAllocatorPoolConfig.POOL_QUERY, newMin)); + cs.addSettingsUpdateConsumer(QUERY_MAX_SETTING, newMax -> allocator.setPoolLimit(NativeAllocatorPoolConfig.POOL_QUERY, newMax)); + + // Register dynamic consumer for rebalancer enable/disable + cs.addSettingsUpdateConsumer(REBALANCER_ENABLED_SETTING, enabled -> { + if (enabled == false) { + cancelRebalanceTask(); + allocator.resetAllPoolsToMax(); + } else { + startRebalancer(allocator, budgetSupplier, REBALANCE_INTERVAL_SETTING.get(settings)); + } + }); + + // Set up the rebalancer if enabled + if (REBALANCER_ENABLED_SETTING.get(settings)) { + startRebalancer(allocator, budgetSupplier, REBALANCE_INTERVAL_SETTING.get(settings)); + } + + // Register dynamic consumer for interval changes + cs.addSettingsUpdateConsumer(REBALANCE_INTERVAL_SETTING, this::updateRebalanceInterval); + + // Register dynamic consumers for threshold changes + cs.addSettingsUpdateConsumer(PRESSURE_THRESHOLD_SETTING, value -> { + NativeMemoryRebalancer r = this.rebalancer; + if (r != null) r.setPressureThreshold(value); + }); + cs.addSettingsUpdateConsumer(IDLE_THRESHOLD_SETTING, value -> { + NativeMemoryRebalancer r = this.rebalancer; + if (r != null) r.setIdleThreshold(value); + }); + cs.addSettingsUpdateConsumer(SHRINK_FACTOR_SETTING, value -> { + NativeMemoryRebalancer r = this.rebalancer; + if (r != null) r.setShrinkFactor(value); + }); + return allocator; } - /** - * Registers cluster-settings update consumers that propagate dynamic setting changes - * into the live {@link ArrowNativeAllocator}. Package-private so unit tests can exercise - * the wiring with a real {@link ClusterSettings} instance — the test that asserts a PUT - * lands on the allocator is what catches a future regression where one of these lines - * is accidentally removed. - */ - static void registerSettingsUpdateConsumers(ClusterSettings cs, ArrowNativeAllocator allocator) { - cs.addSettingsUpdateConsumer(ROOT_LIMIT_SETTING, allocator::setRootLimit); - cs.addSettingsUpdateConsumer(REBALANCE_INTERVAL_SETTING, allocator::setRebalanceInterval); - cs.addSettingsUpdateConsumer(FLIGHT_MAX_SETTING, v -> allocator.setPoolLimit(NativeAllocatorPoolConfig.POOL_FLIGHT, v)); - cs.addSettingsUpdateConsumer(FLIGHT_MIN_SETTING, v -> allocator.setPoolMin(NativeAllocatorPoolConfig.POOL_FLIGHT, v)); - cs.addSettingsUpdateConsumer(INGEST_MAX_SETTING, v -> allocator.setPoolLimit(NativeAllocatorPoolConfig.POOL_INGEST, v)); - cs.addSettingsUpdateConsumer(INGEST_MIN_SETTING, v -> allocator.setPoolMin(NativeAllocatorPoolConfig.POOL_INGEST, v)); - cs.addSettingsUpdateConsumer(QUERY_MAX_SETTING, v -> allocator.setPoolLimit(NativeAllocatorPoolConfig.POOL_QUERY, v)); - cs.addSettingsUpdateConsumer(QUERY_MIN_SETTING, v -> allocator.setPoolMin(NativeAllocatorPoolConfig.POOL_QUERY, v)); - - // Grouped validator runs across the related settings on every dynamic update so cross-setting - // invariants (sum of pool mins ≤ root, per-pool min ≤ max) are enforced post-startup. - cs.addSettingsUpdateConsumer(s -> {}, MIN_MAX_SETTINGS, ArrowBasePlugin::validateUpdate); - } + // ─── Private helpers ───────────────────────────────────────────────────────── - private static final List> MIN_MAX_SETTINGS = List.of( - ROOT_LIMIT_SETTING, - FLIGHT_MIN_SETTING, - FLIGHT_MAX_SETTING, - INGEST_MIN_SETTING, - INGEST_MAX_SETTING, - QUERY_MIN_SETTING, - QUERY_MAX_SETTING - ); + private synchronized void startRebalancer(ArrowNativeAllocator allocator, Supplier budgetSupplier, long intervalSeconds) { + if (rebalancer != null || rebalancerScheduler != null) return; - private static void validateUpdate(Settings settings) { - long rootLimit = ROOT_LIMIT_SETTING.get(settings); - long flightMin = FLIGHT_MIN_SETTING.get(settings); - long flightMax = FLIGHT_MAX_SETTING.get(settings); - long ingestMin = INGEST_MIN_SETTING.get(settings); - long ingestMax = INGEST_MAX_SETTING.get(settings); - long queryMin = QUERY_MIN_SETTING.get(settings); - long queryMax = QUERY_MAX_SETTING.get(settings); - validateMinMax(NativeAllocatorPoolConfig.POOL_FLIGHT, flightMin, flightMax); - validateMinMax(NativeAllocatorPoolConfig.POOL_INGEST, ingestMin, ingestMax); - validateMinMax(NativeAllocatorPoolConfig.POOL_QUERY, queryMin, queryMax); - validateMinSum(rootLimit, flightMin, ingestMin, queryMin); - } + long budget = budgetSupplier.get(); + if (budget <= 0) return; + if (intervalSeconds <= 0) return; - @Override - public List> getSettings() { - return List.of( - ROOT_LIMIT_SETTING, - FLIGHT_MIN_SETTING, - FLIGHT_MAX_SETTING, - INGEST_MIN_SETTING, - INGEST_MAX_SETTING, - QUERY_MIN_SETTING, - QUERY_MAX_SETTING, - REBALANCE_INTERVAL_SETTING + NativeMemoryRebalancer nativeRebalancer = new NativeMemoryRebalancer( + allocator, + budgetSupplier, + PRESSURE_THRESHOLD_SETTING.getDefault(Settings.EMPTY), + IDLE_THRESHOLD_SETTING.getDefault(Settings.EMPTY), + SHRINK_FACTOR_SETTING.getDefault(Settings.EMPTY) ); + this.rebalancer = nativeRebalancer; + + Scheduler.SafeScheduledThreadPoolExecutor executor = new Scheduler.SafeScheduledThreadPoolExecutor(1, r -> { + Thread t = new Thread(r, "native-allocator-rebalancer"); + t.setDaemon(true); + return t; + }); + executor.setRemoveOnCancelPolicy(true); + this.rebalancerScheduler = executor; + + rebalanceTask = rebalancerScheduler.scheduleAtFixedRate(nativeRebalancer, intervalSeconds, intervalSeconds, TimeUnit.SECONDS); + } + + private synchronized void cancelRebalanceTask() { + ScheduledFuture existing = rebalanceTask; + if (existing != null) { + FutureUtils.cancel(existing); + rebalanceTask = null; + } + } + + private void updateRebalanceInterval(long newInterval) { + cancelRebalanceTask(); + if (newInterval > 0 && rebalancerScheduler != null && rebalancer != null) { + rebalanceTask = rebalancerScheduler.scheduleAtFixedRate(rebalancer, newInterval, newInterval, TimeUnit.SECONDS); + } } private static void validateMinMax(String poolName, long min, long max) { @@ -362,36 +372,27 @@ private static void validateMinMax(String poolName, long min, long max) { } } - private static void validateMinSum(long rootLimit, long... mins) { - if (rootLimit == Long.MAX_VALUE) { - return; - } - long sum = 0; - for (long min : mins) { - try { - sum = Math.addExact(sum, min); - } catch (ArithmeticException overflow) { - throw new IllegalArgumentException("Sum of pool minimums overflows.", overflow); - } + static String derivePoolMaxDefault(Settings settings, int percent) { + ByteSizeValue nativeLimit = ResourceTrackerSettings.NODE_NATIVE_MEMORY_LIMIT_SETTING.get(settings); + if (nativeLimit.getBytes() <= 0) { + return Long.toString(Long.MAX_VALUE); } - if (sum > rootLimit) { - throw new IllegalArgumentException( - "Sum of pool minimums (" - + sum - + " bytes) exceeds root limit (" - + rootLimit - + " bytes). " - + "Reduce pool minimums or increase " - + NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT - ); + return Long.toString(Math.max(0L, nativeLimit.getBytes() * percent / 100)); + } + + static String derivePoolMinDefault(Settings settings, int percent) { + ByteSizeValue nativeLimit = ResourceTrackerSettings.NODE_NATIVE_MEMORY_LIMIT_SETTING.get(settings); + if (nativeLimit.getBytes() <= 0) { + return "0"; } + return Long.toString(Math.max(0L, nativeLimit.getBytes() * percent / 100)); } - @Override - public void close() throws IOException { - if (allocator != null) { - allocator.close(); - allocator = null; + private static long parseNonNegativeLong(String s, String settingName) { + long v = Long.parseLong(s); + if (v < 0) { + throw new IllegalArgumentException("Setting [" + settingName + "] must be >= 0, got " + v); } + return v; } } diff --git a/plugins/arrow-base/src/main/java/org/opensearch/arrow/allocator/ArrowBaseStatsAction.java b/plugins/arrow-base/src/main/java/org/opensearch/arrow/allocator/ArrowBaseStatsAction.java new file mode 100644 index 0000000000000..70928e93713a0 --- /dev/null +++ b/plugins/arrow-base/src/main/java/org/opensearch/arrow/allocator/ArrowBaseStatsAction.java @@ -0,0 +1,75 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.arrow.allocator; + +import org.opensearch.core.rest.RestStatus; +import org.opensearch.core.xcontent.XContentBuilder; +import org.opensearch.plugin.stats.NativeAllocatorPoolStats; +import org.opensearch.rest.BaseRestHandler; +import org.opensearch.rest.BytesRestResponse; +import org.opensearch.rest.RestRequest; +import org.opensearch.transport.client.node.NodeClient; + +import java.util.List; +import java.util.function.Supplier; + +/** + * REST handler exposing per-pool native memory stats at {@code _plugins/arrow_base/stats}. + */ +public class ArrowBaseStatsAction extends BaseRestHandler { + private final Supplier statsSupplier; + + /** + * Creates a new stats action. + * @param statsSupplier supplier of pool stats + */ + public ArrowBaseStatsAction(Supplier statsSupplier) { + this.statsSupplier = statsSupplier; + } + + @Override + public String getName() { + return "arrow_base_stats_action"; + } + + @Override + public List routes() { + return List.of(new Route(RestRequest.Method.GET, "_plugins/arrow_base/stats")); + } + + @Override + protected RestChannelConsumer prepareRequest(RestRequest request, NodeClient client) { + return channel -> { + NativeAllocatorPoolStats stats = statsSupplier.get(); + XContentBuilder builder = channel.newBuilder(); + builder.startObject(); + builder.startObject("memory_pools"); + if (stats != null) { + builder.startObject("runtime"); + builder.field("allocated_bytes", stats.getNativeAllocatedBytes()); + builder.field("resident_bytes", stats.getNativeResidentBytes()); + builder.endObject(); + builder.startObject("pools"); + for (NativeAllocatorPoolStats.PoolStats pool : stats.getPools()) { + builder.startObject(pool.getName()); + builder.field("allocated_bytes", pool.getAllocatedBytes()); + builder.field("peak_bytes", pool.getPeakBytes()); + builder.field("limit_bytes", pool.getLimitBytes()); + builder.field("min_bytes", pool.getMinBytes()); + builder.field("group", pool.getGroup()); + builder.endObject(); + } + builder.endObject(); + } + builder.endObject(); + builder.endObject(); + channel.sendResponse(new BytesRestResponse(RestStatus.OK, builder)); + }; + } +} diff --git a/plugins/arrow-base/src/main/java/org/opensearch/arrow/allocator/ArrowNativeAllocator.java b/plugins/arrow-base/src/main/java/org/opensearch/arrow/allocator/ArrowNativeAllocator.java index 892c43d2cb2c8..9c54298b6093e 100644 --- a/plugins/arrow-base/src/main/java/org/opensearch/arrow/allocator/ArrowNativeAllocator.java +++ b/plugins/arrow-base/src/main/java/org/opensearch/arrow/allocator/ArrowNativeAllocator.java @@ -10,197 +10,295 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import org.opensearch.arrow.spi.NativeAllocator; +import org.opensearch.arrow.spi.PoolGroup; +import org.opensearch.common.settings.ClusterSettings; +import org.opensearch.common.settings.Setting; +import org.opensearch.common.settings.Settings; import org.opensearch.plugin.stats.NativeAllocatorPoolStats; import java.util.ArrayList; import java.util.Collections; +import java.util.HashSet; import java.util.List; -import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; -import java.util.concurrent.ScheduledExecutorService; -import java.util.concurrent.ScheduledFuture; -import java.util.concurrent.TimeUnit; +import java.util.concurrent.CopyOnWriteArrayList; +import java.util.function.Consumer; +import java.util.function.Supplier; /** * Arrow-backed implementation of {@link NativeAllocator}. * - *

Owns a single {@link RootAllocator} for the node. All plugins that need - * Arrow buffers obtain pool handles from this class via the SPI interface. - * - *

Elastic rebalancing

- *

A background task periodically redistributes unused capacity across pools. - * Each pool has a guaranteed limit (configured via settings). When other - * pools are idle, an active pool can temporarily grow beyond its guarantee up to - * the root limit. When contention rises, pools shrink back toward their guarantee. - * This prevents idle capacity from being wasted while maintaining isolation under load. - * - *

Constructed once by {@link ArrowBasePlugin#createComponents} and exposed to - * downstream plugins via Guice and {@code PluginComponentRegistry} so consumers - * receive the instance through explicit dependency injection rather than a static - * singleton. + *

Owns a single {@link RootAllocator} (set to {@code Long.MAX_VALUE} — per-pool + * limits are the real enforcement). Manages both Arrow-backed pools and virtual pools. */ public class ArrowNativeAllocator implements NativeAllocator { + private static final Logger logger = LogManager.getLogger(ArrowNativeAllocator.class); + private final RootAllocator root; private final ConcurrentMap pools = new ConcurrentHashMap<>(); - private final ConcurrentMap poolMins = new ConcurrentHashMap<>(); - private final ConcurrentMap poolMaxes = new ConcurrentHashMap<>(); - private final ScheduledExecutorService rebalancer; - private volatile ScheduledFuture rebalanceTask; - /** - * True iff the rebalancer is configured to run periodically. Used by - * {@link #getOrCreatePool} to decide each pool's initial child-allocator - * limit: when rebalancing is enabled, pools start at {@code min} and grow - * via the next rebalance tick (preserving the original PR's - * "guarantee + burst" semantics); when rebalancing is disabled, pools - * start at {@code max} so consumers can allocate immediately without - * waiting for a tick that never comes. - */ - private volatile boolean rebalancerEnabled = false; + private final ConcurrentMap virtualPools = new ConcurrentHashMap<>(); + private final ConcurrentMap poolConfigs = new ConcurrentHashMap<>(); + private final List statsRefreshers = new CopyOnWriteArrayList<>(); + private volatile Supplier nativeMemoryStatsSupplier; + private volatile long budget = Long.MAX_VALUE; /** * Creates a new allocator with a fresh RootAllocator. - * - * @param rootLimit maximum bytes for the root allocator */ - public ArrowNativeAllocator(long rootLimit) { - this.root = new RootAllocator(rootLimit); - org.opensearch.threadpool.Scheduler.SafeScheduledThreadPoolExecutor executor = - new org.opensearch.threadpool.Scheduler.SafeScheduledThreadPoolExecutor(1, r -> { - Thread t = new Thread(r, "native-allocator-rebalancer"); - t.setDaemon(true); - return t; - }); - executor.setRemoveOnCancelPolicy(true); - this.rebalancer = executor; + public ArrowNativeAllocator() { + this.root = new RootAllocator(Long.MAX_VALUE); } /** - * Schedules (or reschedules) the rebalancer at the given interval. - * A value of 0 disables rebalancing. + * Sets the total native memory budget for validation. * - * @param intervalSeconds rebalance period in seconds, or 0 to disable + * @param budget node.native_memory.limit in bytes */ - public void setRebalanceInterval(long intervalSeconds) { - ScheduledFuture existing = rebalanceTask; - if (existing != null) { - org.opensearch.common.util.concurrent.FutureUtils.cancel(existing); - rebalanceTask = null; - } - rebalancerEnabled = intervalSeconds > 0; - if (rebalancerEnabled) { - rebalanceTask = rebalancer.scheduleAtFixedRate(this::rebalance, intervalSeconds, intervalSeconds, TimeUnit.SECONDS); - } + public void setBudget(long budget) { + this.budget = budget; } - @Override - public PoolHandle getOrCreatePool(String poolName, long limit) { - return getOrCreatePool(poolName, limit, limit); - } + // ─── Public / SPI methods ─────────────────────────────────────────────────── - /** - * Creates or returns a pool with min/max limits. - * - * @param poolName logical pool name - * @param min guaranteed minimum bytes (always available) - * @param max maximum bytes the pool can burst to - * @return the pool handle - */ - public PoolHandle getOrCreatePool(String poolName, long min, long max) { - poolMins.putIfAbsent(poolName, min); - poolMaxes.putIfAbsent(poolName, max); + @Override + public PoolHandle getOrCreatePool(String poolName, long min, long max, PoolGroup group) { + validateSumMaxesWithinBudget(poolName, max); + poolConfigs.putIfAbsent(poolName, new PoolConfig(min, max, group)); return pools.computeIfAbsent(poolName, name -> { - // Pick an initial limit that's safe for both rebalancer-on and rebalancer-off - // deployments. When rebalancing is enabled, start at min (the original PR's - // "guarantee + burst" semantics): the next rebalance tick will distribute - // headroom up to each pool's max. When rebalancing is disabled (the default), - // pools with min=0 would otherwise reject every allocation until a tick that - // never comes — start at max so consumers can allocate immediately. - long initial = rebalancerEnabled ? min : max; - BufferAllocator child = root.newChildAllocator(name, 0, initial); + BufferAllocator child = root.newChildAllocator(name, 0, max); return new ArrowPoolHandle(child); }); } @Override public void setPoolLimit(String poolName, long newLimit) { - ArrowPoolHandle handle = pools.get(poolName); - if (handle == null) { - throw new IllegalStateException("Pool '" + poolName + "' does not exist"); + PoolConfig config = poolConfigs.get(poolName); + if (config != null) { + config.max = newLimit; } - poolMaxes.put(poolName, newLimit); - handle.allocator.setLimit(newLimit); + ArrowPoolHandle arrowHandle = pools.get(poolName); + if (arrowHandle != null) { + arrowHandle.allocator.setLimit(newLimit); + return; + } + VirtualPoolHandleImpl vp = virtualPools.get(poolName); + if (vp != null) { + vp.setLimit(newLimit); + return; + } + throw new IllegalStateException("Pool '" + poolName + "' does not exist"); + } + + @Override + public VirtualPoolHandle registerVirtualPool(String poolName, long min, long max, PoolGroup group, Consumer limitSetter) { + if (min > max) { + throw new IllegalArgumentException("Pool '" + poolName + "' min (" + min + ") exceeds max (" + max + ")"); + } + validateSumMaxesWithinBudget(poolName, max); + VirtualPoolHandleImpl handle = new VirtualPoolHandleImpl(poolName, max, limitSetter); + VirtualPoolHandleImpl existing = virtualPools.putIfAbsent(poolName, handle); + if (existing != null || pools.containsKey(poolName)) { + virtualPools.remove(poolName, handle); + throw new IllegalStateException("Pool '" + poolName + "' already registered"); + } + poolConfigs.put(poolName, new PoolConfig(min, max, group)); + limitSetter.accept(max); + return handle; + } + + @Override + public void setPoolMin(String poolName, long newMin) { + PoolConfig config = poolConfigs.get(poolName); + if (config != null) { + config.min = newMin; + } + // Raise live limit if newMin exceeds current effective limit + ArrowPoolHandle arrowHandle = pools.get(poolName); + if (arrowHandle != null) { + long max = config != null ? config.max : Long.MAX_VALUE; + long current = arrowHandle.allocator.getLimit(); + long target = Math.min(newMin, max); + if (target > current) { + arrowHandle.allocator.setLimit(target); + } + return; + } + VirtualPoolHandleImpl vp = virtualPools.get(poolName); + if (vp != null) { + long max = config != null ? config.max : Long.MAX_VALUE; + long current = vp.limit(); + long target = Math.min(newMin, max); + if (target > current) { + vp.setLimit(target); + } + } + } + + @Override + public Set getAllPoolNames() { + Set all = new HashSet<>(pools.keySet()); + all.addAll(virtualPools.keySet()); + return Collections.unmodifiableSet(all); + } + + @Override + public void addStatsRefresher(Runnable refresher) { + statsRefreshers.add(refresher); + } + + @Override + public void setNativeMemoryStatsSupplier(Supplier supplier) { + this.nativeMemoryStatsSupplier = supplier; } /** - * Updates the minimum guaranteed bytes for a pool. The new min is recorded for the - * rebalancer (which honors it as a floor on the next tick) and also pushed to the - * live {@link BufferAllocator} so the change takes effect immediately even when - * the rebalancer is disabled — the alternative was a Dynamic setting that returned - * HTTP 200 but had no observable effect. - * - *

Live propagation rules: - *

    - *
  • If {@code newMin} exceeds the pool's current limit, the limit is raised to - * {@code newMin} (capped at the configured pool max). Children of the pool - * allocator inherit the change automatically via Arrow's parent-cap check at - * allocation time, so dynamic resizes reach in-flight workloads without an - * explicit notification SPI. - *
  • If {@code newMin} is below the current limit, the limit is left alone — - * the rebalancer is the only path that shrinks live limits, so a min change - * on its own never reduces capacity in flight. - *
+ * Sets the effective (live) limit for a pool without updating the configured max. + * Used by the rebalancer to adjust pool limits dynamically. * - * @param poolName the pool name - * @param newMin new minimum bytes + * @param poolName name of the pool + * @param newLimit new effective limit in bytes */ - public void setPoolMin(String poolName, long newMin) { - ArrowPoolHandle handle = pools.get(poolName); - if (handle == null) { - throw new IllegalStateException("Pool '" + poolName + "' does not exist"); + public void setPoolEffectiveLimit(String poolName, long newLimit) { + ArrowPoolHandle arrowHandle = pools.get(poolName); + if (arrowHandle != null) { + arrowHandle.allocator.setLimit(newLimit); + return; } - poolMins.put(poolName, newMin); - long max = poolMaxes.getOrDefault(poolName, Long.MAX_VALUE); - long current = handle.allocator.getLimit(); - long target = Math.min(newMin, max); - if (target > current) { - handle.allocator.setLimit(target); + VirtualPoolHandleImpl vp = virtualPools.get(poolName); + if (vp != null) { + vp.setLimit(newLimit); + return; } + throw new IllegalStateException("Pool '" + poolName + "' does not exist"); } - @Override - public void setRootLimit(long limit) { - root.setLimit(limit); + /** + * Resets all pools to their configured max. Called when the rebalancer is disabled. + * Logs a warning for any pool that was bursting above its max. + */ + public void resetAllPoolsToMax() { + for (String name : getAllPoolNames()) { + PoolConfig config = poolConfigs.get(name); + long max = config != null ? config.max : Long.MAX_VALUE; + long current = getEffectiveLimit(name); + if (current > max) { + logger.warn( + "Pool [{}] effective limit {} exceeds max {}, resetting to max. In-flight allocations may be rejected.", + name, + current, + max + ); + } + setPoolEffectiveLimit(name, max); + } + } + + /** + * Convenience method for plugins that have Setting objects. Registers the virtual pool + * and auto-wires dynamic setting listeners for min/max changes. + * + * @param poolName name of the virtual pool + * @param minSetting setting for minimum bytes + * @param maxSetting setting for maximum bytes + * @param settings current node settings + * @param clusterSettings cluster settings for dynamic updates + * @param group pool group assignment + * @param limitSetter callback invoked when the pool limit changes + */ + public VirtualPoolHandle registerVirtualPool( + String poolName, + Setting minSetting, + Setting maxSetting, + Settings settings, + ClusterSettings clusterSettings, + PoolGroup group, + Consumer limitSetter + ) { + long min = minSetting.get(settings); + long max = maxSetting.get(settings); + VirtualPoolHandle handle = registerVirtualPool(poolName, min, max, group, limitSetter); + + clusterSettings.addSettingsUpdateConsumer(maxSetting, newMax -> setPoolLimit(poolName, newMax)); + clusterSettings.addSettingsUpdateConsumer(minSetting, newMin -> setPoolMin(poolName, newMin)); + + return handle; } /** - * Returns a point-in-time stats snapshot across all pools. Used by the - * {@code NativeAllocatorStatsRegistry} component published from - * {@code ArrowBasePlugin.createComponents()} and wired into {@code NodeService} to - * render allocator state under {@code _nodes/stats[/native_allocator]}. + * Returns a point-in-time stats snapshot across all pools. */ public NativeAllocatorPoolStats stats() { + refreshStats(); + + long nativeAllocated = -1; + long nativeResident = -1; + Supplier supplier = this.nativeMemoryStatsSupplier; + if (supplier != null) { + try { + long[] stats = supplier.get(); + if (stats != null && stats.length >= 2) { + nativeAllocated = stats[0]; + nativeResident = stats[1]; + } + } catch (Exception e) { + // best-effort + } + } + List poolStats = new ArrayList<>(); for (var entry : pools.entrySet()) { BufferAllocator alloc = entry.getValue().allocator; + PoolConfig config = poolConfigs.get(entry.getKey()); poolStats.add( new NativeAllocatorPoolStats.PoolStats( entry.getKey(), alloc.getAllocatedMemory(), alloc.getPeakMemoryAllocation(), - alloc.getLimit() + alloc.getLimit(), + config != null && config.group != null ? config.group.getName() : null, + config != null ? config.min : 0L ) ); } - return new NativeAllocatorPoolStats(root.getAllocatedMemory(), root.getPeakMemoryAllocation(), root.getLimit(), poolStats); + for (var entry : virtualPools.entrySet()) { + VirtualPoolHandleImpl vp = entry.getValue(); + PoolConfig config = poolConfigs.get(entry.getKey()); + poolStats.add( + new NativeAllocatorPoolStats.PoolStats( + entry.getKey(), + vp.allocatedBytes(), + vp.peakBytes(), + vp.limit(), + config != null && config.group != null ? config.group.getName() : null, + config != null ? config.min : 0L + ) + ); + } + + return new NativeAllocatorPoolStats(nativeAllocated, nativeResident, poolStats); + } + + /** + * Runs all registered stats refreshers. + */ + public void refreshStats() { + for (Runnable refresher : statsRefreshers) { + try { + refresher.run(); + } catch (Exception e) { + // best-effort + } + } } @Override public void close() { - rebalancer.shutdownNow(); pools.forEach((name, handle) -> { try { handle.allocator.close(); @@ -209,72 +307,23 @@ public void close() { } }); pools.clear(); - // Close any remaining child allocators (e.g., ad-hoc children created via ArrowAllocatorService) + virtualPools.clear(); for (BufferAllocator child : new ArrayList<>(root.getChildAllocators())) { try { child.close(); } catch (Exception e) { - // best-effort — log but don't block shutdown + // best-effort } } root.close(); } - /** - * Redistributes unused capacity across pools based on min/max guarantees. - * - *

Algorithm: - *

    - *
  1. Every pool is guaranteed at least its configured min
  2. - *
  3. Compute headroom = rootLimit - sum(all pool current allocations)
  4. - *
  5. Distribute headroom equally across all pools (not just active ones), capped - * at each pool's max. Distributing to all pools — including those with zero - * current allocation — avoids the dead-pool corner case where a pool with - * min = 0 starts at limit = 0, can never make its first allocation, and so - * never becomes "active" enough to receive a bonus. Pools that don't need the - * headroom stay at min naturally because their max caps the bonus.
  6. - *
  7. No pool's limit ever drops below its current allocation or its min
  8. - *
- */ - void rebalance() { - if (pools.isEmpty()) return; - - long rootLimit = root.getLimit(); - long totalAllocated = 0; - - for (Map.Entry entry : pools.entrySet()) { - totalAllocated += entry.getValue().allocator.getAllocatedMemory(); - } - - long headroom = Math.max(0, rootLimit - totalAllocated); - int poolCount = pools.size(); - long bonusPerPool = poolCount > 0 ? headroom / poolCount : 0; - - for (Map.Entry entry : pools.entrySet()) { - String name = entry.getKey(); - BufferAllocator alloc = entry.getValue().allocator; - long min = poolMins.getOrDefault(name, 0L); - long max = poolMaxes.getOrDefault(name, Long.MAX_VALUE); - long currentAllocation = alloc.getAllocatedMemory(); - - long effectiveLimit = min + bonusPerPool; - - // Cap at pool's max - effectiveLimit = Math.min(effectiveLimit, max); - // Never drop below current allocation or min - effectiveLimit = Math.max(effectiveLimit, currentAllocation); - effectiveLimit = Math.max(effectiveLimit, min); - // Never exceed root - effectiveLimit = Math.min(effectiveLimit, rootLimit); - - alloc.setLimit(effectiveLimit); - } - } + // ─── Package-private accessors (used by rebalancer and tests) ──────────────── /** * Returns the underlying Arrow allocator for a pool. * - * @param poolName name of the pool to look up + * @param poolName name of the pool */ public BufferAllocator getPoolAllocator(String poolName) { ArrowPoolHandle handle = pools.get(poolName); @@ -284,16 +333,12 @@ public BufferAllocator getPoolAllocator(String poolName) { return handle.allocator; } - /** - * Returns the root Arrow allocator. - */ + /** Returns the root Arrow allocator. */ public BufferAllocator getRootAllocator() { return root; } - /** - * Returns all registered pool names. - */ + /** Returns all registered pool names (Arrow pools only). */ public Set getPoolNames() { return Collections.unmodifiableSet(pools.keySet()); } @@ -304,7 +349,8 @@ public Set getPoolNames() { * @param poolName name of the pool */ public long getPoolMin(String poolName) { - return poolMins.getOrDefault(poolName, 0L); + PoolConfig config = poolConfigs.get(poolName); + return config != null ? config.min : 0L; } /** @@ -313,7 +359,163 @@ public long getPoolMin(String poolName) { * @param poolName name of the pool */ public long getPoolMax(String poolName) { - return poolMaxes.getOrDefault(poolName, Long.MAX_VALUE); + PoolConfig config = poolConfigs.get(poolName); + return config != null ? config.max : Long.MAX_VALUE; + } + + /** + * Returns the group for a pool, or null if not assigned. + * + * @param poolName name of the pool + */ + public PoolGroup getPoolGroup(String poolName) { + PoolConfig config = poolConfigs.get(poolName); + return config != null ? config.group : null; + } + + /** + * Returns the allocated bytes for a virtual pool. + * + * @param poolName name of the virtual pool + */ + public long getVirtualPoolAllocated(String poolName) { + VirtualPoolHandleImpl vp = virtualPools.get(poolName); + return vp != null ? vp.allocatedBytes() : 0; + } + + /** + * Returns the current limit for a virtual pool. + * + * @param poolName name of the virtual pool + */ + public long getVirtualPoolLimit(String poolName) { + VirtualPoolHandleImpl vp = virtualPools.get(poolName); + return vp != null ? vp.limit() : 0; + } + + /** + * Returns the effective limit for any pool (Arrow or virtual). + * + * @param poolName name of the pool + */ + public long getEffectiveLimit(String poolName) { + ArrowPoolHandle arrowHandle = pools.get(poolName); + if (arrowHandle != null) { + return arrowHandle.allocator.getLimit(); + } + VirtualPoolHandleImpl vp = virtualPools.get(poolName); + if (vp != null) { + return vp.limit(); + } + return 0; + } + + /** + * Returns the allocated bytes for any pool (Arrow or virtual). + * + * @param poolName name of the pool + */ + public long getAllocated(String poolName) { + ArrowPoolHandle arrowHandle = pools.get(poolName); + if (arrowHandle != null) { + return arrowHandle.allocator.getAllocatedMemory(); + } + VirtualPoolHandleImpl vp = virtualPools.get(poolName); + if (vp != null) { + return vp.allocatedBytes(); + } + return 0; + } + + /** Returns the native memory stats supplier. */ + public Supplier getNativeMemoryStatsSupplier() { + return nativeMemoryStatsSupplier; + } + + // ─── Private helpers ───────────────────────────────────────────────────────── + + private void validateSumMaxesWithinBudget(String newPoolName, long newPoolMax) { + if (budget == Long.MAX_VALUE || budget <= 0) { + return; + } + long sumMaxes = newPoolMax; + for (var entry : poolConfigs.entrySet()) { + if (entry.getKey().equals(newPoolName) == false) { + sumMaxes += entry.getValue().max; + } + } + if (sumMaxes > budget) { + throw new IllegalArgumentException( + "Sum of pool max limits (" + + sumMaxes + + " bytes) exceeds native memory budget (" + + budget + + " bytes). Reduce pool max settings or increase the budget." + ); + } + } + + // ─── Inner classes ─────────────────────────────────────────────────────────── + + /** + * Mutable configuration for a pool: min, max, and group. + */ + static class PoolConfig { + volatile long min; + volatile long max; + final PoolGroup group; + + PoolConfig(long min, long max, PoolGroup group) { + this.min = min; + this.max = max; + this.group = group; + } + } + + /** + * Virtual pool handle implementation. Tracks stats reported from native layer + * and delegates limit changes to the registered callback. + */ + public static class VirtualPoolHandleImpl implements VirtualPoolHandle { + private final String name; + private volatile long limit; + private volatile long allocatedBytes; + private volatile long peakBytes; + private final Consumer limitSetter; + + VirtualPoolHandleImpl(String name, long limit, Consumer limitSetter) { + this.name = name; + this.limit = limit; + this.limitSetter = limitSetter; + } + + @Override + public void updateStats(long allocated, long peak) { + this.allocatedBytes = allocated; + this.peakBytes = peak; + } + + void setLimit(long newLimit) { + this.limit = newLimit; + if (limitSetter != null) { + limitSetter.accept(newLimit); + } + } + + @Override + public long allocatedBytes() { + return allocatedBytes; + } + + @Override + public long peakBytes() { + return peakBytes; + } + + @Override + public long limit() { + return limit; + } } /** diff --git a/plugins/arrow-base/src/main/java/org/opensearch/arrow/allocator/NativeMemoryRebalancer.java b/plugins/arrow-base/src/main/java/org/opensearch/arrow/allocator/NativeMemoryRebalancer.java new file mode 100644 index 0000000000000..5aefe7763e952 --- /dev/null +++ b/plugins/arrow-base/src/main/java/org/opensearch/arrow/allocator/NativeMemoryRebalancer.java @@ -0,0 +1,235 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.arrow.allocator; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.apache.logging.log4j.message.ParameterizedMessage; + +import java.util.HashMap; +import java.util.Map; +import java.util.Set; +import java.util.function.Supplier; + +/** + * Periodic rebalancer that redistributes native memory across pools. + * + *

Algorithm: + *

    + *
  • Pools start at their configured max on registration
  • + *
  • If no pool is under pressure, return early (no-op)
  • + *
  • Idle pools (utilization < idle_threshold) are shrunk, never below min
  • + *
  • Pressured pools (utilization > pressure_threshold) receive freed capacity, can exceed max
  • + *
  • Excess freed capacity is returned to idle pools proportionally
  • + *
  • Invariant: sum(effective_limits) <= budget at all times
  • + *
+ * + * @opensearch.internal + */ +public class NativeMemoryRebalancer implements Runnable { + + private static final Logger logger = LogManager.getLogger(NativeMemoryRebalancer.class); + + private final ArrowNativeAllocator allocator; + private final Supplier budgetSupplier; + + private volatile double pressureThreshold; + private volatile double idleThreshold; + private volatile double shrinkFactor; + + /** + * Creates a new rebalancer. + * + * @param allocator the allocator managing all pools + * @param budgetSupplier supplies the current budget value + * @param pressureThreshold utilization above this triggers growth (default 0.75) + * @param idleThreshold utilization below this means pool can give back capacity (default 0.50) + * @param shrinkFactor factor to shrink idle pools by — new limit = limit * (1 - shrinkFactor) (default 0.10) + */ + public NativeMemoryRebalancer( + ArrowNativeAllocator allocator, + Supplier budgetSupplier, + double pressureThreshold, + double idleThreshold, + double shrinkFactor + ) { + this.allocator = allocator; + this.budgetSupplier = budgetSupplier; + this.pressureThreshold = pressureThreshold; + this.idleThreshold = idleThreshold; + this.shrinkFactor = shrinkFactor; + } + + /** + * Updates the pressure threshold dynamically. + * + * @param value new threshold (0.0 to 1.0) + */ + public void setPressureThreshold(double value) { + this.pressureThreshold = value; + } + + /** + * Updates the idle threshold dynamically. + * + * @param value new threshold (0.0 to 1.0) + */ + public void setIdleThreshold(double value) { + this.idleThreshold = value; + } + + /** + * Updates the shrink factor dynamically. + * + * @param value new factor (0.0 to 1.0) + */ + public void setShrinkFactor(double value) { + this.shrinkFactor = value; + } + + @Override + public void run() { + try { + rebalance(); + } catch (Exception e) { + logger.warn("Rebalancer tick failed", e); + } + } + + void rebalance() { + Set allPools = allocator.getAllPoolNames(); + if (allPools.isEmpty()) return; + + long budget = budgetSupplier.get(); + if (budget <= 0 || budget == Long.MAX_VALUE) return; + + // Refresh stats from native layers + allocator.refreshStats(); + + // Snapshot per-pool state + Map snapshots = new HashMap<>(); + for (String name : allPools) { + long allocated = allocator.getAllocated(name); + long effectiveLimit = allocator.getEffectiveLimit(name); + long min = allocator.getPoolMin(name); + long max = allocator.getPoolMax(name); + double utilization = effectiveLimit > 0 ? (double) allocated / effectiveLimit : 0; + snapshots.put(name, new PoolSnapshot(allocated, effectiveLimit, min, max, utilization)); + } + + // Identify pressured pools — if none, nothing to do + Map desires = new HashMap<>(); + long totalDesired = 0; + for (var entry : snapshots.entrySet()) { + PoolSnapshot s = entry.getValue(); + if (s.utilization > pressureThreshold) { + long desired = Math.max(1, (long) (s.allocated * 0.25)); + desires.put(entry.getKey(), desired); + totalDesired += desired; + } + } + if (totalDesired == 0) { + logger.debug("Rebalancer: no pools under pressure, skipping"); + return; + } + + // Shrink idle pools, floor at min + long freedCapacity = 0; + for (var entry : snapshots.entrySet()) { + PoolSnapshot s = entry.getValue(); + if (s.utilization < idleThreshold) { + long newLimit = Math.max((long) (s.effectiveLimit * (1.0 - shrinkFactor)), s.min); + newLimit = Math.max(newLimit, s.allocated); + if (newLimit < s.effectiveLimit) { + freedCapacity += s.effectiveLimit - newLimit; + allocator.setPoolEffectiveLimit(entry.getKey(), newLimit); + s.effectiveLimit = newLimit; + } + } + } + + if (freedCapacity == 0) { + logger.debug("Rebalancer: no capacity freed from idle pools"); + return; + } + + // Distribute freed capacity to pressured pools (can exceed max) + long totalGranted = 0; + long grantCap = Math.min(freedCapacity, totalDesired); + for (var entry : desires.entrySet()) { + String name = entry.getKey(); + long desired = entry.getValue(); + PoolSnapshot s = snapshots.get(name); + long grant = (long) ((double) grantCap * desired / totalDesired); + grant = Math.min(grant, grantCap - totalGranted); + if (grant > 0) { + try { + long newLimit = s.effectiveLimit + grant; + allocator.setPoolEffectiveLimit(name, newLimit); + totalGranted += grant; + logger.debug("Rebalancer: grew pool [{}] by {} bytes to {} (max={})", name, grant, newLimit, s.max); + } catch (Exception e) { + logger.warn(() -> new ParameterizedMessage("Rebalancer: failed to grow pool [{}]", name), e); + } + } + } + + // Return any excess freed capacity back to idle pools + long excess = freedCapacity - totalGranted; + if (excess > 0) { + returnToIdlePools(snapshots, excess); + } + } + + // ─── Private helpers ───────────────────────────────────────────────────────── + + private void returnToIdlePools(Map snapshots, long capacity) { + long totalIdleSize = 0; + for (PoolSnapshot s : snapshots.values()) { + if (s.utilization < idleThreshold) { + totalIdleSize += s.effectiveLimit; + } + } + if (totalIdleSize == 0) return; + + long totalReturned = 0; + for (var entry : snapshots.entrySet()) { + PoolSnapshot s = entry.getValue(); + if (s.utilization < idleThreshold) { + long share = (long) ((double) capacity * s.effectiveLimit / totalIdleSize); + share = Math.min(share, capacity - totalReturned); + if (share > 0) { + long newLimit = s.effectiveLimit + share; + allocator.setPoolEffectiveLimit(entry.getKey(), newLimit); + s.effectiveLimit = newLimit; + totalReturned += share; + } + } + } + } + + /** + * Point-in-time snapshot of a pool's state during one rebalance tick. + */ + static class PoolSnapshot { + final long allocated; + long effectiveLimit; + final long min; + final long max; + final double utilization; + + PoolSnapshot(long allocated, long effectiveLimit, long min, long max, double utilization) { + this.allocated = allocated; + this.effectiveLimit = effectiveLimit; + this.min = min; + this.max = max; + this.utilization = utilization; + } + } +} diff --git a/plugins/arrow-base/src/test/java/org/opensearch/arrow/allocator/ArrowBasePluginTests.java b/plugins/arrow-base/src/test/java/org/opensearch/arrow/allocator/ArrowBasePluginTests.java index ad72b70b8cbbb..df2673a5a8a7b 100644 --- a/plugins/arrow-base/src/test/java/org/opensearch/arrow/allocator/ArrowBasePluginTests.java +++ b/plugins/arrow-base/src/test/java/org/opensearch/arrow/allocator/ArrowBasePluginTests.java @@ -8,12 +8,11 @@ package org.opensearch.arrow.allocator; -import org.apache.arrow.memory.BufferAllocator; -import org.apache.arrow.memory.OutOfMemoryException; import org.opensearch.arrow.spi.NativeAllocatorPoolConfig; import org.opensearch.common.settings.ClusterSettings; import org.opensearch.common.settings.Setting; import org.opensearch.common.settings.Settings; +import org.opensearch.node.resource.tracker.ResourceTrackerSettings; import org.opensearch.test.OpenSearchTestCase; import java.util.HashSet; @@ -21,90 +20,23 @@ public class ArrowBasePluginTests extends OpenSearchTestCase { - public void testDeriveRootLimitDefaultUnsetReturnsLongMaxValue() { - // Explicit 0 expresses "AC unconfigured" — default is now ram - heap, so Settings.EMPTY - // would resolve to a real value on whatever machine the test runs on. - Settings s = Settings.builder().put("node.native_memory.limit", "0b").build(); - assertEquals(Long.toString(Long.MAX_VALUE), ArrowBasePlugin.deriveRootLimitDefault(s)); - } - - public void testDeriveRootLimitDefaultUsesAcLimitWhenSet() { - Settings s = Settings.builder().put("node.native_memory.limit", "1gb").build(); - // ROOT_LIMIT defaults to 20% of node.native_memory.limit — the Arrow framework gets a - // small fraction of native budget; DataFusion's Rust runtime takes the larger share. - long oneGiB = 1024L * 1024 * 1024; - assertEquals(Long.toString(oneGiB * 20 / 100), ArrowBasePlugin.deriveRootLimitDefault(s)); - } - - public void testDeriveRootLimitDefaultIgnoresBufferPercent() { - // node.native_memory.buffer_percent is admission control's throttle margin, not a - // framework budget reduction. The framework default takes its 20% fraction off - // node.native_memory.limit directly so AC's safety margin sits between AC's throttle - // threshold and the framework's hard cap rather than being collapsed into the cap. - // 1000 bytes limit, 20% buffer => root.limit still 20% of 1000 = 200. - Settings s = Settings.builder().put("node.native_memory.limit", "1000b").put("node.native_memory.buffer_percent", 20).build(); - assertEquals("200", ArrowBasePlugin.deriveRootLimitDefault(s)); - } - - public void testRootLimitSettingExposesDerivedDefault() { - Settings s = Settings.builder().put("node.native_memory.limit", "10gb").build(); - // 20% of 10 GiB. - long expected = 10L * 1024 * 1024 * 1024 * 20 / 100; - assertEquals(Long.valueOf(expected), ArrowBasePlugin.ROOT_LIMIT_SETTING.get(s)); - } - - public void testRootLimitSettingExplicitOverridesDerived() { - Settings s = Settings.builder() - .put("node.native_memory.limit", "8gb") - .put(NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT, 1024L) - .build(); - assertEquals(Long.valueOf(1024L), ArrowBasePlugin.ROOT_LIMIT_SETTING.get(s)); - } - - public void testRootLimitRejectsNegative() { - Settings s = Settings.builder().put(NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT, -1L).build(); - IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> ArrowBasePlugin.ROOT_LIMIT_SETTING.get(s)); - assertTrue(e.getMessage().contains("must be >= 0")); - } - public void testQuerySettingsExposeDefaults() { // Explicit 0 expresses "AC unconfigured" so QUERY_MAX falls back to Long.MAX_VALUE. - // Settings.EMPTY would resolve via ram - heap default to a finite, machine-dependent value. Settings s = Settings.builder().put("node.native_memory.limit", "0b").build(); assertEquals(Long.valueOf(0L), ArrowBasePlugin.QUERY_MIN_SETTING.get(s)); assertEquals(Long.valueOf(Long.MAX_VALUE), ArrowBasePlugin.QUERY_MAX_SETTING.get(s)); } - public void testFlightAndIngestMinDefaultsToZero() { - // The grouped validator (validateMinSum) treats per-pool mins as a guarantee - // floor — defaults of Long.MAX_VALUE caused the validator to reject any PUT - // that set a non-MAX root. Pool mins must default to zero so the baseline - // configuration is consistent. - Settings s = Settings.EMPTY; - assertEquals(Long.valueOf(0L), ArrowBasePlugin.FLIGHT_MIN_SETTING.get(s)); - assertEquals(Long.valueOf(0L), ArrowBasePlugin.INGEST_MIN_SETTING.get(s)); - } - - public void testQuerySettingsAcceptValues() { - Settings s = Settings.builder() - .put(NativeAllocatorPoolConfig.SETTING_QUERY_MIN, 100L) - .put(NativeAllocatorPoolConfig.SETTING_QUERY_MAX, 1000L) - .build(); - assertEquals(Long.valueOf(100L), ArrowBasePlugin.QUERY_MIN_SETTING.get(s)); - assertEquals(Long.valueOf(1000L), ArrowBasePlugin.QUERY_MAX_SETTING.get(s)); + public void testFlightAndIngestMinDerivedFromBudget() { + // With node.native_memory.limit set, mins derive as percentages + Settings s = Settings.builder().put("node.native_memory.limit", "1gb").build(); + long budget = 1024L * 1024 * 1024; + // flight min = 2% of budget, ingest min = 4% of budget + assertEquals(Long.valueOf(budget * 2 / 100), ArrowBasePlugin.FLIGHT_MIN_SETTING.get(s)); + assertEquals(Long.valueOf(budget * 4 / 100), ArrowBasePlugin.INGEST_MIN_SETTING.get(s)); } - // -- Pool max defaults derived from node.native_memory.limit ---------- - // Pool maxes anchor to the operator's off-heap budget (node.native_memory.limit), - // not to native.allocator.root.limit. This matches the PR #21732 partitioning - // diagram where pool fractions (5%/8%/5%) are of native_memory.limit. Sum of - // pool maxes (18% of native_memory.limit) fits within root.limit (20% of - // native_memory.limit) by default, leaving 2 pp headroom inside the root cap. - public void testPoolMaxDefaultsAreLongMaxValueWhenAcUnset() { - // AC explicitly unconfigured — pool maxes default to Long.MAX_VALUE (unbounded), - // preserving pre-AC behaviour. The default for node.native_memory.limit is - // 79% of (ram - heap), so to test the "unset" branch we must explicitly set it to 0. Settings s = Settings.builder().put("node.native_memory.limit", "0b").build(); assertEquals(Long.valueOf(Long.MAX_VALUE), ArrowBasePlugin.FLIGHT_MAX_SETTING.get(s)); assertEquals(Long.valueOf(Long.MAX_VALUE), ArrowBasePlugin.INGEST_MAX_SETTING.get(s)); @@ -112,10 +44,6 @@ public void testPoolMaxDefaultsAreLongMaxValueWhenAcUnset() { } public void testPoolMaxDefaultsScaleFromAcBudget() { - // 10 GiB native memory limit. Pool maxes per the partitioning model in PR #21732: - // FLIGHT_MAX = 5% INGEST_MAX = 8% QUERY_MAX = 5% - // Anchored to node.native_memory.limit, not to root.limit (which defaults to 20% - // of native_memory.limit) — see derivePoolMaxDefault Javadoc. Settings s = Settings.builder().put("node.native_memory.limit", "10gb").build(); long limit = 10L * 1024 * 1024 * 1024; assertEquals(Long.valueOf(limit * 5 / 100), ArrowBasePlugin.FLIGHT_MAX_SETTING.get(s)); @@ -123,51 +51,14 @@ public void testPoolMaxDefaultsScaleFromAcBudget() { assertEquals(Long.valueOf(limit * 5 / 100), ArrowBasePlugin.QUERY_MAX_SETTING.get(s)); } - public void testPoolMaxDefaultsIgnoreRootLimitOverride() { - // Pool maxes anchor to node.native_memory.limit, not to root.limit. An operator - // who overrides root.limit (e.g. to 4 GiB instead of the default 20% of - // native_memory.limit = 2 GiB) does not shrink pool defaults proportionally; - // the diagrammed partitioning of native_memory.limit holds. - Settings s = Settings.builder() - .put("node.native_memory.limit", "10gb") - .put(NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT, 4L * 1024 * 1024 * 1024) - .build(); - long limit = 10L * 1024 * 1024 * 1024; - assertEquals(Long.valueOf(limit * 5 / 100), ArrowBasePlugin.FLIGHT_MAX_SETTING.get(s)); - assertEquals(Long.valueOf(limit * 8 / 100), ArrowBasePlugin.INGEST_MAX_SETTING.get(s)); - assertEquals(Long.valueOf(limit * 5 / 100), ArrowBasePlugin.QUERY_MAX_SETTING.get(s)); - } - public void testPoolMaxDefaultsIgnoreBufferPercent() { - // node.native_memory.buffer_percent is AC's throttle margin, not a framework budget - // reduction. Pool maxes default off node.native_memory.limit directly so AC's safety - // margin sits between AC's throttle threshold and the framework's hard cap rather than - // being collapsed into the cap. - // 1000 bytes limit, 20% buffer => pool maxes are still 5/8/5% of 1000 = 50/80/50. Settings s = Settings.builder().put("node.native_memory.limit", "1000b").put("node.native_memory.buffer_percent", 20).build(); assertEquals(Long.valueOf(50L), ArrowBasePlugin.FLIGHT_MAX_SETTING.get(s)); assertEquals(Long.valueOf(80L), ArrowBasePlugin.INGEST_MAX_SETTING.get(s)); assertEquals(Long.valueOf(50L), ArrowBasePlugin.QUERY_MAX_SETTING.get(s)); } - public void testPoolMaxExplicitOverridesDerived() { - // Operator-set values must win over derived defaults. - Settings s = Settings.builder() - .put("node.native_memory.limit", "10gb") - .put(NativeAllocatorPoolConfig.SETTING_FLIGHT_MAX, 7L) - .put(NativeAllocatorPoolConfig.SETTING_INGEST_MAX, 8L) - .put(NativeAllocatorPoolConfig.SETTING_QUERY_MAX, 9L) - .build(); - assertEquals(Long.valueOf(7L), ArrowBasePlugin.FLIGHT_MAX_SETTING.get(s)); - assertEquals(Long.valueOf(8L), ArrowBasePlugin.INGEST_MAX_SETTING.get(s)); - assertEquals(Long.valueOf(9L), ArrowBasePlugin.QUERY_MAX_SETTING.get(s)); - } - public void testPoolMaxRejectsNegative() { - // Negative pool max is rejected at parse time, mirroring ROOT_LIMIT_SETTING. - // Each pool's parser has its own message so we exercise all three to lock down - // the per-pool error contract (and keep coverage honest on what is otherwise - // boilerplate-but-distinct branches). Settings flight = Settings.builder().put(NativeAllocatorPoolConfig.SETTING_FLIGHT_MAX, -1L).build(); IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> ArrowBasePlugin.FLIGHT_MAX_SETTING.get(flight)); assertTrue(e.getMessage().contains("must be >= 0")); @@ -188,253 +79,75 @@ public void testPoolMaxRejectsNegative() { } // ----------------------------------------------------------------- - // End-to-end wiring tests — verify that Setting.Property.Dynamic settings - // actually flow through to the live allocator. These guard against the - // "dynamic in name only" failure mode where a setting parses, the validator - // runs, the cluster-state update succeeds, and the runtime component - // silently does nothing because the addSettingsUpdateConsumer line was - // never registered. Bare-setter unit tests do not catch this; tests must - // drive a real ClusterSettings#applySettings round-trip. + // End-to-end wiring tests // ----------------------------------------------------------------- - /** - * Builds a {@link ClusterSettings} preloaded with all of {@link ArrowBasePlugin}'s - * settings, mirroring what {@code SettingsModule} does at node startup. Returns the - * fresh allocator with the framework's pools created and consumers registered - * — the same wiring path {@code createComponents} runs. - */ - private static ArrowNativeAllocator newWiredAllocator(Settings nodeSettings, ClusterSettings cs) { - long rootLimit = ArrowBasePlugin.ROOT_LIMIT_SETTING.get(nodeSettings); - ArrowNativeAllocator allocator = new ArrowNativeAllocator(rootLimit); - allocator.setRebalanceInterval(ArrowBasePlugin.REBALANCE_INTERVAL_SETTING.get(nodeSettings)); - allocator.getOrCreatePool( - NativeAllocatorPoolConfig.POOL_FLIGHT, - ArrowBasePlugin.FLIGHT_MIN_SETTING.get(nodeSettings), - ArrowBasePlugin.FLIGHT_MAX_SETTING.get(nodeSettings) - ); - allocator.getOrCreatePool( - NativeAllocatorPoolConfig.POOL_INGEST, - ArrowBasePlugin.INGEST_MIN_SETTING.get(nodeSettings), - ArrowBasePlugin.INGEST_MAX_SETTING.get(nodeSettings) - ); - allocator.getOrCreatePool( - NativeAllocatorPoolConfig.POOL_QUERY, - ArrowBasePlugin.QUERY_MIN_SETTING.get(nodeSettings), - ArrowBasePlugin.QUERY_MAX_SETTING.get(nodeSettings) - ); - ArrowBasePlugin.registerSettingsUpdateConsumers(cs, allocator); - return allocator; - } - private static ClusterSettings newClusterSettings(Settings nodeSettings) { Set> registered = new HashSet<>(); registered.addAll(new ArrowBasePlugin().getSettings()); return new ClusterSettings(nodeSettings, registered); } - public void testBuildAllocatorWiresAllPoolsAndSettingsConsumers() { - // Verifies the full createComponents code path — the helper extracted from - // createComponents builds the allocator, creates all three pools (FLIGHT, INGEST, - // QUERY), and registers the cluster-settings update consumers. We bypass the - // heavyweight ClusterService fixture and inject a real ClusterSettings directly, - // which is what production wiring also passes through to buildAllocator after - // unpacking the createComponents arguments. + public void testBuildAllocatorWiresAllPools() throws Exception { Settings nodeSettings = Settings.builder() - .put(NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT, 8L * 1024 * 1024 * 1024) + .put("node.native_memory.limit", "10gb") .put(NativeAllocatorPoolConfig.SETTING_FLIGHT_MAX, 1L * 1024 * 1024 * 1024) .put(NativeAllocatorPoolConfig.SETTING_INGEST_MAX, 2L * 1024 * 1024 * 1024) .put(NativeAllocatorPoolConfig.SETTING_QUERY_MAX, 1L * 1024 * 1024 * 1024) + .put("native.allocator.rebalancer.enabled", false) .build(); ClusterSettings cs = newClusterSettings(nodeSettings); - ArrowNativeAllocator allocator = ArrowBasePlugin.buildAllocator(nodeSettings, cs); + ArrowBasePlugin plugin = new ArrowBasePlugin(); + long budget = ResourceTrackerSettings.NODE_NATIVE_MEMORY_LIMIT_SETTING.get(nodeSettings).getBytes(); + ArrowNativeAllocator allocator = plugin.buildAllocator(nodeSettings, cs, () -> budget); try { - // All three pools created. Set poolNames = allocator.getPoolNames(); assertEquals("buildAllocator must register exactly the framework's three pools", 3, poolNames.size()); assertTrue(poolNames.contains(NativeAllocatorPoolConfig.POOL_FLIGHT)); assertTrue(poolNames.contains(NativeAllocatorPoolConfig.POOL_INGEST)); assertTrue(poolNames.contains(NativeAllocatorPoolConfig.POOL_QUERY)); - // Pool maxes match the operator-set values (rebalancer disabled by default, + // Pool maxes match the operator-set values (rebalancer disabled, // so initial limit == max). assertEquals(1L * 1024 * 1024 * 1024, allocator.getPoolAllocator(NativeAllocatorPoolConfig.POOL_FLIGHT).getLimit()); assertEquals(2L * 1024 * 1024 * 1024, allocator.getPoolAllocator(NativeAllocatorPoolConfig.POOL_INGEST).getLimit()); assertEquals(1L * 1024 * 1024 * 1024, allocator.getPoolAllocator(NativeAllocatorPoolConfig.POOL_QUERY).getLimit()); - - // Cluster-settings update consumers are registered: a PUT to a pool max must - // propagate to the live allocator. - cs.applySettings( - Settings.builder() - .put(NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT, 8L * 1024 * 1024 * 1024) - .put(NativeAllocatorPoolConfig.SETTING_INGEST_MAX, 4L * 1024 * 1024 * 1024) - .build() - ); - assertEquals( - "buildAllocator must wire the INGEST_MAX cluster-settings consumer", - 4L * 1024 * 1024 * 1024, - allocator.getPoolAllocator(NativeAllocatorPoolConfig.POOL_INGEST).getLimit() - ); - } finally { - allocator.close(); - } - } - - public void testQueryMaxClusterSettingPropagatesToAllocator() { - // The full wired path: node starts at default settings, plugin registers - // consumers, operator PUTs a new max via _cluster/settings. - Settings nodeSettings = Settings.builder().put(NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT, 8L * 1024 * 1024 * 1024).build(); - ClusterSettings cs = newClusterSettings(nodeSettings); - ArrowNativeAllocator allocator = newWiredAllocator(nodeSettings, cs); - try { - cs.applySettings( - Settings.builder() - .put(NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT, 8L * 1024 * 1024 * 1024) - .put(NativeAllocatorPoolConfig.SETTING_QUERY_MAX, 1024L * 1024 * 1024) - .build() - ); - assertEquals( - "PUT to query max must update the live BufferAllocator limit", - 1024L * 1024 * 1024, - allocator.getPoolAllocator(NativeAllocatorPoolConfig.POOL_QUERY).getLimit() - ); - assertEquals(1024L * 1024 * 1024, allocator.getPoolMax(NativeAllocatorPoolConfig.POOL_QUERY)); - } finally { - allocator.close(); - } - } - - public void testFlightMinClusterSettingPropagatesToAllocator() { - // Min is the regression-prone path: prior to the live-propagation fix, - // setPoolMin only updated the poolMins map and operators got HTTP 200 with - // no observable behavior change. - Settings nodeSettings = Settings.builder().put(NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT, 8L * 1024 * 1024 * 1024).build(); - ClusterSettings cs = newClusterSettings(nodeSettings); - ArrowNativeAllocator allocator = newWiredAllocator(nodeSettings, cs); - try { - // Pool starts at max (rebalancer disabled by default), so a min PUT below - // the current limit is a no-op on the live limit but updates poolMins. - // Use a min ABOVE the current limit to force the live raise path. - cs.applySettings( - Settings.builder() - .put(NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT, 8L * 1024 * 1024 * 1024) - .put(NativeAllocatorPoolConfig.SETTING_FLIGHT_MAX, 4L * 1024 * 1024 * 1024) - .put(NativeAllocatorPoolConfig.SETTING_FLIGHT_MIN, 2L * 1024 * 1024 * 1024) - .build() - ); - assertEquals( - "PUT to flight min must update the recorded min for the rebalancer", - 2L * 1024 * 1024 * 1024, - allocator.getPoolMin(NativeAllocatorPoolConfig.POOL_FLIGHT) - ); - assertTrue( - "PUT to flight min must raise the live BufferAllocator limit when min exceeds current", - allocator.getPoolAllocator(NativeAllocatorPoolConfig.POOL_FLIGHT).getLimit() >= 2L * 1024 * 1024 * 1024 - ); - } finally { - allocator.close(); - } - } - - public void testRootLimitClusterSettingPropagatesToAllocator() { - Settings nodeSettings = Settings.builder().put(NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT, 8L * 1024 * 1024 * 1024).build(); - ClusterSettings cs = newClusterSettings(nodeSettings); - ArrowNativeAllocator allocator = newWiredAllocator(nodeSettings, cs); - try { - cs.applySettings(Settings.builder().put(NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT, 16L * 1024 * 1024 * 1024).build()); - assertEquals( - "PUT to root limit must update the RootAllocator's limit", - 16L * 1024 * 1024 * 1024, - allocator.getRootAllocator().getLimit() - ); } finally { allocator.close(); + plugin.close(); } } - public void testValidatorRejectsSumOfMinsExceedingRoot() { - // The cross-setting grouped validator must reject PUTs that would over- - // subscribe the root. Test the rejection path end-to-end through ClusterSettings. - // Set node.native_memory.limit=0b explicitly so pool maxes default to Long.MAX_VALUE - // — minmax path firing first. + public void testBuildAllocatorWithRebalancerPoolsStartAtMax() throws Exception { Settings nodeSettings = Settings.builder() - .put("node.native_memory.limit", "0b") - .put(NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT, 10L * 1024 * 1024 * 1024) + .put("node.native_memory.limit", "1gb") + .put(NativeAllocatorPoolConfig.SETTING_FLIGHT_MIN, 10L * 1024 * 1024) + .put(NativeAllocatorPoolConfig.SETTING_FLIGHT_MAX, 200L * 1024 * 1024) + .put(NativeAllocatorPoolConfig.SETTING_INGEST_MIN, 20L * 1024 * 1024) + .put(NativeAllocatorPoolConfig.SETTING_INGEST_MAX, 200L * 1024 * 1024) + .put(NativeAllocatorPoolConfig.SETTING_QUERY_MIN, 10L * 1024 * 1024) + .put(NativeAllocatorPoolConfig.SETTING_QUERY_MAX, 200L * 1024 * 1024) + .put("native.allocator.rebalancer.enabled", true) + .put("native.allocator.rebalance.interval_seconds", 5L) .build(); ClusterSettings cs = newClusterSettings(nodeSettings); - ArrowNativeAllocator allocator = newWiredAllocator(nodeSettings, cs); + + ArrowBasePlugin plugin = new ArrowBasePlugin(); + long budget2 = ResourceTrackerSettings.NODE_NATIVE_MEMORY_LIMIT_SETTING.get(nodeSettings).getBytes(); + ArrowNativeAllocator allocator = plugin.buildAllocator(nodeSettings, cs, () -> budget2); try { - // root=10gb, flight_min=6gb, ingest_min=6gb => sum_mins=12gb > root=10gb. - IllegalArgumentException e = expectThrows( - IllegalArgumentException.class, - () -> cs.applySettings( - Settings.builder() - .put("node.native_memory.limit", "0b") - .put(NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT, 10L * 1024 * 1024 * 1024) - .put(NativeAllocatorPoolConfig.SETTING_FLIGHT_MIN, 6L * 1024 * 1024 * 1024) - .put(NativeAllocatorPoolConfig.SETTING_INGEST_MIN, 6L * 1024 * 1024 * 1024) - .build() - ) - ); - assertTrue( - "expected sum-exceeds-root in error, got: " + e.getMessage(), - e.getMessage().contains("exceeds root limit") || e.getMessage().contains("Sum of pool minimums") - ); + // Pools always start at max regardless of rebalancer state + assertEquals(200L * 1024 * 1024, allocator.getPoolAllocator(NativeAllocatorPoolConfig.POOL_FLIGHT).getLimit()); + assertEquals(200L * 1024 * 1024, allocator.getPoolAllocator(NativeAllocatorPoolConfig.POOL_INGEST).getLimit()); + assertEquals(200L * 1024 * 1024, allocator.getPoolAllocator(NativeAllocatorPoolConfig.POOL_QUERY).getLimit()); } finally { allocator.close(); + plugin.close(); } } - public void testChildAllocatorInheritsParentCapAfterPoolLimitUpdate() { - // Sanity check for the AnalyticsSearchService / FlightTransport pattern: - // when a consumer creates a child of the framework's pool with Long.MAX_VALUE - // limit, a PUT to the pool's max takes effect on the child's allocations - // automatically via Arrow's parent-cap check at allocateBytes — no listener needed. - // - // The contract we rely on (Arrow Accountant.allocate, lines 191-203 in 18.3.0): - // when the child's reservation is exhausted, it calls parent.allocate(...) which - // checks the parent's allocationLimit on every allocation. Setting the child's own - // limit to Long.MAX_VALUE means the child has no own-cap on top of the parent's; - // setLimit on the parent is observed atomically by all subsequent allocations - // through any descendant. - Settings nodeSettings = Settings.builder().put(NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT, 8L * 1024 * 1024 * 1024).build(); - ClusterSettings cs = newClusterSettings(nodeSettings); - ArrowNativeAllocator allocator = newWiredAllocator(nodeSettings, cs); - try { - BufferAllocator queryPool = allocator.getPoolAllocator(NativeAllocatorPoolConfig.POOL_QUERY); - BufferAllocator child = queryPool.newChildAllocator("consumer", 0, Long.MAX_VALUE); - try { - // Step 1: a small allocation through the child succeeds with the original pool max. - try (var buf = child.buffer(1024)) { - assertEquals("child accounting reflects allocation", 1024L, child.getAllocatedMemory()); - assertEquals("parent pool sees child allocation", 1024L, queryPool.getAllocatedMemory()); - } - - // Step 2: PUT a small pool max via cluster settings. - cs.applySettings( - Settings.builder() - .put(NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT, 8L * 1024 * 1024 * 1024) - .put(NativeAllocatorPoolConfig.SETTING_QUERY_MAX, 1L * 1024 * 1024) // 1 MB - .build() - ); - assertEquals("pool's own limit reflects the PUT", 1L * 1024 * 1024, queryPool.getLimit()); - assertEquals("child's own limit is intentionally uncapped", Long.MAX_VALUE, child.getLimit()); - - // Step 3: allocations within the new parent cap still work. - try (var withinCap = child.buffer(512 * 1024)) { // 512 KB, under 1 MB cap - assertEquals(512L * 1024, child.getAllocatedMemory()); - } - - // Step 4: allocation exceeding the new parent cap fails — this is the - // behavior the deleted listener pattern was emulating, now provided - // natively by Arrow's parent-cap check. - expectThrows(OutOfMemoryException.class, () -> child.buffer(2L * 1024 * 1024)); // 2 MB, over 1 MB cap - } finally { - child.close(); - } - } finally { - allocator.close(); - } + public void testRebalanceIntervalSettingDefault() { + assertEquals(Long.valueOf(5L), ArrowBasePlugin.REBALANCE_INTERVAL_SETTING.get(Settings.EMPTY)); } } diff --git a/plugins/arrow-base/src/test/java/org/opensearch/arrow/allocator/ArrowNativeAllocatorTests.java b/plugins/arrow-base/src/test/java/org/opensearch/arrow/allocator/ArrowNativeAllocatorTests.java index dc10a93bd6b74..fae60d010e5ed 100644 --- a/plugins/arrow-base/src/test/java/org/opensearch/arrow/allocator/ArrowNativeAllocatorTests.java +++ b/plugins/arrow-base/src/test/java/org/opensearch/arrow/allocator/ArrowNativeAllocatorTests.java @@ -10,6 +10,7 @@ import org.apache.arrow.memory.BufferAllocator; import org.opensearch.arrow.spi.NativeAllocator; +import org.opensearch.arrow.spi.PoolGroup; import org.opensearch.plugin.stats.NativeAllocatorPoolStats; import org.opensearch.test.OpenSearchTestCase; @@ -20,7 +21,7 @@ public class ArrowNativeAllocatorTests extends OpenSearchTestCase { @Override public void setUp() throws Exception { super.setUp(); - allocator = new ArrowNativeAllocator(1024L * 1024 * 1024); // 1 GB for tests + allocator = new ArrowNativeAllocator(); } @Override @@ -30,21 +31,21 @@ public void tearDown() throws Exception { } public void testCreatePool() { - NativeAllocator.PoolHandle handle = allocator.getOrCreatePool("test-pool", 100 * 1024 * 1024); + NativeAllocator.PoolHandle handle = allocator.getOrCreatePool("test-pool", 0L, 100 * 1024 * 1024, null); assertNotNull(handle); assertEquals(100 * 1024 * 1024, handle.limit()); assertEquals(0, handle.allocatedBytes()); } public void testGetOrCreatePoolIdempotent() { - NativeAllocator.PoolHandle first = allocator.getOrCreatePool("idempotent", 50 * 1024 * 1024); - NativeAllocator.PoolHandle second = allocator.getOrCreatePool("idempotent", 999 * 1024 * 1024); + NativeAllocator.PoolHandle first = allocator.getOrCreatePool("idempotent", 0L, 50 * 1024 * 1024, null); + NativeAllocator.PoolHandle second = allocator.getOrCreatePool("idempotent", 0L, 999 * 1024 * 1024, null); assertSame(first, second); assertEquals(50 * 1024 * 1024, second.limit()); } public void testPoolChildAllocation() { - allocator.getOrCreatePool("parent", 200 * 1024 * 1024); + allocator.getOrCreatePool("parent", 0L, 200 * 1024 * 1024, null); BufferAllocator child = allocator.getPoolAllocator("parent").newChildAllocator("child-1", 0, 50 * 1024 * 1024); try { child.buffer(1024).close(); @@ -55,7 +56,7 @@ public void testPoolChildAllocation() { } public void testSetPoolLimit() { - allocator.getOrCreatePool("resizable", 100 * 1024 * 1024); + allocator.getOrCreatePool("resizable", 0L, 100 * 1024 * 1024, null); allocator.setPoolLimit("resizable", 200 * 1024 * 1024); assertEquals(200 * 1024 * 1024, allocator.getPoolAllocator("resizable").getLimit()); } @@ -68,147 +69,54 @@ public void testGetPoolAllocatorNonExistent() { expectThrows(IllegalStateException.class, () -> allocator.getPoolAllocator("ghost")); } - public void testSetRootLimit() { - allocator.setRootLimit(512 * 1024 * 1024); - assertEquals(512 * 1024 * 1024, allocator.getRootAllocator().getLimit()); - } - public void testStats() { - allocator.getOrCreatePool("stats-pool", 64 * 1024 * 1024); + allocator.getOrCreatePool("stats-pool", 0L, 64 * 1024 * 1024, PoolGroup.SEARCH); NativeAllocatorPoolStats stats = allocator.stats(); assertNotNull(stats); - assertEquals(1024 * 1024 * 1024, stats.getRootLimitBytes()); - assertEquals(0, stats.getRootAllocatedBytes()); + assertEquals(-1, stats.getNativeAllocatedBytes()); + assertEquals(-1, stats.getNativeResidentBytes()); assertEquals(1, stats.getPools().size()); NativeAllocatorPoolStats.PoolStats poolStats = stats.getPools().get(0); assertEquals("stats-pool", poolStats.getName()); assertEquals(64 * 1024 * 1024, poolStats.getLimitBytes()); assertEquals(0, poolStats.getAllocatedBytes()); - // child_count is no longer rendered in stats; getPoolAllocator(...).getChildAllocators() - // is the runtime accessor for that detail if needed. } public void testStatsMultiplePools() { - allocator.getOrCreatePool("pool-a", 100 * 1024 * 1024); - allocator.getOrCreatePool("pool-b", 200 * 1024 * 1024); + allocator.getOrCreatePool("pool-a", 0L, 100 * 1024 * 1024, null); + allocator.getOrCreatePool("pool-b", 0L, 200 * 1024 * 1024, null); NativeAllocatorPoolStats stats = allocator.stats(); assertEquals(2, stats.getPools().size()); } public void testGetPoolNames() { - allocator.getOrCreatePool("alpha", 10 * 1024 * 1024); - allocator.getOrCreatePool("beta", 20 * 1024 * 1024); + allocator.getOrCreatePool("alpha", 0L, 10 * 1024 * 1024, null); + allocator.getOrCreatePool("beta", 0L, 20 * 1024 * 1024, null); assertTrue(allocator.getPoolNames().contains("alpha")); assertTrue(allocator.getPoolNames().contains("beta")); assertEquals(2, allocator.getPoolNames().size()); } - public void testRebalanceDistributesHeadroomToAllPools() { - allocator.setRootLimit(100 * 1024 * 1024); - allocator.getOrCreatePool("active", 10 * 1024 * 1024, 100 * 1024 * 1024); - allocator.getOrCreatePool("idle", 10 * 1024 * 1024, 100 * 1024 * 1024); - - // Simulate activity: allocate in "active" pool. - BufferAllocator activeAlloc = allocator.getPoolAllocator("active"); - BufferAllocator child = activeAlloc.newChildAllocator("worker", 0, 100 * 1024 * 1024); - var buf = child.buffer(5 * 1024 * 1024); - - try { - allocator.rebalance(); - - // Active pool gets bonus headroom on top of its min. - long activeLimit = activeAlloc.getLimit(); - assertTrue("Active pool limit should exceed min after rebalance, got " + activeLimit, activeLimit > 10 * 1024 * 1024); - - // Idle pool also receives headroom: distributing to all pools (not just - // currently-active ones) avoids the dead-pool corner case where a pool - // with min = 0 starts at limit = 0 and can never make a first allocation. - // Idle pools that don't end up needing the headroom return it on the next - // tick once they remain at zero allocation. - long idleLimit = allocator.getPoolAllocator("idle").getLimit(); - assertTrue("Idle pool should also receive headroom, got " + idleLimit, idleLimit > 10 * 1024 * 1024); - } finally { - buf.close(); - child.close(); - } - } - - public void testRebalanceLetsZeroMinPoolAllocate() { - // Regression test: under the previous "active pools only" rebalance algorithm, - // a pool with min = 0 would start at limit = 0 (rebalancer-on path), be unable - // to allocate, never become "active", and so never receive a bonus — permanently - // dead. Distributing headroom across all pools fixes the chicken-and-egg. - allocator.setRebalanceInterval(60); - allocator.setRootLimit(100 * 1024 * 1024); - allocator.getOrCreatePool("zero-min", 0L, 100 * 1024 * 1024); - try { - allocator.rebalance(); - BufferAllocator pool = allocator.getPoolAllocator("zero-min"); - assertTrue("Zero-min pool should receive headroom, got " + pool.getLimit(), pool.getLimit() > 0); - } finally { - allocator.setRebalanceInterval(0); - } - } - - public void testRebalanceNeverDropsBelowCurrentAllocation() { - allocator.setRootLimit(50 * 1024 * 1024); - allocator.getOrCreatePool("busy", 10 * 1024 * 1024); - - BufferAllocator pool = allocator.getPoolAllocator("busy"); - BufferAllocator child = pool.newChildAllocator("w", 0, 10 * 1024 * 1024); - var buf = child.buffer(8 * 1024 * 1024); // 8 MB allocated - - try { - allocator.rebalance(); - assertTrue("Limit should never drop below current allocation", pool.getLimit() >= pool.getAllocatedMemory()); - } finally { - buf.close(); - child.close(); - } - } - - public void testRebalanceWithNoPools() { - // Should not throw - allocator.rebalance(); - } - - public void testInitialLimitIsMaxWhenRebalancerDisabled() { - // Default tearDown allocator has rebalancer disabled (interval=0). - NativeAllocator.PoolHandle handle = allocator.getOrCreatePool("burst", 10 * 1024 * 1024, 100 * 1024 * 1024); - // With the rebalancer off, pools must start at their max so consumers can allocate - // immediately. Otherwise default-configured pools (min=0) would reject everything. + public void testInitialLimitIsMax() { + NativeAllocator.PoolHandle handle = allocator.getOrCreatePool("burst", 10 * 1024 * 1024, 100 * 1024 * 1024, null); assertEquals(100 * 1024 * 1024, handle.limit()); } - public void testInitialLimitIsMinWhenRebalancerEnabled() { - // Enabling the rebalancer reverts to the original "guarantee + burst" semantics: - // pools start at min and grow via the next rebalance tick. - allocator.setRebalanceInterval(60); // any positive value enables the flag - NativeAllocator.PoolHandle handle = allocator.getOrCreatePool("guaranteed", 10 * 1024 * 1024, 100 * 1024 * 1024); - assertEquals(10 * 1024 * 1024, handle.limit()); - // Disable so subsequent tests aren't affected by the scheduled task. - allocator.setRebalanceInterval(0); - } - public void testCloseReleasesAllPools() { - allocator.getOrCreatePool("close-test", 10 * 1024 * 1024); + allocator.getOrCreatePool("close-test", 0L, 10 * 1024 * 1024, null); allocator.close(); assertTrue(allocator.getPoolNames().isEmpty()); // Recreate for tearDown - allocator = new ArrowNativeAllocator(1024L * 1024 * 1024); + allocator = new ArrowNativeAllocator(); } - public void testSetPoolMinRaisesLiveLimitWhenRebalancerOff() { - // setPoolMin must affect the live BufferAllocator immediately, not just the - // poolMins map. Otherwise it's a Dynamic setting that returns HTTP 200 and - // does nothing observable until the operator also enables the rebalancer. - allocator.setRootLimit(100 * 1024 * 1024); - allocator.getOrCreatePool("p", 0L, 100 * 1024 * 1024); + public void testSetPoolMinRaisesLiveLimitWhenNeeded() { + allocator.getOrCreatePool("p", 0L, 100 * 1024 * 1024, null); BufferAllocator pool = allocator.getPoolAllocator("p"); long startLimit = pool.getLimit(); @@ -222,11 +130,7 @@ public void testSetPoolMinRaisesLiveLimitWhenRebalancerOff() { } public void testSetPoolMinDoesNotShrinkLiveLimit() { - // Dropping the min must not shrink an in-flight pool — the rebalancer is the - // only path that reduces limits, so a min change on its own should never - // reclaim capacity. - allocator.setRootLimit(100 * 1024 * 1024); - allocator.getOrCreatePool("p", 0L, 100 * 1024 * 1024); + allocator.getOrCreatePool("p", 0L, 100 * 1024 * 1024, null); BufferAllocator pool = allocator.getPoolAllocator("p"); long startLimit = pool.getLimit(); diff --git a/plugins/arrow-base/src/test/java/org/opensearch/arrow/allocator/NativeMemoryRebalancerTests.java b/plugins/arrow-base/src/test/java/org/opensearch/arrow/allocator/NativeMemoryRebalancerTests.java new file mode 100644 index 0000000000000..b92a477a67ea3 --- /dev/null +++ b/plugins/arrow-base/src/test/java/org/opensearch/arrow/allocator/NativeMemoryRebalancerTests.java @@ -0,0 +1,152 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.arrow.allocator; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.opensearch.test.OpenSearchTestCase; + +import java.util.ArrayList; +import java.util.List; + +public class NativeMemoryRebalancerTests extends OpenSearchTestCase { + + private static final long MB = 1024L * 1024; + private static final long BUDGET = 100 * MB; + + private ArrowNativeAllocator allocator; + private NativeMemoryRebalancer rebalancer; + + @Override + public void setUp() throws Exception { + super.setUp(); + allocator = new ArrowNativeAllocator(); + allocator.setBudget(BUDGET); + rebalancer = new NativeMemoryRebalancer(allocator, () -> BUDGET, 0.75, 0.50, 0.10); + } + + @Override + public void tearDown() throws Exception { + allocator.close(); + super.tearDown(); + } + + public void testPoolsStartAtMax() { + allocator.getOrCreatePool("a", 5 * MB, 40 * MB, null); + allocator.getOrCreatePool("b", 10 * MB, 50 * MB, null); + + assertEquals(40 * MB, allocator.getPoolAllocator("a").getLimit()); + assertEquals(50 * MB, allocator.getPoolAllocator("b").getLimit()); + } + + public void testShrinksIdlePool() { + allocator.getOrCreatePool("idle", 5 * MB, 50 * MB, null); + allocator.getOrCreatePool("pressured", 5 * MB, 50 * MB, null); + + BufferAllocator pressuredPool = allocator.getPoolAllocator("pressured"); + ArrowBuf buf = pressuredPool.buffer((long) (50 * MB * 0.8)); + + try { + long idleLimitBefore = allocator.getPoolAllocator("idle").getLimit(); + rebalancer.rebalance(); + long idleLimitAfter = allocator.getPoolAllocator("idle").getLimit(); + assertTrue("Idle pool should shrink, was " + idleLimitBefore + " now " + idleLimitAfter, idleLimitAfter < idleLimitBefore); + } finally { + buf.close(); + } + } + + public void testGrowsPressuredPoolAboveMax() { + allocator.getOrCreatePool("idle", 5 * MB, 50 * MB, null); + allocator.getOrCreatePool("pressured", 5 * MB, 20 * MB, null); + + BufferAllocator pressuredPool = allocator.getPoolAllocator("pressured"); + ArrowBuf buf = pressuredPool.buffer((long) (20 * MB * 0.8)); + + try { + rebalancer.rebalance(); + long pressuredLimit = pressuredPool.getLimit(); + assertTrue("Pressured pool should grow above max (20MB), got " + pressuredLimit, pressuredLimit > 20 * MB); + } finally { + buf.close(); + } + } + + public void testNeverDropsBelowMin() { + allocator.getOrCreatePool("floored", 10 * MB, 50 * MB, null); + allocator.getOrCreatePool("pressured", 5 * MB, 50 * MB, null); + + BufferAllocator pressuredPool = allocator.getPoolAllocator("pressured"); + ArrowBuf buf = pressuredPool.buffer((long) (50 * MB * 0.8)); + + try { + for (int i = 0; i < 20; i++) { + rebalancer.rebalance(); + } + long flooredLimit = allocator.getPoolAllocator("floored").getLimit(); + assertTrue("Pool limit (" + flooredLimit + ") should not drop below min (10MB)", flooredLimit >= 10 * MB); + } finally { + buf.close(); + } + } + + public void testNoActionWhenNoPressure() { + allocator.getOrCreatePool("a", 5 * MB, 50 * MB, null); + allocator.getOrCreatePool("b", 5 * MB, 50 * MB, null); + + long limitA = allocator.getPoolAllocator("a").getLimit(); + long limitB = allocator.getPoolAllocator("b").getLimit(); + + rebalancer.rebalance(); + + assertEquals(limitA, allocator.getPoolAllocator("a").getLimit()); + assertEquals(limitB, allocator.getPoolAllocator("b").getLimit()); + } + + public void testResetAllPoolsToMax() { + allocator.getOrCreatePool("a", 5 * MB, 40 * MB, null); + allocator.getOrCreatePool("b", 5 * MB, 50 * MB, null); + + BufferAllocator bPool = allocator.getPoolAllocator("b"); + ArrowBuf buf = bPool.buffer((long) (50 * MB * 0.8)); + rebalancer.rebalance(); + buf.close(); + + allocator.resetAllPoolsToMax(); + + assertEquals(40 * MB, allocator.getPoolAllocator("a").getLimit()); + assertEquals(50 * MB, allocator.getPoolAllocator("b").getLimit()); + } + + public void testSumLimitsNeverExceedsBudget() { + allocator.getOrCreatePool("p1", 5 * MB, 30 * MB, null); + allocator.getOrCreatePool("p2", 5 * MB, 30 * MB, null); + allocator.getOrCreatePool("p3", 5 * MB, 30 * MB, null); + + List bufs = new ArrayList<>(); + for (String name : new String[] { "p1", "p2", "p3" }) { + BufferAllocator pool = allocator.getPoolAllocator(name); + bufs.add(pool.buffer((long) (pool.getLimit() * 0.8))); + } + + try { + for (int i = 0; i < 10; i++) { + rebalancer.rebalance(); + } + + long sumLimits = 0; + for (String name : new String[] { "p1", "p2", "p3" }) { + sumLimits += allocator.getPoolAllocator(name).getLimit(); + } + assertTrue("Sum of limits (" + sumLimits + ") should not exceed budget (" + BUDGET + ")", sumLimits <= BUDGET); + } finally { + bufs.forEach(ArrowBuf::close); + } + } +} diff --git a/plugins/arrow-flight-rpc/src/internalClusterTest/java/org/opensearch/arrow/flight/BackpressureProducerIT.java b/plugins/arrow-flight-rpc/src/internalClusterTest/java/org/opensearch/arrow/flight/BackpressureProducerIT.java index ff7f1359df089..a32a0ffe8bc58 100644 --- a/plugins/arrow-flight-rpc/src/internalClusterTest/java/org/opensearch/arrow/flight/BackpressureProducerIT.java +++ b/plugins/arrow-flight-rpc/src/internalClusterTest/java/org/opensearch/arrow/flight/BackpressureProducerIT.java @@ -122,9 +122,10 @@ protected Settings nodeSettings(int nodeOrdinal) { return Settings.builder() .put(super.nodeSettings(nodeOrdinal)) .put("node.native_memory.limit", "512mb") - .put(NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT, 256 * MB) .put(NativeAllocatorPoolConfig.SETTING_FLIGHT_MAX, FLIGHT_POOL_CAP_BYTES) + .put(NativeAllocatorPoolConfig.SETTING_INGEST_MIN, 8 * MB) .put(NativeAllocatorPoolConfig.SETTING_INGEST_MAX, 16 * MB) + .put(NativeAllocatorPoolConfig.SETTING_QUERY_MIN, 8 * MB) .put(NativeAllocatorPoolConfig.SETTING_QUERY_MAX, 16 * MB) .put(ServerConfig.FLIGHT_OUTBOUND_BUFFER_THRESHOLD.getKey(), new ByteSizeValue(GRPC_THRESHOLD_BYTES, ByteSizeUnit.BYTES)) .build(); diff --git a/plugins/arrow-flight-rpc/src/internalClusterTest/java/org/opensearch/arrow/flight/NativeAllocatorBoundaryIT.java b/plugins/arrow-flight-rpc/src/internalClusterTest/java/org/opensearch/arrow/flight/NativeAllocatorBoundaryIT.java index 5633cfa429c5d..d849458c0b048 100644 --- a/plugins/arrow-flight-rpc/src/internalClusterTest/java/org/opensearch/arrow/flight/NativeAllocatorBoundaryIT.java +++ b/plugins/arrow-flight-rpc/src/internalClusterTest/java/org/opensearch/arrow/flight/NativeAllocatorBoundaryIT.java @@ -12,7 +12,6 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.OutOfMemoryException; -import org.opensearch.action.admin.cluster.settings.ClusterUpdateSettingsResponse; import org.opensearch.arrow.allocator.ArrowBasePlugin; import org.opensearch.arrow.allocator.ArrowNativeAllocator; import org.opensearch.arrow.spi.NativeAllocatorPoolConfig; @@ -31,27 +30,17 @@ * *

Boots a single-node cluster with tight memory settings, then exercises * the actual Arrow allocation path to verify that the framework's - * configured caps are enforced at allocation time (not just at config-parse - * time). Complements unit-level tests in {@code ArrowBasePluginTests} by - * verifying that the production wiring (Guice -> ArrowNativeAllocator -> - * Arrow's RootAllocator chain) honors the caps end-to-end. - * - *

Each test sets explicit byte limits and allocates real - * {@link org.apache.arrow.memory.ArrowBuf} buffers, asserting either - * successful allocation or {@link OutOfMemoryException} based on whether - * the request fits within the configured cap. + * configured caps are enforced at allocation time. */ @ThreadLeakScope(ThreadLeakScope.Scope.NONE) @OpenSearchIntegTestCase.ClusterScope(scope = OpenSearchIntegTestCase.Scope.TEST, minNumDataNodes = 1, maxNumDataNodes = 1) public class NativeAllocatorBoundaryIT extends OpenSearchIntegTestCase { - /** 1 MiB. Chosen small enough that tests run fast but large enough that - * Arrow's internal accounting doesn't round it away. */ + /** 1 MiB. */ private static final long MB = 1024L * 1024; - /** Cap large enough for the framework's own bookkeeping but small enough - * to trigger OOM well before exhausting host memory. */ - private static final long ROOT_CAP_BYTES = 16 * MB; + /** Per-pool cap for tests. */ + private static final long POOL_CAP_BYTES = 16 * MB; @Override protected Collection> nodePlugins() { @@ -60,127 +49,83 @@ protected Collection> nodePlugins() { @Override protected Settings nodeSettings(int nodeOrdinal) { - // Set node.native_memory.limit explicitly so framework defaults derive - // from a known value rather than the (machine-dependent) ram-heap default. - // ROOT_LIMIT and pool maxes are then overridden per-test via cluster - // settings PUT or directly via this node-settings layer. return Settings.builder() .put(super.nodeSettings(nodeOrdinal)) .put("node.native_memory.limit", "256mb") - // Tight root cap: 16 MiB total Arrow framework budget. - .put(NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT, ROOT_CAP_BYTES) - // Per-pool maxes set generously so per-pool caps don't trip - // before root.limit. Tests targeting per-pool caps override below. - .put(NativeAllocatorPoolConfig.SETTING_FLIGHT_MAX, ROOT_CAP_BYTES) - .put(NativeAllocatorPoolConfig.SETTING_INGEST_MAX, ROOT_CAP_BYTES) - .put(NativeAllocatorPoolConfig.SETTING_QUERY_MAX, ROOT_CAP_BYTES) + .put("native.allocator.rebalancer.enabled", false) + // Per-pool maxes set to a known value for testing. + .put(NativeAllocatorPoolConfig.SETTING_FLIGHT_MAX, POOL_CAP_BYTES) + .put(NativeAllocatorPoolConfig.SETTING_INGEST_MAX, POOL_CAP_BYTES) + .put(NativeAllocatorPoolConfig.SETTING_QUERY_MAX, POOL_CAP_BYTES) .build(); } /** - * Verifies that {@code parquet.native.pool.query.max} caps allocations - * through the QUERY pool: a buffer request exceeding the per-pool cap - * throws {@link OutOfMemoryException} even when root has headroom. + * Verifies that per-pool max caps allocations through the QUERY pool. */ public void testPoolMaxRejectsAllocationsBeyondCap() { - // Tighten QUERY pool to 4 MiB while leaving root at 16 MiB. - long poolCap = 4 * MB; - ClusterUpdateSettingsResponse resp = client().admin() - .cluster() - .prepareUpdateSettings() - .setTransientSettings(Settings.builder().put(NativeAllocatorPoolConfig.SETTING_QUERY_MAX, poolCap)) - .get(); - assertTrue("PUT to query.max must succeed", resp.isAcknowledged()); - ArrowNativeAllocator allocator = internalCluster().getInstance(ArrowNativeAllocator.class); BufferAllocator queryPool = allocator.getPoolAllocator(NativeAllocatorPoolConfig.POOL_QUERY); - assertThat("pool's live limit reflects the PUT", queryPool.getLimit(), is(poolCap)); + assertThat("pool's live limit reflects the configured max", queryPool.getLimit(), is(POOL_CAP_BYTES)); // Sub-cap allocation succeeds. try (var withinCap = queryPool.buffer(2 * MB)) { assertThat(queryPool.getAllocatedMemory(), greaterThanOrEqualTo(2 * MB)); } - // Cap+1 allocation fails — Arrow's parent-cap check at allocateBytes - // walks queryPool's allocationLimit and rejects. - expectThrows(OutOfMemoryException.class, () -> queryPool.buffer(8 * MB)); + // Cap+1 allocation fails. + expectThrows(OutOfMemoryException.class, () -> queryPool.buffer(POOL_CAP_BYTES + MB)); } /** - * Verifies that {@code native.allocator.root.limit} caps allocations - * across all pools combined: when the sum of in-flight pool allocations - * approaches the root cap, the next allocation is rejected at the root - * level even if each individual pool's max would allow it. + * Verifies that per-pool limits cap allocations: when one pool is full, + * allocations through it fail even if other pools have headroom. */ - public void testRootLimitRejectsAllocationsBeyondCap() { + public void testPoolLimitRejectsAllocationsBeyondCap() { ArrowNativeAllocator allocator = internalCluster().getInstance(ArrowNativeAllocator.class); BufferAllocator flightPool = allocator.getPoolAllocator(NativeAllocatorPoolConfig.POOL_FLIGHT); BufferAllocator queryPool = allocator.getPoolAllocator(NativeAllocatorPoolConfig.POOL_QUERY); - BufferAllocator root = allocator.getRootAllocator(); - - // Sanity-check setup: confirm the live limits match nodeSettings. - // If these fail, the test setup is wrong and the body's expectations are - // meaningless — surface the misconfiguration instead of misleading failures below. - assertThat("root.limit must match nodeSettings", root.getLimit(), is(ROOT_CAP_BYTES)); - assertThat("flight.max must match nodeSettings", flightPool.getLimit(), is(ROOT_CAP_BYTES)); - assertThat("query.max must match nodeSettings", queryPool.getLimit(), is(ROOT_CAP_BYTES)); - - // Hold 8 MiB through the FLIGHT pool. With root at 16 MiB this leaves 8 MiB - // headroom across the root. (Power-of-2 sizes avoid Arrow's chunked-allocation - // rounding surprises; e.g. a 12 MiB request actually consumes 16 MiB.) + + assertThat("flight.max must match nodeSettings", flightPool.getLimit(), is(POOL_CAP_BYTES)); + assertThat("query.max must match nodeSettings", queryPool.getLimit(), is(POOL_CAP_BYTES)); + + // Hold 8 MiB through the FLIGHT pool. try (var flightHold = flightPool.buffer(8 * MB)) { assertThat("FLIGHT pool reflects 8MB allocation", flightPool.getAllocatedMemory(), is(8L * MB)); - assertThat("root reflects 8MB allocation", root.getAllocatedMemory(), is(8L * MB)); - // A 4 MiB allocation through QUERY succeeds (within remaining root headroom). + // A 4 MiB allocation through QUERY succeeds (within its own pool cap). try (var queryFit = queryPool.buffer(4 * MB)) { - assertThat(allocator.getRootAllocator().getAllocatedMemory(), is(12L * MB)); + assertThat(queryPool.getAllocatedMemory(), greaterThanOrEqualTo(4 * MB)); } - // An 8 MiB allocation through QUERY would push the root past the 16 MiB cap - // (8 MiB FLIGHT + 8 MiB QUERY). Arrow's parent-cap check at allocateBytes - // walks queryPool -> root and rejects with OOM, even though QUERY's own - // (16 MiB) max would individually allow it. - expectThrows(OutOfMemoryException.class, () -> queryPool.buffer(16 * MB)); + // An allocation exceeding the QUERY pool's own cap fails. + expectThrows(OutOfMemoryException.class, () -> queryPool.buffer(POOL_CAP_BYTES + MB)); } } /** - * Verifies that a dynamic PUT to a pool's max takes effect on - * subsequent allocations through descendants of that pool. This is the - * behavior the deleted {@code NativeAllocatorListener} SPI was emulating; - * it is now provided natively by Arrow's parent-cap check at allocateBytes. + * Verifies that setPoolLimit dynamically adjusts the pool cap and + * subsequent allocations respect the new limit. */ - public void testDynamicPoolResizeAffectsInFlightAllocations() { + public void testSetPoolLimitAffectsInFlightAllocations() { ArrowNativeAllocator allocator = internalCluster().getInstance(ArrowNativeAllocator.class); BufferAllocator queryPool = allocator.getPoolAllocator(NativeAllocatorPoolConfig.POOL_QUERY); - // Step 1: create a child allocator at Long.MAX_VALUE — the AnalyticsSearchService / - // DefaultPlanExecutor pattern. The child intentionally has no own-cap; it relies - // on the parent pool's allocationLimit at allocation time. try (BufferAllocator child = queryPool.newChildAllocator("boundary-it-child", 0, Long.MAX_VALUE)) { - // Step 2: a small buffer through the child succeeds with the initial pool max. + // A small buffer through the child succeeds with the initial pool max. try (var buf = child.buffer(2 * MB)) { assertThat(child.getAllocatedMemory(), greaterThanOrEqualTo(2 * MB)); } - // Step 3: PUT a tighter pool max via cluster settings. + // Programmatically tighten the pool limit. long newPoolCap = 1 * MB; - ClusterUpdateSettingsResponse resp = client().admin() - .cluster() - .prepareUpdateSettings() - .setTransientSettings(Settings.builder().put(NativeAllocatorPoolConfig.SETTING_QUERY_MAX, newPoolCap)) - .get(); - assertTrue("PUT to query.max must succeed", resp.isAcknowledged()); - assertThat("pool's own limit reflects the PUT", queryPool.getLimit(), is(newPoolCap)); - assertThat("child's own limit is intentionally uncapped", child.getLimit(), is(Long.MAX_VALUE)); - - // Step 4: an allocation that fit before the resize now exceeds the parent cap. - // Arrow's parent-cap check at allocateBytes walks queryPool.allocationLimit - // and rejects — no listener machinery needed. + allocator.setPoolLimit(NativeAllocatorPoolConfig.POOL_QUERY, newPoolCap); + assertThat("pool's own limit reflects the update", queryPool.getLimit(), is(newPoolCap)); + + // An allocation that fit before the resize now exceeds the parent cap. expectThrows(OutOfMemoryException.class, () -> child.buffer(2 * MB)); - // Step 5: an allocation under the new cap still succeeds. + // An allocation under the new cap still succeeds. try (var smallBuf = child.buffer(512 * 1024)) { assertThat(child.getAllocatedMemory(), greaterThanOrEqualTo(512L * 1024)); } diff --git a/plugins/arrow-flight-rpc/src/internalClusterTest/java/org/opensearch/arrow/flight/NativeMemoryRebalancerIT.java b/plugins/arrow-flight-rpc/src/internalClusterTest/java/org/opensearch/arrow/flight/NativeMemoryRebalancerIT.java new file mode 100644 index 0000000000000..e28e373fe69a9 --- /dev/null +++ b/plugins/arrow-flight-rpc/src/internalClusterTest/java/org/opensearch/arrow/flight/NativeMemoryRebalancerIT.java @@ -0,0 +1,100 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.arrow.flight; + +import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.opensearch.arrow.allocator.ArrowBasePlugin; +import org.opensearch.arrow.allocator.ArrowNativeAllocator; +import org.opensearch.arrow.spi.NativeAllocatorPoolConfig; +import org.opensearch.common.settings.Settings; +import org.opensearch.plugins.Plugin; +import org.opensearch.test.OpenSearchIntegTestCase; + +import java.util.Collection; +import java.util.List; + +import static org.hamcrest.Matchers.lessThanOrEqualTo; + +/** + * Integration test for the NativeMemoryRebalancer. + * + *

Boots a single-node cluster with the rebalancer enabled and verifies that + * pools start at their max and the rebalancer shrinks idle pools / grows pressured ones. + */ +@ThreadLeakScope(ThreadLeakScope.Scope.NONE) +@OpenSearchIntegTestCase.ClusterScope(scope = OpenSearchIntegTestCase.Scope.TEST, minNumDataNodes = 1, maxNumDataNodes = 1) +public class NativeMemoryRebalancerIT extends OpenSearchIntegTestCase { + + private static final long MB = 1024L * 1024; + + @Override + protected Collection> nodePlugins() { + return List.of(ArrowBasePlugin.class); + } + + @Override + protected Settings nodeSettings(int nodeOrdinal) { + return Settings.builder() + .put(super.nodeSettings(nodeOrdinal)) + .put("native.allocator.rebalancer.enabled", true) + .put("native.allocator.rebalance.interval_seconds", 1) + .put("node.native_memory.limit", "1gb") + .put(NativeAllocatorPoolConfig.SETTING_FLIGHT_MIN, 5 * MB) + .put(NativeAllocatorPoolConfig.SETTING_FLIGHT_MAX, 200 * MB) + .put(NativeAllocatorPoolConfig.SETTING_INGEST_MIN, 5 * MB) + .put(NativeAllocatorPoolConfig.SETTING_INGEST_MAX, 200 * MB) + .put(NativeAllocatorPoolConfig.SETTING_QUERY_MIN, 5 * MB) + .put(NativeAllocatorPoolConfig.SETTING_QUERY_MAX, 200 * MB) + .build(); + } + + /** + * Verifies that pools start at their max (before rebalancer shrinks them). + * Uses a long rebalancer interval to avoid race conditions. + */ + public void testPoolsStartAtMax() { + ArrowNativeAllocator allocator = internalCluster().getInstance(ArrowNativeAllocator.class); + BufferAllocator ingestPool = allocator.getPoolAllocator(NativeAllocatorPoolConfig.POOL_INGEST); + + // The rebalancer may have already run (1s interval), so the pool may have shrunk. + // Verify it's at least at min and the configured max is correct. + long max = allocator.getPoolMax(NativeAllocatorPoolConfig.POOL_INGEST); + assertEquals("Ingest pool max should be configured at 200MB", 200 * MB, max); + // Pool limit should be between min and max (rebalancer may have shrunk it) + long limit = ingestPool.getLimit(); + assertThat("Ingest pool limit should be >= min", limit, org.hamcrest.Matchers.greaterThanOrEqualTo(5 * MB)); + assertThat("Ingest pool limit should be <= max", limit, lessThanOrEqualTo(200 * MB)); + } + + /** + * Verifies that an idle pool shrinks after rebalancer ticks when another pool is pressured. + */ + public void testIdlePoolShrinksWhenOtherPressured() throws Exception { + ArrowNativeAllocator allocator = internalCluster().getInstance(ArrowNativeAllocator.class); + BufferAllocator ingestPool = allocator.getPoolAllocator(NativeAllocatorPoolConfig.POOL_INGEST); + BufferAllocator flightPool = allocator.getPoolAllocator(NativeAllocatorPoolConfig.POOL_FLIGHT); + + // Allocate > 75% of ingest pool to create pressure + long toAllocate = (long) (ingestPool.getLimit() * 0.8); + ArrowBuf buf = ingestPool.buffer(toAllocate); + + try { + // Flight pool is idle — wait for rebalancer to shrink it + assertBusy(() -> { + long flightLimit = flightPool.getLimit(); + assertThat("Flight pool should shrink when idle", flightLimit, org.hamcrest.Matchers.lessThan(200 * MB)); + }); + } finally { + buf.close(); + } + } +} diff --git a/plugins/arrow-flight-rpc/src/internalClusterTest/java/org/opensearch/arrow/flight/UnifiedNativeMemoryStatsIT.java b/plugins/arrow-flight-rpc/src/internalClusterTest/java/org/opensearch/arrow/flight/UnifiedNativeMemoryStatsIT.java new file mode 100644 index 0000000000000..fd0e165103e86 --- /dev/null +++ b/plugins/arrow-flight-rpc/src/internalClusterTest/java/org/opensearch/arrow/flight/UnifiedNativeMemoryStatsIT.java @@ -0,0 +1,95 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.arrow.flight; + +import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope; + +import org.opensearch.action.admin.cluster.node.stats.NodesStatsRequest; +import org.opensearch.action.admin.cluster.node.stats.NodesStatsResponse; +import org.opensearch.arrow.allocator.ArrowBasePlugin; +import org.opensearch.common.settings.Settings; +import org.opensearch.plugin.stats.NativeAllocatorPoolStats; +import org.opensearch.plugins.Plugin; +import org.opensearch.test.OpenSearchIntegTestCase; + +import java.util.Collection; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.hamcrest.Matchers.greaterThan; +import static org.hamcrest.Matchers.greaterThanOrEqualTo; +import static org.hamcrest.Matchers.hasItems; +import static org.hamcrest.Matchers.is; +import static org.hamcrest.Matchers.notNullValue; + +/** + * Integration test verifying the unified native memory stats endpoint. + * Boots a single-node cluster with ArrowBasePlugin and confirms that + * all registered pools (Arrow + virtual) appear in _nodes/stats/native_memory. + */ +@ThreadLeakScope(ThreadLeakScope.Scope.NONE) +@OpenSearchIntegTestCase.ClusterScope(scope = OpenSearchIntegTestCase.Scope.TEST, minNumDataNodes = 1, maxNumDataNodes = 1) +public class UnifiedNativeMemoryStatsIT extends OpenSearchIntegTestCase { + + @Override + protected Collection> nodePlugins() { + return List.of(ArrowBasePlugin.class); + } + + @Override + protected Settings nodeSettings(int nodeOrdinal) { + return Settings.builder().put(super.nodeSettings(nodeOrdinal)).put("node.native_memory.limit", "1gb").build(); + } + + /** + * Verifies that the Arrow pools (flight, ingest, query) are visible in + * _nodes/stats/native_memory with correct structure. + */ + public void testArrowPoolsVisibleInStats() { + NodesStatsResponse response = client().admin() + .cluster() + .prepareNodesStats() + .addMetric(NodesStatsRequest.Metric.NATIVE_MEMORY.metricName()) + .get(); + + assertThat(response.getNodes().isEmpty(), is(false)); + NativeAllocatorPoolStats stats = response.getNodes().get(0).getNativeAllocatorStats(); + assertThat("native_memory stats should be present", stats, notNullValue()); + + // Dump the stats for debugging + StringBuilder sb = new StringBuilder(); + sb.append("nativeAllocated=").append(stats.getNativeAllocatedBytes()); + sb.append(", nativeResident=").append(stats.getNativeResidentBytes()); + sb.append(", pools=["); + for (NativeAllocatorPoolStats.PoolStats p : stats.getPools()) { + sb.append(p.getName()) + .append("(alloc=") + .append(p.getAllocatedBytes()) + .append(",peak=") + .append(p.getPeakBytes()) + .append(",limit=") + .append(p.getLimitBytes()) + .append(") "); + } + sb.append("]"); + logger.info("=== NATIVE_MEMORY STATS: {} ===", sb); + + // All Arrow pools should be present + Set poolNames = stats.getPools().stream().map(NativeAllocatorPoolStats.PoolStats::getName).collect(Collectors.toSet()); + assertThat(poolNames, hasItems("flight", "ingest", "query")); + + // Each pool should have limit > 0 (derived from 1gb native_memory.limit) + for (NativeAllocatorPoolStats.PoolStats pool : stats.getPools()) { + assertThat("Pool '" + pool.getName() + "' should have limit > 0", pool.getLimitBytes(), greaterThan(0L)); + assertThat("Pool '" + pool.getName() + "' allocated should be >= 0", pool.getAllocatedBytes(), greaterThanOrEqualTo(0L)); + } + } + +} diff --git a/plugins/arrow-flight-rpc/src/test/java/org/opensearch/arrow/flight/transport/FlightTransportTestBase.java b/plugins/arrow-flight-rpc/src/test/java/org/opensearch/arrow/flight/transport/FlightTransportTestBase.java index f5eacc771be92..add8e0a2bbfee 100644 --- a/plugins/arrow-flight-rpc/src/test/java/org/opensearch/arrow/flight/transport/FlightTransportTestBase.java +++ b/plugins/arrow-flight-rpc/src/test/java/org/opensearch/arrow/flight/transport/FlightTransportTestBase.java @@ -91,8 +91,8 @@ public void setUp() throws Exception { // FlightTransport sources its allocator from the framework's FLIGHT pool. Construct one // here so the test has a usable allocator; tearDown closes it. - nativeAllocator = new ArrowNativeAllocator(Long.MAX_VALUE); - nativeAllocator.getOrCreatePool(NativeAllocatorPoolConfig.POOL_FLIGHT, 0L, Long.MAX_VALUE); + nativeAllocator = new ArrowNativeAllocator(); + nativeAllocator.getOrCreatePool(NativeAllocatorPoolConfig.POOL_FLIGHT, 0L, Long.MAX_VALUE, null); flightTransport = new FlightTransport( settings, diff --git a/sandbox/libs/dataformat-native/rust/common/src/lib.rs b/sandbox/libs/dataformat-native/rust/common/src/lib.rs index 0f4b8c132407f..c44fa871c4fb3 100644 --- a/sandbox/libs/dataformat-native/rust/common/src/lib.rs +++ b/sandbox/libs/dataformat-native/rust/common/src/lib.rs @@ -11,6 +11,7 @@ pub mod error; pub mod logger; pub mod allocator; +pub mod memory_pool; // Re-export the proc macro so plugins use `#[native_bridge_common::ffm_safe]` pub use native_bridge_macros::ffm_safe; diff --git a/sandbox/libs/dataformat-native/rust/common/src/memory_pool.rs b/sandbox/libs/dataformat-native/rust/common/src/memory_pool.rs new file mode 100644 index 0000000000000..7fc5c7c83a20c --- /dev/null +++ b/sandbox/libs/dataformat-native/rust/common/src/memory_pool.rs @@ -0,0 +1,370 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +//! Memory pool for tracking native memory usage across write and merge operations. +//! +//! Provides an atomic counter with a configurable limit. Operations that allocate +//! significant memory call `try_grow` before allocating and `shrink` after freeing. +//! The pool rejects allocations that would exceed the configured limit. +//! +//! `MemoryReservation` is an RAII handle that automatically returns memory to the +//! pool on drop, preventing leaks even on error paths. + +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::{Arc, Condvar, Mutex}; +use std::time::Duration; +use std::fmt; + +/// Default timeout for blocking wait (300 seconds). +pub const DEFAULT_WAIT_TIMEOUT: Duration = Duration::from_secs(300); + +/// Merge operations can wait longer (600 seconds). +pub const MERGE_WAIT_TIMEOUT: Duration = Duration::from_secs(600); + +/// Controls whether an allocation blocks or rejects immediately. +#[derive(Debug, Clone)] +pub enum PoolBehavior { + /// Block until memory is available, up to the given timeout. + Wait(Duration), + /// Fail immediately if pool is full. + Reject, +} + +/// Error returned when a pool cannot satisfy an allocation request. +#[derive(Debug, Clone)] +pub struct PoolExhausted { + pub pool_name: &'static str, + pub requested: usize, + pub used: usize, + pub limit: usize, +} + +impl fmt::Display for PoolExhausted { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "[{}] memory limit exceeded: requested {} bytes, used {}, limit {}", + self.pool_name, self.requested, self.used, self.limit + ) + } +} + +impl std::error::Error for PoolExhausted {} + +/// Error returned when wait_and_grow times out. +#[derive(Debug, Clone)] +pub struct PoolTimeout { + pub pool_name: &'static str, + pub requested: usize, + pub used: usize, + pub limit: usize, + pub waited: Duration, +} + +impl fmt::Display for PoolTimeout { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "[{}] timed out waiting for {} bytes after {:?} (used: {}, limit: {})", + self.pool_name, self.requested, self.waited, self.used, self.limit + ) + } +} + +impl std::error::Error for PoolTimeout {} + +/// A node-level memory pool backed by an atomic counter with blocking wait support. +pub struct MemoryPool { + name: &'static str, + used: AtomicUsize, + limit: AtomicUsize, + peak: AtomicUsize, + notify: Condvar, + notify_lock: Mutex<()>, +} + +impl fmt::Debug for MemoryPool { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("MemoryPool") + .field("name", &self.name) + .field("used", &self.used.load(Ordering::Relaxed)) + .field("limit", &self.limit.load(Ordering::Relaxed)) + .field("peak", &self.peak.load(Ordering::Relaxed)) + .finish() + } +} + +impl MemoryPool { + /// Create a new pool. `limit = 0` means unlimited. + pub fn new(name: &'static str, limit: usize) -> Self { + Self { + name, + used: AtomicUsize::new(0), + limit: AtomicUsize::new(limit), + peak: AtomicUsize::new(0), + notify: Condvar::new(), + notify_lock: Mutex::new(()), + } + } + + /// Attempt to reserve `bytes`. Returns error if it would exceed the limit. + pub fn try_grow(&self, bytes: usize) -> Result<(), PoolExhausted> { + if bytes == 0 { + return Ok(()); + } + let limit = self.limit.load(Ordering::Relaxed); + let result = self.used.fetch_update(Ordering::Relaxed, Ordering::Relaxed, |used| { + let new_used = used.checked_add(bytes)?; + if limit > 0 && new_used > limit { + None + } else { + Some(new_used) + } + }); + + match result { + Ok(old) => { + self.peak.fetch_max(old + bytes, Ordering::Relaxed); + Ok(()) + } + Err(_) => Err(PoolExhausted { + pool_name: self.name, + requested: bytes, + used: self.used.load(Ordering::Relaxed), + limit, + }), + } + } + + /// Blocks until `bytes` can be reserved, or timeout expires. + pub fn wait_and_grow(&self, bytes: usize, timeout: Duration) -> Result<(), PoolTimeout> { + if bytes == 0 { + return Ok(()); + } + if self.try_grow(bytes).is_ok() { + return Ok(()); + } + + let start = std::time::Instant::now(); + loop { + let elapsed = start.elapsed(); + if elapsed >= timeout { + let used = self.used.load(Ordering::Relaxed); + let limit = self.limit.load(Ordering::Relaxed); + return Err(PoolTimeout { + pool_name: self.name, + requested: bytes, + used, + limit, + waited: elapsed, + }); + } + + let remaining = timeout - elapsed; + let guard = self.notify_lock.lock().unwrap(); + let _ = self.notify.wait_timeout(guard, remaining.min(Duration::from_secs(1))).unwrap(); + + if self.try_grow(bytes).is_ok() { + return Ok(()); + } + } + } + + /// Infallible grow — use when the allocation has already happened. + pub fn grow(&self, bytes: usize) { + if bytes == 0 { + return; + } + let new_used = self.used.fetch_add(bytes, Ordering::Relaxed) + bytes; + self.peak.fetch_max(new_used, Ordering::Relaxed); + } + + /// Release `bytes` back to the pool. Notifies any waiting threads. + pub fn shrink(&self, bytes: usize) { + if bytes == 0 { + return; + } + self.used + .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |current| { + Some(current.saturating_sub(bytes)) + }) + .unwrap(); + self.notify.notify_all(); + } + + pub fn used(&self) -> usize { + self.used.load(Ordering::Relaxed) + } + + pub fn peak(&self) -> usize { + self.peak.load(Ordering::Relaxed) + } + + pub fn limit(&self) -> usize { + self.limit.load(Ordering::Relaxed) + } + + pub fn name(&self) -> &'static str { + self.name + } + + /// Atomically update the limit. Called by the Java rebalancer. + pub fn set_limit(&self, new_limit: usize) { + self.limit.store(new_limit, Ordering::Release); + // Wake waiters — new limit might allow blocked allocations + self.notify.notify_all(); + } +} + +/// RAII handle that tracks a portion of memory reserved from a [`MemoryPool`]. +/// Automatically releases all held memory on drop. +pub struct MemoryReservation { + pool: Arc, + consumer: &'static str, + size: usize, + behavior: PoolBehavior, +} + +impl MemoryReservation { + pub fn new(pool: &Arc, consumer: &'static str, behavior: PoolBehavior) -> Self { + Self { + pool: Arc::clone(pool), + consumer, + size: 0, + behavior, + } + } + + /// Grow based on the reservation's behavior: block (Wait) or reject (Reject). + pub fn request(&mut self, bytes: usize) -> Result<(), Box> { + match &self.behavior { + PoolBehavior::Reject => { + self.pool.try_grow(bytes)?; + self.size += bytes; + Ok(()) + } + PoolBehavior::Wait(timeout) => { + self.pool.wait_and_grow(bytes, *timeout)?; + self.size += bytes; + Ok(()) + } + } + } + + /// Infallible grow. + pub fn grow(&mut self, bytes: usize) { + self.pool.grow(bytes); + self.size += bytes; + } + + /// Release `bytes` from this reservation. + pub fn shrink(&mut self, bytes: usize) { + let actual = bytes.min(self.size); + self.pool.shrink(actual); + self.size -= actual; + } + + /// Release all memory back to the pool. + pub fn free(&mut self) -> usize { + let s = self.size; + if s > 0 { + self.pool.shrink(s); + self.size = 0; + } + s + } + + pub fn size(&self) -> usize { + self.size + } + + pub fn consumer(&self) -> &'static str { + self.consumer + } + + /// Create a sibling reservation from the same pool with a different consumer name. + pub fn child(&self, consumer: &'static str) -> Self { + Self { + pool: Arc::clone(&self.pool), + consumer, + size: 0, + behavior: self.behavior.clone(), + } + } +} + +impl Drop for MemoryReservation { + fn drop(&mut self) { + if self.size > 0 { + self.pool.shrink(self.size); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_try_grow_within_limit() { + let pool = Arc::new(MemoryPool::new("test", 1024)); + let mut res = MemoryReservation::new(&pool, "writer", PoolBehavior::Reject); + assert!(res.request(512).is_ok()); + assert_eq!(res.size(), 512); + assert_eq!(pool.used(), 512); + } + + #[test] + fn test_try_grow_exceeds_limit() { + let pool = Arc::new(MemoryPool::new("test", 1024)); + let mut res = MemoryReservation::new(&pool, "writer", PoolBehavior::Reject); + assert!(res.request(2048).is_err()); + assert_eq!(res.size(), 0); + assert_eq!(pool.used(), 0); + } + + #[test] + fn test_drop_releases_memory() { + let pool = Arc::new(MemoryPool::new("test", 1024)); + { + let mut res = MemoryReservation::new(&pool, "writer", PoolBehavior::Reject); + res.request(500).unwrap(); + assert_eq!(pool.used(), 500); + } + assert_eq!(pool.used(), 0); + } + + #[test] + fn test_set_limit_allows_growth() { + let pool = Arc::new(MemoryPool::new("test", 100)); + let mut res = MemoryReservation::new(&pool, "writer", PoolBehavior::Reject); + assert!(res.request(200).is_err()); + pool.set_limit(500); + assert!(res.request(200).is_ok()); + } + + #[test] + fn test_peak_tracking() { + let pool = Arc::new(MemoryPool::new("test", 1024)); + let mut res = MemoryReservation::new(&pool, "writer", PoolBehavior::Reject); + res.request(800).unwrap(); + res.shrink(500); + assert_eq!(pool.peak(), 800); + assert_eq!(pool.used(), 300); + } + + #[test] + fn test_child_reservation() { + let pool = Arc::new(MemoryPool::new("test", 1024)); + let res = MemoryReservation::new(&pool, "parent", PoolBehavior::Reject); + let mut child = res.child("child"); + child.request(100).unwrap(); + assert_eq!(child.consumer(), "child"); + assert_eq!(pool.used(), 100); + } +} diff --git a/sandbox/plugins/analytics-backend-datafusion/src/main/java/org/opensearch/be/datafusion/DataFusionPlugin.java b/sandbox/plugins/analytics-backend-datafusion/src/main/java/org/opensearch/be/datafusion/DataFusionPlugin.java index d2cc0417ee59c..a92d6365c2050 100644 --- a/sandbox/plugins/analytics-backend-datafusion/src/main/java/org/opensearch/be/datafusion/DataFusionPlugin.java +++ b/sandbox/plugins/analytics-backend-datafusion/src/main/java/org/opensearch/be/datafusion/DataFusionPlugin.java @@ -12,11 +12,15 @@ import org.apache.logging.log4j.Logger; import org.opensearch.analytics.spi.AnalyticsSearchBackendPlugin; import org.opensearch.analytics.spi.QueryExecutionMetrics; +import org.opensearch.arrow.allocator.ArrowNativeAllocator; +import org.opensearch.arrow.spi.NativeAllocator; +import org.opensearch.arrow.spi.PoolGroup; import org.opensearch.be.datafusion.action.DataFusionStatsAction; import org.opensearch.be.datafusion.nativelib.NativeBridge; import org.opensearch.cluster.metadata.IndexNameExpressionResolver; import org.opensearch.cluster.node.DiscoveryNodes; import org.opensearch.cluster.service.ClusterService; +import org.opensearch.common.Nullable; import org.opensearch.common.settings.ClusterSettings; import org.opensearch.common.settings.IndexScopedSettings; import org.opensearch.common.settings.Setting; @@ -86,7 +90,7 @@ public class DataFusionPlugin extends Plugin * ({@link ResourceTrackerSettings#NODE_NATIVE_MEMORY_LIMIT_SETTING}), which is * the same off-heap budget admission control throttles against. The DataFusion Rust * runtime is the dominant native-memory consumer for analytics workloads (see PR #21732 - * partitioning model), so the default takes 75% of {@code node.native_memory.limit}. + * partitioning model), so the default takes 74% of {@code node.native_memory.limit}. * If the AC limit is unset (== 0), the default is {@link Long#MAX_VALUE} — unbounded — to * preserve pre-AC behaviour rather than make up a number from JVM heap (which is a * separate, already-allocated region with no relation to native-memory sizing). @@ -109,7 +113,7 @@ public class DataFusionPlugin extends Plugin ); /** - * Computes the default for {@link #DATAFUSION_MEMORY_POOL_LIMIT} as 75% of + * Computes the default for {@link #DATAFUSION_MEMORY_POOL_LIMIT} as 74% of * {@link ResourceTrackerSettings#NODE_NATIVE_MEMORY_LIMIT_SETTING}, falling back to * {@link Long#MAX_VALUE} when AC is unconfigured. * @@ -125,10 +129,10 @@ static String deriveMemoryPoolLimitDefault(Settings settings) { if (nativeLimit.getBytes() <= 0) { return Long.toString(Long.MAX_VALUE); } - // 75% of node.native_memory.limit. DataFusion is the dominant native consumer for + // 74% of node.native_memory.limit. DataFusion is the dominant native consumer for // analytics workloads; operators tune via the dynamic setting once they characterize // their workload. - long pool = Math.max(0L, nativeLimit.getBytes() * 75 / 100); + long pool = Math.max(0L, nativeLimit.getBytes() * 74 / 100); return Long.toString(pool); } @@ -331,6 +335,39 @@ public Collection createComponents( IndexNameExpressionResolver indexNameExpressionResolver, Supplier repositoriesServiceSupplier, DataFormatRegistry dataFormatRegistry + ) { + return createComponents( + client, + clusterService, + threadPool, + resourceWatcherService, + scriptService, + xContentRegistry, + environment, + nodeEnvironment, + namedWriteableRegistry, + indexNameExpressionResolver, + repositoriesServiceSupplier, + dataFormatRegistry, + null + ); + } + + @Override + public Collection createComponents( + Client client, + ClusterService clusterService, + ThreadPool threadPool, + ResourceWatcherService resourceWatcherService, + ScriptService scriptService, + NamedXContentRegistry xContentRegistry, + Environment environment, + NodeEnvironment nodeEnvironment, + NamedWriteableRegistry namedWriteableRegistry, + IndexNameExpressionResolver indexNameExpressionResolver, + Supplier repositoriesServiceSupplier, + DataFormatRegistry dataFormatRegistry, + @Nullable NativeAllocator nativeAllocator ) { this.dataFormatRegistry = dataFormatRegistry; this.clusterService = clusterService; @@ -350,13 +387,8 @@ public Collection createComponents( dataFusionService.start(); logger.debug("DataFusion plugin initialized — memory pool {}B, spill limit {}B", memoryPoolLimit, spillMemoryLimit); - // Wire the dynamic memory pool limit setting to the native runtime so updates via the - // cluster settings API take effect without restarting the node. The framework's - // parquet.native.pool.datafusion.{min,max} controls the Java-side Arrow pool that - // sources the per-query allocators handed to DataFusion; this setting controls the - // Rust runtime's internal MemoryPool used by query execution. They're separate - // accounting layers — operators tune them independently. - clusterService.getClusterSettings().addSettingsUpdateConsumer(DATAFUSION_MEMORY_POOL_LIMIT, this::updateMemoryPoolLimit); + // Wire the dynamic spill limit setting to the native runtime so updates via the + // cluster settings API take effect without restarting the node. clusterService.getClusterSettings().addSettingsUpdateConsumer(DATAFUSION_SPILL_MEMORY_LIMIT, this::updateSpillMemoryLimit); clusterService.getClusterSettings().addSettingsUpdateConsumer(DATAFUSION_MIN_TARGET_PARTITIONS, this::updateMinTargetPartitions); clusterService.getClusterSettings() @@ -382,19 +414,48 @@ public Collection createComponents( this.datafusionSettings = new DatafusionSettings(clusterService); - // Expose per-task native-memory usage to search backpressure. The tracker calls - // this supplier once per refresh (invoked by the backpressure service at the top of - // doRun() and nodeStats()), snapshotting all live queries in one FFM call. Per-task - // evaluation then reads from the tracker's cached map — no FFM call per task. - // - // The OpenSearch task id is used as the DataFusion context_id at query launch - // (see ShardScanInstructionHandler / DatafusionSearchExecEngine), so the map is - // already keyed by Task#getId on the consumer side. + // Expose per-task native-memory usage to search backpressure. NativeMemoryUsageTracker.setSnapshotSupplier(this::currentBytesByTaskId); NativeMemoryUsageTracker.setNativeMemoryBudgetSupplier(() -> DATAFUSION_MEMORY_POOL_LIMIT.get(clusterService.getSettings())); this.substraitExtensions = loadSubstraitExtensions(); + // Register with the unified allocator if available + if (nativeAllocator != null) { + ClusterSettings clusterSettings = clusterService.getClusterSettings(); + ArrowNativeAllocator arrowAllocator = (ArrowNativeAllocator) nativeAllocator; + + NativeAllocator.VirtualPoolHandle dfPool = arrowAllocator.registerVirtualPool( + DatafusionSettings.POOL_DATAFUSION, + DatafusionSettings.DATAFUSION_MEMORY_POOL_MIN.get(settings), + DATAFUSION_MEMORY_POOL_LIMIT.get(settings), + PoolGroup.SEARCH, + this::updateMemoryPoolLimit + ); + + arrowAllocator.addStatsRefresher(() -> { + if (dataFusionService != null) { + long usage = dataFusionService.getMemoryPoolUsage(); + dfPool.updateStats(usage, usage); + } + }); + + arrowAllocator.setNativeMemoryStatsSupplier(() -> { + AnalyticsBackendNativeMemoryStats s = NativeMemoryFetcher.fetch(); + return new long[] { s.getAllocatedBytes(), s.getResidentBytes() }; + }); + + // Wire dynamic setting consumers for pool min/max + clusterSettings.addSettingsUpdateConsumer( + DATAFUSION_MEMORY_POOL_LIMIT, + newMax -> arrowAllocator.setPoolLimit(DatafusionSettings.POOL_DATAFUSION, newMax) + ); + clusterSettings.addSettingsUpdateConsumer( + DatafusionSettings.DATAFUSION_MEMORY_POOL_MIN, + newMin -> arrowAllocator.setPoolMin(DatafusionSettings.POOL_DATAFUSION, newMin) + ); + } + return Collections.singletonList(dataFusionService); } diff --git a/sandbox/plugins/analytics-backend-datafusion/src/main/java/org/opensearch/be/datafusion/DatafusionSettings.java b/sandbox/plugins/analytics-backend-datafusion/src/main/java/org/opensearch/be/datafusion/DatafusionSettings.java index cf3e9bbd817d9..5521489b57ab3 100644 --- a/sandbox/plugins/analytics-backend-datafusion/src/main/java/org/opensearch/be/datafusion/DatafusionSettings.java +++ b/sandbox/plugins/analytics-backend-datafusion/src/main/java/org/opensearch/be/datafusion/DatafusionSettings.java @@ -14,6 +14,8 @@ import org.opensearch.common.settings.ClusterSettings; import org.opensearch.common.settings.Setting; import org.opensearch.common.settings.Settings; +import org.opensearch.core.common.unit.ByteSizeValue; +import org.opensearch.node.resource.tracker.ResourceTrackerSettings; import org.opensearch.search.SearchService; import java.util.List; @@ -32,6 +34,8 @@ @ExperimentalApi public final class DatafusionSettings { + public static final String POOL_DATAFUSION = "datafusion"; + // ── New indexed query settings ── /** Number of rows per batch in the indexed query execution path. */ @@ -164,6 +168,21 @@ public final class DatafusionSettings { // ── Concurrency gate settings ── + /** Minimum guaranteed bytes for the DataFusion memory pool. Default is half of datafusion max (37% of budget). */ + public static final Setting DATAFUSION_MEMORY_POOL_MIN = new Setting<>( + "datafusion.memory_pool_min_bytes", + s -> derivePoolMinDefault(s, 37), + s -> { + long v = Long.parseLong(s); + if (v < 0) { + throw new IllegalArgumentException("Setting [datafusion.memory_pool_min_bytes] must be >= 0, got " + v); + } + return v; + }, + Setting.Property.NodeScope, + Setting.Property.Dynamic + ); + /** Datanode concurrency gate multiplier: max concurrent partition-equivalents = cpu_threads × multiplier. */ public static final Setting CONCURRENCY_DATANODE_MULTIPLIER = Setting.doubleSetting( "datafusion.concurrency.datanode_multiplier", @@ -223,6 +242,19 @@ public final class DatafusionSettings { Setting.Property.Dynamic ); + /** + * Computes the default for a pool min as a percentage of + * {@link ResourceTrackerSettings#NODE_NATIVE_MEMORY_LIMIT_SETTING}. + * Returns 0 when AC is unconfigured. + */ + static String derivePoolMinDefault(Settings settings, int percent) { + ByteSizeValue nativeLimit = ResourceTrackerSettings.NODE_NATIVE_MEMORY_LIMIT_SETTING.get(settings); + if (nativeLimit.getBytes() <= 0) { + return "0"; + } + return Long.toString(Math.max(0L, nativeLimit.getBytes() * percent / 100)); + } + // ── All settings registered by the plugin ── public static final List> ALL_SETTINGS = List.of( @@ -237,6 +269,7 @@ public final class DatafusionSettings { DataFusionPlugin.DATAFUSION_MEMORY_GUARD_ADMISSION_REJECT_THRESHOLD, DataFusionPlugin.DATAFUSION_MEMORY_GUARD_EXECUTION_SPILL_THRESHOLD, DataFusionPlugin.DATAFUSION_MEMORY_GUARD_EXECUTION_CRITICAL_THRESHOLD, + DATAFUSION_MEMORY_POOL_MIN, // Cache settings — metadata and statistics cache configuration CacheSettings.METADATA_CACHE_SIZE_LIMIT, diff --git a/sandbox/plugins/analytics-backend-datafusion/src/test/java/org/opensearch/be/datafusion/DataFusionPluginSettingsTests.java b/sandbox/plugins/analytics-backend-datafusion/src/test/java/org/opensearch/be/datafusion/DataFusionPluginSettingsTests.java index aa1847ec84500..38b9ceecdc9eb 100644 --- a/sandbox/plugins/analytics-backend-datafusion/src/test/java/org/opensearch/be/datafusion/DataFusionPluginSettingsTests.java +++ b/sandbox/plugins/analytics-backend-datafusion/src/test/java/org/opensearch/be/datafusion/DataFusionPluginSettingsTests.java @@ -28,6 +28,7 @@ public void testMemoryPoolLimitIsDynamic() { "datafusion.memory_pool_limit_bytes must be dynamic to support runtime updates", DataFusionPlugin.DATAFUSION_MEMORY_POOL_LIMIT.isDynamic() ); + assertTrue("datafusion.memory_pool_limit_bytes must have node scope", DataFusionPlugin.DATAFUSION_MEMORY_POOL_LIMIT.hasNodeScope()); } public void testSpillMemoryLimitIsDynamic() { @@ -111,7 +112,7 @@ public void testGetSettingsReturnsAllIndexedSettings() { public void testGetSettingsReturnsTotalExpectedCount() { try (DataFusionPlugin plugin = new DataFusionPlugin()) { List> settings = plugin.getSettings(); - assertEquals(25, settings.size()); + assertEquals(26, settings.size()); } catch (Exception e) { throw new AssertionError(e); } @@ -134,11 +135,11 @@ public void testDeriveMemoryPoolLimitDefaultUnsetReturnsLongMaxValue() { } public void testDeriveMemoryPoolLimitDefaultUsesNativeMemoryLimit() { - // 10 GiB native memory limit — default takes 75% straight from limit, not + // 10 GiB native memory limit — default takes 74% straight from limit, not // from limit - buffer_percent (which is AC's throttle margin, not a framework - // budget reduction). 75% of 10 GiB. + // budget reduction). 74% of 10 GiB. Settings s = Settings.builder().put("node.native_memory.limit", "10gb").build(); - long expected = (10L * 1024 * 1024 * 1024) * 75 / 100; + long expected = (10L * 1024 * 1024 * 1024) * 74 / 100; assertEquals(Long.toString(expected), DataFusionPlugin.deriveMemoryPoolLimitDefault(s)); } @@ -146,14 +147,14 @@ public void testDeriveMemoryPoolLimitDefaultIgnoresBufferPercent() { // node.native_memory.buffer_percent is AC's throttle margin. The framework default // takes its fraction off node.native_memory.limit directly so the buffer can sit // between AC's throttle threshold and the framework's hard cap. - // 1000 bytes limit, 20% buffer => pool max still 75% of 1000 = 750. + // 1000 bytes limit, 20% buffer => pool max still 74% of 1000 = 740. Settings s = Settings.builder().put("node.native_memory.limit", "1000b").put("node.native_memory.buffer_percent", 20).build(); - assertEquals("750", DataFusionPlugin.deriveMemoryPoolLimitDefault(s)); + assertEquals("740", DataFusionPlugin.deriveMemoryPoolLimitDefault(s)); } public void testMemoryPoolLimitSettingExposesDerivedDefault() { Settings s = Settings.builder().put("node.native_memory.limit", "10gb").build(); - long expected = (10L * 1024 * 1024 * 1024) * 75 / 100; + long expected = (10L * 1024 * 1024 * 1024) * 74 / 100; assertEquals(Long.valueOf(expected), DataFusionPlugin.DATAFUSION_MEMORY_POOL_LIMIT.get(s)); } diff --git a/sandbox/plugins/analytics-backend-datafusion/src/test/java/org/opensearch/be/datafusion/DatafusionSettingsTests.java b/sandbox/plugins/analytics-backend-datafusion/src/test/java/org/opensearch/be/datafusion/DatafusionSettingsTests.java index 3dccb2479dfea..22dc688fa7e05 100644 --- a/sandbox/plugins/analytics-backend-datafusion/src/test/java/org/opensearch/be/datafusion/DatafusionSettingsTests.java +++ b/sandbox/plugins/analytics-backend-datafusion/src/test/java/org/opensearch/be/datafusion/DatafusionSettingsTests.java @@ -69,7 +69,7 @@ public void testMaxCollectorParallelismSettingDefinition() { } public void testAllSettingsContainsAllExpectedSettings() { - assertEquals(25, DatafusionSettings.ALL_SETTINGS.size()); + assertEquals(26, DatafusionSettings.ALL_SETTINGS.size()); assertTrue(DatafusionSettings.ALL_SETTINGS.contains(DataFusionPlugin.DATAFUSION_REDUCE_TARGET_PARTITIONS)); assertTrue(DatafusionSettings.ALL_SETTINGS.contains(DatafusionSettings.INDEXED_BATCH_SIZE)); assertTrue(DatafusionSettings.ALL_SETTINGS.contains(DatafusionSettings.INDEXED_PARQUET_PUSHDOWN_FILTERS)); diff --git a/sandbox/plugins/composite-engine/src/internalClusterTest/java/org/opensearch/composite/UnifiedNativeMemoryFullStackIT.java b/sandbox/plugins/composite-engine/src/internalClusterTest/java/org/opensearch/composite/UnifiedNativeMemoryFullStackIT.java new file mode 100644 index 0000000000000..5678d0e08c3e3 --- /dev/null +++ b/sandbox/plugins/composite-engine/src/internalClusterTest/java/org/opensearch/composite/UnifiedNativeMemoryFullStackIT.java @@ -0,0 +1,84 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.composite; + +import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope; + +import org.opensearch.action.admin.cluster.node.stats.NodesStatsRequest; +import org.opensearch.action.admin.cluster.node.stats.NodesStatsResponse; +import org.opensearch.arrow.allocator.ArrowBasePlugin; +import org.opensearch.be.datafusion.DataFusionPlugin; +import org.opensearch.be.lucene.LucenePlugin; +import org.opensearch.common.settings.Settings; +import org.opensearch.parquet.ParquetDataFormatPlugin; +import org.opensearch.plugin.stats.NativeAllocatorPoolStats; +import org.opensearch.plugins.Plugin; +import org.opensearch.test.OpenSearchIntegTestCase; + +import java.util.Arrays; +import java.util.Collection; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.hamcrest.Matchers.greaterThan; +import static org.hamcrest.Matchers.greaterThanOrEqualTo; +import static org.hamcrest.Matchers.hasItems; +import static org.hamcrest.Matchers.is; +import static org.hamcrest.Matchers.notNullValue; + +/** + * Full-stack IT verifying all 6 pools appear in _nodes/stats/native_memory. + */ +@ThreadLeakScope(ThreadLeakScope.Scope.NONE) +@OpenSearchIntegTestCase.ClusterScope(scope = OpenSearchIntegTestCase.Scope.TEST, minNumDataNodes = 1, maxNumDataNodes = 1) +public class UnifiedNativeMemoryFullStackIT extends OpenSearchIntegTestCase { + + @Override + protected Collection> nodePlugins() { + return Arrays.asList( + ArrowBasePlugin.class, + ParquetDataFormatPlugin.class, + CompositeDataFormatPlugin.class, + LucenePlugin.class, + DataFusionPlugin.class + ); + } + + @Override + protected Settings nodeSettings(int nodeOrdinal) { + return Settings.builder().put(super.nodeSettings(nodeOrdinal)).put("node.native_memory.limit", "2gb").build(); + } + + public void testAllSixPoolsVisibleInStats() { + NodesStatsResponse response = client().admin() + .cluster() + .prepareNodesStats() + .addMetric(NodesStatsRequest.Metric.NATIVE_MEMORY.metricName()) + .get(); + + assertThat(response.getNodes().isEmpty(), is(false)); + NativeAllocatorPoolStats stats = response.getNodes().get(0).getNativeAllocatorStats(); + assertThat("native_memory stats should be present", stats, notNullValue()); + + // All 6 pools should be present + Set poolNames = stats.getPools().stream().map(NativeAllocatorPoolStats.PoolStats::getName).collect(Collectors.toSet()); + assertThat(poolNames, hasItems("flight", "ingest", "query", "datafusion", "write", "merge")); + + // Each pool should have limit > 0 + for (NativeAllocatorPoolStats.PoolStats pool : stats.getPools()) { + assertThat("Pool '" + pool.getName() + "' limit should be > 0", pool.getLimitBytes(), greaterThan(0L)); + assertThat("Pool '" + pool.getName() + "' allocated should be >= 0", pool.getAllocatedBytes(), greaterThanOrEqualTo(0L)); + } + + // Native memory stats (jemalloc) should be available since DataFusion plugin sets the supplier + assertThat("native allocated_bytes should be > 0", stats.getNativeAllocatedBytes(), greaterThan(0L)); + assertThat("native resident_bytes should be > 0", stats.getNativeResidentBytes(), greaterThan(0L)); + } + +} diff --git a/sandbox/plugins/composite-engine/src/main/java/org/opensearch/composite/CompositeDataFormatPlugin.java b/sandbox/plugins/composite-engine/src/main/java/org/opensearch/composite/CompositeDataFormatPlugin.java index f1757189361ad..315cc5b78fd05 100644 --- a/sandbox/plugins/composite-engine/src/main/java/org/opensearch/composite/CompositeDataFormatPlugin.java +++ b/sandbox/plugins/composite-engine/src/main/java/org/opensearch/composite/CompositeDataFormatPlugin.java @@ -163,7 +163,7 @@ public class CompositeDataFormatPlugin extends Plugin implements DataFormatPlugi */ public static final Setting MERGE_ON_REFRESH_MAX_SIZE = Setting.byteSizeSetting( "index.composite.merge_on_refresh_max_size", - new ByteSizeValue(10, ByteSizeUnit.MB), + new ByteSizeValue(0, ByteSizeUnit.MB), Setting.Property.IndexScope, Setting.Property.Dynamic ); diff --git a/sandbox/plugins/parquet-data-format/benchmarks/src/main/java/org/opensearch/parquet/benchmark/VSRRotationBenchmark.java b/sandbox/plugins/parquet-data-format/benchmarks/src/main/java/org/opensearch/parquet/benchmark/VSRRotationBenchmark.java index 8a65d5fe52734..0986a8790f2e5 100644 --- a/sandbox/plugins/parquet-data-format/benchmarks/src/main/java/org/opensearch/parquet/benchmark/VSRRotationBenchmark.java +++ b/sandbox/plugins/parquet-data-format/benchmarks/src/main/java/org/opensearch/parquet/benchmark/VSRRotationBenchmark.java @@ -11,6 +11,9 @@ import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.Schema; import org.opensearch.Version; +import org.opensearch.arrow.allocator.ArrowNativeAllocator; +import org.opensearch.arrow.spi.NativeAllocatorPoolConfig; +import org.opensearch.arrow.spi.PoolGroup; import org.opensearch.cluster.metadata.IndexMetadata; import org.opensearch.common.settings.Settings; import org.opensearch.index.IndexSettings; @@ -79,7 +82,7 @@ public class VSRRotationBenchmark { private ThreadPool threadPool; private ArrowBufferPool bufferPool; - private org.opensearch.arrow.allocator.ArrowNativeAllocator nativeAllocator; + private ArrowNativeAllocator nativeAllocator; private Schema schema; private List fieldTypes; private VSRManager vsrManager; @@ -126,8 +129,8 @@ public void setupTrial() { @Setup(Level.Invocation) public void setup() throws IOException { - nativeAllocator = new org.opensearch.arrow.allocator.ArrowNativeAllocator(Long.MAX_VALUE); - nativeAllocator.getOrCreatePool(org.opensearch.arrow.spi.NativeAllocatorPoolConfig.POOL_INGEST, 0L, Long.MAX_VALUE); + nativeAllocator = new ArrowNativeAllocator(); + nativeAllocator.getOrCreatePool(NativeAllocatorPoolConfig.POOL_INGEST, 0L, Long.MAX_VALUE, PoolGroup.INDEXING); bufferPool = new ArrowBufferPool(Settings.EMPTY, nativeAllocator); filePath = Path.of(System.getProperty("java.io.tmpdir"), "benchmark_vsr_" + System.nanoTime() + ".parquet").toString(); Settings idxSettings = Settings.builder().put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT).build(); diff --git a/sandbox/plugins/parquet-data-format/src/main/java/org/opensearch/parquet/ParquetDataFormatPlugin.java b/sandbox/plugins/parquet-data-format/src/main/java/org/opensearch/parquet/ParquetDataFormatPlugin.java index aa38e5a2f9455..d3a815888e5f7 100644 --- a/sandbox/plugins/parquet-data-format/src/main/java/org/opensearch/parquet/ParquetDataFormatPlugin.java +++ b/sandbox/plugins/parquet-data-format/src/main/java/org/opensearch/parquet/ParquetDataFormatPlugin.java @@ -9,8 +9,11 @@ package org.opensearch.parquet; import org.opensearch.arrow.allocator.ArrowNativeAllocator; +import org.opensearch.arrow.spi.NativeAllocator; +import org.opensearch.arrow.spi.PoolGroup; import org.opensearch.cluster.metadata.IndexNameExpressionResolver; import org.opensearch.cluster.service.ClusterService; +import org.opensearch.common.settings.ClusterSettings; import org.opensearch.common.settings.Setting; import org.opensearch.common.settings.Settings; import org.opensearch.common.util.concurrent.OpenSearchExecutors; @@ -28,6 +31,7 @@ import org.opensearch.index.engine.dataformat.IndexingExecutionEngine; import org.opensearch.index.engine.dataformat.StoreStrategy; import org.opensearch.index.store.PrecomputedChecksumStrategy; +import org.opensearch.parquet.bridge.RustBridge; import org.opensearch.parquet.engine.ParquetDataFormat; import org.opensearch.parquet.engine.ParquetIndexingEngine; import org.opensearch.parquet.fields.ArrowSchemaBuilder; @@ -100,8 +104,61 @@ public Collection createComponents( ) { this.settings = clusterService.getSettings(); this.threadPool = threadPool; - this.nativeAllocator = pluginComponentRegistry.getComponent(ArrowNativeAllocator.class) - .orElseThrow(() -> new IllegalStateException("ArrowNativeAllocator not available; arrow-base plugin must be installed")); + this.nativeAllocator = pluginComponentRegistry.getComponent(ArrowNativeAllocator.class).orElse(null); + + // Initialize native write/merge memory pools + long writeMax = ParquetSettings.WRITE_POOL_MAX.get(this.settings); + long mergeMax = ParquetSettings.MERGE_POOL_MAX.get(this.settings); + RustBridge.initMemoryPools(writeMax, mergeMax); + + // Register virtual pools if allocator is available (arrow-base loaded) + if (nativeAllocator != null) { + NativeAllocator.VirtualPoolHandle writePool = nativeAllocator.registerVirtualPool( + ParquetSettings.POOL_WRITE, + ParquetSettings.WRITE_POOL_MIN.get(this.settings), + writeMax, + PoolGroup.INDEXING, + newLimit -> RustBridge.setWritePoolLimit(newLimit) + ); + NativeAllocator.VirtualPoolHandle mergePool = nativeAllocator.registerVirtualPool( + ParquetSettings.POOL_MERGE, + ParquetSettings.MERGE_POOL_MIN.get(this.settings), + mergeMax, + PoolGroup.MERGE, + newLimit -> RustBridge.setMergePoolLimit(newLimit) + ); + + // Wire dynamic setting consumers via allocator + ClusterSettings cs = clusterService.getClusterSettings(); + cs.addSettingsUpdateConsumer( + ParquetSettings.WRITE_POOL_MAX, + newMax -> nativeAllocator.setPoolLimit(ParquetSettings.POOL_WRITE, newMax) + ); + cs.addSettingsUpdateConsumer( + ParquetSettings.WRITE_POOL_MIN, + newMin -> nativeAllocator.setPoolMin(ParquetSettings.POOL_WRITE, newMin) + ); + cs.addSettingsUpdateConsumer( + ParquetSettings.MERGE_POOL_MAX, + newMax -> nativeAllocator.setPoolLimit(ParquetSettings.POOL_MERGE, newMax) + ); + cs.addSettingsUpdateConsumer( + ParquetSettings.MERGE_POOL_MIN, + newMin -> nativeAllocator.setPoolMin(ParquetSettings.POOL_MERGE, newMin) + ); + + nativeAllocator.addStatsRefresher(() -> { + long[] s = RustBridge.getPoolStats(); + writePool.updateStats(s[1], s[2]); + mergePool.updateStats(s[4], s[5]); + }); + } else { + // No allocator — wire dynamic consumers directly to Rust pools + ClusterSettings cs = clusterService.getClusterSettings(); + cs.addSettingsUpdateConsumer(ParquetSettings.WRITE_POOL_MAX, newMax -> RustBridge.setWritePoolLimit(newMax)); + cs.addSettingsUpdateConsumer(ParquetSettings.MERGE_POOL_MAX, newMax -> RustBridge.setMergePoolLimit(newMax)); + } + return Collections.emptyList(); } diff --git a/sandbox/plugins/parquet-data-format/src/main/java/org/opensearch/parquet/ParquetSettings.java b/sandbox/plugins/parquet-data-format/src/main/java/org/opensearch/parquet/ParquetSettings.java index 643b9809f0367..af1e85193edc2 100644 --- a/sandbox/plugins/parquet-data-format/src/main/java/org/opensearch/parquet/ParquetSettings.java +++ b/sandbox/plugins/parquet-data-format/src/main/java/org/opensearch/parquet/ParquetSettings.java @@ -18,6 +18,7 @@ import org.opensearch.common.settings.Settings; import org.opensearch.core.common.unit.ByteSizeUnit; import org.opensearch.core.common.unit.ByteSizeValue; +import org.opensearch.node.resource.tracker.ResourceTrackerSettings; import java.util.Collections; import java.util.HashMap; @@ -34,6 +35,9 @@ public final class ParquetSettings { private ParquetSettings() {} + public static final String POOL_WRITE = "write"; + public static final String POOL_MERGE = "merge"; + public static final String DEFAULT_MAX_NATIVE_ALLOCATION = "10%"; public static final int DEFAULT_MAX_ROWS_PER_VSR = 65536; @@ -168,6 +172,93 @@ private ParquetSettings() {} Setting.Property.NodeScope ); + /** Minimum guaranteed bytes for the native write pool. Default is half of write max (2% of budget). */ + public static final Setting WRITE_POOL_MIN = new Setting<>( + "parquet.native.pool.write.min", + s -> derivePoolMinDefault(s, 2), + s -> { + long v = Long.parseLong(s); + if (v < 0) { + throw new IllegalArgumentException("Setting [parquet.native.pool.write.min] must be >= 0, got " + v); + } + return v; + }, + Setting.Property.NodeScope, + Setting.Property.Dynamic + ); + + /** Maximum bytes the native write pool can burst to. Default is 5% of node.native_memory.limit. */ + public static final Setting WRITE_POOL_MAX = new Setting<>( + "parquet.native.pool.write.max", + s -> derivePoolMaxDefault(s, 5), + s -> { + long v = Long.parseLong(s); + if (v < 0) { + throw new IllegalArgumentException("Setting [parquet.native.pool.write.max] must be >= 0, got " + v); + } + return v; + }, + Setting.Property.NodeScope, + Setting.Property.Dynamic + ); + + /** Minimum guaranteed bytes for the native merge pool. Default is half of merge max (1% of budget). */ + public static final Setting MERGE_POOL_MIN = new Setting<>( + "parquet.native.pool.merge.min", + s -> derivePoolMinDefault(s, 1), + s -> { + long v = Long.parseLong(s); + if (v < 0) { + throw new IllegalArgumentException("Setting [parquet.native.pool.merge.min] must be >= 0, got " + v); + } + return v; + }, + Setting.Property.NodeScope, + Setting.Property.Dynamic + ); + + /** Maximum bytes the native merge pool can burst to. Default is 3% of node.native_memory.limit. */ + public static final Setting MERGE_POOL_MAX = new Setting<>( + "parquet.native.pool.merge.max", + s -> derivePoolMaxDefault(s, 3), + s -> { + long v = Long.parseLong(s); + if (v < 0) { + throw new IllegalArgumentException("Setting [parquet.native.pool.merge.max] must be >= 0, got " + v); + } + return v; + }, + Setting.Property.NodeScope, + Setting.Property.Dynamic + ); + + /** + * Computes the default for a pool max as a percentage of + * {@link ResourceTrackerSettings#NODE_NATIVE_MEMORY_LIMIT_SETTING}. + * Falls back to {@link Long#MAX_VALUE} when AC is unconfigured. + */ + static String derivePoolMaxDefault(Settings settings, int percent) { + ByteSizeValue nativeLimit = ResourceTrackerSettings.NODE_NATIVE_MEMORY_LIMIT_SETTING.get(settings); + if (nativeLimit.getBytes() <= 0) { + return Long.toString(Long.MAX_VALUE); + } + long pool = Math.max(0L, nativeLimit.getBytes() * percent / 100); + return Long.toString(pool); + } + + /** + * Computes the default for a pool min as a percentage of + * {@link ResourceTrackerSettings#NODE_NATIVE_MEMORY_LIMIT_SETTING}. + * Returns 0 when AC is unconfigured (unlike max which returns Long.MAX_VALUE). + */ + static String derivePoolMinDefault(Settings settings, int percent) { + ByteSizeValue nativeLimit = ResourceTrackerSettings.NODE_NATIVE_MEMORY_LIMIT_SETTING.get(settings); + if (nativeLimit.getBytes() <= 0) { + return "0"; + } + return Long.toString(Math.max(0L, nativeLimit.getBytes() * percent / 100)); + } + public static final Set VALID_ENCODINGS = Set.of( "PLAIN", "RLE", @@ -666,6 +757,10 @@ public static List> getSettings() { MERGE_BATCH_SIZE, MERGE_RAYON_THREADS, MERGE_IO_THREADS, + WRITE_POOL_MIN, + WRITE_POOL_MAX, + MERGE_POOL_MIN, + MERGE_POOL_MAX, ENCODING_FIELD_SETTING, ENCODING_VALUE_SETTING, COMPRESSION_FIELD_SETTING, diff --git a/sandbox/plugins/parquet-data-format/src/main/java/org/opensearch/parquet/bridge/RustBridge.java b/sandbox/plugins/parquet-data-format/src/main/java/org/opensearch/parquet/bridge/RustBridge.java index e59a3549a0dd1..d3f99e1aa1a1f 100644 --- a/sandbox/plugins/parquet-data-format/src/main/java/org/opensearch/parquet/bridge/RustBridge.java +++ b/sandbox/plugins/parquet-data-format/src/main/java/org/opensearch/parquet/bridge/RustBridge.java @@ -45,6 +45,10 @@ public class RustBridge { private static final MethodHandle FREE_MERGE_RESULT; private static final MethodHandle READ_AS_JSON; private static final MethodHandle FREE_ROW_ID_MAPPING; + private static final MethodHandle INIT_MEMORY_POOLS; + private static final MethodHandle SET_WRITE_POOL_LIMIT; + private static final MethodHandle SET_MERGE_POOL_LIMIT; + private static final MethodHandle GET_POOL_STATS; static { SymbolLookup lib = NativeLibraryLoader.symbolLookup(); @@ -251,6 +255,22 @@ public class RustBridge { ValueLayout.JAVA_LONG // mapping_len ) ); + INIT_MEMORY_POOLS = linker.downcallHandle( + lib.find("parquet_init_memory_pools").orElseThrow(), + FunctionDescriptor.ofVoid(ValueLayout.JAVA_LONG, ValueLayout.JAVA_LONG) + ); + SET_WRITE_POOL_LIMIT = linker.downcallHandle( + lib.find("parquet_set_write_pool_limit").orElseThrow(), + FunctionDescriptor.ofVoid(ValueLayout.JAVA_LONG) + ); + SET_MERGE_POOL_LIMIT = linker.downcallHandle( + lib.find("parquet_set_merge_pool_limit").orElseThrow(), + FunctionDescriptor.ofVoid(ValueLayout.JAVA_LONG) + ); + GET_POOL_STATS = linker.downcallHandle( + lib.find("parquet_get_pool_stats").orElseThrow(), + FunctionDescriptor.ofVoid(ValueLayout.ADDRESS) + ); } public static void initLogger() {} @@ -688,5 +708,25 @@ private static LongMapArrays toLongMapArrays(NativeCall call, Map return new LongMapArrays(call.strArray(keys), seg); } + public static void initMemoryPools(long writeLimit, long mergeLimit) { + NativeCall.invokeVoid(INIT_MEMORY_POOLS, writeLimit, mergeLimit); + } + + public static void setWritePoolLimit(long newLimit) { + NativeCall.invokeVoid(SET_WRITE_POOL_LIMIT, newLimit); + } + + public static void setMergePoolLimit(long newLimit) { + NativeCall.invokeVoid(SET_MERGE_POOL_LIMIT, newLimit); + } + + public static long[] getPoolStats() { + try (var call = new NativeCall()) { + var buf = call.buf(6 * 8); + NativeCall.invokeVoid(GET_POOL_STATS, buf); + return buf.toArray(ValueLayout.JAVA_LONG); + } + } + private RustBridge() {} } diff --git a/sandbox/plugins/parquet-data-format/src/main/rust/src/ffm.rs b/sandbox/plugins/parquet-data-format/src/main/rust/src/ffm.rs index e96d6b70d5b00..9a150f9fe1972 100644 --- a/sandbox/plugins/parquet-data-format/src/main/rust/src/ffm.rs +++ b/sandbox/plugins/parquet-data-format/src/main/rust/src/ffm.rs @@ -694,3 +694,35 @@ pub unsafe extern "C" fn parquet_free_row_id_mapping( let _ = Box::from_raw(slice::from_raw_parts_mut(mapping_ptr as *mut i64, mapping_len as usize)); } } + +// --------------------------------------------------------------------------- +// Memory pool management (Phase 1 stubs) +// --------------------------------------------------------------------------- + +/// Initialize write and merge memory pool counters. +#[no_mangle] +pub extern "C" fn parquet_init_memory_pools(write_limit: i64, merge_limit: i64) { + crate::memory::init_pools(write_limit as usize, merge_limit as usize); +} + +/// Set write pool limit. Called by Java rebalancer via FFM. +#[no_mangle] +pub extern "C" fn parquet_set_write_pool_limit(new_limit: i64) { + crate::memory::set_write_limit(new_limit as usize); +} + +/// Set merge pool limit. Called by Java rebalancer via FFM. +#[no_mangle] +pub extern "C" fn parquet_set_merge_pool_limit(new_limit: i64) { + crate::memory::set_merge_limit(new_limit as usize); +} + +/// Get pool stats: writes 6 i64s to out_buf. +/// Layout: [write_limit, write_used, write_peak, merge_limit, merge_used, merge_peak] +#[no_mangle] +pub unsafe extern "C" fn parquet_get_pool_stats(out_buf: *mut i64) { + let stats = crate::memory::get_stats(); + for (i, val) in stats.iter().enumerate() { + *out_buf.add(i) = *val as i64; + } +} diff --git a/sandbox/plugins/parquet-data-format/src/main/rust/src/lib.rs b/sandbox/plugins/parquet-data-format/src/main/rust/src/lib.rs index 2ce15506f12c4..9a2fac354e97c 100644 --- a/sandbox/plugins/parquet-data-format/src/main/rust/src/lib.rs +++ b/sandbox/plugins/parquet-data-format/src/main/rust/src/lib.rs @@ -14,6 +14,7 @@ mod tests; pub mod writer; pub mod ffm; +pub mod memory; pub mod native_settings; pub mod field_config; pub mod writer_properties_builder; diff --git a/sandbox/plugins/parquet-data-format/src/main/rust/src/memory.rs b/sandbox/plugins/parquet-data-format/src/main/rust/src/memory.rs new file mode 100644 index 0000000000000..d88ee47f8e831 --- /dev/null +++ b/sandbox/plugins/parquet-data-format/src/main/rust/src/memory.rs @@ -0,0 +1,57 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +//! Write and merge memory pools backed by `native_bridge_common::memory_pool::MemoryPool`. + +use std::sync::{Arc, OnceLock}; + +use native_bridge_common::memory_pool::MemoryPool; + +static WRITE_POOL: OnceLock> = OnceLock::new(); +static MERGE_POOL: OnceLock> = OnceLock::new(); + +/// Initialize write and merge pools. Called once from Java. +pub fn init_pools(write_limit: usize, merge_limit: usize) { + WRITE_POOL.get_or_init(|| Arc::new(MemoryPool::new("write", write_limit))); + MERGE_POOL.get_or_init(|| Arc::new(MemoryPool::new("merge", merge_limit))); +} + +/// Returns the write pool, or panics if not initialized. +pub fn write_pool() -> &'static Arc { + WRITE_POOL.get().expect("write pool not initialized") +} + +/// Returns the merge pool, or panics if not initialized. +pub fn merge_pool() -> &'static Arc { + MERGE_POOL.get().expect("merge pool not initialized") +} + +pub fn set_write_limit(v: usize) { + if let Some(p) = WRITE_POOL.get() { + p.set_limit(v); + } +} + +pub fn set_merge_limit(v: usize) { + if let Some(p) = MERGE_POOL.get() { + p.set_limit(v); + } +} + +/// Returns [write_limit, write_used, write_peak, merge_limit, merge_used, merge_peak]. +pub fn get_stats() -> [usize; 6] { + let w = WRITE_POOL + .get() + .map(|p| (p.limit(), p.used(), p.peak())) + .unwrap_or((0, 0, 0)); + let m = MERGE_POOL + .get() + .map(|p| (p.limit(), p.used(), p.peak())) + .unwrap_or((0, 0, 0)); + [w.0, w.1, w.2, m.0, m.1, m.2] +} diff --git a/sandbox/plugins/parquet-data-format/src/main/rust/src/tests/mod.rs b/sandbox/plugins/parquet-data-format/src/main/rust/src/tests/mod.rs index 1e03fd66ce905..b04b4beb0c3a2 100644 --- a/sandbox/plugins/parquet-data-format/src/main/rust/src/tests/mod.rs +++ b/sandbox/plugins/parquet-data-format/src/main/rust/src/tests/mod.rs @@ -3295,9 +3295,9 @@ fn test_writer_properties_defaults_single_chunk() { assert!(matches!(compression, parquet::basic::Compression::LZ4_RAW), "Default compression should be LZ4_RAW, got: {:?}", compression); - // Default bloom filter is enabled - assert!(has_bloom_filter_in_parquet(&filename), - "Default bloom_filter_enabled should be true"); + // Default bloom filter is disabled + assert!(!has_bloom_filter_in_parquet(&filename), + "Default bloom_filter_enabled should be false"); // Format version always stamped let format_version = read_format_version_from_parquet(&filename); @@ -3339,9 +3339,9 @@ fn test_writer_properties_defaults_multi_chunk() { assert!(matches!(compression, parquet::basic::Compression::LZ4_RAW), "Default compression should be LZ4_RAW in multi-chunk path, got: {:?}", compression); - // Default bloom filter is enabled - assert!(has_bloom_filter_in_parquet(&filename), - "Default bloom_filter_enabled should be true in multi-chunk path"); + // Default bloom filter is disabled + assert!(!has_bloom_filter_in_parquet(&filename), + "Default bloom_filter_enabled should be false in multi-chunk path"); // Format version always stamped let format_version = read_format_version_from_parquet(&filename); diff --git a/sandbox/plugins/parquet-data-format/src/main/rust/src/writer_properties_builder.rs b/sandbox/plugins/parquet-data-format/src/main/rust/src/writer_properties_builder.rs index d3954369b2d46..89417020918a1 100644 --- a/sandbox/plugins/parquet-data-format/src/main/rust/src/writer_properties_builder.rs +++ b/sandbox/plugins/parquet-data-format/src/main/rust/src/writer_properties_builder.rs @@ -1043,7 +1043,8 @@ mod tests { #[test] fn test_build_stamps_format_version() { let config = NativeSettings::default(); - let props = WriterPropertiesBuilder::build(&config); + let schema = ArrowSchema::new(Vec::::new()); + let props = WriterPropertiesBuilder::build(&config, &schema).expect("build failed"); let kv = props.key_value_metadata().expect("KV metadata missing"); let found = kv.iter().find(|k| k.key == FORMAT_VERSION_KEY); let entry = found.expect("format_version KV entry missing"); @@ -1053,7 +1054,8 @@ mod tests { #[test] fn test_build_with_generation_stamps_both() { let config = NativeSettings::default(); - let props = WriterPropertiesBuilder::build_with_generation(&config, Some(42)); + let schema = ArrowSchema::new(Vec::::new()); + let props = WriterPropertiesBuilder::build_with_generation(&config, Some(42), &schema).expect("build failed"); let kv = props.key_value_metadata().expect("KV metadata missing"); let has_format = kv.iter().any(|k| k.key == FORMAT_VERSION_KEY && k.value.as_deref() == Some(FORMAT_VERSION)); let has_gen = kv.iter().any(|k| k.key == WRITER_GENERATION_KEY && k.value.as_deref() == Some("42")); diff --git a/sandbox/plugins/parquet-data-format/src/main/rust/tests/writer_integration_tests.rs b/sandbox/plugins/parquet-data-format/src/main/rust/tests/writer_integration_tests.rs index ca74236316ea5..18f56283abede 100644 --- a/sandbox/plugins/parquet-data-format/src/main/rust/tests/writer_integration_tests.rs +++ b/sandbox/plugins/parquet-data-format/src/main/rust/tests/writer_integration_tests.rs @@ -33,8 +33,8 @@ fn test_complete_writer_lifecycle() { assert!(file_path.metadata().unwrap().len() > 0); let read_metadata = NativeParquetWriter::get_file_metadata(filename.clone()).unwrap(); - assert_eq!(read_metadata.num_rows(), metadata.metadata.file_metadata().num_rows()); - assert_eq!(read_metadata.version(), metadata.metadata.file_metadata().version()); + assert_eq!(read_metadata.file_metadata().num_rows(), metadata.metadata.file_metadata().num_rows()); + assert_eq!(read_metadata.file_metadata().version(), metadata.metadata.file_metadata().version()); } #[test] @@ -231,7 +231,7 @@ fn test_ipc_staging_sorted_writer_integration() { assert_eq!(ids, vec![10, 20, 30, 40, 50, 60]); let read_metadata = NativeParquetWriter::get_file_metadata(filename).unwrap(); - assert_eq!(read_metadata.num_rows(), 6); + assert_eq!(read_metadata.file_metadata().num_rows(), 6); cleanup_ffi_schema(schema_ptr); } diff --git a/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/engine/ParquetDataFormatAwareEngineTests.java b/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/engine/ParquetDataFormatAwareEngineTests.java index 609a3a9e6c4c9..358d244221c95 100644 --- a/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/engine/ParquetDataFormatAwareEngineTests.java +++ b/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/engine/ParquetDataFormatAwareEngineTests.java @@ -13,6 +13,8 @@ import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.arrow.vector.types.pojo.Schema; import org.apache.lucene.search.Query; +import org.opensearch.arrow.allocator.ArrowNativeAllocator; +import org.opensearch.arrow.spi.NativeAllocatorPoolConfig; import org.opensearch.cluster.metadata.IndexMetadata; import org.opensearch.common.network.InetAddresses; import org.opensearch.common.settings.Settings; @@ -115,14 +117,14 @@ public Query termQuery(Object value, QueryShardContext context) { }; private Schema schema; - private org.opensearch.arrow.allocator.ArrowNativeAllocator nativeAllocator; + private ArrowNativeAllocator nativeAllocator; @Override public void setUp() throws Exception { super.setUp(); RustBridge.initLogger(); - nativeAllocator = new org.opensearch.arrow.allocator.ArrowNativeAllocator(Long.MAX_VALUE); - nativeAllocator.getOrCreatePool(org.opensearch.arrow.spi.NativeAllocatorPoolConfig.POOL_INGEST, 0L, Long.MAX_VALUE); + nativeAllocator = new ArrowNativeAllocator(); + nativeAllocator.getOrCreatePool(NativeAllocatorPoolConfig.POOL_INGEST, 0L, Long.MAX_VALUE, null); schema = buildSchema(); } diff --git a/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/engine/ParquetIndexingEngineTests.java b/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/engine/ParquetIndexingEngineTests.java index 54c783e2ac6de..1b6b7f414955e 100644 --- a/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/engine/ParquetIndexingEngineTests.java +++ b/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/engine/ParquetIndexingEngineTests.java @@ -58,7 +58,7 @@ public class ParquetIndexingEngineTests extends OpenSearchTestCase { - private org.opensearch.arrow.allocator.ArrowNativeAllocator nativeAllocator; + private ArrowNativeAllocator nativeAllocator; private MappedFieldType idField; private MappedFieldType nameField; private MappedFieldType scoreField; @@ -71,8 +71,8 @@ public class ParquetIndexingEngineTests extends OpenSearchTestCase { public void setUp() throws Exception { super.setUp(); RustBridge.initLogger(); - nativeAllocator = new ArrowNativeAllocator(Long.MAX_VALUE); - nativeAllocator.getOrCreatePool(NativeAllocatorPoolConfig.POOL_INGEST, 0L, Long.MAX_VALUE); + nativeAllocator = new ArrowNativeAllocator(); + nativeAllocator.getOrCreatePool(NativeAllocatorPoolConfig.POOL_INGEST, 0L, Long.MAX_VALUE, null); idField = new NumberFieldMapper.NumberFieldType("id", NumberFieldMapper.NumberType.INTEGER); nameField = new KeywordFieldMapper.KeywordFieldType("name"); scoreField = new NumberFieldMapper.NumberFieldType("score", NumberFieldMapper.NumberType.LONG); diff --git a/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/memory/ArrowBufferPoolTests.java b/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/memory/ArrowBufferPoolTests.java index 9e18ad0e79b6a..d242572838129 100644 --- a/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/memory/ArrowBufferPoolTests.java +++ b/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/memory/ArrowBufferPoolTests.java @@ -24,8 +24,8 @@ public void setUp() throws Exception { super.setUp(); // Each test gets its own allocator with the standard pools pre-created. // Production code receives this via dependency injection; tests build it explicitly. - nativeAllocator = new ArrowNativeAllocator(Long.MAX_VALUE); - nativeAllocator.getOrCreatePool(NativeAllocatorPoolConfig.POOL_INGEST, 0L, Long.MAX_VALUE); + nativeAllocator = new ArrowNativeAllocator(); + nativeAllocator.getOrCreatePool(NativeAllocatorPoolConfig.POOL_INGEST, 0L, Long.MAX_VALUE, null); } @Override diff --git a/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/vsr/VSRManagerTests.java b/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/vsr/VSRManagerTests.java index c9584463dc6ad..bb7a9bae9f834 100644 --- a/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/vsr/VSRManagerTests.java +++ b/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/vsr/VSRManagerTests.java @@ -15,6 +15,7 @@ import org.apache.arrow.vector.types.pojo.Schema; import org.opensearch.Version; import org.opensearch.arrow.allocator.ArrowNativeAllocator; +import org.opensearch.arrow.spi.NativeAllocatorPoolConfig; import org.opensearch.cluster.metadata.IndexMetadata; import org.opensearch.common.settings.Settings; import org.opensearch.index.IndexSettings; @@ -50,8 +51,8 @@ public class VSRManagerTests extends OpenSearchTestCase { public void setUp() throws Exception { super.setUp(); RustBridge.initLogger(); - nativeAllocator = new ArrowNativeAllocator(Long.MAX_VALUE); - nativeAllocator.getOrCreatePool(org.opensearch.arrow.spi.NativeAllocatorPoolConfig.POOL_INGEST, 0L, Long.MAX_VALUE); + nativeAllocator = new ArrowNativeAllocator(); + nativeAllocator.getOrCreatePool(NativeAllocatorPoolConfig.POOL_INGEST, 0L, Long.MAX_VALUE, null); bufferPool = new ArrowBufferPool(Settings.EMPTY, nativeAllocator); schema = new Schema(List.of(new Field("val", FieldType.nullable(new ArrowType.Int(32, true)), null))); Settings indexSettingsBuilder = Settings.builder() diff --git a/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/vsr/VSRPoolTests.java b/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/vsr/VSRPoolTests.java index 8cb2626921fad..80e7af76f4b4b 100644 --- a/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/vsr/VSRPoolTests.java +++ b/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/vsr/VSRPoolTests.java @@ -30,8 +30,8 @@ public class VSRPoolTests extends OpenSearchTestCase { @Override public void setUp() throws Exception { super.setUp(); - nativeAllocator = new ArrowNativeAllocator(Long.MAX_VALUE); - nativeAllocator.getOrCreatePool(NativeAllocatorPoolConfig.POOL_INGEST, 0L, Long.MAX_VALUE); + nativeAllocator = new ArrowNativeAllocator(); + nativeAllocator.getOrCreatePool(NativeAllocatorPoolConfig.POOL_INGEST, 0L, Long.MAX_VALUE, null); bufferPool = new ArrowBufferPool(Settings.EMPTY, nativeAllocator); schema = new Schema(List.of(new Field("val", FieldType.nullable(new ArrowType.Int(32, true)), null))); } diff --git a/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/writer/ParquetWriterTests.java b/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/writer/ParquetWriterTests.java index 09bf28a908441..fd76ff204cc9a 100644 --- a/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/writer/ParquetWriterTests.java +++ b/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/writer/ParquetWriterTests.java @@ -12,6 +12,7 @@ import org.apache.arrow.vector.types.pojo.Schema; import org.opensearch.Version; import org.opensearch.arrow.allocator.ArrowNativeAllocator; +import org.opensearch.arrow.spi.NativeAllocatorPoolConfig; import org.opensearch.cluster.metadata.IndexMetadata; import org.opensearch.common.settings.Settings; import org.opensearch.index.IndexSettings; @@ -55,8 +56,8 @@ public class ParquetWriterTests extends OpenSearchTestCase { public void setUp() throws Exception { super.setUp(); RustBridge.initLogger(); - nativeAllocator = new ArrowNativeAllocator(Long.MAX_VALUE); - nativeAllocator.getOrCreatePool(org.opensearch.arrow.spi.NativeAllocatorPoolConfig.POOL_INGEST, 0L, Long.MAX_VALUE); + nativeAllocator = new ArrowNativeAllocator(); + nativeAllocator.getOrCreatePool(NativeAllocatorPoolConfig.POOL_INGEST, 0L, Long.MAX_VALUE, null); bufferPool = new ArrowBufferPool(Settings.EMPTY, nativeAllocator); idField = new NumberFieldMapper.NumberFieldType("id", NumberFieldMapper.NumberType.INTEGER); nameField = new KeywordFieldMapper.KeywordFieldType("name"); diff --git a/server/build.gradle b/server/build.gradle index bd14f4b6606d3..ad38c8b21ed02 100644 --- a/server/build.gradle +++ b/server/build.gradle @@ -77,6 +77,7 @@ dependencies { compileOnly project(":libs:agent-sm:bootstrap") compileOnly project(':libs:opensearch-plugin-classloader') + api project(":libs:opensearch-arrow-spi") testRuntimeOnly project(':libs:opensearch-plugin-classloader') api libs.bundles.lucene diff --git a/server/src/main/java/org/opensearch/action/admin/cluster/node/stats/NodeStats.java b/server/src/main/java/org/opensearch/action/admin/cluster/node/stats/NodeStats.java index 7eece7a11595e..3ae869f42d766 100644 --- a/server/src/main/java/org/opensearch/action/admin/cluster/node/stats/NodeStats.java +++ b/server/src/main/java/org/opensearch/action/admin/cluster/node/stats/NodeStats.java @@ -60,7 +60,6 @@ import org.opensearch.node.AdaptiveSelectionStats; import org.opensearch.node.NodesResourceUsageStats; import org.opensearch.node.remotestore.RemoteStoreNodeStats; -import org.opensearch.plugin.stats.AnalyticsBackendNativeMemoryStats; import org.opensearch.plugin.stats.NativeAllocatorPoolStats; import org.opensearch.plugins.BlockCacheStats; import org.opensearch.ratelimitting.admissioncontrol.stats.AdmissionControlStats; @@ -189,9 +188,6 @@ public class NodeStats extends BaseNodeResponse implements ToXContentFragment { */ private long totalEstimatedNativeBytes; - @Nullable - private AnalyticsBackendNativeMemoryStats nativeMemoryStats; - public NodeStats(StreamInput in) throws IOException { super(in); timestamp = in.readVLong(); @@ -290,11 +286,6 @@ public NodeStats(StreamInput in) throws IOException { } else { nativeAllocatorStats = null; } - if (in.getVersion().onOrAfter(Version.V_3_7_0)) { - nativeMemoryStats = in.readOptionalWriteable(AnalyticsBackendNativeMemoryStats::new); - } else { - nativeMemoryStats = null; - } if (in.getVersion().onOrAfter(Version.V_3_7_0)) { totalEstimatedNativeBytes = in.readLong(); } else { @@ -336,7 +327,6 @@ public NodeStats( @Nullable NodeCacheStats nodeCacheStats, @Nullable RemoteStoreNodeStats remoteStoreNodeStats, @Nullable NativeAllocatorPoolStats nativeAllocatorStats, - @Nullable AnalyticsBackendNativeMemoryStats nativeMemoryStats, long totalEstimatedNativeBytes ) { super(node); @@ -372,7 +362,6 @@ public NodeStats( this.nodeCacheStats = nodeCacheStats; this.remoteStoreNodeStats = remoteStoreNodeStats; this.nativeAllocatorStats = nativeAllocatorStats; - this.nativeMemoryStats = nativeMemoryStats; this.totalEstimatedNativeBytes = totalEstimatedNativeBytes; } @@ -568,14 +557,6 @@ public long getTotalEstimatedNativeBytes() { return totalEstimatedNativeBytes; } - /** - * Returns the analytics backend native memory stats, or {@code null} if not available. - */ - @Nullable - public AnalyticsBackendNativeMemoryStats getAnalyticsBackendNativeMemoryStats() { - return nativeMemoryStats; - } - @Override public void writeTo(StreamOutput out) throws IOException { super.writeTo(out); @@ -644,9 +625,6 @@ public void writeTo(StreamOutput out) throws IOException { if (out.getVersion().onOrAfter(Version.V_3_7_0)) { out.writeOptionalWriteable(nativeAllocatorStats); } - if (out.getVersion().onOrAfter(Version.V_3_7_0)) { - out.writeOptionalWriteable(nativeMemoryStats); - } if (out.getVersion().onOrAfter(Version.V_3_7_0)) { out.writeLong(totalEstimatedNativeBytes); } @@ -771,17 +749,20 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws getRemoteStoreNodeStats().toXContent(builder, params); } // total_estimated_bytes ≈ RssAnon - JVM heap committed - JVM non-heap committed. - // Always emit so operators see the per-node value even when no plugin contributes - // an inner stats block. The value is captured on the data node in NodeService.stats() - // and serialized; the coordinator never re-reads its own OsProbe here. + // native_memory: unified view of all native memory pools and jemalloc stats. + // NativeAllocatorPoolStats now includes jemalloc allocated/resident + all pools. builder.startObject("native_memory"); builder.field("total_estimated_bytes", totalEstimatedNativeBytes); - if (getAnalyticsBackendNativeMemoryStats() != null) { - getAnalyticsBackendNativeMemoryStats().toXContent(builder, params); - } if (getNativeAllocatorStats() != null) { - builder.startObject("native_allocator"); - getNativeAllocatorStats().toXContent(builder, params); + NativeAllocatorPoolStats stats = getNativeAllocatorStats(); + builder.startObject("runtime"); + builder.field("allocated_bytes", stats.getNativeAllocatedBytes()); + builder.field("resident_bytes", stats.getNativeResidentBytes()); + builder.endObject(); + builder.startObject("memory_pools"); + for (var entry : stats.getGroupedStats().entrySet()) { + entry.getValue().toXContent(builder, params); + } builder.endObject(); } builder.endObject(); diff --git a/server/src/main/java/org/opensearch/action/admin/cluster/node/stats/NodesStatsRequest.java b/server/src/main/java/org/opensearch/action/admin/cluster/node/stats/NodesStatsRequest.java index 80ef0b6cc6d8e..544caa0b6da78 100644 --- a/server/src/main/java/org/opensearch/action/admin/cluster/node/stats/NodesStatsRequest.java +++ b/server/src/main/java/org/opensearch/action/admin/cluster/node/stats/NodesStatsRequest.java @@ -226,7 +226,6 @@ public enum Metric { ADMISSION_CONTROL("admission_control"), CACHE_STATS("caches"), REMOTE_STORE("remote_store"), - NATIVE_ALLOCATOR("native_allocator"), NATIVE_MEMORY("native_memory"); private String metricName; diff --git a/server/src/main/java/org/opensearch/action/admin/cluster/node/stats/TransportNodesStatsAction.java b/server/src/main/java/org/opensearch/action/admin/cluster/node/stats/TransportNodesStatsAction.java index 64b0fee32408b..40e8788fc2238 100644 --- a/server/src/main/java/org/opensearch/action/admin/cluster/node/stats/TransportNodesStatsAction.java +++ b/server/src/main/java/org/opensearch/action/admin/cluster/node/stats/TransportNodesStatsAction.java @@ -132,7 +132,6 @@ protected NodeStats nodeOperation(NodeStatsRequest nodeStatsRequest) { NodesStatsRequest.Metric.ADMISSION_CONTROL.containedIn(metrics), NodesStatsRequest.Metric.CACHE_STATS.containedIn(metrics), NodesStatsRequest.Metric.REMOTE_STORE.containedIn(metrics), - NodesStatsRequest.Metric.NATIVE_ALLOCATOR.containedIn(metrics), NodesStatsRequest.Metric.NATIVE_MEMORY.containedIn(metrics) ); } diff --git a/server/src/main/java/org/opensearch/action/admin/cluster/stats/TransportClusterStatsAction.java b/server/src/main/java/org/opensearch/action/admin/cluster/stats/TransportClusterStatsAction.java index 1d39f635606d7..c8d06034e6fdf 100644 --- a/server/src/main/java/org/opensearch/action/admin/cluster/stats/TransportClusterStatsAction.java +++ b/server/src/main/java/org/opensearch/action/admin/cluster/stats/TransportClusterStatsAction.java @@ -201,7 +201,6 @@ protected ClusterStatsNodeResponse nodeOperation(ClusterStatsNodeRequest nodeReq false, false, false, - false, false ); List shardsStats = new ArrayList<>(); diff --git a/server/src/main/java/org/opensearch/node/Node.java b/server/src/main/java/org/opensearch/node/Node.java index 8321ab127bb0a..06596fecbfd6d 100644 --- a/server/src/main/java/org/opensearch/node/Node.java +++ b/server/src/main/java/org/opensearch/node/Node.java @@ -56,6 +56,7 @@ import org.opensearch.action.search.StreamSearchTransportService; import org.opensearch.action.support.TransportAction; import org.opensearch.action.update.UpdateHelper; +import org.opensearch.arrow.spi.NativeAllocator; import org.opensearch.bootstrap.BootstrapCheck; import org.opensearch.bootstrap.BootstrapContext; import org.opensearch.cluster.ClusterInfoService; @@ -1205,6 +1206,14 @@ protected Node(final Environment initialEnvironment, Collection clas // Add the telemetryAwarePlugin components to the existing pluginComponents collection. pluginComponents.addAll(telemetryAwarePluginComponents); + // Extract the NativeAllocator instance (published by ArrowBasePlugin in phase 1) + // so it can be passed to SearchBackEndPlugin.createComponents for virtual pool registration. + final NativeAllocator nativeAllocator = pluginComponents.stream() + .filter(c -> c instanceof NativeAllocator) + .map(c -> (NativeAllocator) c) + .findFirst() + .orElse(null); + @SuppressWarnings("rawtypes") Collection searchBackEndPluginComponents = pluginsService.filterPlugins(SearchBackEndPlugin.class) .stream() @@ -1221,7 +1230,8 @@ protected Node(final Environment initialEnvironment, Collection clas namedWriteableRegistry, clusterModule.getIndexNameExpressionResolver(), repositoriesServiceReference::get, - dataFormatRegistry + dataFormatRegistry, + nativeAllocator ).stream() ) .collect(Collectors.toList()); diff --git a/server/src/main/java/org/opensearch/node/NodeService.java b/server/src/main/java/org/opensearch/node/NodeService.java index 3a7aa0ee0b5dd..1ab06c811b8d3 100644 --- a/server/src/main/java/org/opensearch/node/NodeService.java +++ b/server/src/main/java/org/opensearch/node/NodeService.java @@ -263,7 +263,6 @@ public NodeStats stats( boolean admissionControl, boolean cacheService, boolean remoteStoreNodeStats, - boolean nativeAllocator, boolean nativeMemory ) { // for indices stats we want to include previous allocated shards stats as well (it will @@ -301,8 +300,7 @@ public NodeStats stats( admissionControl ? this.admissionControlService.stats() : null, cacheService ? this.cacheService.stats(indices) : null, remoteStoreNodeStats ? new RemoteStoreNodeStats() : null, - nativeAllocator ? collectNativeAllocatorStats() : null, - nativeMemory ? monitorService.memoryReportingService().nativeStats() : null, + nativeMemory ? collectNativeAllocatorStats() : null, // Always capture the process-level native memory estimate on this data node. // Serialized over the wire so the coordinator renders the source node's value, // not its own. Returns -1 on non-Linux platforms or when /proc/self/status is diff --git a/server/src/main/java/org/opensearch/plugin/stats/NativeAllocatorPoolStats.java b/server/src/main/java/org/opensearch/plugin/stats/NativeAllocatorPoolStats.java index 1f27e44d9423d..be8251038a60a 100644 --- a/server/src/main/java/org/opensearch/plugin/stats/NativeAllocatorPoolStats.java +++ b/server/src/main/java/org/opensearch/plugin/stats/NativeAllocatorPoolStats.java @@ -17,57 +17,46 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Collections; +import java.util.LinkedHashMap; import java.util.List; +import java.util.Map; /** - * Point-in-time snapshot of native allocator pool statistics for a node. + * Point-in-time snapshot of native memory statistics for a node. * - *

Arrow-agnostic POJO. The plugin that owns the allocator (e.g. {@code arrow-base}) - * constructs instances of this class and exposes them through a - * {@link NativeAllocatorStatsRegistry} component returned from its - * {@code createComponents()}. Server is the type's home so that the cross-module - * dependency from {@code :server} to {@code :libs:opensearch-arrow-spi} is unnecessary, - * mirroring the placement of {@link AnalyticsBackendNativeMemoryStats}. + *

Includes process-wide native memory stats (allocated/resident from jemalloc) + * and per-pool stats for all registered pools (Arrow and virtual). * - *

Renders as the inner body of the {@code native_allocator} object inside - * {@code _nodes/stats[/native_allocator]} — the caller ({@code NodeStats.toXContent}) - * is responsible for opening the {@code native_allocator} wrapper. Each pool exposes - * {@code allocated_bytes}, {@code peak_bytes}, and {@code limit_bytes}; root exposes - * the same. + *

Renders as the body of the {@code native_memory} object inside + * {@code _nodes/stats/native_memory}. * * @opensearch.api */ public class NativeAllocatorPoolStats implements Writeable, ToXContentFragment { - private final long rootAllocatedBytes; - private final long rootPeakBytes; - private final long rootLimitBytes; + private final long nativeAllocatedBytes; + private final long nativeResidentBytes; private final List pools; /** - * Creates a new stats snapshot from the given values. + * Creates a new stats snapshot. * - * @param rootAllocatedBytes current bytes allocated by the root - * @param rootPeakBytes peak bytes ever allocated by the root since process start - * @param rootLimitBytes configured root limit - * @param pools per-pool stats + * @param nativeAllocatedBytes process-wide native allocated bytes (jemalloc), -1 if unavailable + * @param nativeResidentBytes process-wide native resident bytes (jemalloc RSS), -1 if unavailable + * @param pools per-pool stats (Arrow + virtual) */ - public NativeAllocatorPoolStats(long rootAllocatedBytes, long rootPeakBytes, long rootLimitBytes, List pools) { - this.rootAllocatedBytes = rootAllocatedBytes; - this.rootPeakBytes = rootPeakBytes; - this.rootLimitBytes = rootLimitBytes; + public NativeAllocatorPoolStats(long nativeAllocatedBytes, long nativeResidentBytes, List pools) { + this.nativeAllocatedBytes = nativeAllocatedBytes; + this.nativeResidentBytes = nativeResidentBytes; this.pools = Collections.unmodifiableList(pools); } /** * Deserializes from stream. - * - * @param in the stream input */ public NativeAllocatorPoolStats(StreamInput in) throws IOException { - this.rootAllocatedBytes = in.readVLong(); - this.rootPeakBytes = in.readVLong(); - this.rootLimitBytes = in.readVLong(); + this.nativeAllocatedBytes = in.readLong(); + this.nativeResidentBytes = in.readLong(); int count = in.readVInt(); List list = new ArrayList<>(count); for (int i = 0; i < count; i++) { @@ -78,9 +67,8 @@ public NativeAllocatorPoolStats(StreamInput in) throws IOException { @Override public void writeTo(StreamOutput out) throws IOException { - out.writeVLong(rootAllocatedBytes); - out.writeVLong(rootPeakBytes); - out.writeVLong(rootLimitBytes); + out.writeLong(nativeAllocatedBytes); + out.writeLong(nativeResidentBytes); out.writeVInt(pools.size()); for (PoolStats pool : pools) { pool.writeTo(out); @@ -89,11 +77,8 @@ public void writeTo(StreamOutput out) throws IOException { @Override public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { - builder.startObject("root"); - builder.field("allocated_bytes", rootAllocatedBytes); - builder.field("peak_bytes", rootPeakBytes); - builder.field("limit_bytes", rootLimitBytes); - builder.endObject(); + builder.field("allocated_bytes", nativeAllocatedBytes); + builder.field("resident_bytes", nativeResidentBytes); builder.startObject("pools"); for (PoolStats pool : pools) { @@ -103,19 +88,14 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws return builder; } - /** Returns the root allocator's currently allocated bytes. */ - public long getRootAllocatedBytes() { - return rootAllocatedBytes; - } - - /** Returns the root allocator's peak allocated bytes since process start. */ - public long getRootPeakBytes() { - return rootPeakBytes; + /** Returns process-wide native allocated bytes, or -1 if unavailable. */ + public long getNativeAllocatedBytes() { + return nativeAllocatedBytes; } - /** Returns the root allocator's configured limit in bytes. */ - public long getRootLimitBytes() { - return rootLimitBytes; + /** Returns process-wide native resident bytes (RSS), or -1 if unavailable. */ + public long getNativeResidentBytes() { + return nativeResidentBytes; } /** Returns the per-pool statistics. */ @@ -123,6 +103,26 @@ public List getPools() { return pools; } + /** Returns stats grouped by pool group. Pools without a group use their name as the key. */ + public Map getGroupedStats() { + // [allocated, peak, limit] — peak uses max (highest watermark) rather than sum + // because individual pool peaks are not additive (they occur at different times). + Map grouped = new LinkedHashMap<>(); + for (PoolStats pool : pools) { + String g = pool.getGroup() != null ? pool.getGroup() : pool.getName(); + grouped.merge( + g, + new long[] { pool.getAllocatedBytes(), pool.getPeakBytes(), pool.getLimitBytes() }, + (a, b) -> new long[] { a[0] + b[0], Math.max(a[1], b[1]), a[2] + b[2] } + ); + } + Map result = new LinkedHashMap<>(); + for (var e : grouped.entrySet()) { + result.put(e.getKey(), new PoolStats(e.getKey(), e.getValue()[0], e.getValue()[1], e.getValue()[2])); + } + return result; + } + /** * Per-pool statistics snapshot. */ @@ -132,32 +132,29 @@ public static class PoolStats implements Writeable, ToXContentFragment { private final long allocatedBytes; private final long peakBytes; private final long limitBytes; + private final String group; + private final long minBytes; - /** - * Creates a new pool stats snapshot. - * - * @param name pool name - * @param allocatedBytes current allocated bytes - * @param peakBytes peak bytes ever allocated since process start - * @param limitBytes configured limit - */ public PoolStats(String name, long allocatedBytes, long peakBytes, long limitBytes) { + this(name, allocatedBytes, peakBytes, limitBytes, null, 0L); + } + + public PoolStats(String name, long allocatedBytes, long peakBytes, long limitBytes, String group, long minBytes) { this.name = name; this.allocatedBytes = allocatedBytes; this.peakBytes = peakBytes; this.limitBytes = limitBytes; + this.group = group; + this.minBytes = minBytes; } - /** - * Deserializes from stream. - * - * @param in the stream input - */ public PoolStats(StreamInput in) throws IOException { this.name = in.readString(); this.allocatedBytes = in.readVLong(); this.peakBytes = in.readVLong(); this.limitBytes = in.readVLong(); + this.group = in.readOptionalString(); + this.minBytes = in.readVLong(); } @Override @@ -166,36 +163,45 @@ public void writeTo(StreamOutput out) throws IOException { out.writeVLong(allocatedBytes); out.writeVLong(peakBytes); out.writeVLong(limitBytes); + out.writeOptionalString(group); + out.writeVLong(minBytes); } @Override public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { builder.startObject(name); builder.field("allocated_bytes", allocatedBytes); - builder.field("peak_bytes", peakBytes); builder.field("limit_bytes", limitBytes); + builder.field("min_bytes", minBytes); + if (group != null) { + builder.field("group", group); + } builder.endObject(); return builder; } - /** Returns the pool name. */ public String getName() { return name; } - /** Returns the currently allocated bytes. */ public long getAllocatedBytes() { return allocatedBytes; } - /** Returns the peak allocated bytes since process start. */ public long getPeakBytes() { return peakBytes; } - /** Returns the configured limit in bytes. */ public long getLimitBytes() { return limitBytes; } + + public String getGroup() { + return group; + } + + public long getMinBytes() { + return minBytes; + } } } diff --git a/server/src/main/java/org/opensearch/plugins/SearchBackEndPlugin.java b/server/src/main/java/org/opensearch/plugins/SearchBackEndPlugin.java index 1761a0c35d9b7..c103c7dff10a2 100644 --- a/server/src/main/java/org/opensearch/plugins/SearchBackEndPlugin.java +++ b/server/src/main/java/org/opensearch/plugins/SearchBackEndPlugin.java @@ -8,6 +8,7 @@ package org.opensearch.plugins; +import org.opensearch.arrow.spi.NativeAllocator; import org.opensearch.cluster.metadata.IndexNameExpressionResolver; import org.opensearch.cluster.service.ClusterService; import org.opensearch.common.Nullable; @@ -105,6 +106,44 @@ default Collection createComponents( return Collections.emptyList(); } + /** + * Extended variant that also receives the unified native memory allocator. + * Plugins that need to register virtual pools (e.g., DataFusion) override this method. + * The default delegates to the original method for backwards compatibility. + * + * @param nativeAllocator the unified native allocator, or null if arrow-base is not installed + */ + default Collection createComponents( + Client client, + ClusterService clusterService, + ThreadPool threadPool, + ResourceWatcherService resourceWatcherService, + ScriptService scriptService, + NamedXContentRegistry xContentRegistry, + Environment environment, + NodeEnvironment nodeEnvironment, + NamedWriteableRegistry namedWriteableRegistry, + IndexNameExpressionResolver indexNameExpressionResolver, + Supplier repositoriesServiceSupplier, + DataFormatRegistry dataFormatRegistry, + @Nullable NativeAllocator nativeAllocator + ) { + return createComponents( + client, + clusterService, + threadPool, + resourceWatcherService, + scriptService, + xContentRegistry, + environment, + nodeEnvironment, + namedWriteableRegistry, + indexNameExpressionResolver, + repositoriesServiceSupplier, + dataFormatRegistry + ); + } + /** * Returns a supplier for native task cancellation stats, or {@code null} if not available. *

diff --git a/server/src/test/java/org/opensearch/action/admin/cluster/node/stats/AnalyticsBackendNativeMemoryStatsVersionGateTests.java b/server/src/test/java/org/opensearch/action/admin/cluster/node/stats/AnalyticsBackendNativeMemoryStatsVersionGateTests.java deleted file mode 100644 index c2dc2424d74ab..0000000000000 --- a/server/src/test/java/org/opensearch/action/admin/cluster/node/stats/AnalyticsBackendNativeMemoryStatsVersionGateTests.java +++ /dev/null @@ -1,166 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The OpenSearch Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -package org.opensearch.action.admin.cluster.node.stats; - -import org.opensearch.Version; -import org.opensearch.cluster.node.DiscoveryNode; -import org.opensearch.common.io.stream.BytesStreamOutput; -import org.opensearch.core.common.io.stream.StreamInput; -import org.opensearch.plugin.stats.AnalyticsBackendNativeMemoryStats; -import org.opensearch.test.OpenSearchTestCase; - -import java.io.IOException; - -import static java.util.Collections.emptyMap; -import static java.util.Collections.emptySet; - -/** - * Property-based tests for version-gated serialization of {@link AnalyticsBackendNativeMemoryStats} - * within {@link NodeStats}. - * - * Verifies that when NodeStats containing a non-null AnalyticsBackendNativeMemoryStats is serialized - * to a stream with a version older than V_3_7_0, the deserialized NodeStats has - * nativeMemoryStats == null. Conversely, when serialized to V_3_7_0 or later, the - * AnalyticsBackendNativeMemoryStats is preserved. - */ -public class AnalyticsBackendNativeMemoryStatsVersionGateTests extends OpenSearchTestCase { - - /** - * Property 3: Version-gated serialization preserves null for old versions. - * - * For any NodeStats containing a non-null AnalyticsBackendNativeMemoryStats, serializing to a stream - * with version older than the native-memory support version (V_3_7_0) and then - * deserializing SHALL yield a NodeStats with nativeMemoryStats == null. - * - * Validates: Requirements 3.4, 3.5 - */ - public void testVersionGatedSerializationOmitsAnalyticsBackendNativeMemoryStatsForOldVersions() throws IOException { - for (int i = 0; i < 100; i++) { - long allocatedBytes = randomLongBetween(Long.MIN_VALUE, Long.MAX_VALUE); - long residentBytes = randomLongBetween(Long.MIN_VALUE, Long.MAX_VALUE); - AnalyticsBackendNativeMemoryStats nativeMemoryStats = new AnalyticsBackendNativeMemoryStats(allocatedBytes, residentBytes); - - NodeStats nodeStats = createNodeStatsWithNativeMemory(nativeMemoryStats); - - // Serialize with a version older than V_3_7_0 - try (BytesStreamOutput out = new BytesStreamOutput()) { - out.setVersion(Version.V_2_18_0); - nodeStats.writeTo(out); - - try (StreamInput in = out.bytes().streamInput()) { - in.setVersion(Version.V_2_18_0); - NodeStats deserialized = new NodeStats(in); - - assertNull( - "nativeMemoryStats should be null when deserialized from version < V_3_7_0, " - + "iteration " - + i - + " with values [" - + allocatedBytes - + ", " - + residentBytes - + "]", - deserialized.getAnalyticsBackendNativeMemoryStats() - ); - } - } - } - } - - /** - * Positive case: Version-gated serialization preserves AnalyticsBackendNativeMemoryStats for V_3_7_0+. - * - * For any NodeStats containing a non-null AnalyticsBackendNativeMemoryStats, serializing to a stream - * with version V_3_7_0 or later and then deserializing SHALL yield a NodeStats with - * nativeMemoryStats containing the original values. - * - * Validates: Requirements 3.4, 3.5 - */ - public void testVersionGatedSerializationPreservesAnalyticsBackendNativeMemoryStatsForCurrentVersion() throws IOException { - for (int i = 0; i < 100; i++) { - long allocatedBytes = randomLongBetween(Long.MIN_VALUE, Long.MAX_VALUE); - long residentBytes = randomLongBetween(Long.MIN_VALUE, Long.MAX_VALUE); - AnalyticsBackendNativeMemoryStats nativeMemoryStats = new AnalyticsBackendNativeMemoryStats(allocatedBytes, residentBytes); - - NodeStats nodeStats = createNodeStatsWithNativeMemory(nativeMemoryStats); - - // Serialize with V_3_7_0 (the version that introduced native memory support) - try (BytesStreamOutput out = new BytesStreamOutput()) { - out.setVersion(Version.V_3_7_0); - nodeStats.writeTo(out); - - try (StreamInput in = out.bytes().streamInput()) { - in.setVersion(Version.V_3_7_0); - NodeStats deserialized = new NodeStats(in); - - assertNotNull( - "nativeMemoryStats should be non-null when deserialized from version >= V_3_7_0, " + "iteration " + i, - deserialized.getAnalyticsBackendNativeMemoryStats() - ); - assertEquals( - "allocatedBytes mismatch on iteration " + i, - allocatedBytes, - deserialized.getAnalyticsBackendNativeMemoryStats().getAllocatedBytes() - ); - assertEquals( - "residentBytes mismatch on iteration " + i, - residentBytes, - deserialized.getAnalyticsBackendNativeMemoryStats().getResidentBytes() - ); - } - } - } - } - - /** - * Creates a minimal NodeStats with the given AnalyticsBackendNativeMemoryStats and all other fields null. - * Uses the current version for the DiscoveryNode. - */ - private NodeStats createNodeStatsWithNativeMemory(AnalyticsBackendNativeMemoryStats nativeMemoryStats) { - DiscoveryNode node = new DiscoveryNode("test_node", buildNewFakeTransportAddress(), emptyMap(), emptySet(), Version.CURRENT); - - return new NodeStats( - node, - System.currentTimeMillis(), - null, // indices - null, // os - null, // process - null, // jvm - null, // threadPool - null, // fs - null, // transport - null, // http - null, // breaker - null, // scriptStats - null, // discoveryStats - null, // ingestStats - null, // adaptiveSelectionStats - null, // resourceUsageStats - null, // scriptCacheStats - null, // indexingPressureStats - null, // shardIndexingPressureStats - null, // searchBackpressureStats - null, // clusterManagerThrottlingStats - null, // weightedRoutingStats - null, // fileCacheStats - null, // fileCacheOnlyStats - null, // blockCacheOnlyStats - null, // taskCancellationStats - null, // searchPipelineStats - null, // segmentReplicationRejectionStats - null, // repositoriesStats - null, // admissionControlStats - null, // nodeCacheStats - null, // remoteStoreNodeStats - null, // nativeAllocator - nativeMemoryStats, - -1L // totalEstimatedNativeBytes - ); - } -} diff --git a/server/src/test/java/org/opensearch/action/admin/cluster/node/stats/NodeStatsTests.java b/server/src/test/java/org/opensearch/action/admin/cluster/node/stats/NodeStatsTests.java index 05f27c3f98562..7d9f77f1d0bd8 100644 --- a/server/src/test/java/org/opensearch/action/admin/cluster/node/stats/NodeStatsTests.java +++ b/server/src/test/java/org/opensearch/action/admin/cluster/node/stats/NodeStatsTests.java @@ -1056,7 +1056,6 @@ public long getLastSuccessfulFetchOfPinnedTimestamps() { nodeCacheStats, remoteStoreNodeStats, null, - null, -1L ); } @@ -1524,7 +1523,6 @@ public void testNativeAllocatorStatsBwcEmptyOnOldVersion() throws IOException { NativeAllocatorPoolStats stats = new NativeAllocatorPoolStats( 1024L, 2048L, - 8192L, List.of(new NativeAllocatorPoolStats.PoolStats("flight", 100L, 200L, 2048L)) ); DiscoveryNode node = new DiscoveryNode("node1", buildNewFakeTransportAddress(), emptyMap(), emptySet(), Version.CURRENT); @@ -1554,7 +1552,6 @@ public void testNativeAllocatorStatsRoundTripCurrentVersion() throws IOException NativeAllocatorPoolStats stats = new NativeAllocatorPoolStats( 1024L, 2048L, - 8192L, List.of( new NativeAllocatorPoolStats.PoolStats("flight", 100L, 200L, 2048L), new NativeAllocatorPoolStats.PoolStats("ingest", 200L, 400L, 4096L), @@ -1572,9 +1569,8 @@ public void testNativeAllocatorStatsRoundTripCurrentVersion() throws IOException NodeStats roundtripped = new NodeStats(in); NativeAllocatorPoolStats decoded = roundtripped.getNativeAllocatorStats(); assertNotNull("native allocator stats must round-trip on current wire version", decoded); - assertEquals(1024L, decoded.getRootAllocatedBytes()); - assertEquals(2048L, decoded.getRootPeakBytes()); - assertEquals(8192L, decoded.getRootLimitBytes()); + assertEquals(1024L, decoded.getNativeAllocatedBytes()); + assertEquals(2048L, decoded.getNativeResidentBytes()); assertEquals(3, decoded.getPools().size()); assertEquals("flight", decoded.getPools().get(0).getName()); assertEquals(100L, decoded.getPools().get(0).getAllocatedBytes()); @@ -1586,15 +1582,13 @@ public void testNativeAllocatorStatsRoundTripCurrentVersion() throws IOException /** * Renders {@code NodeStats.toXContent} when {@code nativeAllocatorStats} is non-null and - * asserts the JSON shape: a top-level {@code native_memory.native_allocator} block with - * the SPI's inner {@code root}/{@code pools.} structure. Covers the conditional - * branch in {@code NodeStats.toXContent} that opens the {@code native_allocator} wrapper. + * asserts the JSON shape: a top-level {@code native_memory} block with + * {@code runtime.allocated_bytes}/{@code runtime.resident_bytes} and grouped {@code memory_pools}. */ public void testNativeAllocatorStatsXContentRendersInsideNativeMemory() throws IOException { NativeAllocatorPoolStats stats = new NativeAllocatorPoolStats( 1024L, 2048L, - 8192L, List.of(new NativeAllocatorPoolStats.PoolStats("flight", 100L, 200L, 2048L)) ); DiscoveryNode node = new DiscoveryNode("node1", buildNewFakeTransportAddress(), emptyMap(), emptySet(), Version.CURRENT); @@ -1608,20 +1602,22 @@ public void testNativeAllocatorStatsXContentRendersInsideNativeMemory() throws I @SuppressWarnings("unchecked") Map nativeMemory = (Map) root.get("native_memory"); assertNotNull("native_memory wrapper must be opened when allocator stats are present", nativeMemory); + + // Runtime stats are nested under "runtime" @SuppressWarnings("unchecked") - Map nativeAllocator = (Map) nativeMemory.get("native_allocator"); - assertNotNull("native_allocator block must be present", nativeAllocator); - @SuppressWarnings("unchecked") - Map rootBlock = (Map) nativeAllocator.get("root"); - assertEquals(1024L, ((Number) rootBlock.get("allocated_bytes")).longValue()); - assertEquals(2048L, ((Number) rootBlock.get("peak_bytes")).longValue()); - assertEquals(8192L, ((Number) rootBlock.get("limit_bytes")).longValue()); + Map runtime = (Map) nativeMemory.get("runtime"); + assertNotNull("runtime block must be present", runtime); + assertEquals(1024L, ((Number) runtime.get("allocated_bytes")).longValue()); + assertEquals(2048L, ((Number) runtime.get("resident_bytes")).longValue()); + + // Pools are grouped under "memory_pools" @SuppressWarnings("unchecked") - Map pools = (Map) nativeAllocator.get("pools"); + Map pools = (Map) nativeMemory.get("memory_pools"); + assertNotNull("memory_pools block must be present", pools); @SuppressWarnings("unchecked") Map flight = (Map) pools.get("flight"); + assertNotNull("flight pool must be present in memory_pools", flight); assertEquals(100L, ((Number) flight.get("allocated_bytes")).longValue()); - assertEquals(200L, ((Number) flight.get("peak_bytes")).longValue()); assertEquals(2048L, ((Number) flight.get("limit_bytes")).longValue()); } @@ -1704,7 +1700,6 @@ private static NodeStats newNodeStatsWithNativeAllocator( null, // nodeCacheStats null, nativeAllocatorStats, - null, totalEstimatedNativeBytes ); } diff --git a/server/src/test/java/org/opensearch/action/admin/cluster/stats/ClusterStatsNodesTests.java b/server/src/test/java/org/opensearch/action/admin/cluster/stats/ClusterStatsNodesTests.java index 9820102840829..fcdd26912848f 100644 --- a/server/src/test/java/org/opensearch/action/admin/cluster/stats/ClusterStatsNodesTests.java +++ b/server/src/test/java/org/opensearch/action/admin/cluster/stats/ClusterStatsNodesTests.java @@ -354,7 +354,6 @@ private ClusterStatsNodeResponse createClusterStatsNodeResponse( null, null, null, // nativeAllocator - null, -1L // totalEstimatedNativeBytes ); if (defaultBehavior) { diff --git a/server/src/test/java/org/opensearch/action/admin/cluster/stats/ClusterStatsResponseTests.java b/server/src/test/java/org/opensearch/action/admin/cluster/stats/ClusterStatsResponseTests.java index b1ae92df3793c..7ff9dd3d6a89e 100644 --- a/server/src/test/java/org/opensearch/action/admin/cluster/stats/ClusterStatsResponseTests.java +++ b/server/src/test/java/org/opensearch/action/admin/cluster/stats/ClusterStatsResponseTests.java @@ -226,7 +226,6 @@ private ClusterStatsNodeResponse createClusterStatsNodeResponse(DiscoveryNode no null, null, null, // nativeAllocator - null, -1L // totalEstimatedNativeBytes ); return new ClusterStatsNodeResponse(node, null, nodeInfo, nodeStats, shardStats); diff --git a/server/src/test/java/org/opensearch/cluster/DiskUsageTests.java b/server/src/test/java/org/opensearch/cluster/DiskUsageTests.java index ec689d6554b33..2bf4b1fcd8b94 100644 --- a/server/src/test/java/org/opensearch/cluster/DiskUsageTests.java +++ b/server/src/test/java/org/opensearch/cluster/DiskUsageTests.java @@ -216,7 +216,6 @@ public void testFillDiskUsage() { null, null, null, // nativeAllocator - null, -1L // totalEstimatedNativeBytes ), new NodeStats( @@ -253,7 +252,6 @@ public void testFillDiskUsage() { null, null, null, // nativeAllocator - null, -1L // totalEstimatedNativeBytes ), new NodeStats( @@ -290,7 +288,6 @@ public void testFillDiskUsage() { null, null, null, // nativeAllocator - null, -1L // totalEstimatedNativeBytes ) ); @@ -358,7 +355,6 @@ public void testFillDiskUsageSomeInvalidValues() { null, null, null, // nativeAllocator - null, -1L // totalEstimatedNativeBytes ), new NodeStats( @@ -395,7 +391,6 @@ public void testFillDiskUsageSomeInvalidValues() { null, null, null, // nativeAllocator - null, -1L // totalEstimatedNativeBytes ), new NodeStats( @@ -432,7 +427,6 @@ public void testFillDiskUsageSomeInvalidValues() { null, null, null, // nativeAllocator - null, -1L // totalEstimatedNativeBytes ) ); @@ -529,7 +523,6 @@ private NodeStats makeNodeStatsWithResourceUsage(DiscoveryNode node, NodesResour null, null, null, - null, -1L ); diff --git a/server/src/test/java/org/opensearch/node/NodeServiceNativeMemoryTests.java b/server/src/test/java/org/opensearch/node/NodeServiceNativeMemoryTests.java index 2598165dc6dcc..e69eefd2be669 100644 --- a/server/src/test/java/org/opensearch/node/NodeServiceNativeMemoryTests.java +++ b/server/src/test/java/org/opensearch/node/NodeServiceNativeMemoryTests.java @@ -16,19 +16,13 @@ import org.opensearch.cluster.service.ClusterService; import org.opensearch.common.settings.Settings; import org.opensearch.common.settings.SettingsFilter; -import org.opensearch.common.xcontent.XContentHelper; -import org.opensearch.common.xcontent.json.JsonXContent; import org.opensearch.core.indices.breaker.CircuitBreakerService; -import org.opensearch.core.xcontent.ToXContent; -import org.opensearch.core.xcontent.XContentBuilder; import org.opensearch.discovery.Discovery; import org.opensearch.index.IndexingPressureService; import org.opensearch.index.SegmentReplicationStatsTracker; import org.opensearch.indices.IndicesService; import org.opensearch.ingest.IngestService; import org.opensearch.monitor.MonitorService; -import org.opensearch.monitor.memory.MemoryReportingService; -import org.opensearch.plugin.stats.AnalyticsBackendNativeMemoryStats; import org.opensearch.plugin.stats.NativeAllocatorPoolStats; import org.opensearch.plugins.PluginsService; import org.opensearch.ratelimitting.admissioncontrol.AdmissionControlService; @@ -43,7 +37,6 @@ import java.util.Collections; import java.util.List; -import java.util.Map; import java.util.function.Supplier; import static org.mockito.Mockito.mock; @@ -52,38 +45,21 @@ /** * Unit tests for NodeService native memory stats delegation logic. *

- * Validates that NodeService correctly delegates to the native memory stats + * Validates that NodeService correctly delegates to the native allocator stats * supplier when nativeMemory=true and the supplier is non-null, * and returns null otherwise. */ public class NodeServiceNativeMemoryTests extends OpenSearchTestCase { - private NodeService createNodeService(AnalyticsBackendNativeMemoryStats nativeStats) { - return createNodeService(nativeStats, null); - } - - private NodeService createNodeService( - AnalyticsBackendNativeMemoryStats nativeStats, - Supplier nativeAllocatorStatsSupplier - ) { + private NodeService createNodeService(Supplier nativeAllocatorStatsSupplier) { TransportService transportService = mock(TransportService.class); DiscoveryNode localNode = new DiscoveryNode("test_node", buildNewFakeTransportAddress(), Version.CURRENT); when(transportService.getLocalNode()).thenReturn(localNode); - ClusterService clusterService = mock(ClusterService.class); - IngestService ingestService = mock(IngestService.class); - SearchPipelineService searchPipelineService = mock(SearchPipelineService.class); - - MemoryReportingService memoryReportingService = mock(MemoryReportingService.class); - when(memoryReportingService.nativeStats()).thenReturn(nativeStats); - - MonitorService monitorService = mock(MonitorService.class); - when(monitorService.memoryReportingService()).thenReturn(memoryReportingService); - return new NodeService( Settings.EMPTY, mock(ThreadPool.class), - monitorService, + mock(MonitorService.class), mock(Discovery.class), transportService, mock(IndicesService.class), @@ -91,16 +67,16 @@ private NodeService createNodeService( mock(CircuitBreakerService.class), mock(ScriptService.class), null, // httpServerTransport - ingestService, - clusterService, + mock(IngestService.class), + mock(ClusterService.class), new SettingsFilter(Collections.emptyList()), null, // responseCollectorService - not needed when adaptiveSelection=false mock(SearchTransportService.class), mock(IndexingPressureService.class), null, // aggregationUsageService mock(SearchBackpressureService.class), - searchPipelineService, - null, // fileCache + mock(SearchPipelineService.class), + null, // nodeCacheService mock(TaskCancellationMonitoringService.class), null, // resourceUsageCollectorService mock(SegmentReplicationStatsTracker.class), @@ -115,10 +91,13 @@ private NodeService createNodeService( * Tests that stats() with nativeMemory=true and a non-null supplier * returns the stats from the supplier. */ - public void testStatsWithNativeMemoryTrueAndServicePresent() { - AnalyticsBackendNativeMemoryStats expectedStats = new AnalyticsBackendNativeMemoryStats(1024L, 2048L); - - NodeService nodeService = createNodeService(expectedStats); + public void testStatsWithNativeMemoryTrueAndSupplierPresent() { + NativeAllocatorPoolStats expected = new NativeAllocatorPoolStats( + 1024L, + 2048L, + List.of(new NativeAllocatorPoolStats.PoolStats("flight", 100L, 200L, 2048L)) + ); + NodeService nodeService = createNodeService(() -> expected); NodeStats nodeStats = nodeService.stats( CommonStatsFlags.NONE, @@ -150,21 +129,18 @@ public void testStatsWithNativeMemoryTrueAndServicePresent() { false, // admissionControl false, // cacheService false, // remoteStoreNodeStats - false, // nativeAllocator true // nativeMemory ); - assertNotNull(nodeStats.getAnalyticsBackendNativeMemoryStats()); - assertSame(expectedStats, nodeStats.getAnalyticsBackendNativeMemoryStats()); - assertEquals(1024L, nodeStats.getAnalyticsBackendNativeMemoryStats().getAllocatedBytes()); - assertEquals(2048L, nodeStats.getAnalyticsBackendNativeMemoryStats().getResidentBytes()); + assertNotNull("nativeAllocatorStats should be present when supplier returns non-null", nodeStats.getNativeAllocatorStats()); + assertSame(expected, nodeStats.getNativeAllocatorStats()); } /** - * Tests that stats() with nativeMemory=true and a null supplier - * returns null for the nativeMemoryStats field. + * Tests that stats() with nativeMemory=true and no supplier + * returns null for the nativeAllocatorStats field. */ - public void testStatsWithNativeMemoryTrueAndNullService() { + public void testStatsWithNativeMemoryTrueAndNoSupplier() { NodeService nodeService = createNodeService(null); NodeStats nodeStats = nodeService.stats( @@ -197,21 +173,23 @@ public void testStatsWithNativeMemoryTrueAndNullService() { false, // admissionControl false, // cacheService false, // remoteStoreNodeStats - false, // nativeAllocator true // nativeMemory ); - assertNull(nodeStats.getAnalyticsBackendNativeMemoryStats()); + assertNull("nativeAllocatorStats should be null when no supplier registered", nodeStats.getNativeAllocatorStats()); } /** * Tests that stats() with nativeMemory=false returns null for the - * nativeMemoryStats field regardless of whether the supplier is present. + * nativeAllocatorStats field regardless of whether the supplier is present. */ public void testStatsWithNativeMemoryFalse() { - AnalyticsBackendNativeMemoryStats expectedStats = new AnalyticsBackendNativeMemoryStats(4096L, 8192L); - - NodeService nodeService = createNodeService(expectedStats); + NativeAllocatorPoolStats expected = new NativeAllocatorPoolStats( + 4096L, + 8192L, + List.of(new NativeAllocatorPoolStats.PoolStats("flight", 100L, 200L, 2048L)) + ); + NodeService nodeService = createNodeService(() -> expected); NodeStats nodeStats = nodeService.stats( CommonStatsFlags.NONE, @@ -243,225 +221,9 @@ public void testStatsWithNativeMemoryFalse() { false, // admissionControl false, // cacheService false, // remoteStoreNodeStats - false, // nativeAllocator - false // nativeMemory - ); - - assertNull(nodeStats.getAnalyticsBackendNativeMemoryStats()); - } - - /** - * Integration test: verifies that the _nodes/stats/native_memory response format - * contains the expected "native_memory" object with "allocated_bytes" and "resident_bytes" fields. - * This ensures the response format is unchanged after the refactor. - */ - @SuppressWarnings("unchecked") - public void testNativeMemoryResponseFormatUnchanged() throws Exception { - AnalyticsBackendNativeMemoryStats expectedStats = new AnalyticsBackendNativeMemoryStats(123456789L, 987654321L); - - NodeService nodeService = createNodeService(expectedStats); - - NodeStats nodeStats = nodeService.stats( - CommonStatsFlags.NONE, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, // fileCacheDetailed - false, - false, - false, - false, - false, - false, - false, - false, - false, // nativeAllocator - true // nativeMemory - ); - - assertNotNull("nativeMemoryStats should be present", nodeStats.getAnalyticsBackendNativeMemoryStats()); - - // Render the parent NodeStats to JSON — NodeStats now opens the `native_memory` - // wrapper, emits `total_estimated_bytes` from OsProbe, then delegates to - // AnalyticsBackendNativeMemoryStats which renders only the `analytics_backend` block. - XContentBuilder builder = JsonXContent.contentBuilder(); - builder.startObject(); - nodeStats.toXContent(builder, ToXContent.EMPTY_PARAMS); - builder.endObject(); - String json = builder.toString(); - - Map root = XContentHelper.convertToMap(JsonXContent.jsonXContent, json, false); - - // Verify "native_memory" object is present - assertTrue("Response should contain 'native_memory' key", root.containsKey("native_memory")); - - @SuppressWarnings("unchecked") - Map nativeMemory = (Map) root.get("native_memory"); - assertNotNull("native_memory object should not be null", nativeMemory); - - // Verify nested "analytics_backend" with correct values - assertTrue("native_memory should contain 'analytics_backend'", nativeMemory.containsKey("analytics_backend")); - @SuppressWarnings("unchecked") - Map analyticsBackend = (Map) nativeMemory.get("analytics_backend"); - assertEquals(123456789L, ((Number) analyticsBackend.get("allocated_bytes")).longValue()); - assertEquals(987654321L, ((Number) analyticsBackend.get("resident_bytes")).longValue()); - } - - /** - * Integration test: verifies that when native stats are unavailable (null supplier), - * the response omits the native_memory object entirely. - */ - public void testNativeMemoryOmittedWhenUnavailable() throws Exception { - NodeService nodeService = createNodeService(null); - - NodeStats nodeStats = nodeService.stats( - CommonStatsFlags.NONE, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, // fileCacheDetailed - false, - false, - false, - false, - false, - false, - false, - false, - false, // nativeAllocator - true // nativeMemory - ); - - assertNull("nativeMemoryStats should be null when supplier is null", nodeStats.getAnalyticsBackendNativeMemoryStats()); - } - - /** - * Tests that {@code stats(... nativeAllocator=true ...)} invokes the constructor-injected - * {@code Supplier} and surfaces its return value on - * {@link NodeStats#getNativeAllocatorStats()}. Covers the supplier-invocation branch in - * {@code collectNativeAllocatorStats}. - */ - public void testStatsWithNativeAllocatorTrueAndSupplierPresent() { - NativeAllocatorPoolStats expected = new NativeAllocatorPoolStats( - 1024L, - 2048L, - 8192L, - List.of(new NativeAllocatorPoolStats.PoolStats("flight", 100L, 200L, 2048L)) - ); - NodeService nodeService = createNodeService(null, () -> expected); - - NodeStats nodeStats = nodeService.stats( - CommonStatsFlags.NONE, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - true, // nativeAllocator false // nativeMemory ); - assertNotNull("nativeAllocatorStats should be present when supplier returns non-null", nodeStats.getNativeAllocatorStats()); - assertSame(expected, nodeStats.getNativeAllocatorStats()); - } - - /** - * Tests that {@code stats(... nativeAllocator=true ...)} returns {@code null} for the - * allocator stats when no supplier was injected at construction. - */ - public void testStatsWithNativeAllocatorTrueAndNoSupplier() { - NodeService nodeService = createNodeService(null); - // No supplier passed to the factory — defaults to null. - - NodeStats nodeStats = nodeService.stats( - CommonStatsFlags.NONE, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - false, - true, // nativeAllocator - false // nativeMemory - ); - - assertNull("nativeAllocatorStats should be null when no supplier registered", nodeStats.getNativeAllocatorStats()); + assertNull("nativeAllocatorStats should be null when nativeMemory=false", nodeStats.getNativeAllocatorStats()); } } diff --git a/server/src/test/java/org/opensearch/plugin/stats/NativeAllocatorPoolStatsTests.java b/server/src/test/java/org/opensearch/plugin/stats/NativeAllocatorPoolStatsTests.java index 8257f58f0466c..a49c784a34e51 100644 --- a/server/src/test/java/org/opensearch/plugin/stats/NativeAllocatorPoolStatsTests.java +++ b/server/src/test/java/org/opensearch/plugin/stats/NativeAllocatorPoolStatsTests.java @@ -25,7 +25,7 @@ public void testSerializationRoundTrip() throws IOException { new NativeAllocatorPoolStats.PoolStats("flight", 1000, 2000, 3000), new NativeAllocatorPoolStats.PoolStats("query", 4000, 5000, 6000) ); - NativeAllocatorPoolStats original = new NativeAllocatorPoolStats(10000, 20000, 30000, pools); + NativeAllocatorPoolStats original = new NativeAllocatorPoolStats(10000, 20000, pools); BytesStreamOutput out = new BytesStreamOutput(); original.writeTo(out); @@ -33,9 +33,8 @@ public void testSerializationRoundTrip() throws IOException { StreamInput in = out.bytes().streamInput(); NativeAllocatorPoolStats deserialized = new NativeAllocatorPoolStats(in); - assertEquals(original.getRootAllocatedBytes(), deserialized.getRootAllocatedBytes()); - assertEquals(original.getRootPeakBytes(), deserialized.getRootPeakBytes()); - assertEquals(original.getRootLimitBytes(), deserialized.getRootLimitBytes()); + assertEquals(original.getNativeAllocatedBytes(), deserialized.getNativeAllocatedBytes()); + assertEquals(original.getNativeResidentBytes(), deserialized.getNativeResidentBytes()); assertEquals(original.getPools().size(), deserialized.getPools().size()); for (int i = 0; i < pools.size(); i++) { @@ -49,7 +48,7 @@ public void testSerializationRoundTrip() throws IOException { } public void testEmptyPoolsSerialization() throws IOException { - NativeAllocatorPoolStats original = new NativeAllocatorPoolStats(0, 0, 16000000000L, List.of()); + NativeAllocatorPoolStats original = new NativeAllocatorPoolStats(-1, -1, List.of()); BytesStreamOutput out = new BytesStreamOutput(); original.writeTo(out); @@ -57,23 +56,16 @@ public void testEmptyPoolsSerialization() throws IOException { StreamInput in = out.bytes().streamInput(); NativeAllocatorPoolStats deserialized = new NativeAllocatorPoolStats(in); - assertEquals(0, deserialized.getRootAllocatedBytes()); - assertEquals(0, deserialized.getRootPeakBytes()); - assertEquals(16000000000L, deserialized.getRootLimitBytes()); + assertEquals(-1, deserialized.getNativeAllocatedBytes()); + assertEquals(-1, deserialized.getNativeResidentBytes()); assertTrue(deserialized.getPools().isEmpty()); } - /** - * Asserts the JSON shape: {@code root}/{@code pools.} blocks expose - * {@code allocated_bytes}, {@code peak_bytes}, and {@code limit_bytes}. Caller is - * responsible for the outer {@code native_allocator} wrapper, so this test does - * not expect it. - */ public void testToXContent() throws IOException { List pools = List.of( new NativeAllocatorPoolStats.PoolStats("flight", 1024, 1048576, 2147483648L) ); - NativeAllocatorPoolStats stats = new NativeAllocatorPoolStats(4096, 8192, 17179869184L, pools); + NativeAllocatorPoolStats stats = new NativeAllocatorPoolStats(4096, 8192, pools); XContentBuilder builder = JsonXContent.contentBuilder(); builder.startObject(); @@ -81,17 +73,12 @@ public void testToXContent() throws IOException { builder.endObject(); String json = builder.toString(); - assertTrue(json.contains("\"root\"")); + assertTrue(json.contains("\"allocated_bytes\"")); + assertTrue(json.contains("\"resident_bytes\"")); assertTrue(json.contains("\"pools\"")); assertTrue(json.contains("\"flight\"")); - assertTrue(json.contains("\"allocated_bytes\"")); - assertTrue(json.contains("\"peak_bytes\"")); assertTrue(json.contains("\"limit_bytes\"")); - - // Removed fields must NOT appear in the JSON. - assertFalse("child_count was dropped from the stats shape", json.contains("\"child_count\"")); - assertFalse("human-readable byte string was dropped", json.contains("\"allocated\":")); - assertFalse("human-readable byte string was dropped", json.contains("\"limit\":")); + assertFalse("root object should not exist", json.contains("\"root\"")); } public void testPoolStatsSerializationRoundTrip() throws IOException { diff --git a/test/framework/src/main/java/org/opensearch/cluster/MockInternalClusterInfoService.java b/test/framework/src/main/java/org/opensearch/cluster/MockInternalClusterInfoService.java index 9576112b8b12b..bd2842cdaa20d 100644 --- a/test/framework/src/main/java/org/opensearch/cluster/MockInternalClusterInfoService.java +++ b/test/framework/src/main/java/org/opensearch/cluster/MockInternalClusterInfoService.java @@ -154,7 +154,6 @@ List adjustNodesStats(List nodesStats) { nodeStats.getNodeCacheStats(), nodeStats.getRemoteStoreNodeStats(), nodeStats.getNativeAllocatorStats(), - nodeStats.getAnalyticsBackendNativeMemoryStats(), nodeStats.getTotalEstimatedNativeBytes() ); }).collect(Collectors.toList()); diff --git a/test/framework/src/main/java/org/opensearch/test/InternalTestCluster.java b/test/framework/src/main/java/org/opensearch/test/InternalTestCluster.java index e01f4d651d979..e92c1c6402a9f 100644 --- a/test/framework/src/main/java/org/opensearch/test/InternalTestCluster.java +++ b/test/framework/src/main/java/org/opensearch/test/InternalTestCluster.java @@ -2709,7 +2709,6 @@ public void ensureEstimatedStats() { false, false, false, - false, false ); assertThat(