diff --git a/libs/arrow-spi/build.gradle b/libs/arrow-spi/build.gradle
index abf7eecf84c77..c1a716c6f72c4 100644
--- a/libs/arrow-spi/build.gradle
+++ b/libs/arrow-spi/build.gradle
@@ -11,7 +11,9 @@ apply plugin: 'opensearch.publish'
dependencies {
api project(':libs:opensearch-core')
api project(':libs:opensearch-common')
- testImplementation project(':test:framework')
+ testImplementation(project(':test:framework')) {
+ exclude group: 'org.opensearch', module: 'opensearch-arrow-spi'
+ }
}
tasks.named('forbiddenApisMain').configure {
diff --git a/libs/arrow-spi/src/main/java/org/opensearch/arrow/spi/NativeAllocator.java b/libs/arrow-spi/src/main/java/org/opensearch/arrow/spi/NativeAllocator.java
index 89d6866da2c89..70776f7dcef6c 100644
--- a/libs/arrow-spi/src/main/java/org/opensearch/arrow/spi/NativeAllocator.java
+++ b/libs/arrow-spi/src/main/java/org/opensearch/arrow/spi/NativeAllocator.java
@@ -9,36 +9,35 @@
package org.opensearch.arrow.spi;
import java.io.Closeable;
+import java.util.Set;
+import java.util.function.Consumer;
+import java.util.function.Supplier;
/**
- * Arrow-agnostic interface for a hierarchical native memory allocator.
+ * Unified native memory allocator interface.
*
- *
The implementation (backed by Arrow's {@code RootAllocator}) is provided by
- * a plugin. The SPI allows other subsystems to interact with the allocator
- * without depending on Arrow classes.
- *
- *
Plugins that need Arrow allocators obtain the implementation via
- * service lookup or plugin extension and call {@link #getOrCreatePool} to
- * register their pool.
+ *
Manages memory pools under a shared budget. Each pool has a minimum
+ * guaranteed allocation and a maximum burst limit. Implementations may
+ * redistribute unused capacity across pools.
*
* @opensearch.api
*/
public interface NativeAllocator extends Closeable {
/**
- * Returns the named pool, creating it on first access with the given limit.
- * Subsequent calls with the same name return the same pool (first-call limit wins).
+ * Returns the named pool, creating it on first access.
+ * Subsequent calls with the same name return the existing pool (first-call config wins).
*
- * @param poolName logical pool name (e.g., "query", "flight")
- * @param limit maximum bytes this pool can allocate in aggregate
+ * @param poolName logical pool name
+ * @param min minimum guaranteed bytes
+ * @param max maximum bytes this pool can allocate
+ * @param group the group this pool belongs to for aggregated stats, or null
* @return an opaque pool handle
*/
- PoolHandle getOrCreatePool(String poolName, long limit);
+ PoolHandle getOrCreatePool(String poolName, long min, long max, PoolGroup group);
/**
- * Updates the limit of an existing pool. Children of the pool allocator
- * inherit the change automatically via Arrow's parent-cap check at
- * allocation time — no notification SPI is needed.
+ * Updates the effective limit of an existing pool.
*
* @param poolName logical pool name
* @param newLimit new maximum bytes for the pool
@@ -46,15 +45,69 @@ public interface NativeAllocator extends Closeable {
void setPoolLimit(String poolName, long newLimit);
/**
- * Sets the root-level memory limit for the entire allocator.
+ * Registers a virtual pool with initial min/max and a callback
+ * invoked when the pool's limit changes.
+ *
+ * @param poolName logical pool name
+ * @param min minimum guaranteed bytes
+ * @param max initial maximum bytes (the pool's starting limit)
+ * @param group the group this pool belongs to for aggregated stats
+ * @param limitSetter callback invoked when the pool limit changes
+ * @return a handle to update stats from the native layer
+ */
+ VirtualPoolHandle registerVirtualPool(String poolName, long min, long max, PoolGroup group, Consumer limitSetter);
+
+ /**
+ * Updates the minimum guaranteed bytes for a pool.
+ *
+ * @param poolName logical pool name
+ * @param newMin new minimum bytes
+ */
+ void setPoolMin(String poolName, long newMin);
+
+ /**
+ * Returns all registered pool names.
+ */
+ Set getAllPoolNames();
+
+ /**
+ * Adds a callback invoked before stats collection to refresh pool usage data.
*
- * @param limit new maximum bytes for the root allocator
+ * @param refresher runnable that updates pool stats
+ */
+ void addStatsRefresher(Runnable refresher);
+
+ /**
+ * Sets the supplier for process-wide native memory stats.
+ *
+ * @param supplier returns [allocatedBytes, residentBytes]
+ */
+ void setNativeMemoryStatsSupplier(Supplier supplier);
+
+ /**
+ * Handle for a virtual pool. Plugins update stats via this handle.
*/
- void setRootLimit(long limit);
+ interface VirtualPoolHandle {
+ /**
+ * Update the current usage stats.
+ *
+ * @param allocatedBytes current allocated bytes
+ * @param peakBytes peak allocated bytes
+ */
+ void updateStats(long allocatedBytes, long peakBytes);
+
+ /** Returns current allocated bytes. */
+ long allocatedBytes();
+
+ /** Returns peak allocated bytes. */
+ long peakBytes();
+
+ /** Returns current limit. */
+ long limit();
+ }
/**
- * Opaque handle to a memory pool. Plugins downcast to the concrete type
- * (e.g., Arrow's {@code BufferAllocator}) in the implementation layer.
+ * Opaque handle to a memory pool.
*/
interface PoolHandle {
@@ -63,28 +116,20 @@ interface PoolHandle {
*
* @param childName name for debugging
* @param childLimit maximum bytes for the child
- * @return an opaque child handle (downcast to BufferAllocator in Arrow impl)
+ * @return a child handle
*/
PoolHandle newChild(String childName, long childLimit);
- /**
- * Returns the current allocated bytes for this pool/child.
- */
+ /** Returns the current allocated bytes. */
long allocatedBytes();
- /**
- * Returns the peak memory allocation.
- */
+ /** Returns the peak memory allocation. */
long peakBytes();
- /**
- * Returns the configured limit.
- */
+ /** Returns the configured limit. */
long limit();
- /**
- * Releases this allocation handle.
- */
+ /** Releases this allocation handle. */
void close();
}
}
diff --git a/libs/arrow-spi/src/main/java/org/opensearch/arrow/spi/NativeAllocatorPoolConfig.java b/libs/arrow-spi/src/main/java/org/opensearch/arrow/spi/NativeAllocatorPoolConfig.java
index 98f991cb86704..29dba48e9f165 100644
--- a/libs/arrow-spi/src/main/java/org/opensearch/arrow/spi/NativeAllocatorPoolConfig.java
+++ b/libs/arrow-spi/src/main/java/org/opensearch/arrow/spi/NativeAllocatorPoolConfig.java
@@ -33,9 +33,6 @@ public final class NativeAllocatorPoolConfig {
/** Pool name for query-execution memory (analytics-engine fragments and per-query allocators). */
public static final String POOL_QUERY = "query";
- /** Setting key for the root allocator limit. */
- public static final String SETTING_ROOT_LIMIT = "native.allocator.root.limit";
-
/** Setting key for the Flight pool minimum. */
public static final String SETTING_FLIGHT_MIN = "native.allocator.pool.flight.min";
/** Setting key for the Flight pool maximum. */
diff --git a/libs/arrow-spi/src/main/java/org/opensearch/arrow/spi/PoolGroup.java b/libs/arrow-spi/src/main/java/org/opensearch/arrow/spi/PoolGroup.java
new file mode 100644
index 0000000000000..d292b42ae595f
--- /dev/null
+++ b/libs/arrow-spi/src/main/java/org/opensearch/arrow/spi/PoolGroup.java
@@ -0,0 +1,37 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.arrow.spi;
+
+/**
+ * Groups that memory pools belong to for aggregated customer-facing stats.
+ * Each pool is assigned to exactly one group at registration time.
+ *
+ * @opensearch.api
+ */
+public enum PoolGroup {
+ /** Arrow Flight transport pool group. */
+ TRANSPORT("transport"),
+ /** Query and analytics execution pool group. */
+ SEARCH("search"),
+ /** Ingest and write path pool group. */
+ INDEXING("indexing"),
+ /** Background merge operations pool group. */
+ MERGE("merge");
+
+ private final String name;
+
+ PoolGroup(String name) {
+ this.name = name;
+ }
+
+ /** Returns the group name used in stats output. */
+ public String getName() {
+ return name;
+ }
+}
diff --git a/libs/arrow-spi/src/test/java/org/opensearch/arrow/spi/NativeAllocatorPoolConfigTests.java b/libs/arrow-spi/src/test/java/org/opensearch/arrow/spi/NativeAllocatorPoolConfigTests.java
index a21ca8ff54943..025b41a603212 100644
--- a/libs/arrow-spi/src/test/java/org/opensearch/arrow/spi/NativeAllocatorPoolConfigTests.java
+++ b/libs/arrow-spi/src/test/java/org/opensearch/arrow/spi/NativeAllocatorPoolConfigTests.java
@@ -26,8 +26,4 @@ public void testSettingKeys() {
assertEquals("native.allocator.pool.query.min", NativeAllocatorPoolConfig.SETTING_QUERY_MIN);
assertEquals("native.allocator.pool.query.max", NativeAllocatorPoolConfig.SETTING_QUERY_MAX);
}
-
- public void testRootSettingKey() {
- assertEquals("native.allocator.root.limit", NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT);
- }
}
diff --git a/plugins/arrow-base/src/main/java/org/opensearch/arrow/allocator/ArrowBasePlugin.java b/plugins/arrow-base/src/main/java/org/opensearch/arrow/allocator/ArrowBasePlugin.java
index a9bd9968b5884..ad49305b72106 100644
--- a/plugins/arrow-base/src/main/java/org/opensearch/arrow/allocator/ArrowBasePlugin.java
+++ b/plugins/arrow-base/src/main/java/org/opensearch/arrow/allocator/ArrowBasePlugin.java
@@ -9,11 +9,16 @@
package org.opensearch.arrow.allocator;
import org.opensearch.arrow.spi.NativeAllocatorPoolConfig;
+import org.opensearch.arrow.spi.PoolGroup;
import org.opensearch.cluster.metadata.IndexNameExpressionResolver;
+import org.opensearch.cluster.node.DiscoveryNodes;
import org.opensearch.cluster.service.ClusterService;
import org.opensearch.common.settings.ClusterSettings;
+import org.opensearch.common.settings.IndexScopedSettings;
import org.opensearch.common.settings.Setting;
import org.opensearch.common.settings.Settings;
+import org.opensearch.common.settings.SettingsFilter;
+import org.opensearch.common.util.concurrent.FutureUtils;
import org.opensearch.core.common.io.stream.NamedWriteableRegistry;
import org.opensearch.core.common.unit.ByteSizeValue;
import org.opensearch.core.xcontent.NamedXContentRegistry;
@@ -22,10 +27,14 @@
import org.opensearch.node.resource.tracker.ResourceTrackerSettings;
import org.opensearch.plugin.stats.NativeAllocatorPoolStats;
import org.opensearch.plugin.stats.NativeAllocatorStatsRegistry;
+import org.opensearch.plugins.ActionPlugin;
import org.opensearch.plugins.ExtensiblePlugin;
import org.opensearch.plugins.Plugin;
import org.opensearch.repositories.RepositoriesService;
+import org.opensearch.rest.RestController;
+import org.opensearch.rest.RestHandler;
import org.opensearch.script.ScriptService;
+import org.opensearch.threadpool.Scheduler;
import org.opensearch.threadpool.ThreadPool;
import org.opensearch.transport.client.Client;
import org.opensearch.watcher.ResourceWatcherService;
@@ -33,207 +42,133 @@
import java.io.IOException;
import java.util.Collection;
import java.util.List;
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.ScheduledFuture;
+import java.util.concurrent.TimeUnit;
import java.util.function.Supplier;
/**
* Top-level plugin that owns the unified Arrow-backed native memory allocator.
*
- * All Arrow-consuming plugins (arrow-flight-rpc, parquet-data-format) extend
- * this plugin to share one {@link ArrowNativeAllocator} and its classloader.
- *
- *
Each pool has a min (guaranteed floor) and max (burst ceiling). The rebalancer
- * ensures every pool can always allocate up to its min, and distributes unused
- * capacity allowing pools to grow up to their max.
+ *
All Arrow-consuming plugins extend this plugin to share one
+ * {@link ArrowNativeAllocator} and its classloader.
*/
-public class ArrowBasePlugin extends Plugin implements ExtensiblePlugin {
+public class ArrowBasePlugin extends Plugin implements ExtensiblePlugin, ActionPlugin {
/** Creates the plugin. */
public ArrowBasePlugin() {}
- /**
- * Maximum bytes for the root Arrow allocator.
- *
- *
When unset, the default is 20% of
- * {@link ResourceTrackerSettings#NODE_NATIVE_MEMORY_LIMIT_SETTING}; see
- * {@link #deriveRootLimitDefault}. The Arrow framework gets a small fraction of the
- * native budget because the dominant consumer of native memory in analytics workloads
- * is the DataFusion Rust runtime (~75% of {@code node.native_memory.limit}), not Arrow.
- * If AC is unconfigured (limit = 0), the default is {@link Long#MAX_VALUE}, preserving
- * pre-AC behaviour.
- */
- public static final Setting ROOT_LIMIT_SETTING = new Setting<>(
- NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT,
- ArrowBasePlugin::deriveRootLimitDefault,
- s -> {
- long v = Long.parseLong(s);
- if (v < 0) {
- throw new IllegalArgumentException("Setting [" + NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT + "] must be >= 0, got " + v);
- }
- return v;
- },
+ // ─── Settings ────────────────────────────────────────────────────────────────
+
+ /** Whether the NativeMemoryRebalancer is enabled. */
+ public static final Setting REBALANCER_ENABLED_SETTING = Setting.boolSetting(
+ "native.allocator.rebalancer.enabled",
+ true,
Setting.Property.NodeScope,
Setting.Property.Dynamic
);
- /**
- * Computes the default for {@link #ROOT_LIMIT_SETTING} as 20% of
- * {@link ResourceTrackerSettings#NODE_NATIVE_MEMORY_LIMIT_SETTING}. The Arrow framework's
- * hard cap covers only Arrow allocations — DataFusion's Rust runtime is a sibling of
- * Arrow root and gets the larger share of the native budget (see
- * {@code DataFusionPlugin#deriveMemoryPoolLimitDefault}).
- *
- * Returns the bytes-as-string representation expected by the {@link Setting} parser.
- * If the AC limit is unset (== 0), the default is {@link Long#MAX_VALUE} — unbounded —
- * preserving pre-AC behaviour.
- */
- static String deriveRootLimitDefault(Settings settings) {
- ByteSizeValue nativeLimit = ResourceTrackerSettings.NODE_NATIVE_MEMORY_LIMIT_SETTING.get(settings);
- if (nativeLimit.getBytes() <= 0) {
- return Long.toString(Long.MAX_VALUE);
- }
- return Long.toString(nativeLimit.getBytes() * 20 / 100);
- }
+ /** Interval in seconds between pool rebalance cycles. 0 disables rebalancing. */
+ public static final Setting REBALANCE_INTERVAL_SETTING = Setting.longSetting(
+ "native.allocator.rebalance.interval_seconds",
+ 5L,
+ 0L,
+ Setting.Property.NodeScope,
+ Setting.Property.Dynamic
+ );
+
+ /** Pool utilization above this triggers growth. */
+ public static final Setting PRESSURE_THRESHOLD_SETTING = Setting.doubleSetting(
+ "native.allocator.rebalancer.pressure_threshold",
+ 0.75,
+ 0.0,
+ 1.0,
+ Setting.Property.NodeScope,
+ Setting.Property.Dynamic
+ );
+
+ /** Pool utilization below this means pool can give back capacity. */
+ public static final Setting IDLE_THRESHOLD_SETTING = Setting.doubleSetting(
+ "native.allocator.rebalancer.idle_threshold",
+ 0.50,
+ 0.0,
+ 1.0,
+ Setting.Property.NodeScope,
+ Setting.Property.Dynamic
+ );
- /** Minimum guaranteed bytes for the Flight pool. */
- public static final Setting FLIGHT_MIN_SETTING = Setting.longSetting(
+ /** Factor to shrink idle pools by (new limit = limit * (1 - shrink_factor)). */
+ public static final Setting SHRINK_FACTOR_SETTING = Setting.doubleSetting(
+ "native.allocator.rebalancer.shrink_factor",
+ 0.10,
+ 0.0,
+ 1.0,
+ Setting.Property.NodeScope,
+ Setting.Property.Dynamic
+ );
+
+ /** Minimum guaranteed bytes for the Flight pool. Default is 2% of budget. */
+ public static final Setting FLIGHT_MIN_SETTING = new Setting<>(
NativeAllocatorPoolConfig.SETTING_FLIGHT_MIN,
- 0L,
- 0L,
+ s -> derivePoolMinDefault(s, 2),
+ s -> parseNonNegativeLong(s, NativeAllocatorPoolConfig.SETTING_FLIGHT_MIN),
Setting.Property.NodeScope,
Setting.Property.Dynamic
);
- /**
- * Maximum bytes the Flight pool can burst to. Default is 5% of
- * {@link ResourceTrackerSettings#NODE_NATIVE_MEMORY_LIMIT_SETTING}; see
- * {@link #derivePoolMaxDefault}. Falls back to {@link Long#MAX_VALUE} when AC is
- * unconfigured. Matches the partitioning model documented in PR #21732.
- */
+ /** Maximum bytes the Flight pool can burst to. Default is 5% of budget. */
public static final Setting FLIGHT_MAX_SETTING = new Setting<>(
NativeAllocatorPoolConfig.SETTING_FLIGHT_MAX,
s -> derivePoolMaxDefault(s, 5),
- s -> {
- long v = Long.parseLong(s);
- if (v < 0) {
- throw new IllegalArgumentException("Setting [" + NativeAllocatorPoolConfig.SETTING_FLIGHT_MAX + "] must be >= 0, got " + v);
- }
- return v;
- },
+ s -> parseNonNegativeLong(s, NativeAllocatorPoolConfig.SETTING_FLIGHT_MAX),
Setting.Property.NodeScope,
Setting.Property.Dynamic
);
- /** Minimum guaranteed bytes for the ingest pool. */
- public static final Setting INGEST_MIN_SETTING = Setting.longSetting(
+ /** Minimum guaranteed bytes for the ingest pool. Default is 4% of budget. */
+ public static final Setting INGEST_MIN_SETTING = new Setting<>(
NativeAllocatorPoolConfig.SETTING_INGEST_MIN,
- 0L,
- 0L,
+ s -> derivePoolMinDefault(s, 4),
+ s -> parseNonNegativeLong(s, NativeAllocatorPoolConfig.SETTING_INGEST_MIN),
Setting.Property.NodeScope,
Setting.Property.Dynamic
);
- /**
- * Maximum bytes the ingest pool can burst to. Default is 8% of
- * {@link ResourceTrackerSettings#NODE_NATIVE_MEMORY_LIMIT_SETTING}; see
- * {@link #derivePoolMaxDefault}. Falls back to {@link Long#MAX_VALUE} when AC is
- * unconfigured. Ingest gets a larger fraction than Flight/Query because parquet VSR
- * allocators dominate write-path memory usage — see partitioning model in PR #21732.
- */
+ /** Maximum bytes the ingest pool can burst to. Default is 8% of budget. */
public static final Setting INGEST_MAX_SETTING = new Setting<>(
NativeAllocatorPoolConfig.SETTING_INGEST_MAX,
s -> derivePoolMaxDefault(s, 8),
- s -> {
- long v = Long.parseLong(s);
- if (v < 0) {
- throw new IllegalArgumentException("Setting [" + NativeAllocatorPoolConfig.SETTING_INGEST_MAX + "] must be >= 0, got " + v);
- }
- return v;
- },
+ s -> parseNonNegativeLong(s, NativeAllocatorPoolConfig.SETTING_INGEST_MAX),
Setting.Property.NodeScope,
Setting.Property.Dynamic
);
- /**
- * Minimum guaranteed bytes for the query pool. Honored by the rebalancer (when
- * enabled) — sets a floor below which the rebalancer will not shrink the pool.
- * Has no effect when rebalancing is disabled.
- */
- public static final Setting QUERY_MIN_SETTING = Setting.longSetting(
+ /** Minimum guaranteed bytes for the query pool. Default is 2% of budget. */
+ public static final Setting QUERY_MIN_SETTING = new Setting<>(
NativeAllocatorPoolConfig.SETTING_QUERY_MIN,
- 0L,
- 0L,
+ s -> derivePoolMinDefault(s, 2),
+ s -> parseNonNegativeLong(s, NativeAllocatorPoolConfig.SETTING_QUERY_MIN),
Setting.Property.NodeScope,
Setting.Property.Dynamic
);
- /**
- * Maximum bytes the query pool can allocate. Default is 5% of
- * {@link ResourceTrackerSettings#NODE_NATIVE_MEMORY_LIMIT_SETTING}; see
- * {@link #derivePoolMaxDefault}. Falls back to {@link Long#MAX_VALUE} when AC is
- * unconfigured. Enforced by Arrow's child-allocator limit — analytics-engine's
- * per-query allocators are children of this pool, so the sum of in-flight per-query
- * allocations is capped here.
- *
- * Note: each individual analytics query is also bounded by
- * {@code analytics.exec.QueryContext} per-query limit (currently the constant
- * {@code DEFAULT_PER_QUERY_MEMORY_LIMIT = 256 MB}). Lowering {@code QUERY_MAX}
- * below {@code 256 MB × concurrent-queries} can starve queries even when each
- * individual query is within its per-query limit.
- */
+ /** Maximum bytes the query pool can allocate. Default is 5% of budget. */
public static final Setting QUERY_MAX_SETTING = new Setting<>(
NativeAllocatorPoolConfig.SETTING_QUERY_MAX,
s -> derivePoolMaxDefault(s, 5),
- s -> {
- long v = Long.parseLong(s);
- if (v < 0) {
- throw new IllegalArgumentException("Setting [" + NativeAllocatorPoolConfig.SETTING_QUERY_MAX + "] must be >= 0, got " + v);
- }
- return v;
- },
+ s -> parseNonNegativeLong(s, NativeAllocatorPoolConfig.SETTING_QUERY_MAX),
Setting.Property.NodeScope,
Setting.Property.Dynamic
);
- /**
- * Computes the default for a pool max as a percentage of
- * {@link ResourceTrackerSettings#NODE_NATIVE_MEMORY_LIMIT_SETTING} (the operator's
- * declared off-heap budget), falling back to {@link Long#MAX_VALUE} when AC is
- * unconfigured. Returns the bytes-as-string representation expected by the
- * {@link Setting} parser.
- *
- * Pools are anchored to {@code node.native_memory.limit} rather than to
- * {@link #ROOT_LIMIT_SETTING} so the diagrammed partitioning (PR #21732) holds:
- * sum of pool maxes (5+8+5 = 18% of native_memory.limit) fits within the framework
- * root cap (20% of native_memory.limit) by default. Operator overrides of
- * {@code root.limit} that drop it below {@code sum(pool.max)} are caught by the
- * grouped validator.
- *
- *
The fraction is taken straight from {@code node.native_memory.limit}, not from
- * {@code limit - buffer_percent}. {@code buffer_percent} is an admission-control
- * throttle margin, not a framework budget reduction.
- *
- * @param settings node settings
- * @param percent fraction of {@code node.native_memory.limit} the pool max defaults to
- */
- static String derivePoolMaxDefault(Settings settings, int percent) {
- ByteSizeValue nativeLimit = ResourceTrackerSettings.NODE_NATIVE_MEMORY_LIMIT_SETTING.get(settings);
- if (nativeLimit.getBytes() <= 0) {
- return Long.toString(Long.MAX_VALUE);
- }
- long pool = Math.max(0L, nativeLimit.getBytes() * percent / 100);
- return Long.toString(pool);
- }
-
- /** Interval in seconds between pool rebalance cycles. 0 disables rebalancing. */
- public static final Setting REBALANCE_INTERVAL_SETTING = Setting.longSetting(
- "native.allocator.rebalance.interval_seconds",
- 0L,
- 0L,
- Setting.Property.NodeScope,
- Setting.Property.Dynamic
- );
+ // ─── Instance state ──────────────────────────────────────────────────────────
private volatile ArrowNativeAllocator allocator;
+ private volatile ScheduledExecutorService rebalancerScheduler;
+ private volatile ScheduledFuture> rebalanceTask;
+ private volatile NativeMemoryRebalancer rebalancer;
+
+ // ─── Plugin lifecycle ────────────────────────────────────────────────────────
@Override
public Collection createComponents(
@@ -251,12 +186,11 @@ public Collection createComponents(
) {
Settings settings = environment.settings();
ClusterSettings cs = clusterService.getClusterSettings();
- ArrowNativeAllocator built = buildAllocator(settings, cs);
+ Supplier budgetSupplier = () -> ResourceTrackerSettings.NODE_NATIVE_MEMORY_LIMIT_SETTING.get(clusterService.getSettings())
+ .getBytes();
+ ArrowNativeAllocator built = buildAllocator(settings, cs, budgetSupplier);
this.allocator = built;
- // Publish a NativeAllocatorStatsRegistry alongside the allocator so the server-side
- // NodeService can discover the supplier via pluginComponents (instanceof filter) without
- // taking a compile-time dependency on this plugin. The lambda re-reads `this.allocator`
- // each invocation, so after close() nulls the field, the supplier returns null cleanly.
+
Supplier statsSupplier = () -> {
ArrowNativeAllocator a = this.allocator;
return a != null ? a.stats() : null;
@@ -264,96 +198,172 @@ public Collection createComponents(
return List.of(built, new NativeAllocatorStatsRegistry(statsSupplier));
}
+ @Override
+ public List> getSettings() {
+ return List.of(
+ FLIGHT_MIN_SETTING,
+ FLIGHT_MAX_SETTING,
+ INGEST_MIN_SETTING,
+ INGEST_MAX_SETTING,
+ QUERY_MIN_SETTING,
+ QUERY_MAX_SETTING,
+ REBALANCE_INTERVAL_SETTING,
+ REBALANCER_ENABLED_SETTING,
+ PRESSURE_THRESHOLD_SETTING,
+ IDLE_THRESHOLD_SETTING,
+ SHRINK_FACTOR_SETTING
+ );
+ }
+
+ @Override
+ public List getRestHandlers(
+ Settings settings,
+ RestController restController,
+ ClusterSettings clusterSettings,
+ IndexScopedSettings indexScopedSettings,
+ SettingsFilter settingsFilter,
+ IndexNameExpressionResolver indexNameExpressionResolver,
+ Supplier nodesInCluster
+ ) {
+ Supplier statsSupplier = () -> allocator != null ? allocator.stats() : null;
+ return List.of(new ArrowBaseStatsAction(statsSupplier));
+ }
+
+ @Override
+ public void close() throws IOException {
+ if (rebalancerScheduler != null) {
+ rebalancerScheduler.shutdownNow();
+ }
+ if (allocator != null) {
+ allocator.close();
+ allocator = null;
+ }
+ }
+
+ // ─── Package-private (visible for tests) ─────────────────────────────────────
+
/**
- * Constructs the allocator and wires its pools and dynamic-update consumers from
- * a pure {@code (Settings, ClusterSettings)} pair. Package-private so unit tests
- * can exercise the full wiring without a heavyweight {@link ClusterService}
- * fixture — mirrors the shape of {@link #registerSettingsUpdateConsumers} which
- * is already test-friendly for the same reason.
+ * Constructs the allocator and wires its pools and the rebalancer.
*/
- static ArrowNativeAllocator buildAllocator(Settings settings, ClusterSettings cs) {
- long rootLimit = ROOT_LIMIT_SETTING.get(settings);
- ArrowNativeAllocator allocator = new ArrowNativeAllocator(rootLimit);
- allocator.setRebalanceInterval(REBALANCE_INTERVAL_SETTING.get(settings));
+ ArrowNativeAllocator buildAllocator(Settings settings, ClusterSettings cs, Supplier budgetSupplier) {
+ ArrowNativeAllocator allocator = new ArrowNativeAllocator();
- // Single source of truth for cross-setting invariants — same logic runs on
- // dynamic updates via the grouped consumer below.
- validateUpdate(settings);
+ // Set budget for validation
+ long nativeBudget = ResourceTrackerSettings.NODE_NATIVE_MEMORY_LIMIT_SETTING.get(settings).getBytes();
+ if (nativeBudget > 0) {
+ allocator.setBudget(nativeBudget);
+ }
+
+ // Validate min < max for each pool
+ validateMinMax(NativeAllocatorPoolConfig.POOL_FLIGHT, FLIGHT_MIN_SETTING.get(settings), FLIGHT_MAX_SETTING.get(settings));
+ validateMinMax(NativeAllocatorPoolConfig.POOL_INGEST, INGEST_MIN_SETTING.get(settings), INGEST_MAX_SETTING.get(settings));
+ validateMinMax(NativeAllocatorPoolConfig.POOL_QUERY, QUERY_MIN_SETTING.get(settings), QUERY_MAX_SETTING.get(settings));
+ // Create pools (always start at max)
allocator.getOrCreatePool(
NativeAllocatorPoolConfig.POOL_FLIGHT,
FLIGHT_MIN_SETTING.get(settings),
- FLIGHT_MAX_SETTING.get(settings)
+ FLIGHT_MAX_SETTING.get(settings),
+ PoolGroup.TRANSPORT
);
allocator.getOrCreatePool(
NativeAllocatorPoolConfig.POOL_INGEST,
INGEST_MIN_SETTING.get(settings),
- INGEST_MAX_SETTING.get(settings)
+ INGEST_MAX_SETTING.get(settings),
+ PoolGroup.INDEXING
+ );
+ allocator.getOrCreatePool(
+ NativeAllocatorPoolConfig.POOL_QUERY,
+ QUERY_MIN_SETTING.get(settings),
+ QUERY_MAX_SETTING.get(settings),
+ PoolGroup.SEARCH
);
- allocator.getOrCreatePool(NativeAllocatorPoolConfig.POOL_QUERY, QUERY_MIN_SETTING.get(settings), QUERY_MAX_SETTING.get(settings));
- registerSettingsUpdateConsumers(cs, allocator);
+ // Register dynamic setting consumers for min/max changes
+ cs.addSettingsUpdateConsumer(FLIGHT_MIN_SETTING, newMin -> allocator.setPoolMin(NativeAllocatorPoolConfig.POOL_FLIGHT, newMin));
+ cs.addSettingsUpdateConsumer(FLIGHT_MAX_SETTING, newMax -> allocator.setPoolLimit(NativeAllocatorPoolConfig.POOL_FLIGHT, newMax));
+ cs.addSettingsUpdateConsumer(INGEST_MIN_SETTING, newMin -> allocator.setPoolMin(NativeAllocatorPoolConfig.POOL_INGEST, newMin));
+ cs.addSettingsUpdateConsumer(INGEST_MAX_SETTING, newMax -> allocator.setPoolLimit(NativeAllocatorPoolConfig.POOL_INGEST, newMax));
+ cs.addSettingsUpdateConsumer(QUERY_MIN_SETTING, newMin -> allocator.setPoolMin(NativeAllocatorPoolConfig.POOL_QUERY, newMin));
+ cs.addSettingsUpdateConsumer(QUERY_MAX_SETTING, newMax -> allocator.setPoolLimit(NativeAllocatorPoolConfig.POOL_QUERY, newMax));
+
+ // Register dynamic consumer for rebalancer enable/disable
+ cs.addSettingsUpdateConsumer(REBALANCER_ENABLED_SETTING, enabled -> {
+ if (enabled == false) {
+ cancelRebalanceTask();
+ allocator.resetAllPoolsToMax();
+ } else {
+ startRebalancer(allocator, budgetSupplier, REBALANCE_INTERVAL_SETTING.get(settings));
+ }
+ });
+
+ // Set up the rebalancer if enabled
+ if (REBALANCER_ENABLED_SETTING.get(settings)) {
+ startRebalancer(allocator, budgetSupplier, REBALANCE_INTERVAL_SETTING.get(settings));
+ }
+
+ // Register dynamic consumer for interval changes
+ cs.addSettingsUpdateConsumer(REBALANCE_INTERVAL_SETTING, this::updateRebalanceInterval);
+
+ // Register dynamic consumers for threshold changes
+ cs.addSettingsUpdateConsumer(PRESSURE_THRESHOLD_SETTING, value -> {
+ NativeMemoryRebalancer r = this.rebalancer;
+ if (r != null) r.setPressureThreshold(value);
+ });
+ cs.addSettingsUpdateConsumer(IDLE_THRESHOLD_SETTING, value -> {
+ NativeMemoryRebalancer r = this.rebalancer;
+ if (r != null) r.setIdleThreshold(value);
+ });
+ cs.addSettingsUpdateConsumer(SHRINK_FACTOR_SETTING, value -> {
+ NativeMemoryRebalancer r = this.rebalancer;
+ if (r != null) r.setShrinkFactor(value);
+ });
+
return allocator;
}
- /**
- * Registers cluster-settings update consumers that propagate dynamic setting changes
- * into the live {@link ArrowNativeAllocator}. Package-private so unit tests can exercise
- * the wiring with a real {@link ClusterSettings} instance — the test that asserts a PUT
- * lands on the allocator is what catches a future regression where one of these lines
- * is accidentally removed.
- */
- static void registerSettingsUpdateConsumers(ClusterSettings cs, ArrowNativeAllocator allocator) {
- cs.addSettingsUpdateConsumer(ROOT_LIMIT_SETTING, allocator::setRootLimit);
- cs.addSettingsUpdateConsumer(REBALANCE_INTERVAL_SETTING, allocator::setRebalanceInterval);
- cs.addSettingsUpdateConsumer(FLIGHT_MAX_SETTING, v -> allocator.setPoolLimit(NativeAllocatorPoolConfig.POOL_FLIGHT, v));
- cs.addSettingsUpdateConsumer(FLIGHT_MIN_SETTING, v -> allocator.setPoolMin(NativeAllocatorPoolConfig.POOL_FLIGHT, v));
- cs.addSettingsUpdateConsumer(INGEST_MAX_SETTING, v -> allocator.setPoolLimit(NativeAllocatorPoolConfig.POOL_INGEST, v));
- cs.addSettingsUpdateConsumer(INGEST_MIN_SETTING, v -> allocator.setPoolMin(NativeAllocatorPoolConfig.POOL_INGEST, v));
- cs.addSettingsUpdateConsumer(QUERY_MAX_SETTING, v -> allocator.setPoolLimit(NativeAllocatorPoolConfig.POOL_QUERY, v));
- cs.addSettingsUpdateConsumer(QUERY_MIN_SETTING, v -> allocator.setPoolMin(NativeAllocatorPoolConfig.POOL_QUERY, v));
-
- // Grouped validator runs across the related settings on every dynamic update so cross-setting
- // invariants (sum of pool mins ≤ root, per-pool min ≤ max) are enforced post-startup.
- cs.addSettingsUpdateConsumer(s -> {}, MIN_MAX_SETTINGS, ArrowBasePlugin::validateUpdate);
- }
+ // ─── Private helpers ─────────────────────────────────────────────────────────
- private static final List> MIN_MAX_SETTINGS = List.of(
- ROOT_LIMIT_SETTING,
- FLIGHT_MIN_SETTING,
- FLIGHT_MAX_SETTING,
- INGEST_MIN_SETTING,
- INGEST_MAX_SETTING,
- QUERY_MIN_SETTING,
- QUERY_MAX_SETTING
- );
+ private synchronized void startRebalancer(ArrowNativeAllocator allocator, Supplier budgetSupplier, long intervalSeconds) {
+ if (rebalancer != null || rebalancerScheduler != null) return;
- private static void validateUpdate(Settings settings) {
- long rootLimit = ROOT_LIMIT_SETTING.get(settings);
- long flightMin = FLIGHT_MIN_SETTING.get(settings);
- long flightMax = FLIGHT_MAX_SETTING.get(settings);
- long ingestMin = INGEST_MIN_SETTING.get(settings);
- long ingestMax = INGEST_MAX_SETTING.get(settings);
- long queryMin = QUERY_MIN_SETTING.get(settings);
- long queryMax = QUERY_MAX_SETTING.get(settings);
- validateMinMax(NativeAllocatorPoolConfig.POOL_FLIGHT, flightMin, flightMax);
- validateMinMax(NativeAllocatorPoolConfig.POOL_INGEST, ingestMin, ingestMax);
- validateMinMax(NativeAllocatorPoolConfig.POOL_QUERY, queryMin, queryMax);
- validateMinSum(rootLimit, flightMin, ingestMin, queryMin);
- }
+ long budget = budgetSupplier.get();
+ if (budget <= 0) return;
+ if (intervalSeconds <= 0) return;
- @Override
- public List> getSettings() {
- return List.of(
- ROOT_LIMIT_SETTING,
- FLIGHT_MIN_SETTING,
- FLIGHT_MAX_SETTING,
- INGEST_MIN_SETTING,
- INGEST_MAX_SETTING,
- QUERY_MIN_SETTING,
- QUERY_MAX_SETTING,
- REBALANCE_INTERVAL_SETTING
+ NativeMemoryRebalancer nativeRebalancer = new NativeMemoryRebalancer(
+ allocator,
+ budgetSupplier,
+ PRESSURE_THRESHOLD_SETTING.getDefault(Settings.EMPTY),
+ IDLE_THRESHOLD_SETTING.getDefault(Settings.EMPTY),
+ SHRINK_FACTOR_SETTING.getDefault(Settings.EMPTY)
);
+ this.rebalancer = nativeRebalancer;
+
+ Scheduler.SafeScheduledThreadPoolExecutor executor = new Scheduler.SafeScheduledThreadPoolExecutor(1, r -> {
+ Thread t = new Thread(r, "native-allocator-rebalancer");
+ t.setDaemon(true);
+ return t;
+ });
+ executor.setRemoveOnCancelPolicy(true);
+ this.rebalancerScheduler = executor;
+
+ rebalanceTask = rebalancerScheduler.scheduleAtFixedRate(nativeRebalancer, intervalSeconds, intervalSeconds, TimeUnit.SECONDS);
+ }
+
+ private synchronized void cancelRebalanceTask() {
+ ScheduledFuture> existing = rebalanceTask;
+ if (existing != null) {
+ FutureUtils.cancel(existing);
+ rebalanceTask = null;
+ }
+ }
+
+ private void updateRebalanceInterval(long newInterval) {
+ cancelRebalanceTask();
+ if (newInterval > 0 && rebalancerScheduler != null && rebalancer != null) {
+ rebalanceTask = rebalancerScheduler.scheduleAtFixedRate(rebalancer, newInterval, newInterval, TimeUnit.SECONDS);
+ }
}
private static void validateMinMax(String poolName, long min, long max) {
@@ -362,36 +372,27 @@ private static void validateMinMax(String poolName, long min, long max) {
}
}
- private static void validateMinSum(long rootLimit, long... mins) {
- if (rootLimit == Long.MAX_VALUE) {
- return;
- }
- long sum = 0;
- for (long min : mins) {
- try {
- sum = Math.addExact(sum, min);
- } catch (ArithmeticException overflow) {
- throw new IllegalArgumentException("Sum of pool minimums overflows.", overflow);
- }
+ static String derivePoolMaxDefault(Settings settings, int percent) {
+ ByteSizeValue nativeLimit = ResourceTrackerSettings.NODE_NATIVE_MEMORY_LIMIT_SETTING.get(settings);
+ if (nativeLimit.getBytes() <= 0) {
+ return Long.toString(Long.MAX_VALUE);
}
- if (sum > rootLimit) {
- throw new IllegalArgumentException(
- "Sum of pool minimums ("
- + sum
- + " bytes) exceeds root limit ("
- + rootLimit
- + " bytes). "
- + "Reduce pool minimums or increase "
- + NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT
- );
+ return Long.toString(Math.max(0L, nativeLimit.getBytes() * percent / 100));
+ }
+
+ static String derivePoolMinDefault(Settings settings, int percent) {
+ ByteSizeValue nativeLimit = ResourceTrackerSettings.NODE_NATIVE_MEMORY_LIMIT_SETTING.get(settings);
+ if (nativeLimit.getBytes() <= 0) {
+ return "0";
}
+ return Long.toString(Math.max(0L, nativeLimit.getBytes() * percent / 100));
}
- @Override
- public void close() throws IOException {
- if (allocator != null) {
- allocator.close();
- allocator = null;
+ private static long parseNonNegativeLong(String s, String settingName) {
+ long v = Long.parseLong(s);
+ if (v < 0) {
+ throw new IllegalArgumentException("Setting [" + settingName + "] must be >= 0, got " + v);
}
+ return v;
}
}
diff --git a/plugins/arrow-base/src/main/java/org/opensearch/arrow/allocator/ArrowBaseStatsAction.java b/plugins/arrow-base/src/main/java/org/opensearch/arrow/allocator/ArrowBaseStatsAction.java
new file mode 100644
index 0000000000000..70928e93713a0
--- /dev/null
+++ b/plugins/arrow-base/src/main/java/org/opensearch/arrow/allocator/ArrowBaseStatsAction.java
@@ -0,0 +1,75 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.arrow.allocator;
+
+import org.opensearch.core.rest.RestStatus;
+import org.opensearch.core.xcontent.XContentBuilder;
+import org.opensearch.plugin.stats.NativeAllocatorPoolStats;
+import org.opensearch.rest.BaseRestHandler;
+import org.opensearch.rest.BytesRestResponse;
+import org.opensearch.rest.RestRequest;
+import org.opensearch.transport.client.node.NodeClient;
+
+import java.util.List;
+import java.util.function.Supplier;
+
+/**
+ * REST handler exposing per-pool native memory stats at {@code _plugins/arrow_base/stats}.
+ */
+public class ArrowBaseStatsAction extends BaseRestHandler {
+ private final Supplier statsSupplier;
+
+ /**
+ * Creates a new stats action.
+ * @param statsSupplier supplier of pool stats
+ */
+ public ArrowBaseStatsAction(Supplier statsSupplier) {
+ this.statsSupplier = statsSupplier;
+ }
+
+ @Override
+ public String getName() {
+ return "arrow_base_stats_action";
+ }
+
+ @Override
+ public List routes() {
+ return List.of(new Route(RestRequest.Method.GET, "_plugins/arrow_base/stats"));
+ }
+
+ @Override
+ protected RestChannelConsumer prepareRequest(RestRequest request, NodeClient client) {
+ return channel -> {
+ NativeAllocatorPoolStats stats = statsSupplier.get();
+ XContentBuilder builder = channel.newBuilder();
+ builder.startObject();
+ builder.startObject("memory_pools");
+ if (stats != null) {
+ builder.startObject("runtime");
+ builder.field("allocated_bytes", stats.getNativeAllocatedBytes());
+ builder.field("resident_bytes", stats.getNativeResidentBytes());
+ builder.endObject();
+ builder.startObject("pools");
+ for (NativeAllocatorPoolStats.PoolStats pool : stats.getPools()) {
+ builder.startObject(pool.getName());
+ builder.field("allocated_bytes", pool.getAllocatedBytes());
+ builder.field("peak_bytes", pool.getPeakBytes());
+ builder.field("limit_bytes", pool.getLimitBytes());
+ builder.field("min_bytes", pool.getMinBytes());
+ builder.field("group", pool.getGroup());
+ builder.endObject();
+ }
+ builder.endObject();
+ }
+ builder.endObject();
+ builder.endObject();
+ channel.sendResponse(new BytesRestResponse(RestStatus.OK, builder));
+ };
+ }
+}
diff --git a/plugins/arrow-base/src/main/java/org/opensearch/arrow/allocator/ArrowNativeAllocator.java b/plugins/arrow-base/src/main/java/org/opensearch/arrow/allocator/ArrowNativeAllocator.java
index 892c43d2cb2c8..9c54298b6093e 100644
--- a/plugins/arrow-base/src/main/java/org/opensearch/arrow/allocator/ArrowNativeAllocator.java
+++ b/plugins/arrow-base/src/main/java/org/opensearch/arrow/allocator/ArrowNativeAllocator.java
@@ -10,197 +10,295 @@
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
import org.opensearch.arrow.spi.NativeAllocator;
+import org.opensearch.arrow.spi.PoolGroup;
+import org.opensearch.common.settings.ClusterSettings;
+import org.opensearch.common.settings.Setting;
+import org.opensearch.common.settings.Settings;
import org.opensearch.plugin.stats.NativeAllocatorPoolStats;
import java.util.ArrayList;
import java.util.Collections;
+import java.util.HashSet;
import java.util.List;
-import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
-import java.util.concurrent.ScheduledExecutorService;
-import java.util.concurrent.ScheduledFuture;
-import java.util.concurrent.TimeUnit;
+import java.util.concurrent.CopyOnWriteArrayList;
+import java.util.function.Consumer;
+import java.util.function.Supplier;
/**
* Arrow-backed implementation of {@link NativeAllocator}.
*
- * Owns a single {@link RootAllocator} for the node. All plugins that need
- * Arrow buffers obtain pool handles from this class via the SPI interface.
- *
- *
Elastic rebalancing
- * A background task periodically redistributes unused capacity across pools.
- * Each pool has a guaranteed limit (configured via settings). When other
- * pools are idle, an active pool can temporarily grow beyond its guarantee up to
- * the root limit. When contention rises, pools shrink back toward their guarantee.
- * This prevents idle capacity from being wasted while maintaining isolation under load.
- *
- *
Constructed once by {@link ArrowBasePlugin#createComponents} and exposed to
- * downstream plugins via Guice and {@code PluginComponentRegistry} so consumers
- * receive the instance through explicit dependency injection rather than a static
- * singleton.
+ *
Owns a single {@link RootAllocator} (set to {@code Long.MAX_VALUE} — per-pool
+ * limits are the real enforcement). Manages both Arrow-backed pools and virtual pools.
*/
public class ArrowNativeAllocator implements NativeAllocator {
+ private static final Logger logger = LogManager.getLogger(ArrowNativeAllocator.class);
+
private final RootAllocator root;
private final ConcurrentMap pools = new ConcurrentHashMap<>();
- private final ConcurrentMap poolMins = new ConcurrentHashMap<>();
- private final ConcurrentMap poolMaxes = new ConcurrentHashMap<>();
- private final ScheduledExecutorService rebalancer;
- private volatile ScheduledFuture> rebalanceTask;
- /**
- * True iff the rebalancer is configured to run periodically. Used by
- * {@link #getOrCreatePool} to decide each pool's initial child-allocator
- * limit: when rebalancing is enabled, pools start at {@code min} and grow
- * via the next rebalance tick (preserving the original PR's
- * "guarantee + burst" semantics); when rebalancing is disabled, pools
- * start at {@code max} so consumers can allocate immediately without
- * waiting for a tick that never comes.
- */
- private volatile boolean rebalancerEnabled = false;
+ private final ConcurrentMap virtualPools = new ConcurrentHashMap<>();
+ private final ConcurrentMap poolConfigs = new ConcurrentHashMap<>();
+ private final List statsRefreshers = new CopyOnWriteArrayList<>();
+ private volatile Supplier nativeMemoryStatsSupplier;
+ private volatile long budget = Long.MAX_VALUE;
/**
* Creates a new allocator with a fresh RootAllocator.
- *
- * @param rootLimit maximum bytes for the root allocator
*/
- public ArrowNativeAllocator(long rootLimit) {
- this.root = new RootAllocator(rootLimit);
- org.opensearch.threadpool.Scheduler.SafeScheduledThreadPoolExecutor executor =
- new org.opensearch.threadpool.Scheduler.SafeScheduledThreadPoolExecutor(1, r -> {
- Thread t = new Thread(r, "native-allocator-rebalancer");
- t.setDaemon(true);
- return t;
- });
- executor.setRemoveOnCancelPolicy(true);
- this.rebalancer = executor;
+ public ArrowNativeAllocator() {
+ this.root = new RootAllocator(Long.MAX_VALUE);
}
/**
- * Schedules (or reschedules) the rebalancer at the given interval.
- * A value of 0 disables rebalancing.
+ * Sets the total native memory budget for validation.
*
- * @param intervalSeconds rebalance period in seconds, or 0 to disable
+ * @param budget node.native_memory.limit in bytes
*/
- public void setRebalanceInterval(long intervalSeconds) {
- ScheduledFuture> existing = rebalanceTask;
- if (existing != null) {
- org.opensearch.common.util.concurrent.FutureUtils.cancel(existing);
- rebalanceTask = null;
- }
- rebalancerEnabled = intervalSeconds > 0;
- if (rebalancerEnabled) {
- rebalanceTask = rebalancer.scheduleAtFixedRate(this::rebalance, intervalSeconds, intervalSeconds, TimeUnit.SECONDS);
- }
+ public void setBudget(long budget) {
+ this.budget = budget;
}
- @Override
- public PoolHandle getOrCreatePool(String poolName, long limit) {
- return getOrCreatePool(poolName, limit, limit);
- }
+ // ─── Public / SPI methods ───────────────────────────────────────────────────
- /**
- * Creates or returns a pool with min/max limits.
- *
- * @param poolName logical pool name
- * @param min guaranteed minimum bytes (always available)
- * @param max maximum bytes the pool can burst to
- * @return the pool handle
- */
- public PoolHandle getOrCreatePool(String poolName, long min, long max) {
- poolMins.putIfAbsent(poolName, min);
- poolMaxes.putIfAbsent(poolName, max);
+ @Override
+ public PoolHandle getOrCreatePool(String poolName, long min, long max, PoolGroup group) {
+ validateSumMaxesWithinBudget(poolName, max);
+ poolConfigs.putIfAbsent(poolName, new PoolConfig(min, max, group));
return pools.computeIfAbsent(poolName, name -> {
- // Pick an initial limit that's safe for both rebalancer-on and rebalancer-off
- // deployments. When rebalancing is enabled, start at min (the original PR's
- // "guarantee + burst" semantics): the next rebalance tick will distribute
- // headroom up to each pool's max. When rebalancing is disabled (the default),
- // pools with min=0 would otherwise reject every allocation until a tick that
- // never comes — start at max so consumers can allocate immediately.
- long initial = rebalancerEnabled ? min : max;
- BufferAllocator child = root.newChildAllocator(name, 0, initial);
+ BufferAllocator child = root.newChildAllocator(name, 0, max);
return new ArrowPoolHandle(child);
});
}
@Override
public void setPoolLimit(String poolName, long newLimit) {
- ArrowPoolHandle handle = pools.get(poolName);
- if (handle == null) {
- throw new IllegalStateException("Pool '" + poolName + "' does not exist");
+ PoolConfig config = poolConfigs.get(poolName);
+ if (config != null) {
+ config.max = newLimit;
}
- poolMaxes.put(poolName, newLimit);
- handle.allocator.setLimit(newLimit);
+ ArrowPoolHandle arrowHandle = pools.get(poolName);
+ if (arrowHandle != null) {
+ arrowHandle.allocator.setLimit(newLimit);
+ return;
+ }
+ VirtualPoolHandleImpl vp = virtualPools.get(poolName);
+ if (vp != null) {
+ vp.setLimit(newLimit);
+ return;
+ }
+ throw new IllegalStateException("Pool '" + poolName + "' does not exist");
+ }
+
+ @Override
+ public VirtualPoolHandle registerVirtualPool(String poolName, long min, long max, PoolGroup group, Consumer limitSetter) {
+ if (min > max) {
+ throw new IllegalArgumentException("Pool '" + poolName + "' min (" + min + ") exceeds max (" + max + ")");
+ }
+ validateSumMaxesWithinBudget(poolName, max);
+ VirtualPoolHandleImpl handle = new VirtualPoolHandleImpl(poolName, max, limitSetter);
+ VirtualPoolHandleImpl existing = virtualPools.putIfAbsent(poolName, handle);
+ if (existing != null || pools.containsKey(poolName)) {
+ virtualPools.remove(poolName, handle);
+ throw new IllegalStateException("Pool '" + poolName + "' already registered");
+ }
+ poolConfigs.put(poolName, new PoolConfig(min, max, group));
+ limitSetter.accept(max);
+ return handle;
+ }
+
+ @Override
+ public void setPoolMin(String poolName, long newMin) {
+ PoolConfig config = poolConfigs.get(poolName);
+ if (config != null) {
+ config.min = newMin;
+ }
+ // Raise live limit if newMin exceeds current effective limit
+ ArrowPoolHandle arrowHandle = pools.get(poolName);
+ if (arrowHandle != null) {
+ long max = config != null ? config.max : Long.MAX_VALUE;
+ long current = arrowHandle.allocator.getLimit();
+ long target = Math.min(newMin, max);
+ if (target > current) {
+ arrowHandle.allocator.setLimit(target);
+ }
+ return;
+ }
+ VirtualPoolHandleImpl vp = virtualPools.get(poolName);
+ if (vp != null) {
+ long max = config != null ? config.max : Long.MAX_VALUE;
+ long current = vp.limit();
+ long target = Math.min(newMin, max);
+ if (target > current) {
+ vp.setLimit(target);
+ }
+ }
+ }
+
+ @Override
+ public Set getAllPoolNames() {
+ Set all = new HashSet<>(pools.keySet());
+ all.addAll(virtualPools.keySet());
+ return Collections.unmodifiableSet(all);
+ }
+
+ @Override
+ public void addStatsRefresher(Runnable refresher) {
+ statsRefreshers.add(refresher);
+ }
+
+ @Override
+ public void setNativeMemoryStatsSupplier(Supplier supplier) {
+ this.nativeMemoryStatsSupplier = supplier;
}
/**
- * Updates the minimum guaranteed bytes for a pool. The new min is recorded for the
- * rebalancer (which honors it as a floor on the next tick) and also pushed to the
- * live {@link BufferAllocator} so the change takes effect immediately even when
- * the rebalancer is disabled — the alternative was a Dynamic setting that returned
- * HTTP 200 but had no observable effect.
- *
- * Live propagation rules:
- *
- * If {@code newMin} exceeds the pool's current limit, the limit is raised to
- * {@code newMin} (capped at the configured pool max). Children of the pool
- * allocator inherit the change automatically via Arrow's parent-cap check at
- * allocation time, so dynamic resizes reach in-flight workloads without an
- * explicit notification SPI.
- * If {@code newMin} is below the current limit, the limit is left alone —
- * the rebalancer is the only path that shrinks live limits, so a min change
- * on its own never reduces capacity in flight.
- *
+ * Sets the effective (live) limit for a pool without updating the configured max.
+ * Used by the rebalancer to adjust pool limits dynamically.
*
- * @param poolName the pool name
- * @param newMin new minimum bytes
+ * @param poolName name of the pool
+ * @param newLimit new effective limit in bytes
*/
- public void setPoolMin(String poolName, long newMin) {
- ArrowPoolHandle handle = pools.get(poolName);
- if (handle == null) {
- throw new IllegalStateException("Pool '" + poolName + "' does not exist");
+ public void setPoolEffectiveLimit(String poolName, long newLimit) {
+ ArrowPoolHandle arrowHandle = pools.get(poolName);
+ if (arrowHandle != null) {
+ arrowHandle.allocator.setLimit(newLimit);
+ return;
}
- poolMins.put(poolName, newMin);
- long max = poolMaxes.getOrDefault(poolName, Long.MAX_VALUE);
- long current = handle.allocator.getLimit();
- long target = Math.min(newMin, max);
- if (target > current) {
- handle.allocator.setLimit(target);
+ VirtualPoolHandleImpl vp = virtualPools.get(poolName);
+ if (vp != null) {
+ vp.setLimit(newLimit);
+ return;
}
+ throw new IllegalStateException("Pool '" + poolName + "' does not exist");
}
- @Override
- public void setRootLimit(long limit) {
- root.setLimit(limit);
+ /**
+ * Resets all pools to their configured max. Called when the rebalancer is disabled.
+ * Logs a warning for any pool that was bursting above its max.
+ */
+ public void resetAllPoolsToMax() {
+ for (String name : getAllPoolNames()) {
+ PoolConfig config = poolConfigs.get(name);
+ long max = config != null ? config.max : Long.MAX_VALUE;
+ long current = getEffectiveLimit(name);
+ if (current > max) {
+ logger.warn(
+ "Pool [{}] effective limit {} exceeds max {}, resetting to max. In-flight allocations may be rejected.",
+ name,
+ current,
+ max
+ );
+ }
+ setPoolEffectiveLimit(name, max);
+ }
+ }
+
+ /**
+ * Convenience method for plugins that have Setting objects. Registers the virtual pool
+ * and auto-wires dynamic setting listeners for min/max changes.
+ *
+ * @param poolName name of the virtual pool
+ * @param minSetting setting for minimum bytes
+ * @param maxSetting setting for maximum bytes
+ * @param settings current node settings
+ * @param clusterSettings cluster settings for dynamic updates
+ * @param group pool group assignment
+ * @param limitSetter callback invoked when the pool limit changes
+ */
+ public VirtualPoolHandle registerVirtualPool(
+ String poolName,
+ Setting minSetting,
+ Setting maxSetting,
+ Settings settings,
+ ClusterSettings clusterSettings,
+ PoolGroup group,
+ Consumer limitSetter
+ ) {
+ long min = minSetting.get(settings);
+ long max = maxSetting.get(settings);
+ VirtualPoolHandle handle = registerVirtualPool(poolName, min, max, group, limitSetter);
+
+ clusterSettings.addSettingsUpdateConsumer(maxSetting, newMax -> setPoolLimit(poolName, newMax));
+ clusterSettings.addSettingsUpdateConsumer(minSetting, newMin -> setPoolMin(poolName, newMin));
+
+ return handle;
}
/**
- * Returns a point-in-time stats snapshot across all pools. Used by the
- * {@code NativeAllocatorStatsRegistry} component published from
- * {@code ArrowBasePlugin.createComponents()} and wired into {@code NodeService} to
- * render allocator state under {@code _nodes/stats[/native_allocator]}.
+ * Returns a point-in-time stats snapshot across all pools.
*/
public NativeAllocatorPoolStats stats() {
+ refreshStats();
+
+ long nativeAllocated = -1;
+ long nativeResident = -1;
+ Supplier supplier = this.nativeMemoryStatsSupplier;
+ if (supplier != null) {
+ try {
+ long[] stats = supplier.get();
+ if (stats != null && stats.length >= 2) {
+ nativeAllocated = stats[0];
+ nativeResident = stats[1];
+ }
+ } catch (Exception e) {
+ // best-effort
+ }
+ }
+
List poolStats = new ArrayList<>();
for (var entry : pools.entrySet()) {
BufferAllocator alloc = entry.getValue().allocator;
+ PoolConfig config = poolConfigs.get(entry.getKey());
poolStats.add(
new NativeAllocatorPoolStats.PoolStats(
entry.getKey(),
alloc.getAllocatedMemory(),
alloc.getPeakMemoryAllocation(),
- alloc.getLimit()
+ alloc.getLimit(),
+ config != null && config.group != null ? config.group.getName() : null,
+ config != null ? config.min : 0L
)
);
}
- return new NativeAllocatorPoolStats(root.getAllocatedMemory(), root.getPeakMemoryAllocation(), root.getLimit(), poolStats);
+ for (var entry : virtualPools.entrySet()) {
+ VirtualPoolHandleImpl vp = entry.getValue();
+ PoolConfig config = poolConfigs.get(entry.getKey());
+ poolStats.add(
+ new NativeAllocatorPoolStats.PoolStats(
+ entry.getKey(),
+ vp.allocatedBytes(),
+ vp.peakBytes(),
+ vp.limit(),
+ config != null && config.group != null ? config.group.getName() : null,
+ config != null ? config.min : 0L
+ )
+ );
+ }
+
+ return new NativeAllocatorPoolStats(nativeAllocated, nativeResident, poolStats);
+ }
+
+ /**
+ * Runs all registered stats refreshers.
+ */
+ public void refreshStats() {
+ for (Runnable refresher : statsRefreshers) {
+ try {
+ refresher.run();
+ } catch (Exception e) {
+ // best-effort
+ }
+ }
}
@Override
public void close() {
- rebalancer.shutdownNow();
pools.forEach((name, handle) -> {
try {
handle.allocator.close();
@@ -209,72 +307,23 @@ public void close() {
}
});
pools.clear();
- // Close any remaining child allocators (e.g., ad-hoc children created via ArrowAllocatorService)
+ virtualPools.clear();
for (BufferAllocator child : new ArrayList<>(root.getChildAllocators())) {
try {
child.close();
} catch (Exception e) {
- // best-effort — log but don't block shutdown
+ // best-effort
}
}
root.close();
}
- /**
- * Redistributes unused capacity across pools based on min/max guarantees.
- *
- * Algorithm:
- *
- * Every pool is guaranteed at least its configured min
- * Compute headroom = rootLimit - sum(all pool current allocations)
- * Distribute headroom equally across all pools (not just active ones), capped
- * at each pool's max. Distributing to all pools — including those with zero
- * current allocation — avoids the dead-pool corner case where a pool with
- * min = 0 starts at limit = 0, can never make its first allocation, and so
- * never becomes "active" enough to receive a bonus. Pools that don't need the
- * headroom stay at min naturally because their max caps the bonus.
- * No pool's limit ever drops below its current allocation or its min
- *
- */
- void rebalance() {
- if (pools.isEmpty()) return;
-
- long rootLimit = root.getLimit();
- long totalAllocated = 0;
-
- for (Map.Entry entry : pools.entrySet()) {
- totalAllocated += entry.getValue().allocator.getAllocatedMemory();
- }
-
- long headroom = Math.max(0, rootLimit - totalAllocated);
- int poolCount = pools.size();
- long bonusPerPool = poolCount > 0 ? headroom / poolCount : 0;
-
- for (Map.Entry entry : pools.entrySet()) {
- String name = entry.getKey();
- BufferAllocator alloc = entry.getValue().allocator;
- long min = poolMins.getOrDefault(name, 0L);
- long max = poolMaxes.getOrDefault(name, Long.MAX_VALUE);
- long currentAllocation = alloc.getAllocatedMemory();
-
- long effectiveLimit = min + bonusPerPool;
-
- // Cap at pool's max
- effectiveLimit = Math.min(effectiveLimit, max);
- // Never drop below current allocation or min
- effectiveLimit = Math.max(effectiveLimit, currentAllocation);
- effectiveLimit = Math.max(effectiveLimit, min);
- // Never exceed root
- effectiveLimit = Math.min(effectiveLimit, rootLimit);
-
- alloc.setLimit(effectiveLimit);
- }
- }
+ // ─── Package-private accessors (used by rebalancer and tests) ────────────────
/**
* Returns the underlying Arrow allocator for a pool.
*
- * @param poolName name of the pool to look up
+ * @param poolName name of the pool
*/
public BufferAllocator getPoolAllocator(String poolName) {
ArrowPoolHandle handle = pools.get(poolName);
@@ -284,16 +333,12 @@ public BufferAllocator getPoolAllocator(String poolName) {
return handle.allocator;
}
- /**
- * Returns the root Arrow allocator.
- */
+ /** Returns the root Arrow allocator. */
public BufferAllocator getRootAllocator() {
return root;
}
- /**
- * Returns all registered pool names.
- */
+ /** Returns all registered pool names (Arrow pools only). */
public Set getPoolNames() {
return Collections.unmodifiableSet(pools.keySet());
}
@@ -304,7 +349,8 @@ public Set getPoolNames() {
* @param poolName name of the pool
*/
public long getPoolMin(String poolName) {
- return poolMins.getOrDefault(poolName, 0L);
+ PoolConfig config = poolConfigs.get(poolName);
+ return config != null ? config.min : 0L;
}
/**
@@ -313,7 +359,163 @@ public long getPoolMin(String poolName) {
* @param poolName name of the pool
*/
public long getPoolMax(String poolName) {
- return poolMaxes.getOrDefault(poolName, Long.MAX_VALUE);
+ PoolConfig config = poolConfigs.get(poolName);
+ return config != null ? config.max : Long.MAX_VALUE;
+ }
+
+ /**
+ * Returns the group for a pool, or null if not assigned.
+ *
+ * @param poolName name of the pool
+ */
+ public PoolGroup getPoolGroup(String poolName) {
+ PoolConfig config = poolConfigs.get(poolName);
+ return config != null ? config.group : null;
+ }
+
+ /**
+ * Returns the allocated bytes for a virtual pool.
+ *
+ * @param poolName name of the virtual pool
+ */
+ public long getVirtualPoolAllocated(String poolName) {
+ VirtualPoolHandleImpl vp = virtualPools.get(poolName);
+ return vp != null ? vp.allocatedBytes() : 0;
+ }
+
+ /**
+ * Returns the current limit for a virtual pool.
+ *
+ * @param poolName name of the virtual pool
+ */
+ public long getVirtualPoolLimit(String poolName) {
+ VirtualPoolHandleImpl vp = virtualPools.get(poolName);
+ return vp != null ? vp.limit() : 0;
+ }
+
+ /**
+ * Returns the effective limit for any pool (Arrow or virtual).
+ *
+ * @param poolName name of the pool
+ */
+ public long getEffectiveLimit(String poolName) {
+ ArrowPoolHandle arrowHandle = pools.get(poolName);
+ if (arrowHandle != null) {
+ return arrowHandle.allocator.getLimit();
+ }
+ VirtualPoolHandleImpl vp = virtualPools.get(poolName);
+ if (vp != null) {
+ return vp.limit();
+ }
+ return 0;
+ }
+
+ /**
+ * Returns the allocated bytes for any pool (Arrow or virtual).
+ *
+ * @param poolName name of the pool
+ */
+ public long getAllocated(String poolName) {
+ ArrowPoolHandle arrowHandle = pools.get(poolName);
+ if (arrowHandle != null) {
+ return arrowHandle.allocator.getAllocatedMemory();
+ }
+ VirtualPoolHandleImpl vp = virtualPools.get(poolName);
+ if (vp != null) {
+ return vp.allocatedBytes();
+ }
+ return 0;
+ }
+
+ /** Returns the native memory stats supplier. */
+ public Supplier getNativeMemoryStatsSupplier() {
+ return nativeMemoryStatsSupplier;
+ }
+
+ // ─── Private helpers ─────────────────────────────────────────────────────────
+
+ private void validateSumMaxesWithinBudget(String newPoolName, long newPoolMax) {
+ if (budget == Long.MAX_VALUE || budget <= 0) {
+ return;
+ }
+ long sumMaxes = newPoolMax;
+ for (var entry : poolConfigs.entrySet()) {
+ if (entry.getKey().equals(newPoolName) == false) {
+ sumMaxes += entry.getValue().max;
+ }
+ }
+ if (sumMaxes > budget) {
+ throw new IllegalArgumentException(
+ "Sum of pool max limits ("
+ + sumMaxes
+ + " bytes) exceeds native memory budget ("
+ + budget
+ + " bytes). Reduce pool max settings or increase the budget."
+ );
+ }
+ }
+
+ // ─── Inner classes ───────────────────────────────────────────────────────────
+
+ /**
+ * Mutable configuration for a pool: min, max, and group.
+ */
+ static class PoolConfig {
+ volatile long min;
+ volatile long max;
+ final PoolGroup group;
+
+ PoolConfig(long min, long max, PoolGroup group) {
+ this.min = min;
+ this.max = max;
+ this.group = group;
+ }
+ }
+
+ /**
+ * Virtual pool handle implementation. Tracks stats reported from native layer
+ * and delegates limit changes to the registered callback.
+ */
+ public static class VirtualPoolHandleImpl implements VirtualPoolHandle {
+ private final String name;
+ private volatile long limit;
+ private volatile long allocatedBytes;
+ private volatile long peakBytes;
+ private final Consumer limitSetter;
+
+ VirtualPoolHandleImpl(String name, long limit, Consumer limitSetter) {
+ this.name = name;
+ this.limit = limit;
+ this.limitSetter = limitSetter;
+ }
+
+ @Override
+ public void updateStats(long allocated, long peak) {
+ this.allocatedBytes = allocated;
+ this.peakBytes = peak;
+ }
+
+ void setLimit(long newLimit) {
+ this.limit = newLimit;
+ if (limitSetter != null) {
+ limitSetter.accept(newLimit);
+ }
+ }
+
+ @Override
+ public long allocatedBytes() {
+ return allocatedBytes;
+ }
+
+ @Override
+ public long peakBytes() {
+ return peakBytes;
+ }
+
+ @Override
+ public long limit() {
+ return limit;
+ }
}
/**
diff --git a/plugins/arrow-base/src/main/java/org/opensearch/arrow/allocator/NativeMemoryRebalancer.java b/plugins/arrow-base/src/main/java/org/opensearch/arrow/allocator/NativeMemoryRebalancer.java
new file mode 100644
index 0000000000000..5aefe7763e952
--- /dev/null
+++ b/plugins/arrow-base/src/main/java/org/opensearch/arrow/allocator/NativeMemoryRebalancer.java
@@ -0,0 +1,235 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.arrow.allocator;
+
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.apache.logging.log4j.message.ParameterizedMessage;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+import java.util.function.Supplier;
+
+/**
+ * Periodic rebalancer that redistributes native memory across pools.
+ *
+ * Algorithm:
+ *
+ * Pools start at their configured max on registration
+ * If no pool is under pressure, return early (no-op)
+ * Idle pools (utilization < idle_threshold) are shrunk, never below min
+ * Pressured pools (utilization > pressure_threshold) receive freed capacity, can exceed max
+ * Excess freed capacity is returned to idle pools proportionally
+ * Invariant: sum(effective_limits) <= budget at all times
+ *
+ *
+ * @opensearch.internal
+ */
+public class NativeMemoryRebalancer implements Runnable {
+
+ private static final Logger logger = LogManager.getLogger(NativeMemoryRebalancer.class);
+
+ private final ArrowNativeAllocator allocator;
+ private final Supplier budgetSupplier;
+
+ private volatile double pressureThreshold;
+ private volatile double idleThreshold;
+ private volatile double shrinkFactor;
+
+ /**
+ * Creates a new rebalancer.
+ *
+ * @param allocator the allocator managing all pools
+ * @param budgetSupplier supplies the current budget value
+ * @param pressureThreshold utilization above this triggers growth (default 0.75)
+ * @param idleThreshold utilization below this means pool can give back capacity (default 0.50)
+ * @param shrinkFactor factor to shrink idle pools by — new limit = limit * (1 - shrinkFactor) (default 0.10)
+ */
+ public NativeMemoryRebalancer(
+ ArrowNativeAllocator allocator,
+ Supplier budgetSupplier,
+ double pressureThreshold,
+ double idleThreshold,
+ double shrinkFactor
+ ) {
+ this.allocator = allocator;
+ this.budgetSupplier = budgetSupplier;
+ this.pressureThreshold = pressureThreshold;
+ this.idleThreshold = idleThreshold;
+ this.shrinkFactor = shrinkFactor;
+ }
+
+ /**
+ * Updates the pressure threshold dynamically.
+ *
+ * @param value new threshold (0.0 to 1.0)
+ */
+ public void setPressureThreshold(double value) {
+ this.pressureThreshold = value;
+ }
+
+ /**
+ * Updates the idle threshold dynamically.
+ *
+ * @param value new threshold (0.0 to 1.0)
+ */
+ public void setIdleThreshold(double value) {
+ this.idleThreshold = value;
+ }
+
+ /**
+ * Updates the shrink factor dynamically.
+ *
+ * @param value new factor (0.0 to 1.0)
+ */
+ public void setShrinkFactor(double value) {
+ this.shrinkFactor = value;
+ }
+
+ @Override
+ public void run() {
+ try {
+ rebalance();
+ } catch (Exception e) {
+ logger.warn("Rebalancer tick failed", e);
+ }
+ }
+
+ void rebalance() {
+ Set allPools = allocator.getAllPoolNames();
+ if (allPools.isEmpty()) return;
+
+ long budget = budgetSupplier.get();
+ if (budget <= 0 || budget == Long.MAX_VALUE) return;
+
+ // Refresh stats from native layers
+ allocator.refreshStats();
+
+ // Snapshot per-pool state
+ Map snapshots = new HashMap<>();
+ for (String name : allPools) {
+ long allocated = allocator.getAllocated(name);
+ long effectiveLimit = allocator.getEffectiveLimit(name);
+ long min = allocator.getPoolMin(name);
+ long max = allocator.getPoolMax(name);
+ double utilization = effectiveLimit > 0 ? (double) allocated / effectiveLimit : 0;
+ snapshots.put(name, new PoolSnapshot(allocated, effectiveLimit, min, max, utilization));
+ }
+
+ // Identify pressured pools — if none, nothing to do
+ Map desires = new HashMap<>();
+ long totalDesired = 0;
+ for (var entry : snapshots.entrySet()) {
+ PoolSnapshot s = entry.getValue();
+ if (s.utilization > pressureThreshold) {
+ long desired = Math.max(1, (long) (s.allocated * 0.25));
+ desires.put(entry.getKey(), desired);
+ totalDesired += desired;
+ }
+ }
+ if (totalDesired == 0) {
+ logger.debug("Rebalancer: no pools under pressure, skipping");
+ return;
+ }
+
+ // Shrink idle pools, floor at min
+ long freedCapacity = 0;
+ for (var entry : snapshots.entrySet()) {
+ PoolSnapshot s = entry.getValue();
+ if (s.utilization < idleThreshold) {
+ long newLimit = Math.max((long) (s.effectiveLimit * (1.0 - shrinkFactor)), s.min);
+ newLimit = Math.max(newLimit, s.allocated);
+ if (newLimit < s.effectiveLimit) {
+ freedCapacity += s.effectiveLimit - newLimit;
+ allocator.setPoolEffectiveLimit(entry.getKey(), newLimit);
+ s.effectiveLimit = newLimit;
+ }
+ }
+ }
+
+ if (freedCapacity == 0) {
+ logger.debug("Rebalancer: no capacity freed from idle pools");
+ return;
+ }
+
+ // Distribute freed capacity to pressured pools (can exceed max)
+ long totalGranted = 0;
+ long grantCap = Math.min(freedCapacity, totalDesired);
+ for (var entry : desires.entrySet()) {
+ String name = entry.getKey();
+ long desired = entry.getValue();
+ PoolSnapshot s = snapshots.get(name);
+ long grant = (long) ((double) grantCap * desired / totalDesired);
+ grant = Math.min(grant, grantCap - totalGranted);
+ if (grant > 0) {
+ try {
+ long newLimit = s.effectiveLimit + grant;
+ allocator.setPoolEffectiveLimit(name, newLimit);
+ totalGranted += grant;
+ logger.debug("Rebalancer: grew pool [{}] by {} bytes to {} (max={})", name, grant, newLimit, s.max);
+ } catch (Exception e) {
+ logger.warn(() -> new ParameterizedMessage("Rebalancer: failed to grow pool [{}]", name), e);
+ }
+ }
+ }
+
+ // Return any excess freed capacity back to idle pools
+ long excess = freedCapacity - totalGranted;
+ if (excess > 0) {
+ returnToIdlePools(snapshots, excess);
+ }
+ }
+
+ // ─── Private helpers ─────────────────────────────────────────────────────────
+
+ private void returnToIdlePools(Map snapshots, long capacity) {
+ long totalIdleSize = 0;
+ for (PoolSnapshot s : snapshots.values()) {
+ if (s.utilization < idleThreshold) {
+ totalIdleSize += s.effectiveLimit;
+ }
+ }
+ if (totalIdleSize == 0) return;
+
+ long totalReturned = 0;
+ for (var entry : snapshots.entrySet()) {
+ PoolSnapshot s = entry.getValue();
+ if (s.utilization < idleThreshold) {
+ long share = (long) ((double) capacity * s.effectiveLimit / totalIdleSize);
+ share = Math.min(share, capacity - totalReturned);
+ if (share > 0) {
+ long newLimit = s.effectiveLimit + share;
+ allocator.setPoolEffectiveLimit(entry.getKey(), newLimit);
+ s.effectiveLimit = newLimit;
+ totalReturned += share;
+ }
+ }
+ }
+ }
+
+ /**
+ * Point-in-time snapshot of a pool's state during one rebalance tick.
+ */
+ static class PoolSnapshot {
+ final long allocated;
+ long effectiveLimit;
+ final long min;
+ final long max;
+ final double utilization;
+
+ PoolSnapshot(long allocated, long effectiveLimit, long min, long max, double utilization) {
+ this.allocated = allocated;
+ this.effectiveLimit = effectiveLimit;
+ this.min = min;
+ this.max = max;
+ this.utilization = utilization;
+ }
+ }
+}
diff --git a/plugins/arrow-base/src/test/java/org/opensearch/arrow/allocator/ArrowBasePluginTests.java b/plugins/arrow-base/src/test/java/org/opensearch/arrow/allocator/ArrowBasePluginTests.java
index ad72b70b8cbbb..df2673a5a8a7b 100644
--- a/plugins/arrow-base/src/test/java/org/opensearch/arrow/allocator/ArrowBasePluginTests.java
+++ b/plugins/arrow-base/src/test/java/org/opensearch/arrow/allocator/ArrowBasePluginTests.java
@@ -8,12 +8,11 @@
package org.opensearch.arrow.allocator;
-import org.apache.arrow.memory.BufferAllocator;
-import org.apache.arrow.memory.OutOfMemoryException;
import org.opensearch.arrow.spi.NativeAllocatorPoolConfig;
import org.opensearch.common.settings.ClusterSettings;
import org.opensearch.common.settings.Setting;
import org.opensearch.common.settings.Settings;
+import org.opensearch.node.resource.tracker.ResourceTrackerSettings;
import org.opensearch.test.OpenSearchTestCase;
import java.util.HashSet;
@@ -21,90 +20,23 @@
public class ArrowBasePluginTests extends OpenSearchTestCase {
- public void testDeriveRootLimitDefaultUnsetReturnsLongMaxValue() {
- // Explicit 0 expresses "AC unconfigured" — default is now ram - heap, so Settings.EMPTY
- // would resolve to a real value on whatever machine the test runs on.
- Settings s = Settings.builder().put("node.native_memory.limit", "0b").build();
- assertEquals(Long.toString(Long.MAX_VALUE), ArrowBasePlugin.deriveRootLimitDefault(s));
- }
-
- public void testDeriveRootLimitDefaultUsesAcLimitWhenSet() {
- Settings s = Settings.builder().put("node.native_memory.limit", "1gb").build();
- // ROOT_LIMIT defaults to 20% of node.native_memory.limit — the Arrow framework gets a
- // small fraction of native budget; DataFusion's Rust runtime takes the larger share.
- long oneGiB = 1024L * 1024 * 1024;
- assertEquals(Long.toString(oneGiB * 20 / 100), ArrowBasePlugin.deriveRootLimitDefault(s));
- }
-
- public void testDeriveRootLimitDefaultIgnoresBufferPercent() {
- // node.native_memory.buffer_percent is admission control's throttle margin, not a
- // framework budget reduction. The framework default takes its 20% fraction off
- // node.native_memory.limit directly so AC's safety margin sits between AC's throttle
- // threshold and the framework's hard cap rather than being collapsed into the cap.
- // 1000 bytes limit, 20% buffer => root.limit still 20% of 1000 = 200.
- Settings s = Settings.builder().put("node.native_memory.limit", "1000b").put("node.native_memory.buffer_percent", 20).build();
- assertEquals("200", ArrowBasePlugin.deriveRootLimitDefault(s));
- }
-
- public void testRootLimitSettingExposesDerivedDefault() {
- Settings s = Settings.builder().put("node.native_memory.limit", "10gb").build();
- // 20% of 10 GiB.
- long expected = 10L * 1024 * 1024 * 1024 * 20 / 100;
- assertEquals(Long.valueOf(expected), ArrowBasePlugin.ROOT_LIMIT_SETTING.get(s));
- }
-
- public void testRootLimitSettingExplicitOverridesDerived() {
- Settings s = Settings.builder()
- .put("node.native_memory.limit", "8gb")
- .put(NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT, 1024L)
- .build();
- assertEquals(Long.valueOf(1024L), ArrowBasePlugin.ROOT_LIMIT_SETTING.get(s));
- }
-
- public void testRootLimitRejectsNegative() {
- Settings s = Settings.builder().put(NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT, -1L).build();
- IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> ArrowBasePlugin.ROOT_LIMIT_SETTING.get(s));
- assertTrue(e.getMessage().contains("must be >= 0"));
- }
-
public void testQuerySettingsExposeDefaults() {
// Explicit 0 expresses "AC unconfigured" so QUERY_MAX falls back to Long.MAX_VALUE.
- // Settings.EMPTY would resolve via ram - heap default to a finite, machine-dependent value.
Settings s = Settings.builder().put("node.native_memory.limit", "0b").build();
assertEquals(Long.valueOf(0L), ArrowBasePlugin.QUERY_MIN_SETTING.get(s));
assertEquals(Long.valueOf(Long.MAX_VALUE), ArrowBasePlugin.QUERY_MAX_SETTING.get(s));
}
- public void testFlightAndIngestMinDefaultsToZero() {
- // The grouped validator (validateMinSum) treats per-pool mins as a guarantee
- // floor — defaults of Long.MAX_VALUE caused the validator to reject any PUT
- // that set a non-MAX root. Pool mins must default to zero so the baseline
- // configuration is consistent.
- Settings s = Settings.EMPTY;
- assertEquals(Long.valueOf(0L), ArrowBasePlugin.FLIGHT_MIN_SETTING.get(s));
- assertEquals(Long.valueOf(0L), ArrowBasePlugin.INGEST_MIN_SETTING.get(s));
- }
-
- public void testQuerySettingsAcceptValues() {
- Settings s = Settings.builder()
- .put(NativeAllocatorPoolConfig.SETTING_QUERY_MIN, 100L)
- .put(NativeAllocatorPoolConfig.SETTING_QUERY_MAX, 1000L)
- .build();
- assertEquals(Long.valueOf(100L), ArrowBasePlugin.QUERY_MIN_SETTING.get(s));
- assertEquals(Long.valueOf(1000L), ArrowBasePlugin.QUERY_MAX_SETTING.get(s));
+ public void testFlightAndIngestMinDerivedFromBudget() {
+ // With node.native_memory.limit set, mins derive as percentages
+ Settings s = Settings.builder().put("node.native_memory.limit", "1gb").build();
+ long budget = 1024L * 1024 * 1024;
+ // flight min = 2% of budget, ingest min = 4% of budget
+ assertEquals(Long.valueOf(budget * 2 / 100), ArrowBasePlugin.FLIGHT_MIN_SETTING.get(s));
+ assertEquals(Long.valueOf(budget * 4 / 100), ArrowBasePlugin.INGEST_MIN_SETTING.get(s));
}
- // -- Pool max defaults derived from node.native_memory.limit ----------
- // Pool maxes anchor to the operator's off-heap budget (node.native_memory.limit),
- // not to native.allocator.root.limit. This matches the PR #21732 partitioning
- // diagram where pool fractions (5%/8%/5%) are of native_memory.limit. Sum of
- // pool maxes (18% of native_memory.limit) fits within root.limit (20% of
- // native_memory.limit) by default, leaving 2 pp headroom inside the root cap.
-
public void testPoolMaxDefaultsAreLongMaxValueWhenAcUnset() {
- // AC explicitly unconfigured — pool maxes default to Long.MAX_VALUE (unbounded),
- // preserving pre-AC behaviour. The default for node.native_memory.limit is
- // 79% of (ram - heap), so to test the "unset" branch we must explicitly set it to 0.
Settings s = Settings.builder().put("node.native_memory.limit", "0b").build();
assertEquals(Long.valueOf(Long.MAX_VALUE), ArrowBasePlugin.FLIGHT_MAX_SETTING.get(s));
assertEquals(Long.valueOf(Long.MAX_VALUE), ArrowBasePlugin.INGEST_MAX_SETTING.get(s));
@@ -112,10 +44,6 @@ public void testPoolMaxDefaultsAreLongMaxValueWhenAcUnset() {
}
public void testPoolMaxDefaultsScaleFromAcBudget() {
- // 10 GiB native memory limit. Pool maxes per the partitioning model in PR #21732:
- // FLIGHT_MAX = 5% INGEST_MAX = 8% QUERY_MAX = 5%
- // Anchored to node.native_memory.limit, not to root.limit (which defaults to 20%
- // of native_memory.limit) — see derivePoolMaxDefault Javadoc.
Settings s = Settings.builder().put("node.native_memory.limit", "10gb").build();
long limit = 10L * 1024 * 1024 * 1024;
assertEquals(Long.valueOf(limit * 5 / 100), ArrowBasePlugin.FLIGHT_MAX_SETTING.get(s));
@@ -123,51 +51,14 @@ public void testPoolMaxDefaultsScaleFromAcBudget() {
assertEquals(Long.valueOf(limit * 5 / 100), ArrowBasePlugin.QUERY_MAX_SETTING.get(s));
}
- public void testPoolMaxDefaultsIgnoreRootLimitOverride() {
- // Pool maxes anchor to node.native_memory.limit, not to root.limit. An operator
- // who overrides root.limit (e.g. to 4 GiB instead of the default 20% of
- // native_memory.limit = 2 GiB) does not shrink pool defaults proportionally;
- // the diagrammed partitioning of native_memory.limit holds.
- Settings s = Settings.builder()
- .put("node.native_memory.limit", "10gb")
- .put(NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT, 4L * 1024 * 1024 * 1024)
- .build();
- long limit = 10L * 1024 * 1024 * 1024;
- assertEquals(Long.valueOf(limit * 5 / 100), ArrowBasePlugin.FLIGHT_MAX_SETTING.get(s));
- assertEquals(Long.valueOf(limit * 8 / 100), ArrowBasePlugin.INGEST_MAX_SETTING.get(s));
- assertEquals(Long.valueOf(limit * 5 / 100), ArrowBasePlugin.QUERY_MAX_SETTING.get(s));
- }
-
public void testPoolMaxDefaultsIgnoreBufferPercent() {
- // node.native_memory.buffer_percent is AC's throttle margin, not a framework budget
- // reduction. Pool maxes default off node.native_memory.limit directly so AC's safety
- // margin sits between AC's throttle threshold and the framework's hard cap rather than
- // being collapsed into the cap.
- // 1000 bytes limit, 20% buffer => pool maxes are still 5/8/5% of 1000 = 50/80/50.
Settings s = Settings.builder().put("node.native_memory.limit", "1000b").put("node.native_memory.buffer_percent", 20).build();
assertEquals(Long.valueOf(50L), ArrowBasePlugin.FLIGHT_MAX_SETTING.get(s));
assertEquals(Long.valueOf(80L), ArrowBasePlugin.INGEST_MAX_SETTING.get(s));
assertEquals(Long.valueOf(50L), ArrowBasePlugin.QUERY_MAX_SETTING.get(s));
}
- public void testPoolMaxExplicitOverridesDerived() {
- // Operator-set values must win over derived defaults.
- Settings s = Settings.builder()
- .put("node.native_memory.limit", "10gb")
- .put(NativeAllocatorPoolConfig.SETTING_FLIGHT_MAX, 7L)
- .put(NativeAllocatorPoolConfig.SETTING_INGEST_MAX, 8L)
- .put(NativeAllocatorPoolConfig.SETTING_QUERY_MAX, 9L)
- .build();
- assertEquals(Long.valueOf(7L), ArrowBasePlugin.FLIGHT_MAX_SETTING.get(s));
- assertEquals(Long.valueOf(8L), ArrowBasePlugin.INGEST_MAX_SETTING.get(s));
- assertEquals(Long.valueOf(9L), ArrowBasePlugin.QUERY_MAX_SETTING.get(s));
- }
-
public void testPoolMaxRejectsNegative() {
- // Negative pool max is rejected at parse time, mirroring ROOT_LIMIT_SETTING.
- // Each pool's parser has its own message so we exercise all three to lock down
- // the per-pool error contract (and keep coverage honest on what is otherwise
- // boilerplate-but-distinct branches).
Settings flight = Settings.builder().put(NativeAllocatorPoolConfig.SETTING_FLIGHT_MAX, -1L).build();
IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> ArrowBasePlugin.FLIGHT_MAX_SETTING.get(flight));
assertTrue(e.getMessage().contains("must be >= 0"));
@@ -188,253 +79,75 @@ public void testPoolMaxRejectsNegative() {
}
// -----------------------------------------------------------------
- // End-to-end wiring tests — verify that Setting.Property.Dynamic settings
- // actually flow through to the live allocator. These guard against the
- // "dynamic in name only" failure mode where a setting parses, the validator
- // runs, the cluster-state update succeeds, and the runtime component
- // silently does nothing because the addSettingsUpdateConsumer line was
- // never registered. Bare-setter unit tests do not catch this; tests must
- // drive a real ClusterSettings#applySettings round-trip.
+ // End-to-end wiring tests
// -----------------------------------------------------------------
- /**
- * Builds a {@link ClusterSettings} preloaded with all of {@link ArrowBasePlugin}'s
- * settings, mirroring what {@code SettingsModule} does at node startup. Returns the
- * fresh allocator with the framework's pools created and consumers registered
- * — the same wiring path {@code createComponents} runs.
- */
- private static ArrowNativeAllocator newWiredAllocator(Settings nodeSettings, ClusterSettings cs) {
- long rootLimit = ArrowBasePlugin.ROOT_LIMIT_SETTING.get(nodeSettings);
- ArrowNativeAllocator allocator = new ArrowNativeAllocator(rootLimit);
- allocator.setRebalanceInterval(ArrowBasePlugin.REBALANCE_INTERVAL_SETTING.get(nodeSettings));
- allocator.getOrCreatePool(
- NativeAllocatorPoolConfig.POOL_FLIGHT,
- ArrowBasePlugin.FLIGHT_MIN_SETTING.get(nodeSettings),
- ArrowBasePlugin.FLIGHT_MAX_SETTING.get(nodeSettings)
- );
- allocator.getOrCreatePool(
- NativeAllocatorPoolConfig.POOL_INGEST,
- ArrowBasePlugin.INGEST_MIN_SETTING.get(nodeSettings),
- ArrowBasePlugin.INGEST_MAX_SETTING.get(nodeSettings)
- );
- allocator.getOrCreatePool(
- NativeAllocatorPoolConfig.POOL_QUERY,
- ArrowBasePlugin.QUERY_MIN_SETTING.get(nodeSettings),
- ArrowBasePlugin.QUERY_MAX_SETTING.get(nodeSettings)
- );
- ArrowBasePlugin.registerSettingsUpdateConsumers(cs, allocator);
- return allocator;
- }
-
private static ClusterSettings newClusterSettings(Settings nodeSettings) {
Set> registered = new HashSet<>();
registered.addAll(new ArrowBasePlugin().getSettings());
return new ClusterSettings(nodeSettings, registered);
}
- public void testBuildAllocatorWiresAllPoolsAndSettingsConsumers() {
- // Verifies the full createComponents code path — the helper extracted from
- // createComponents builds the allocator, creates all three pools (FLIGHT, INGEST,
- // QUERY), and registers the cluster-settings update consumers. We bypass the
- // heavyweight ClusterService fixture and inject a real ClusterSettings directly,
- // which is what production wiring also passes through to buildAllocator after
- // unpacking the createComponents arguments.
+ public void testBuildAllocatorWiresAllPools() throws Exception {
Settings nodeSettings = Settings.builder()
- .put(NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT, 8L * 1024 * 1024 * 1024)
+ .put("node.native_memory.limit", "10gb")
.put(NativeAllocatorPoolConfig.SETTING_FLIGHT_MAX, 1L * 1024 * 1024 * 1024)
.put(NativeAllocatorPoolConfig.SETTING_INGEST_MAX, 2L * 1024 * 1024 * 1024)
.put(NativeAllocatorPoolConfig.SETTING_QUERY_MAX, 1L * 1024 * 1024 * 1024)
+ .put("native.allocator.rebalancer.enabled", false)
.build();
ClusterSettings cs = newClusterSettings(nodeSettings);
- ArrowNativeAllocator allocator = ArrowBasePlugin.buildAllocator(nodeSettings, cs);
+ ArrowBasePlugin plugin = new ArrowBasePlugin();
+ long budget = ResourceTrackerSettings.NODE_NATIVE_MEMORY_LIMIT_SETTING.get(nodeSettings).getBytes();
+ ArrowNativeAllocator allocator = plugin.buildAllocator(nodeSettings, cs, () -> budget);
try {
- // All three pools created.
Set poolNames = allocator.getPoolNames();
assertEquals("buildAllocator must register exactly the framework's three pools", 3, poolNames.size());
assertTrue(poolNames.contains(NativeAllocatorPoolConfig.POOL_FLIGHT));
assertTrue(poolNames.contains(NativeAllocatorPoolConfig.POOL_INGEST));
assertTrue(poolNames.contains(NativeAllocatorPoolConfig.POOL_QUERY));
- // Pool maxes match the operator-set values (rebalancer disabled by default,
+ // Pool maxes match the operator-set values (rebalancer disabled,
// so initial limit == max).
assertEquals(1L * 1024 * 1024 * 1024, allocator.getPoolAllocator(NativeAllocatorPoolConfig.POOL_FLIGHT).getLimit());
assertEquals(2L * 1024 * 1024 * 1024, allocator.getPoolAllocator(NativeAllocatorPoolConfig.POOL_INGEST).getLimit());
assertEquals(1L * 1024 * 1024 * 1024, allocator.getPoolAllocator(NativeAllocatorPoolConfig.POOL_QUERY).getLimit());
-
- // Cluster-settings update consumers are registered: a PUT to a pool max must
- // propagate to the live allocator.
- cs.applySettings(
- Settings.builder()
- .put(NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT, 8L * 1024 * 1024 * 1024)
- .put(NativeAllocatorPoolConfig.SETTING_INGEST_MAX, 4L * 1024 * 1024 * 1024)
- .build()
- );
- assertEquals(
- "buildAllocator must wire the INGEST_MAX cluster-settings consumer",
- 4L * 1024 * 1024 * 1024,
- allocator.getPoolAllocator(NativeAllocatorPoolConfig.POOL_INGEST).getLimit()
- );
- } finally {
- allocator.close();
- }
- }
-
- public void testQueryMaxClusterSettingPropagatesToAllocator() {
- // The full wired path: node starts at default settings, plugin registers
- // consumers, operator PUTs a new max via _cluster/settings.
- Settings nodeSettings = Settings.builder().put(NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT, 8L * 1024 * 1024 * 1024).build();
- ClusterSettings cs = newClusterSettings(nodeSettings);
- ArrowNativeAllocator allocator = newWiredAllocator(nodeSettings, cs);
- try {
- cs.applySettings(
- Settings.builder()
- .put(NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT, 8L * 1024 * 1024 * 1024)
- .put(NativeAllocatorPoolConfig.SETTING_QUERY_MAX, 1024L * 1024 * 1024)
- .build()
- );
- assertEquals(
- "PUT to query max must update the live BufferAllocator limit",
- 1024L * 1024 * 1024,
- allocator.getPoolAllocator(NativeAllocatorPoolConfig.POOL_QUERY).getLimit()
- );
- assertEquals(1024L * 1024 * 1024, allocator.getPoolMax(NativeAllocatorPoolConfig.POOL_QUERY));
- } finally {
- allocator.close();
- }
- }
-
- public void testFlightMinClusterSettingPropagatesToAllocator() {
- // Min is the regression-prone path: prior to the live-propagation fix,
- // setPoolMin only updated the poolMins map and operators got HTTP 200 with
- // no observable behavior change.
- Settings nodeSettings = Settings.builder().put(NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT, 8L * 1024 * 1024 * 1024).build();
- ClusterSettings cs = newClusterSettings(nodeSettings);
- ArrowNativeAllocator allocator = newWiredAllocator(nodeSettings, cs);
- try {
- // Pool starts at max (rebalancer disabled by default), so a min PUT below
- // the current limit is a no-op on the live limit but updates poolMins.
- // Use a min ABOVE the current limit to force the live raise path.
- cs.applySettings(
- Settings.builder()
- .put(NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT, 8L * 1024 * 1024 * 1024)
- .put(NativeAllocatorPoolConfig.SETTING_FLIGHT_MAX, 4L * 1024 * 1024 * 1024)
- .put(NativeAllocatorPoolConfig.SETTING_FLIGHT_MIN, 2L * 1024 * 1024 * 1024)
- .build()
- );
- assertEquals(
- "PUT to flight min must update the recorded min for the rebalancer",
- 2L * 1024 * 1024 * 1024,
- allocator.getPoolMin(NativeAllocatorPoolConfig.POOL_FLIGHT)
- );
- assertTrue(
- "PUT to flight min must raise the live BufferAllocator limit when min exceeds current",
- allocator.getPoolAllocator(NativeAllocatorPoolConfig.POOL_FLIGHT).getLimit() >= 2L * 1024 * 1024 * 1024
- );
- } finally {
- allocator.close();
- }
- }
-
- public void testRootLimitClusterSettingPropagatesToAllocator() {
- Settings nodeSettings = Settings.builder().put(NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT, 8L * 1024 * 1024 * 1024).build();
- ClusterSettings cs = newClusterSettings(nodeSettings);
- ArrowNativeAllocator allocator = newWiredAllocator(nodeSettings, cs);
- try {
- cs.applySettings(Settings.builder().put(NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT, 16L * 1024 * 1024 * 1024).build());
- assertEquals(
- "PUT to root limit must update the RootAllocator's limit",
- 16L * 1024 * 1024 * 1024,
- allocator.getRootAllocator().getLimit()
- );
} finally {
allocator.close();
+ plugin.close();
}
}
- public void testValidatorRejectsSumOfMinsExceedingRoot() {
- // The cross-setting grouped validator must reject PUTs that would over-
- // subscribe the root. Test the rejection path end-to-end through ClusterSettings.
- // Set node.native_memory.limit=0b explicitly so pool maxes default to Long.MAX_VALUE
- // — minmax path firing first.
+ public void testBuildAllocatorWithRebalancerPoolsStartAtMax() throws Exception {
Settings nodeSettings = Settings.builder()
- .put("node.native_memory.limit", "0b")
- .put(NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT, 10L * 1024 * 1024 * 1024)
+ .put("node.native_memory.limit", "1gb")
+ .put(NativeAllocatorPoolConfig.SETTING_FLIGHT_MIN, 10L * 1024 * 1024)
+ .put(NativeAllocatorPoolConfig.SETTING_FLIGHT_MAX, 200L * 1024 * 1024)
+ .put(NativeAllocatorPoolConfig.SETTING_INGEST_MIN, 20L * 1024 * 1024)
+ .put(NativeAllocatorPoolConfig.SETTING_INGEST_MAX, 200L * 1024 * 1024)
+ .put(NativeAllocatorPoolConfig.SETTING_QUERY_MIN, 10L * 1024 * 1024)
+ .put(NativeAllocatorPoolConfig.SETTING_QUERY_MAX, 200L * 1024 * 1024)
+ .put("native.allocator.rebalancer.enabled", true)
+ .put("native.allocator.rebalance.interval_seconds", 5L)
.build();
ClusterSettings cs = newClusterSettings(nodeSettings);
- ArrowNativeAllocator allocator = newWiredAllocator(nodeSettings, cs);
+
+ ArrowBasePlugin plugin = new ArrowBasePlugin();
+ long budget2 = ResourceTrackerSettings.NODE_NATIVE_MEMORY_LIMIT_SETTING.get(nodeSettings).getBytes();
+ ArrowNativeAllocator allocator = plugin.buildAllocator(nodeSettings, cs, () -> budget2);
try {
- // root=10gb, flight_min=6gb, ingest_min=6gb => sum_mins=12gb > root=10gb.
- IllegalArgumentException e = expectThrows(
- IllegalArgumentException.class,
- () -> cs.applySettings(
- Settings.builder()
- .put("node.native_memory.limit", "0b")
- .put(NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT, 10L * 1024 * 1024 * 1024)
- .put(NativeAllocatorPoolConfig.SETTING_FLIGHT_MIN, 6L * 1024 * 1024 * 1024)
- .put(NativeAllocatorPoolConfig.SETTING_INGEST_MIN, 6L * 1024 * 1024 * 1024)
- .build()
- )
- );
- assertTrue(
- "expected sum-exceeds-root in error, got: " + e.getMessage(),
- e.getMessage().contains("exceeds root limit") || e.getMessage().contains("Sum of pool minimums")
- );
+ // Pools always start at max regardless of rebalancer state
+ assertEquals(200L * 1024 * 1024, allocator.getPoolAllocator(NativeAllocatorPoolConfig.POOL_FLIGHT).getLimit());
+ assertEquals(200L * 1024 * 1024, allocator.getPoolAllocator(NativeAllocatorPoolConfig.POOL_INGEST).getLimit());
+ assertEquals(200L * 1024 * 1024, allocator.getPoolAllocator(NativeAllocatorPoolConfig.POOL_QUERY).getLimit());
} finally {
allocator.close();
+ plugin.close();
}
}
- public void testChildAllocatorInheritsParentCapAfterPoolLimitUpdate() {
- // Sanity check for the AnalyticsSearchService / FlightTransport pattern:
- // when a consumer creates a child of the framework's pool with Long.MAX_VALUE
- // limit, a PUT to the pool's max takes effect on the child's allocations
- // automatically via Arrow's parent-cap check at allocateBytes — no listener needed.
- //
- // The contract we rely on (Arrow Accountant.allocate, lines 191-203 in 18.3.0):
- // when the child's reservation is exhausted, it calls parent.allocate(...) which
- // checks the parent's allocationLimit on every allocation. Setting the child's own
- // limit to Long.MAX_VALUE means the child has no own-cap on top of the parent's;
- // setLimit on the parent is observed atomically by all subsequent allocations
- // through any descendant.
- Settings nodeSettings = Settings.builder().put(NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT, 8L * 1024 * 1024 * 1024).build();
- ClusterSettings cs = newClusterSettings(nodeSettings);
- ArrowNativeAllocator allocator = newWiredAllocator(nodeSettings, cs);
- try {
- BufferAllocator queryPool = allocator.getPoolAllocator(NativeAllocatorPoolConfig.POOL_QUERY);
- BufferAllocator child = queryPool.newChildAllocator("consumer", 0, Long.MAX_VALUE);
- try {
- // Step 1: a small allocation through the child succeeds with the original pool max.
- try (var buf = child.buffer(1024)) {
- assertEquals("child accounting reflects allocation", 1024L, child.getAllocatedMemory());
- assertEquals("parent pool sees child allocation", 1024L, queryPool.getAllocatedMemory());
- }
-
- // Step 2: PUT a small pool max via cluster settings.
- cs.applySettings(
- Settings.builder()
- .put(NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT, 8L * 1024 * 1024 * 1024)
- .put(NativeAllocatorPoolConfig.SETTING_QUERY_MAX, 1L * 1024 * 1024) // 1 MB
- .build()
- );
- assertEquals("pool's own limit reflects the PUT", 1L * 1024 * 1024, queryPool.getLimit());
- assertEquals("child's own limit is intentionally uncapped", Long.MAX_VALUE, child.getLimit());
-
- // Step 3: allocations within the new parent cap still work.
- try (var withinCap = child.buffer(512 * 1024)) { // 512 KB, under 1 MB cap
- assertEquals(512L * 1024, child.getAllocatedMemory());
- }
-
- // Step 4: allocation exceeding the new parent cap fails — this is the
- // behavior the deleted listener pattern was emulating, now provided
- // natively by Arrow's parent-cap check.
- expectThrows(OutOfMemoryException.class, () -> child.buffer(2L * 1024 * 1024)); // 2 MB, over 1 MB cap
- } finally {
- child.close();
- }
- } finally {
- allocator.close();
- }
+ public void testRebalanceIntervalSettingDefault() {
+ assertEquals(Long.valueOf(5L), ArrowBasePlugin.REBALANCE_INTERVAL_SETTING.get(Settings.EMPTY));
}
}
diff --git a/plugins/arrow-base/src/test/java/org/opensearch/arrow/allocator/ArrowNativeAllocatorTests.java b/plugins/arrow-base/src/test/java/org/opensearch/arrow/allocator/ArrowNativeAllocatorTests.java
index dc10a93bd6b74..fae60d010e5ed 100644
--- a/plugins/arrow-base/src/test/java/org/opensearch/arrow/allocator/ArrowNativeAllocatorTests.java
+++ b/plugins/arrow-base/src/test/java/org/opensearch/arrow/allocator/ArrowNativeAllocatorTests.java
@@ -10,6 +10,7 @@
import org.apache.arrow.memory.BufferAllocator;
import org.opensearch.arrow.spi.NativeAllocator;
+import org.opensearch.arrow.spi.PoolGroup;
import org.opensearch.plugin.stats.NativeAllocatorPoolStats;
import org.opensearch.test.OpenSearchTestCase;
@@ -20,7 +21,7 @@ public class ArrowNativeAllocatorTests extends OpenSearchTestCase {
@Override
public void setUp() throws Exception {
super.setUp();
- allocator = new ArrowNativeAllocator(1024L * 1024 * 1024); // 1 GB for tests
+ allocator = new ArrowNativeAllocator();
}
@Override
@@ -30,21 +31,21 @@ public void tearDown() throws Exception {
}
public void testCreatePool() {
- NativeAllocator.PoolHandle handle = allocator.getOrCreatePool("test-pool", 100 * 1024 * 1024);
+ NativeAllocator.PoolHandle handle = allocator.getOrCreatePool("test-pool", 0L, 100 * 1024 * 1024, null);
assertNotNull(handle);
assertEquals(100 * 1024 * 1024, handle.limit());
assertEquals(0, handle.allocatedBytes());
}
public void testGetOrCreatePoolIdempotent() {
- NativeAllocator.PoolHandle first = allocator.getOrCreatePool("idempotent", 50 * 1024 * 1024);
- NativeAllocator.PoolHandle second = allocator.getOrCreatePool("idempotent", 999 * 1024 * 1024);
+ NativeAllocator.PoolHandle first = allocator.getOrCreatePool("idempotent", 0L, 50 * 1024 * 1024, null);
+ NativeAllocator.PoolHandle second = allocator.getOrCreatePool("idempotent", 0L, 999 * 1024 * 1024, null);
assertSame(first, second);
assertEquals(50 * 1024 * 1024, second.limit());
}
public void testPoolChildAllocation() {
- allocator.getOrCreatePool("parent", 200 * 1024 * 1024);
+ allocator.getOrCreatePool("parent", 0L, 200 * 1024 * 1024, null);
BufferAllocator child = allocator.getPoolAllocator("parent").newChildAllocator("child-1", 0, 50 * 1024 * 1024);
try {
child.buffer(1024).close();
@@ -55,7 +56,7 @@ public void testPoolChildAllocation() {
}
public void testSetPoolLimit() {
- allocator.getOrCreatePool("resizable", 100 * 1024 * 1024);
+ allocator.getOrCreatePool("resizable", 0L, 100 * 1024 * 1024, null);
allocator.setPoolLimit("resizable", 200 * 1024 * 1024);
assertEquals(200 * 1024 * 1024, allocator.getPoolAllocator("resizable").getLimit());
}
@@ -68,147 +69,54 @@ public void testGetPoolAllocatorNonExistent() {
expectThrows(IllegalStateException.class, () -> allocator.getPoolAllocator("ghost"));
}
- public void testSetRootLimit() {
- allocator.setRootLimit(512 * 1024 * 1024);
- assertEquals(512 * 1024 * 1024, allocator.getRootAllocator().getLimit());
- }
-
public void testStats() {
- allocator.getOrCreatePool("stats-pool", 64 * 1024 * 1024);
+ allocator.getOrCreatePool("stats-pool", 0L, 64 * 1024 * 1024, PoolGroup.SEARCH);
NativeAllocatorPoolStats stats = allocator.stats();
assertNotNull(stats);
- assertEquals(1024 * 1024 * 1024, stats.getRootLimitBytes());
- assertEquals(0, stats.getRootAllocatedBytes());
+ assertEquals(-1, stats.getNativeAllocatedBytes());
+ assertEquals(-1, stats.getNativeResidentBytes());
assertEquals(1, stats.getPools().size());
NativeAllocatorPoolStats.PoolStats poolStats = stats.getPools().get(0);
assertEquals("stats-pool", poolStats.getName());
assertEquals(64 * 1024 * 1024, poolStats.getLimitBytes());
assertEquals(0, poolStats.getAllocatedBytes());
- // child_count is no longer rendered in stats; getPoolAllocator(...).getChildAllocators()
- // is the runtime accessor for that detail if needed.
}
public void testStatsMultiplePools() {
- allocator.getOrCreatePool("pool-a", 100 * 1024 * 1024);
- allocator.getOrCreatePool("pool-b", 200 * 1024 * 1024);
+ allocator.getOrCreatePool("pool-a", 0L, 100 * 1024 * 1024, null);
+ allocator.getOrCreatePool("pool-b", 0L, 200 * 1024 * 1024, null);
NativeAllocatorPoolStats stats = allocator.stats();
assertEquals(2, stats.getPools().size());
}
public void testGetPoolNames() {
- allocator.getOrCreatePool("alpha", 10 * 1024 * 1024);
- allocator.getOrCreatePool("beta", 20 * 1024 * 1024);
+ allocator.getOrCreatePool("alpha", 0L, 10 * 1024 * 1024, null);
+ allocator.getOrCreatePool("beta", 0L, 20 * 1024 * 1024, null);
assertTrue(allocator.getPoolNames().contains("alpha"));
assertTrue(allocator.getPoolNames().contains("beta"));
assertEquals(2, allocator.getPoolNames().size());
}
- public void testRebalanceDistributesHeadroomToAllPools() {
- allocator.setRootLimit(100 * 1024 * 1024);
- allocator.getOrCreatePool("active", 10 * 1024 * 1024, 100 * 1024 * 1024);
- allocator.getOrCreatePool("idle", 10 * 1024 * 1024, 100 * 1024 * 1024);
-
- // Simulate activity: allocate in "active" pool.
- BufferAllocator activeAlloc = allocator.getPoolAllocator("active");
- BufferAllocator child = activeAlloc.newChildAllocator("worker", 0, 100 * 1024 * 1024);
- var buf = child.buffer(5 * 1024 * 1024);
-
- try {
- allocator.rebalance();
-
- // Active pool gets bonus headroom on top of its min.
- long activeLimit = activeAlloc.getLimit();
- assertTrue("Active pool limit should exceed min after rebalance, got " + activeLimit, activeLimit > 10 * 1024 * 1024);
-
- // Idle pool also receives headroom: distributing to all pools (not just
- // currently-active ones) avoids the dead-pool corner case where a pool
- // with min = 0 starts at limit = 0 and can never make a first allocation.
- // Idle pools that don't end up needing the headroom return it on the next
- // tick once they remain at zero allocation.
- long idleLimit = allocator.getPoolAllocator("idle").getLimit();
- assertTrue("Idle pool should also receive headroom, got " + idleLimit, idleLimit > 10 * 1024 * 1024);
- } finally {
- buf.close();
- child.close();
- }
- }
-
- public void testRebalanceLetsZeroMinPoolAllocate() {
- // Regression test: under the previous "active pools only" rebalance algorithm,
- // a pool with min = 0 would start at limit = 0 (rebalancer-on path), be unable
- // to allocate, never become "active", and so never receive a bonus — permanently
- // dead. Distributing headroom across all pools fixes the chicken-and-egg.
- allocator.setRebalanceInterval(60);
- allocator.setRootLimit(100 * 1024 * 1024);
- allocator.getOrCreatePool("zero-min", 0L, 100 * 1024 * 1024);
- try {
- allocator.rebalance();
- BufferAllocator pool = allocator.getPoolAllocator("zero-min");
- assertTrue("Zero-min pool should receive headroom, got " + pool.getLimit(), pool.getLimit() > 0);
- } finally {
- allocator.setRebalanceInterval(0);
- }
- }
-
- public void testRebalanceNeverDropsBelowCurrentAllocation() {
- allocator.setRootLimit(50 * 1024 * 1024);
- allocator.getOrCreatePool("busy", 10 * 1024 * 1024);
-
- BufferAllocator pool = allocator.getPoolAllocator("busy");
- BufferAllocator child = pool.newChildAllocator("w", 0, 10 * 1024 * 1024);
- var buf = child.buffer(8 * 1024 * 1024); // 8 MB allocated
-
- try {
- allocator.rebalance();
- assertTrue("Limit should never drop below current allocation", pool.getLimit() >= pool.getAllocatedMemory());
- } finally {
- buf.close();
- child.close();
- }
- }
-
- public void testRebalanceWithNoPools() {
- // Should not throw
- allocator.rebalance();
- }
-
- public void testInitialLimitIsMaxWhenRebalancerDisabled() {
- // Default tearDown allocator has rebalancer disabled (interval=0).
- NativeAllocator.PoolHandle handle = allocator.getOrCreatePool("burst", 10 * 1024 * 1024, 100 * 1024 * 1024);
- // With the rebalancer off, pools must start at their max so consumers can allocate
- // immediately. Otherwise default-configured pools (min=0) would reject everything.
+ public void testInitialLimitIsMax() {
+ NativeAllocator.PoolHandle handle = allocator.getOrCreatePool("burst", 10 * 1024 * 1024, 100 * 1024 * 1024, null);
assertEquals(100 * 1024 * 1024, handle.limit());
}
- public void testInitialLimitIsMinWhenRebalancerEnabled() {
- // Enabling the rebalancer reverts to the original "guarantee + burst" semantics:
- // pools start at min and grow via the next rebalance tick.
- allocator.setRebalanceInterval(60); // any positive value enables the flag
- NativeAllocator.PoolHandle handle = allocator.getOrCreatePool("guaranteed", 10 * 1024 * 1024, 100 * 1024 * 1024);
- assertEquals(10 * 1024 * 1024, handle.limit());
- // Disable so subsequent tests aren't affected by the scheduled task.
- allocator.setRebalanceInterval(0);
- }
-
public void testCloseReleasesAllPools() {
- allocator.getOrCreatePool("close-test", 10 * 1024 * 1024);
+ allocator.getOrCreatePool("close-test", 0L, 10 * 1024 * 1024, null);
allocator.close();
assertTrue(allocator.getPoolNames().isEmpty());
// Recreate for tearDown
- allocator = new ArrowNativeAllocator(1024L * 1024 * 1024);
+ allocator = new ArrowNativeAllocator();
}
- public void testSetPoolMinRaisesLiveLimitWhenRebalancerOff() {
- // setPoolMin must affect the live BufferAllocator immediately, not just the
- // poolMins map. Otherwise it's a Dynamic setting that returns HTTP 200 and
- // does nothing observable until the operator also enables the rebalancer.
- allocator.setRootLimit(100 * 1024 * 1024);
- allocator.getOrCreatePool("p", 0L, 100 * 1024 * 1024);
+ public void testSetPoolMinRaisesLiveLimitWhenNeeded() {
+ allocator.getOrCreatePool("p", 0L, 100 * 1024 * 1024, null);
BufferAllocator pool = allocator.getPoolAllocator("p");
long startLimit = pool.getLimit();
@@ -222,11 +130,7 @@ public void testSetPoolMinRaisesLiveLimitWhenRebalancerOff() {
}
public void testSetPoolMinDoesNotShrinkLiveLimit() {
- // Dropping the min must not shrink an in-flight pool — the rebalancer is the
- // only path that reduces limits, so a min change on its own should never
- // reclaim capacity.
- allocator.setRootLimit(100 * 1024 * 1024);
- allocator.getOrCreatePool("p", 0L, 100 * 1024 * 1024);
+ allocator.getOrCreatePool("p", 0L, 100 * 1024 * 1024, null);
BufferAllocator pool = allocator.getPoolAllocator("p");
long startLimit = pool.getLimit();
diff --git a/plugins/arrow-base/src/test/java/org/opensearch/arrow/allocator/NativeMemoryRebalancerTests.java b/plugins/arrow-base/src/test/java/org/opensearch/arrow/allocator/NativeMemoryRebalancerTests.java
new file mode 100644
index 0000000000000..b92a477a67ea3
--- /dev/null
+++ b/plugins/arrow-base/src/test/java/org/opensearch/arrow/allocator/NativeMemoryRebalancerTests.java
@@ -0,0 +1,152 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.arrow.allocator;
+
+import org.apache.arrow.memory.ArrowBuf;
+import org.apache.arrow.memory.BufferAllocator;
+import org.opensearch.test.OpenSearchTestCase;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class NativeMemoryRebalancerTests extends OpenSearchTestCase {
+
+ private static final long MB = 1024L * 1024;
+ private static final long BUDGET = 100 * MB;
+
+ private ArrowNativeAllocator allocator;
+ private NativeMemoryRebalancer rebalancer;
+
+ @Override
+ public void setUp() throws Exception {
+ super.setUp();
+ allocator = new ArrowNativeAllocator();
+ allocator.setBudget(BUDGET);
+ rebalancer = new NativeMemoryRebalancer(allocator, () -> BUDGET, 0.75, 0.50, 0.10);
+ }
+
+ @Override
+ public void tearDown() throws Exception {
+ allocator.close();
+ super.tearDown();
+ }
+
+ public void testPoolsStartAtMax() {
+ allocator.getOrCreatePool("a", 5 * MB, 40 * MB, null);
+ allocator.getOrCreatePool("b", 10 * MB, 50 * MB, null);
+
+ assertEquals(40 * MB, allocator.getPoolAllocator("a").getLimit());
+ assertEquals(50 * MB, allocator.getPoolAllocator("b").getLimit());
+ }
+
+ public void testShrinksIdlePool() {
+ allocator.getOrCreatePool("idle", 5 * MB, 50 * MB, null);
+ allocator.getOrCreatePool("pressured", 5 * MB, 50 * MB, null);
+
+ BufferAllocator pressuredPool = allocator.getPoolAllocator("pressured");
+ ArrowBuf buf = pressuredPool.buffer((long) (50 * MB * 0.8));
+
+ try {
+ long idleLimitBefore = allocator.getPoolAllocator("idle").getLimit();
+ rebalancer.rebalance();
+ long idleLimitAfter = allocator.getPoolAllocator("idle").getLimit();
+ assertTrue("Idle pool should shrink, was " + idleLimitBefore + " now " + idleLimitAfter, idleLimitAfter < idleLimitBefore);
+ } finally {
+ buf.close();
+ }
+ }
+
+ public void testGrowsPressuredPoolAboveMax() {
+ allocator.getOrCreatePool("idle", 5 * MB, 50 * MB, null);
+ allocator.getOrCreatePool("pressured", 5 * MB, 20 * MB, null);
+
+ BufferAllocator pressuredPool = allocator.getPoolAllocator("pressured");
+ ArrowBuf buf = pressuredPool.buffer((long) (20 * MB * 0.8));
+
+ try {
+ rebalancer.rebalance();
+ long pressuredLimit = pressuredPool.getLimit();
+ assertTrue("Pressured pool should grow above max (20MB), got " + pressuredLimit, pressuredLimit > 20 * MB);
+ } finally {
+ buf.close();
+ }
+ }
+
+ public void testNeverDropsBelowMin() {
+ allocator.getOrCreatePool("floored", 10 * MB, 50 * MB, null);
+ allocator.getOrCreatePool("pressured", 5 * MB, 50 * MB, null);
+
+ BufferAllocator pressuredPool = allocator.getPoolAllocator("pressured");
+ ArrowBuf buf = pressuredPool.buffer((long) (50 * MB * 0.8));
+
+ try {
+ for (int i = 0; i < 20; i++) {
+ rebalancer.rebalance();
+ }
+ long flooredLimit = allocator.getPoolAllocator("floored").getLimit();
+ assertTrue("Pool limit (" + flooredLimit + ") should not drop below min (10MB)", flooredLimit >= 10 * MB);
+ } finally {
+ buf.close();
+ }
+ }
+
+ public void testNoActionWhenNoPressure() {
+ allocator.getOrCreatePool("a", 5 * MB, 50 * MB, null);
+ allocator.getOrCreatePool("b", 5 * MB, 50 * MB, null);
+
+ long limitA = allocator.getPoolAllocator("a").getLimit();
+ long limitB = allocator.getPoolAllocator("b").getLimit();
+
+ rebalancer.rebalance();
+
+ assertEquals(limitA, allocator.getPoolAllocator("a").getLimit());
+ assertEquals(limitB, allocator.getPoolAllocator("b").getLimit());
+ }
+
+ public void testResetAllPoolsToMax() {
+ allocator.getOrCreatePool("a", 5 * MB, 40 * MB, null);
+ allocator.getOrCreatePool("b", 5 * MB, 50 * MB, null);
+
+ BufferAllocator bPool = allocator.getPoolAllocator("b");
+ ArrowBuf buf = bPool.buffer((long) (50 * MB * 0.8));
+ rebalancer.rebalance();
+ buf.close();
+
+ allocator.resetAllPoolsToMax();
+
+ assertEquals(40 * MB, allocator.getPoolAllocator("a").getLimit());
+ assertEquals(50 * MB, allocator.getPoolAllocator("b").getLimit());
+ }
+
+ public void testSumLimitsNeverExceedsBudget() {
+ allocator.getOrCreatePool("p1", 5 * MB, 30 * MB, null);
+ allocator.getOrCreatePool("p2", 5 * MB, 30 * MB, null);
+ allocator.getOrCreatePool("p3", 5 * MB, 30 * MB, null);
+
+ List bufs = new ArrayList<>();
+ for (String name : new String[] { "p1", "p2", "p3" }) {
+ BufferAllocator pool = allocator.getPoolAllocator(name);
+ bufs.add(pool.buffer((long) (pool.getLimit() * 0.8)));
+ }
+
+ try {
+ for (int i = 0; i < 10; i++) {
+ rebalancer.rebalance();
+ }
+
+ long sumLimits = 0;
+ for (String name : new String[] { "p1", "p2", "p3" }) {
+ sumLimits += allocator.getPoolAllocator(name).getLimit();
+ }
+ assertTrue("Sum of limits (" + sumLimits + ") should not exceed budget (" + BUDGET + ")", sumLimits <= BUDGET);
+ } finally {
+ bufs.forEach(ArrowBuf::close);
+ }
+ }
+}
diff --git a/plugins/arrow-flight-rpc/src/internalClusterTest/java/org/opensearch/arrow/flight/BackpressureProducerIT.java b/plugins/arrow-flight-rpc/src/internalClusterTest/java/org/opensearch/arrow/flight/BackpressureProducerIT.java
index ff7f1359df089..a32a0ffe8bc58 100644
--- a/plugins/arrow-flight-rpc/src/internalClusterTest/java/org/opensearch/arrow/flight/BackpressureProducerIT.java
+++ b/plugins/arrow-flight-rpc/src/internalClusterTest/java/org/opensearch/arrow/flight/BackpressureProducerIT.java
@@ -122,9 +122,10 @@ protected Settings nodeSettings(int nodeOrdinal) {
return Settings.builder()
.put(super.nodeSettings(nodeOrdinal))
.put("node.native_memory.limit", "512mb")
- .put(NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT, 256 * MB)
.put(NativeAllocatorPoolConfig.SETTING_FLIGHT_MAX, FLIGHT_POOL_CAP_BYTES)
+ .put(NativeAllocatorPoolConfig.SETTING_INGEST_MIN, 8 * MB)
.put(NativeAllocatorPoolConfig.SETTING_INGEST_MAX, 16 * MB)
+ .put(NativeAllocatorPoolConfig.SETTING_QUERY_MIN, 8 * MB)
.put(NativeAllocatorPoolConfig.SETTING_QUERY_MAX, 16 * MB)
.put(ServerConfig.FLIGHT_OUTBOUND_BUFFER_THRESHOLD.getKey(), new ByteSizeValue(GRPC_THRESHOLD_BYTES, ByteSizeUnit.BYTES))
.build();
diff --git a/plugins/arrow-flight-rpc/src/internalClusterTest/java/org/opensearch/arrow/flight/NativeAllocatorBoundaryIT.java b/plugins/arrow-flight-rpc/src/internalClusterTest/java/org/opensearch/arrow/flight/NativeAllocatorBoundaryIT.java
index 5633cfa429c5d..d849458c0b048 100644
--- a/plugins/arrow-flight-rpc/src/internalClusterTest/java/org/opensearch/arrow/flight/NativeAllocatorBoundaryIT.java
+++ b/plugins/arrow-flight-rpc/src/internalClusterTest/java/org/opensearch/arrow/flight/NativeAllocatorBoundaryIT.java
@@ -12,7 +12,6 @@
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.OutOfMemoryException;
-import org.opensearch.action.admin.cluster.settings.ClusterUpdateSettingsResponse;
import org.opensearch.arrow.allocator.ArrowBasePlugin;
import org.opensearch.arrow.allocator.ArrowNativeAllocator;
import org.opensearch.arrow.spi.NativeAllocatorPoolConfig;
@@ -31,27 +30,17 @@
*
* Boots a single-node cluster with tight memory settings, then exercises
* the actual Arrow allocation path to verify that the framework's
- * configured caps are enforced at allocation time (not just at config-parse
- * time). Complements unit-level tests in {@code ArrowBasePluginTests} by
- * verifying that the production wiring (Guice -> ArrowNativeAllocator ->
- * Arrow's RootAllocator chain) honors the caps end-to-end.
- *
- *
Each test sets explicit byte limits and allocates real
- * {@link org.apache.arrow.memory.ArrowBuf} buffers, asserting either
- * successful allocation or {@link OutOfMemoryException} based on whether
- * the request fits within the configured cap.
+ * configured caps are enforced at allocation time.
*/
@ThreadLeakScope(ThreadLeakScope.Scope.NONE)
@OpenSearchIntegTestCase.ClusterScope(scope = OpenSearchIntegTestCase.Scope.TEST, minNumDataNodes = 1, maxNumDataNodes = 1)
public class NativeAllocatorBoundaryIT extends OpenSearchIntegTestCase {
- /** 1 MiB. Chosen small enough that tests run fast but large enough that
- * Arrow's internal accounting doesn't round it away. */
+ /** 1 MiB. */
private static final long MB = 1024L * 1024;
- /** Cap large enough for the framework's own bookkeeping but small enough
- * to trigger OOM well before exhausting host memory. */
- private static final long ROOT_CAP_BYTES = 16 * MB;
+ /** Per-pool cap for tests. */
+ private static final long POOL_CAP_BYTES = 16 * MB;
@Override
protected Collection> nodePlugins() {
@@ -60,127 +49,83 @@ protected Collection> nodePlugins() {
@Override
protected Settings nodeSettings(int nodeOrdinal) {
- // Set node.native_memory.limit explicitly so framework defaults derive
- // from a known value rather than the (machine-dependent) ram-heap default.
- // ROOT_LIMIT and pool maxes are then overridden per-test via cluster
- // settings PUT or directly via this node-settings layer.
return Settings.builder()
.put(super.nodeSettings(nodeOrdinal))
.put("node.native_memory.limit", "256mb")
- // Tight root cap: 16 MiB total Arrow framework budget.
- .put(NativeAllocatorPoolConfig.SETTING_ROOT_LIMIT, ROOT_CAP_BYTES)
- // Per-pool maxes set generously so per-pool caps don't trip
- // before root.limit. Tests targeting per-pool caps override below.
- .put(NativeAllocatorPoolConfig.SETTING_FLIGHT_MAX, ROOT_CAP_BYTES)
- .put(NativeAllocatorPoolConfig.SETTING_INGEST_MAX, ROOT_CAP_BYTES)
- .put(NativeAllocatorPoolConfig.SETTING_QUERY_MAX, ROOT_CAP_BYTES)
+ .put("native.allocator.rebalancer.enabled", false)
+ // Per-pool maxes set to a known value for testing.
+ .put(NativeAllocatorPoolConfig.SETTING_FLIGHT_MAX, POOL_CAP_BYTES)
+ .put(NativeAllocatorPoolConfig.SETTING_INGEST_MAX, POOL_CAP_BYTES)
+ .put(NativeAllocatorPoolConfig.SETTING_QUERY_MAX, POOL_CAP_BYTES)
.build();
}
/**
- * Verifies that {@code parquet.native.pool.query.max} caps allocations
- * through the QUERY pool: a buffer request exceeding the per-pool cap
- * throws {@link OutOfMemoryException} even when root has headroom.
+ * Verifies that per-pool max caps allocations through the QUERY pool.
*/
public void testPoolMaxRejectsAllocationsBeyondCap() {
- // Tighten QUERY pool to 4 MiB while leaving root at 16 MiB.
- long poolCap = 4 * MB;
- ClusterUpdateSettingsResponse resp = client().admin()
- .cluster()
- .prepareUpdateSettings()
- .setTransientSettings(Settings.builder().put(NativeAllocatorPoolConfig.SETTING_QUERY_MAX, poolCap))
- .get();
- assertTrue("PUT to query.max must succeed", resp.isAcknowledged());
-
ArrowNativeAllocator allocator = internalCluster().getInstance(ArrowNativeAllocator.class);
BufferAllocator queryPool = allocator.getPoolAllocator(NativeAllocatorPoolConfig.POOL_QUERY);
- assertThat("pool's live limit reflects the PUT", queryPool.getLimit(), is(poolCap));
+ assertThat("pool's live limit reflects the configured max", queryPool.getLimit(), is(POOL_CAP_BYTES));
// Sub-cap allocation succeeds.
try (var withinCap = queryPool.buffer(2 * MB)) {
assertThat(queryPool.getAllocatedMemory(), greaterThanOrEqualTo(2 * MB));
}
- // Cap+1 allocation fails — Arrow's parent-cap check at allocateBytes
- // walks queryPool's allocationLimit and rejects.
- expectThrows(OutOfMemoryException.class, () -> queryPool.buffer(8 * MB));
+ // Cap+1 allocation fails.
+ expectThrows(OutOfMemoryException.class, () -> queryPool.buffer(POOL_CAP_BYTES + MB));
}
/**
- * Verifies that {@code native.allocator.root.limit} caps allocations
- * across all pools combined: when the sum of in-flight pool allocations
- * approaches the root cap, the next allocation is rejected at the root
- * level even if each individual pool's max would allow it.
+ * Verifies that per-pool limits cap allocations: when one pool is full,
+ * allocations through it fail even if other pools have headroom.
*/
- public void testRootLimitRejectsAllocationsBeyondCap() {
+ public void testPoolLimitRejectsAllocationsBeyondCap() {
ArrowNativeAllocator allocator = internalCluster().getInstance(ArrowNativeAllocator.class);
BufferAllocator flightPool = allocator.getPoolAllocator(NativeAllocatorPoolConfig.POOL_FLIGHT);
BufferAllocator queryPool = allocator.getPoolAllocator(NativeAllocatorPoolConfig.POOL_QUERY);
- BufferAllocator root = allocator.getRootAllocator();
-
- // Sanity-check setup: confirm the live limits match nodeSettings.
- // If these fail, the test setup is wrong and the body's expectations are
- // meaningless — surface the misconfiguration instead of misleading failures below.
- assertThat("root.limit must match nodeSettings", root.getLimit(), is(ROOT_CAP_BYTES));
- assertThat("flight.max must match nodeSettings", flightPool.getLimit(), is(ROOT_CAP_BYTES));
- assertThat("query.max must match nodeSettings", queryPool.getLimit(), is(ROOT_CAP_BYTES));
-
- // Hold 8 MiB through the FLIGHT pool. With root at 16 MiB this leaves 8 MiB
- // headroom across the root. (Power-of-2 sizes avoid Arrow's chunked-allocation
- // rounding surprises; e.g. a 12 MiB request actually consumes 16 MiB.)
+
+ assertThat("flight.max must match nodeSettings", flightPool.getLimit(), is(POOL_CAP_BYTES));
+ assertThat("query.max must match nodeSettings", queryPool.getLimit(), is(POOL_CAP_BYTES));
+
+ // Hold 8 MiB through the FLIGHT pool.
try (var flightHold = flightPool.buffer(8 * MB)) {
assertThat("FLIGHT pool reflects 8MB allocation", flightPool.getAllocatedMemory(), is(8L * MB));
- assertThat("root reflects 8MB allocation", root.getAllocatedMemory(), is(8L * MB));
- // A 4 MiB allocation through QUERY succeeds (within remaining root headroom).
+ // A 4 MiB allocation through QUERY succeeds (within its own pool cap).
try (var queryFit = queryPool.buffer(4 * MB)) {
- assertThat(allocator.getRootAllocator().getAllocatedMemory(), is(12L * MB));
+ assertThat(queryPool.getAllocatedMemory(), greaterThanOrEqualTo(4 * MB));
}
- // An 8 MiB allocation through QUERY would push the root past the 16 MiB cap
- // (8 MiB FLIGHT + 8 MiB QUERY). Arrow's parent-cap check at allocateBytes
- // walks queryPool -> root and rejects with OOM, even though QUERY's own
- // (16 MiB) max would individually allow it.
- expectThrows(OutOfMemoryException.class, () -> queryPool.buffer(16 * MB));
+ // An allocation exceeding the QUERY pool's own cap fails.
+ expectThrows(OutOfMemoryException.class, () -> queryPool.buffer(POOL_CAP_BYTES + MB));
}
}
/**
- * Verifies that a dynamic PUT to a pool's max takes effect on
- * subsequent allocations through descendants of that pool. This is the
- * behavior the deleted {@code NativeAllocatorListener} SPI was emulating;
- * it is now provided natively by Arrow's parent-cap check at allocateBytes.
+ * Verifies that setPoolLimit dynamically adjusts the pool cap and
+ * subsequent allocations respect the new limit.
*/
- public void testDynamicPoolResizeAffectsInFlightAllocations() {
+ public void testSetPoolLimitAffectsInFlightAllocations() {
ArrowNativeAllocator allocator = internalCluster().getInstance(ArrowNativeAllocator.class);
BufferAllocator queryPool = allocator.getPoolAllocator(NativeAllocatorPoolConfig.POOL_QUERY);
- // Step 1: create a child allocator at Long.MAX_VALUE — the AnalyticsSearchService /
- // DefaultPlanExecutor pattern. The child intentionally has no own-cap; it relies
- // on the parent pool's allocationLimit at allocation time.
try (BufferAllocator child = queryPool.newChildAllocator("boundary-it-child", 0, Long.MAX_VALUE)) {
- // Step 2: a small buffer through the child succeeds with the initial pool max.
+ // A small buffer through the child succeeds with the initial pool max.
try (var buf = child.buffer(2 * MB)) {
assertThat(child.getAllocatedMemory(), greaterThanOrEqualTo(2 * MB));
}
- // Step 3: PUT a tighter pool max via cluster settings.
+ // Programmatically tighten the pool limit.
long newPoolCap = 1 * MB;
- ClusterUpdateSettingsResponse resp = client().admin()
- .cluster()
- .prepareUpdateSettings()
- .setTransientSettings(Settings.builder().put(NativeAllocatorPoolConfig.SETTING_QUERY_MAX, newPoolCap))
- .get();
- assertTrue("PUT to query.max must succeed", resp.isAcknowledged());
- assertThat("pool's own limit reflects the PUT", queryPool.getLimit(), is(newPoolCap));
- assertThat("child's own limit is intentionally uncapped", child.getLimit(), is(Long.MAX_VALUE));
-
- // Step 4: an allocation that fit before the resize now exceeds the parent cap.
- // Arrow's parent-cap check at allocateBytes walks queryPool.allocationLimit
- // and rejects — no listener machinery needed.
+ allocator.setPoolLimit(NativeAllocatorPoolConfig.POOL_QUERY, newPoolCap);
+ assertThat("pool's own limit reflects the update", queryPool.getLimit(), is(newPoolCap));
+
+ // An allocation that fit before the resize now exceeds the parent cap.
expectThrows(OutOfMemoryException.class, () -> child.buffer(2 * MB));
- // Step 5: an allocation under the new cap still succeeds.
+ // An allocation under the new cap still succeeds.
try (var smallBuf = child.buffer(512 * 1024)) {
assertThat(child.getAllocatedMemory(), greaterThanOrEqualTo(512L * 1024));
}
diff --git a/plugins/arrow-flight-rpc/src/internalClusterTest/java/org/opensearch/arrow/flight/NativeMemoryRebalancerIT.java b/plugins/arrow-flight-rpc/src/internalClusterTest/java/org/opensearch/arrow/flight/NativeMemoryRebalancerIT.java
new file mode 100644
index 0000000000000..e28e373fe69a9
--- /dev/null
+++ b/plugins/arrow-flight-rpc/src/internalClusterTest/java/org/opensearch/arrow/flight/NativeMemoryRebalancerIT.java
@@ -0,0 +1,100 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.arrow.flight;
+
+import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope;
+
+import org.apache.arrow.memory.ArrowBuf;
+import org.apache.arrow.memory.BufferAllocator;
+import org.opensearch.arrow.allocator.ArrowBasePlugin;
+import org.opensearch.arrow.allocator.ArrowNativeAllocator;
+import org.opensearch.arrow.spi.NativeAllocatorPoolConfig;
+import org.opensearch.common.settings.Settings;
+import org.opensearch.plugins.Plugin;
+import org.opensearch.test.OpenSearchIntegTestCase;
+
+import java.util.Collection;
+import java.util.List;
+
+import static org.hamcrest.Matchers.lessThanOrEqualTo;
+
+/**
+ * Integration test for the NativeMemoryRebalancer.
+ *
+ * Boots a single-node cluster with the rebalancer enabled and verifies that
+ * pools start at their max and the rebalancer shrinks idle pools / grows pressured ones.
+ */
+@ThreadLeakScope(ThreadLeakScope.Scope.NONE)
+@OpenSearchIntegTestCase.ClusterScope(scope = OpenSearchIntegTestCase.Scope.TEST, minNumDataNodes = 1, maxNumDataNodes = 1)
+public class NativeMemoryRebalancerIT extends OpenSearchIntegTestCase {
+
+ private static final long MB = 1024L * 1024;
+
+ @Override
+ protected Collection> nodePlugins() {
+ return List.of(ArrowBasePlugin.class);
+ }
+
+ @Override
+ protected Settings nodeSettings(int nodeOrdinal) {
+ return Settings.builder()
+ .put(super.nodeSettings(nodeOrdinal))
+ .put("native.allocator.rebalancer.enabled", true)
+ .put("native.allocator.rebalance.interval_seconds", 1)
+ .put("node.native_memory.limit", "1gb")
+ .put(NativeAllocatorPoolConfig.SETTING_FLIGHT_MIN, 5 * MB)
+ .put(NativeAllocatorPoolConfig.SETTING_FLIGHT_MAX, 200 * MB)
+ .put(NativeAllocatorPoolConfig.SETTING_INGEST_MIN, 5 * MB)
+ .put(NativeAllocatorPoolConfig.SETTING_INGEST_MAX, 200 * MB)
+ .put(NativeAllocatorPoolConfig.SETTING_QUERY_MIN, 5 * MB)
+ .put(NativeAllocatorPoolConfig.SETTING_QUERY_MAX, 200 * MB)
+ .build();
+ }
+
+ /**
+ * Verifies that pools start at their max (before rebalancer shrinks them).
+ * Uses a long rebalancer interval to avoid race conditions.
+ */
+ public void testPoolsStartAtMax() {
+ ArrowNativeAllocator allocator = internalCluster().getInstance(ArrowNativeAllocator.class);
+ BufferAllocator ingestPool = allocator.getPoolAllocator(NativeAllocatorPoolConfig.POOL_INGEST);
+
+ // The rebalancer may have already run (1s interval), so the pool may have shrunk.
+ // Verify it's at least at min and the configured max is correct.
+ long max = allocator.getPoolMax(NativeAllocatorPoolConfig.POOL_INGEST);
+ assertEquals("Ingest pool max should be configured at 200MB", 200 * MB, max);
+ // Pool limit should be between min and max (rebalancer may have shrunk it)
+ long limit = ingestPool.getLimit();
+ assertThat("Ingest pool limit should be >= min", limit, org.hamcrest.Matchers.greaterThanOrEqualTo(5 * MB));
+ assertThat("Ingest pool limit should be <= max", limit, lessThanOrEqualTo(200 * MB));
+ }
+
+ /**
+ * Verifies that an idle pool shrinks after rebalancer ticks when another pool is pressured.
+ */
+ public void testIdlePoolShrinksWhenOtherPressured() throws Exception {
+ ArrowNativeAllocator allocator = internalCluster().getInstance(ArrowNativeAllocator.class);
+ BufferAllocator ingestPool = allocator.getPoolAllocator(NativeAllocatorPoolConfig.POOL_INGEST);
+ BufferAllocator flightPool = allocator.getPoolAllocator(NativeAllocatorPoolConfig.POOL_FLIGHT);
+
+ // Allocate > 75% of ingest pool to create pressure
+ long toAllocate = (long) (ingestPool.getLimit() * 0.8);
+ ArrowBuf buf = ingestPool.buffer(toAllocate);
+
+ try {
+ // Flight pool is idle — wait for rebalancer to shrink it
+ assertBusy(() -> {
+ long flightLimit = flightPool.getLimit();
+ assertThat("Flight pool should shrink when idle", flightLimit, org.hamcrest.Matchers.lessThan(200 * MB));
+ });
+ } finally {
+ buf.close();
+ }
+ }
+}
diff --git a/plugins/arrow-flight-rpc/src/internalClusterTest/java/org/opensearch/arrow/flight/UnifiedNativeMemoryStatsIT.java b/plugins/arrow-flight-rpc/src/internalClusterTest/java/org/opensearch/arrow/flight/UnifiedNativeMemoryStatsIT.java
new file mode 100644
index 0000000000000..fd0e165103e86
--- /dev/null
+++ b/plugins/arrow-flight-rpc/src/internalClusterTest/java/org/opensearch/arrow/flight/UnifiedNativeMemoryStatsIT.java
@@ -0,0 +1,95 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.arrow.flight;
+
+import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope;
+
+import org.opensearch.action.admin.cluster.node.stats.NodesStatsRequest;
+import org.opensearch.action.admin.cluster.node.stats.NodesStatsResponse;
+import org.opensearch.arrow.allocator.ArrowBasePlugin;
+import org.opensearch.common.settings.Settings;
+import org.opensearch.plugin.stats.NativeAllocatorPoolStats;
+import org.opensearch.plugins.Plugin;
+import org.opensearch.test.OpenSearchIntegTestCase;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+import static org.hamcrest.Matchers.greaterThan;
+import static org.hamcrest.Matchers.greaterThanOrEqualTo;
+import static org.hamcrest.Matchers.hasItems;
+import static org.hamcrest.Matchers.is;
+import static org.hamcrest.Matchers.notNullValue;
+
+/**
+ * Integration test verifying the unified native memory stats endpoint.
+ * Boots a single-node cluster with ArrowBasePlugin and confirms that
+ * all registered pools (Arrow + virtual) appear in _nodes/stats/native_memory.
+ */
+@ThreadLeakScope(ThreadLeakScope.Scope.NONE)
+@OpenSearchIntegTestCase.ClusterScope(scope = OpenSearchIntegTestCase.Scope.TEST, minNumDataNodes = 1, maxNumDataNodes = 1)
+public class UnifiedNativeMemoryStatsIT extends OpenSearchIntegTestCase {
+
+ @Override
+ protected Collection> nodePlugins() {
+ return List.of(ArrowBasePlugin.class);
+ }
+
+ @Override
+ protected Settings nodeSettings(int nodeOrdinal) {
+ return Settings.builder().put(super.nodeSettings(nodeOrdinal)).put("node.native_memory.limit", "1gb").build();
+ }
+
+ /**
+ * Verifies that the Arrow pools (flight, ingest, query) are visible in
+ * _nodes/stats/native_memory with correct structure.
+ */
+ public void testArrowPoolsVisibleInStats() {
+ NodesStatsResponse response = client().admin()
+ .cluster()
+ .prepareNodesStats()
+ .addMetric(NodesStatsRequest.Metric.NATIVE_MEMORY.metricName())
+ .get();
+
+ assertThat(response.getNodes().isEmpty(), is(false));
+ NativeAllocatorPoolStats stats = response.getNodes().get(0).getNativeAllocatorStats();
+ assertThat("native_memory stats should be present", stats, notNullValue());
+
+ // Dump the stats for debugging
+ StringBuilder sb = new StringBuilder();
+ sb.append("nativeAllocated=").append(stats.getNativeAllocatedBytes());
+ sb.append(", nativeResident=").append(stats.getNativeResidentBytes());
+ sb.append(", pools=[");
+ for (NativeAllocatorPoolStats.PoolStats p : stats.getPools()) {
+ sb.append(p.getName())
+ .append("(alloc=")
+ .append(p.getAllocatedBytes())
+ .append(",peak=")
+ .append(p.getPeakBytes())
+ .append(",limit=")
+ .append(p.getLimitBytes())
+ .append(") ");
+ }
+ sb.append("]");
+ logger.info("=== NATIVE_MEMORY STATS: {} ===", sb);
+
+ // All Arrow pools should be present
+ Set poolNames = stats.getPools().stream().map(NativeAllocatorPoolStats.PoolStats::getName).collect(Collectors.toSet());
+ assertThat(poolNames, hasItems("flight", "ingest", "query"));
+
+ // Each pool should have limit > 0 (derived from 1gb native_memory.limit)
+ for (NativeAllocatorPoolStats.PoolStats pool : stats.getPools()) {
+ assertThat("Pool '" + pool.getName() + "' should have limit > 0", pool.getLimitBytes(), greaterThan(0L));
+ assertThat("Pool '" + pool.getName() + "' allocated should be >= 0", pool.getAllocatedBytes(), greaterThanOrEqualTo(0L));
+ }
+ }
+
+}
diff --git a/plugins/arrow-flight-rpc/src/test/java/org/opensearch/arrow/flight/transport/FlightTransportTestBase.java b/plugins/arrow-flight-rpc/src/test/java/org/opensearch/arrow/flight/transport/FlightTransportTestBase.java
index f5eacc771be92..add8e0a2bbfee 100644
--- a/plugins/arrow-flight-rpc/src/test/java/org/opensearch/arrow/flight/transport/FlightTransportTestBase.java
+++ b/plugins/arrow-flight-rpc/src/test/java/org/opensearch/arrow/flight/transport/FlightTransportTestBase.java
@@ -91,8 +91,8 @@ public void setUp() throws Exception {
// FlightTransport sources its allocator from the framework's FLIGHT pool. Construct one
// here so the test has a usable allocator; tearDown closes it.
- nativeAllocator = new ArrowNativeAllocator(Long.MAX_VALUE);
- nativeAllocator.getOrCreatePool(NativeAllocatorPoolConfig.POOL_FLIGHT, 0L, Long.MAX_VALUE);
+ nativeAllocator = new ArrowNativeAllocator();
+ nativeAllocator.getOrCreatePool(NativeAllocatorPoolConfig.POOL_FLIGHT, 0L, Long.MAX_VALUE, null);
flightTransport = new FlightTransport(
settings,
diff --git a/sandbox/libs/dataformat-native/rust/common/src/lib.rs b/sandbox/libs/dataformat-native/rust/common/src/lib.rs
index 0f4b8c132407f..c44fa871c4fb3 100644
--- a/sandbox/libs/dataformat-native/rust/common/src/lib.rs
+++ b/sandbox/libs/dataformat-native/rust/common/src/lib.rs
@@ -11,6 +11,7 @@
pub mod error;
pub mod logger;
pub mod allocator;
+pub mod memory_pool;
// Re-export the proc macro so plugins use `#[native_bridge_common::ffm_safe]`
pub use native_bridge_macros::ffm_safe;
diff --git a/sandbox/libs/dataformat-native/rust/common/src/memory_pool.rs b/sandbox/libs/dataformat-native/rust/common/src/memory_pool.rs
new file mode 100644
index 0000000000000..7fc5c7c83a20c
--- /dev/null
+++ b/sandbox/libs/dataformat-native/rust/common/src/memory_pool.rs
@@ -0,0 +1,370 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+//! Memory pool for tracking native memory usage across write and merge operations.
+//!
+//! Provides an atomic counter with a configurable limit. Operations that allocate
+//! significant memory call `try_grow` before allocating and `shrink` after freeing.
+//! The pool rejects allocations that would exceed the configured limit.
+//!
+//! `MemoryReservation` is an RAII handle that automatically returns memory to the
+//! pool on drop, preventing leaks even on error paths.
+
+use std::sync::atomic::{AtomicUsize, Ordering};
+use std::sync::{Arc, Condvar, Mutex};
+use std::time::Duration;
+use std::fmt;
+
+/// Default timeout for blocking wait (300 seconds).
+pub const DEFAULT_WAIT_TIMEOUT: Duration = Duration::from_secs(300);
+
+/// Merge operations can wait longer (600 seconds).
+pub const MERGE_WAIT_TIMEOUT: Duration = Duration::from_secs(600);
+
+/// Controls whether an allocation blocks or rejects immediately.
+#[derive(Debug, Clone)]
+pub enum PoolBehavior {
+ /// Block until memory is available, up to the given timeout.
+ Wait(Duration),
+ /// Fail immediately if pool is full.
+ Reject,
+}
+
+/// Error returned when a pool cannot satisfy an allocation request.
+#[derive(Debug, Clone)]
+pub struct PoolExhausted {
+ pub pool_name: &'static str,
+ pub requested: usize,
+ pub used: usize,
+ pub limit: usize,
+}
+
+impl fmt::Display for PoolExhausted {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ write!(
+ f,
+ "[{}] memory limit exceeded: requested {} bytes, used {}, limit {}",
+ self.pool_name, self.requested, self.used, self.limit
+ )
+ }
+}
+
+impl std::error::Error for PoolExhausted {}
+
+/// Error returned when wait_and_grow times out.
+#[derive(Debug, Clone)]
+pub struct PoolTimeout {
+ pub pool_name: &'static str,
+ pub requested: usize,
+ pub used: usize,
+ pub limit: usize,
+ pub waited: Duration,
+}
+
+impl fmt::Display for PoolTimeout {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ write!(
+ f,
+ "[{}] timed out waiting for {} bytes after {:?} (used: {}, limit: {})",
+ self.pool_name, self.requested, self.waited, self.used, self.limit
+ )
+ }
+}
+
+impl std::error::Error for PoolTimeout {}
+
+/// A node-level memory pool backed by an atomic counter with blocking wait support.
+pub struct MemoryPool {
+ name: &'static str,
+ used: AtomicUsize,
+ limit: AtomicUsize,
+ peak: AtomicUsize,
+ notify: Condvar,
+ notify_lock: Mutex<()>,
+}
+
+impl fmt::Debug for MemoryPool {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ f.debug_struct("MemoryPool")
+ .field("name", &self.name)
+ .field("used", &self.used.load(Ordering::Relaxed))
+ .field("limit", &self.limit.load(Ordering::Relaxed))
+ .field("peak", &self.peak.load(Ordering::Relaxed))
+ .finish()
+ }
+}
+
+impl MemoryPool {
+ /// Create a new pool. `limit = 0` means unlimited.
+ pub fn new(name: &'static str, limit: usize) -> Self {
+ Self {
+ name,
+ used: AtomicUsize::new(0),
+ limit: AtomicUsize::new(limit),
+ peak: AtomicUsize::new(0),
+ notify: Condvar::new(),
+ notify_lock: Mutex::new(()),
+ }
+ }
+
+ /// Attempt to reserve `bytes`. Returns error if it would exceed the limit.
+ pub fn try_grow(&self, bytes: usize) -> Result<(), PoolExhausted> {
+ if bytes == 0 {
+ return Ok(());
+ }
+ let limit = self.limit.load(Ordering::Relaxed);
+ let result = self.used.fetch_update(Ordering::Relaxed, Ordering::Relaxed, |used| {
+ let new_used = used.checked_add(bytes)?;
+ if limit > 0 && new_used > limit {
+ None
+ } else {
+ Some(new_used)
+ }
+ });
+
+ match result {
+ Ok(old) => {
+ self.peak.fetch_max(old + bytes, Ordering::Relaxed);
+ Ok(())
+ }
+ Err(_) => Err(PoolExhausted {
+ pool_name: self.name,
+ requested: bytes,
+ used: self.used.load(Ordering::Relaxed),
+ limit,
+ }),
+ }
+ }
+
+ /// Blocks until `bytes` can be reserved, or timeout expires.
+ pub fn wait_and_grow(&self, bytes: usize, timeout: Duration) -> Result<(), PoolTimeout> {
+ if bytes == 0 {
+ return Ok(());
+ }
+ if self.try_grow(bytes).is_ok() {
+ return Ok(());
+ }
+
+ let start = std::time::Instant::now();
+ loop {
+ let elapsed = start.elapsed();
+ if elapsed >= timeout {
+ let used = self.used.load(Ordering::Relaxed);
+ let limit = self.limit.load(Ordering::Relaxed);
+ return Err(PoolTimeout {
+ pool_name: self.name,
+ requested: bytes,
+ used,
+ limit,
+ waited: elapsed,
+ });
+ }
+
+ let remaining = timeout - elapsed;
+ let guard = self.notify_lock.lock().unwrap();
+ let _ = self.notify.wait_timeout(guard, remaining.min(Duration::from_secs(1))).unwrap();
+
+ if self.try_grow(bytes).is_ok() {
+ return Ok(());
+ }
+ }
+ }
+
+ /// Infallible grow — use when the allocation has already happened.
+ pub fn grow(&self, bytes: usize) {
+ if bytes == 0 {
+ return;
+ }
+ let new_used = self.used.fetch_add(bytes, Ordering::Relaxed) + bytes;
+ self.peak.fetch_max(new_used, Ordering::Relaxed);
+ }
+
+ /// Release `bytes` back to the pool. Notifies any waiting threads.
+ pub fn shrink(&self, bytes: usize) {
+ if bytes == 0 {
+ return;
+ }
+ self.used
+ .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |current| {
+ Some(current.saturating_sub(bytes))
+ })
+ .unwrap();
+ self.notify.notify_all();
+ }
+
+ pub fn used(&self) -> usize {
+ self.used.load(Ordering::Relaxed)
+ }
+
+ pub fn peak(&self) -> usize {
+ self.peak.load(Ordering::Relaxed)
+ }
+
+ pub fn limit(&self) -> usize {
+ self.limit.load(Ordering::Relaxed)
+ }
+
+ pub fn name(&self) -> &'static str {
+ self.name
+ }
+
+ /// Atomically update the limit. Called by the Java rebalancer.
+ pub fn set_limit(&self, new_limit: usize) {
+ self.limit.store(new_limit, Ordering::Release);
+ // Wake waiters — new limit might allow blocked allocations
+ self.notify.notify_all();
+ }
+}
+
+/// RAII handle that tracks a portion of memory reserved from a [`MemoryPool`].
+/// Automatically releases all held memory on drop.
+pub struct MemoryReservation {
+ pool: Arc,
+ consumer: &'static str,
+ size: usize,
+ behavior: PoolBehavior,
+}
+
+impl MemoryReservation {
+ pub fn new(pool: &Arc, consumer: &'static str, behavior: PoolBehavior) -> Self {
+ Self {
+ pool: Arc::clone(pool),
+ consumer,
+ size: 0,
+ behavior,
+ }
+ }
+
+ /// Grow based on the reservation's behavior: block (Wait) or reject (Reject).
+ pub fn request(&mut self, bytes: usize) -> Result<(), Box> {
+ match &self.behavior {
+ PoolBehavior::Reject => {
+ self.pool.try_grow(bytes)?;
+ self.size += bytes;
+ Ok(())
+ }
+ PoolBehavior::Wait(timeout) => {
+ self.pool.wait_and_grow(bytes, *timeout)?;
+ self.size += bytes;
+ Ok(())
+ }
+ }
+ }
+
+ /// Infallible grow.
+ pub fn grow(&mut self, bytes: usize) {
+ self.pool.grow(bytes);
+ self.size += bytes;
+ }
+
+ /// Release `bytes` from this reservation.
+ pub fn shrink(&mut self, bytes: usize) {
+ let actual = bytes.min(self.size);
+ self.pool.shrink(actual);
+ self.size -= actual;
+ }
+
+ /// Release all memory back to the pool.
+ pub fn free(&mut self) -> usize {
+ let s = self.size;
+ if s > 0 {
+ self.pool.shrink(s);
+ self.size = 0;
+ }
+ s
+ }
+
+ pub fn size(&self) -> usize {
+ self.size
+ }
+
+ pub fn consumer(&self) -> &'static str {
+ self.consumer
+ }
+
+ /// Create a sibling reservation from the same pool with a different consumer name.
+ pub fn child(&self, consumer: &'static str) -> Self {
+ Self {
+ pool: Arc::clone(&self.pool),
+ consumer,
+ size: 0,
+ behavior: self.behavior.clone(),
+ }
+ }
+}
+
+impl Drop for MemoryReservation {
+ fn drop(&mut self) {
+ if self.size > 0 {
+ self.pool.shrink(self.size);
+ }
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn test_try_grow_within_limit() {
+ let pool = Arc::new(MemoryPool::new("test", 1024));
+ let mut res = MemoryReservation::new(&pool, "writer", PoolBehavior::Reject);
+ assert!(res.request(512).is_ok());
+ assert_eq!(res.size(), 512);
+ assert_eq!(pool.used(), 512);
+ }
+
+ #[test]
+ fn test_try_grow_exceeds_limit() {
+ let pool = Arc::new(MemoryPool::new("test", 1024));
+ let mut res = MemoryReservation::new(&pool, "writer", PoolBehavior::Reject);
+ assert!(res.request(2048).is_err());
+ assert_eq!(res.size(), 0);
+ assert_eq!(pool.used(), 0);
+ }
+
+ #[test]
+ fn test_drop_releases_memory() {
+ let pool = Arc::new(MemoryPool::new("test", 1024));
+ {
+ let mut res = MemoryReservation::new(&pool, "writer", PoolBehavior::Reject);
+ res.request(500).unwrap();
+ assert_eq!(pool.used(), 500);
+ }
+ assert_eq!(pool.used(), 0);
+ }
+
+ #[test]
+ fn test_set_limit_allows_growth() {
+ let pool = Arc::new(MemoryPool::new("test", 100));
+ let mut res = MemoryReservation::new(&pool, "writer", PoolBehavior::Reject);
+ assert!(res.request(200).is_err());
+ pool.set_limit(500);
+ assert!(res.request(200).is_ok());
+ }
+
+ #[test]
+ fn test_peak_tracking() {
+ let pool = Arc::new(MemoryPool::new("test", 1024));
+ let mut res = MemoryReservation::new(&pool, "writer", PoolBehavior::Reject);
+ res.request(800).unwrap();
+ res.shrink(500);
+ assert_eq!(pool.peak(), 800);
+ assert_eq!(pool.used(), 300);
+ }
+
+ #[test]
+ fn test_child_reservation() {
+ let pool = Arc::new(MemoryPool::new("test", 1024));
+ let res = MemoryReservation::new(&pool, "parent", PoolBehavior::Reject);
+ let mut child = res.child("child");
+ child.request(100).unwrap();
+ assert_eq!(child.consumer(), "child");
+ assert_eq!(pool.used(), 100);
+ }
+}
diff --git a/sandbox/plugins/analytics-backend-datafusion/src/main/java/org/opensearch/be/datafusion/DataFusionPlugin.java b/sandbox/plugins/analytics-backend-datafusion/src/main/java/org/opensearch/be/datafusion/DataFusionPlugin.java
index d2cc0417ee59c..a92d6365c2050 100644
--- a/sandbox/plugins/analytics-backend-datafusion/src/main/java/org/opensearch/be/datafusion/DataFusionPlugin.java
+++ b/sandbox/plugins/analytics-backend-datafusion/src/main/java/org/opensearch/be/datafusion/DataFusionPlugin.java
@@ -12,11 +12,15 @@
import org.apache.logging.log4j.Logger;
import org.opensearch.analytics.spi.AnalyticsSearchBackendPlugin;
import org.opensearch.analytics.spi.QueryExecutionMetrics;
+import org.opensearch.arrow.allocator.ArrowNativeAllocator;
+import org.opensearch.arrow.spi.NativeAllocator;
+import org.opensearch.arrow.spi.PoolGroup;
import org.opensearch.be.datafusion.action.DataFusionStatsAction;
import org.opensearch.be.datafusion.nativelib.NativeBridge;
import org.opensearch.cluster.metadata.IndexNameExpressionResolver;
import org.opensearch.cluster.node.DiscoveryNodes;
import org.opensearch.cluster.service.ClusterService;
+import org.opensearch.common.Nullable;
import org.opensearch.common.settings.ClusterSettings;
import org.opensearch.common.settings.IndexScopedSettings;
import org.opensearch.common.settings.Setting;
@@ -86,7 +90,7 @@ public class DataFusionPlugin extends Plugin
* ({@link ResourceTrackerSettings#NODE_NATIVE_MEMORY_LIMIT_SETTING}), which is
* the same off-heap budget admission control throttles against. The DataFusion Rust
* runtime is the dominant native-memory consumer for analytics workloads (see PR #21732
- * partitioning model), so the default takes 75% of {@code node.native_memory.limit}.
+ * partitioning model), so the default takes 74% of {@code node.native_memory.limit}.
* If the AC limit is unset (== 0), the default is {@link Long#MAX_VALUE} — unbounded — to
* preserve pre-AC behaviour rather than make up a number from JVM heap (which is a
* separate, already-allocated region with no relation to native-memory sizing).
@@ -109,7 +113,7 @@ public class DataFusionPlugin extends Plugin
);
/**
- * Computes the default for {@link #DATAFUSION_MEMORY_POOL_LIMIT} as 75% of
+ * Computes the default for {@link #DATAFUSION_MEMORY_POOL_LIMIT} as 74% of
* {@link ResourceTrackerSettings#NODE_NATIVE_MEMORY_LIMIT_SETTING}, falling back to
* {@link Long#MAX_VALUE} when AC is unconfigured.
*
@@ -125,10 +129,10 @@ static String deriveMemoryPoolLimitDefault(Settings settings) {
if (nativeLimit.getBytes() <= 0) {
return Long.toString(Long.MAX_VALUE);
}
- // 75% of node.native_memory.limit. DataFusion is the dominant native consumer for
+ // 74% of node.native_memory.limit. DataFusion is the dominant native consumer for
// analytics workloads; operators tune via the dynamic setting once they characterize
// their workload.
- long pool = Math.max(0L, nativeLimit.getBytes() * 75 / 100);
+ long pool = Math.max(0L, nativeLimit.getBytes() * 74 / 100);
return Long.toString(pool);
}
@@ -331,6 +335,39 @@ public Collection createComponents(
IndexNameExpressionResolver indexNameExpressionResolver,
Supplier repositoriesServiceSupplier,
DataFormatRegistry dataFormatRegistry
+ ) {
+ return createComponents(
+ client,
+ clusterService,
+ threadPool,
+ resourceWatcherService,
+ scriptService,
+ xContentRegistry,
+ environment,
+ nodeEnvironment,
+ namedWriteableRegistry,
+ indexNameExpressionResolver,
+ repositoriesServiceSupplier,
+ dataFormatRegistry,
+ null
+ );
+ }
+
+ @Override
+ public Collection createComponents(
+ Client client,
+ ClusterService clusterService,
+ ThreadPool threadPool,
+ ResourceWatcherService resourceWatcherService,
+ ScriptService scriptService,
+ NamedXContentRegistry xContentRegistry,
+ Environment environment,
+ NodeEnvironment nodeEnvironment,
+ NamedWriteableRegistry namedWriteableRegistry,
+ IndexNameExpressionResolver indexNameExpressionResolver,
+ Supplier repositoriesServiceSupplier,
+ DataFormatRegistry dataFormatRegistry,
+ @Nullable NativeAllocator nativeAllocator
) {
this.dataFormatRegistry = dataFormatRegistry;
this.clusterService = clusterService;
@@ -350,13 +387,8 @@ public Collection createComponents(
dataFusionService.start();
logger.debug("DataFusion plugin initialized — memory pool {}B, spill limit {}B", memoryPoolLimit, spillMemoryLimit);
- // Wire the dynamic memory pool limit setting to the native runtime so updates via the
- // cluster settings API take effect without restarting the node. The framework's
- // parquet.native.pool.datafusion.{min,max} controls the Java-side Arrow pool that
- // sources the per-query allocators handed to DataFusion; this setting controls the
- // Rust runtime's internal MemoryPool used by query execution. They're separate
- // accounting layers — operators tune them independently.
- clusterService.getClusterSettings().addSettingsUpdateConsumer(DATAFUSION_MEMORY_POOL_LIMIT, this::updateMemoryPoolLimit);
+ // Wire the dynamic spill limit setting to the native runtime so updates via the
+ // cluster settings API take effect without restarting the node.
clusterService.getClusterSettings().addSettingsUpdateConsumer(DATAFUSION_SPILL_MEMORY_LIMIT, this::updateSpillMemoryLimit);
clusterService.getClusterSettings().addSettingsUpdateConsumer(DATAFUSION_MIN_TARGET_PARTITIONS, this::updateMinTargetPartitions);
clusterService.getClusterSettings()
@@ -382,19 +414,48 @@ public Collection createComponents(
this.datafusionSettings = new DatafusionSettings(clusterService);
- // Expose per-task native-memory usage to search backpressure. The tracker calls
- // this supplier once per refresh (invoked by the backpressure service at the top of
- // doRun() and nodeStats()), snapshotting all live queries in one FFM call. Per-task
- // evaluation then reads from the tracker's cached map — no FFM call per task.
- //
- // The OpenSearch task id is used as the DataFusion context_id at query launch
- // (see ShardScanInstructionHandler / DatafusionSearchExecEngine), so the map is
- // already keyed by Task#getId on the consumer side.
+ // Expose per-task native-memory usage to search backpressure.
NativeMemoryUsageTracker.setSnapshotSupplier(this::currentBytesByTaskId);
NativeMemoryUsageTracker.setNativeMemoryBudgetSupplier(() -> DATAFUSION_MEMORY_POOL_LIMIT.get(clusterService.getSettings()));
this.substraitExtensions = loadSubstraitExtensions();
+ // Register with the unified allocator if available
+ if (nativeAllocator != null) {
+ ClusterSettings clusterSettings = clusterService.getClusterSettings();
+ ArrowNativeAllocator arrowAllocator = (ArrowNativeAllocator) nativeAllocator;
+
+ NativeAllocator.VirtualPoolHandle dfPool = arrowAllocator.registerVirtualPool(
+ DatafusionSettings.POOL_DATAFUSION,
+ DatafusionSettings.DATAFUSION_MEMORY_POOL_MIN.get(settings),
+ DATAFUSION_MEMORY_POOL_LIMIT.get(settings),
+ PoolGroup.SEARCH,
+ this::updateMemoryPoolLimit
+ );
+
+ arrowAllocator.addStatsRefresher(() -> {
+ if (dataFusionService != null) {
+ long usage = dataFusionService.getMemoryPoolUsage();
+ dfPool.updateStats(usage, usage);
+ }
+ });
+
+ arrowAllocator.setNativeMemoryStatsSupplier(() -> {
+ AnalyticsBackendNativeMemoryStats s = NativeMemoryFetcher.fetch();
+ return new long[] { s.getAllocatedBytes(), s.getResidentBytes() };
+ });
+
+ // Wire dynamic setting consumers for pool min/max
+ clusterSettings.addSettingsUpdateConsumer(
+ DATAFUSION_MEMORY_POOL_LIMIT,
+ newMax -> arrowAllocator.setPoolLimit(DatafusionSettings.POOL_DATAFUSION, newMax)
+ );
+ clusterSettings.addSettingsUpdateConsumer(
+ DatafusionSettings.DATAFUSION_MEMORY_POOL_MIN,
+ newMin -> arrowAllocator.setPoolMin(DatafusionSettings.POOL_DATAFUSION, newMin)
+ );
+ }
+
return Collections.singletonList(dataFusionService);
}
diff --git a/sandbox/plugins/analytics-backend-datafusion/src/main/java/org/opensearch/be/datafusion/DatafusionSettings.java b/sandbox/plugins/analytics-backend-datafusion/src/main/java/org/opensearch/be/datafusion/DatafusionSettings.java
index cf3e9bbd817d9..5521489b57ab3 100644
--- a/sandbox/plugins/analytics-backend-datafusion/src/main/java/org/opensearch/be/datafusion/DatafusionSettings.java
+++ b/sandbox/plugins/analytics-backend-datafusion/src/main/java/org/opensearch/be/datafusion/DatafusionSettings.java
@@ -14,6 +14,8 @@
import org.opensearch.common.settings.ClusterSettings;
import org.opensearch.common.settings.Setting;
import org.opensearch.common.settings.Settings;
+import org.opensearch.core.common.unit.ByteSizeValue;
+import org.opensearch.node.resource.tracker.ResourceTrackerSettings;
import org.opensearch.search.SearchService;
import java.util.List;
@@ -32,6 +34,8 @@
@ExperimentalApi
public final class DatafusionSettings {
+ public static final String POOL_DATAFUSION = "datafusion";
+
// ── New indexed query settings ──
/** Number of rows per batch in the indexed query execution path. */
@@ -164,6 +168,21 @@ public final class DatafusionSettings {
// ── Concurrency gate settings ──
+ /** Minimum guaranteed bytes for the DataFusion memory pool. Default is half of datafusion max (37% of budget). */
+ public static final Setting DATAFUSION_MEMORY_POOL_MIN = new Setting<>(
+ "datafusion.memory_pool_min_bytes",
+ s -> derivePoolMinDefault(s, 37),
+ s -> {
+ long v = Long.parseLong(s);
+ if (v < 0) {
+ throw new IllegalArgumentException("Setting [datafusion.memory_pool_min_bytes] must be >= 0, got " + v);
+ }
+ return v;
+ },
+ Setting.Property.NodeScope,
+ Setting.Property.Dynamic
+ );
+
/** Datanode concurrency gate multiplier: max concurrent partition-equivalents = cpu_threads × multiplier. */
public static final Setting CONCURRENCY_DATANODE_MULTIPLIER = Setting.doubleSetting(
"datafusion.concurrency.datanode_multiplier",
@@ -223,6 +242,19 @@ public final class DatafusionSettings {
Setting.Property.Dynamic
);
+ /**
+ * Computes the default for a pool min as a percentage of
+ * {@link ResourceTrackerSettings#NODE_NATIVE_MEMORY_LIMIT_SETTING}.
+ * Returns 0 when AC is unconfigured.
+ */
+ static String derivePoolMinDefault(Settings settings, int percent) {
+ ByteSizeValue nativeLimit = ResourceTrackerSettings.NODE_NATIVE_MEMORY_LIMIT_SETTING.get(settings);
+ if (nativeLimit.getBytes() <= 0) {
+ return "0";
+ }
+ return Long.toString(Math.max(0L, nativeLimit.getBytes() * percent / 100));
+ }
+
// ── All settings registered by the plugin ──
public static final List> ALL_SETTINGS = List.of(
@@ -237,6 +269,7 @@ public final class DatafusionSettings {
DataFusionPlugin.DATAFUSION_MEMORY_GUARD_ADMISSION_REJECT_THRESHOLD,
DataFusionPlugin.DATAFUSION_MEMORY_GUARD_EXECUTION_SPILL_THRESHOLD,
DataFusionPlugin.DATAFUSION_MEMORY_GUARD_EXECUTION_CRITICAL_THRESHOLD,
+ DATAFUSION_MEMORY_POOL_MIN,
// Cache settings — metadata and statistics cache configuration
CacheSettings.METADATA_CACHE_SIZE_LIMIT,
diff --git a/sandbox/plugins/analytics-backend-datafusion/src/test/java/org/opensearch/be/datafusion/DataFusionPluginSettingsTests.java b/sandbox/plugins/analytics-backend-datafusion/src/test/java/org/opensearch/be/datafusion/DataFusionPluginSettingsTests.java
index aa1847ec84500..38b9ceecdc9eb 100644
--- a/sandbox/plugins/analytics-backend-datafusion/src/test/java/org/opensearch/be/datafusion/DataFusionPluginSettingsTests.java
+++ b/sandbox/plugins/analytics-backend-datafusion/src/test/java/org/opensearch/be/datafusion/DataFusionPluginSettingsTests.java
@@ -28,6 +28,7 @@ public void testMemoryPoolLimitIsDynamic() {
"datafusion.memory_pool_limit_bytes must be dynamic to support runtime updates",
DataFusionPlugin.DATAFUSION_MEMORY_POOL_LIMIT.isDynamic()
);
+ assertTrue("datafusion.memory_pool_limit_bytes must have node scope", DataFusionPlugin.DATAFUSION_MEMORY_POOL_LIMIT.hasNodeScope());
}
public void testSpillMemoryLimitIsDynamic() {
@@ -111,7 +112,7 @@ public void testGetSettingsReturnsAllIndexedSettings() {
public void testGetSettingsReturnsTotalExpectedCount() {
try (DataFusionPlugin plugin = new DataFusionPlugin()) {
List> settings = plugin.getSettings();
- assertEquals(25, settings.size());
+ assertEquals(26, settings.size());
} catch (Exception e) {
throw new AssertionError(e);
}
@@ -134,11 +135,11 @@ public void testDeriveMemoryPoolLimitDefaultUnsetReturnsLongMaxValue() {
}
public void testDeriveMemoryPoolLimitDefaultUsesNativeMemoryLimit() {
- // 10 GiB native memory limit — default takes 75% straight from limit, not
+ // 10 GiB native memory limit — default takes 74% straight from limit, not
// from limit - buffer_percent (which is AC's throttle margin, not a framework
- // budget reduction). 75% of 10 GiB.
+ // budget reduction). 74% of 10 GiB.
Settings s = Settings.builder().put("node.native_memory.limit", "10gb").build();
- long expected = (10L * 1024 * 1024 * 1024) * 75 / 100;
+ long expected = (10L * 1024 * 1024 * 1024) * 74 / 100;
assertEquals(Long.toString(expected), DataFusionPlugin.deriveMemoryPoolLimitDefault(s));
}
@@ -146,14 +147,14 @@ public void testDeriveMemoryPoolLimitDefaultIgnoresBufferPercent() {
// node.native_memory.buffer_percent is AC's throttle margin. The framework default
// takes its fraction off node.native_memory.limit directly so the buffer can sit
// between AC's throttle threshold and the framework's hard cap.
- // 1000 bytes limit, 20% buffer => pool max still 75% of 1000 = 750.
+ // 1000 bytes limit, 20% buffer => pool max still 74% of 1000 = 740.
Settings s = Settings.builder().put("node.native_memory.limit", "1000b").put("node.native_memory.buffer_percent", 20).build();
- assertEquals("750", DataFusionPlugin.deriveMemoryPoolLimitDefault(s));
+ assertEquals("740", DataFusionPlugin.deriveMemoryPoolLimitDefault(s));
}
public void testMemoryPoolLimitSettingExposesDerivedDefault() {
Settings s = Settings.builder().put("node.native_memory.limit", "10gb").build();
- long expected = (10L * 1024 * 1024 * 1024) * 75 / 100;
+ long expected = (10L * 1024 * 1024 * 1024) * 74 / 100;
assertEquals(Long.valueOf(expected), DataFusionPlugin.DATAFUSION_MEMORY_POOL_LIMIT.get(s));
}
diff --git a/sandbox/plugins/analytics-backend-datafusion/src/test/java/org/opensearch/be/datafusion/DatafusionSettingsTests.java b/sandbox/plugins/analytics-backend-datafusion/src/test/java/org/opensearch/be/datafusion/DatafusionSettingsTests.java
index 3dccb2479dfea..22dc688fa7e05 100644
--- a/sandbox/plugins/analytics-backend-datafusion/src/test/java/org/opensearch/be/datafusion/DatafusionSettingsTests.java
+++ b/sandbox/plugins/analytics-backend-datafusion/src/test/java/org/opensearch/be/datafusion/DatafusionSettingsTests.java
@@ -69,7 +69,7 @@ public void testMaxCollectorParallelismSettingDefinition() {
}
public void testAllSettingsContainsAllExpectedSettings() {
- assertEquals(25, DatafusionSettings.ALL_SETTINGS.size());
+ assertEquals(26, DatafusionSettings.ALL_SETTINGS.size());
assertTrue(DatafusionSettings.ALL_SETTINGS.contains(DataFusionPlugin.DATAFUSION_REDUCE_TARGET_PARTITIONS));
assertTrue(DatafusionSettings.ALL_SETTINGS.contains(DatafusionSettings.INDEXED_BATCH_SIZE));
assertTrue(DatafusionSettings.ALL_SETTINGS.contains(DatafusionSettings.INDEXED_PARQUET_PUSHDOWN_FILTERS));
diff --git a/sandbox/plugins/composite-engine/src/internalClusterTest/java/org/opensearch/composite/UnifiedNativeMemoryFullStackIT.java b/sandbox/plugins/composite-engine/src/internalClusterTest/java/org/opensearch/composite/UnifiedNativeMemoryFullStackIT.java
new file mode 100644
index 0000000000000..5678d0e08c3e3
--- /dev/null
+++ b/sandbox/plugins/composite-engine/src/internalClusterTest/java/org/opensearch/composite/UnifiedNativeMemoryFullStackIT.java
@@ -0,0 +1,84 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.composite;
+
+import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope;
+
+import org.opensearch.action.admin.cluster.node.stats.NodesStatsRequest;
+import org.opensearch.action.admin.cluster.node.stats.NodesStatsResponse;
+import org.opensearch.arrow.allocator.ArrowBasePlugin;
+import org.opensearch.be.datafusion.DataFusionPlugin;
+import org.opensearch.be.lucene.LucenePlugin;
+import org.opensearch.common.settings.Settings;
+import org.opensearch.parquet.ParquetDataFormatPlugin;
+import org.opensearch.plugin.stats.NativeAllocatorPoolStats;
+import org.opensearch.plugins.Plugin;
+import org.opensearch.test.OpenSearchIntegTestCase;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+import static org.hamcrest.Matchers.greaterThan;
+import static org.hamcrest.Matchers.greaterThanOrEqualTo;
+import static org.hamcrest.Matchers.hasItems;
+import static org.hamcrest.Matchers.is;
+import static org.hamcrest.Matchers.notNullValue;
+
+/**
+ * Full-stack IT verifying all 6 pools appear in _nodes/stats/native_memory.
+ */
+@ThreadLeakScope(ThreadLeakScope.Scope.NONE)
+@OpenSearchIntegTestCase.ClusterScope(scope = OpenSearchIntegTestCase.Scope.TEST, minNumDataNodes = 1, maxNumDataNodes = 1)
+public class UnifiedNativeMemoryFullStackIT extends OpenSearchIntegTestCase {
+
+ @Override
+ protected Collection> nodePlugins() {
+ return Arrays.asList(
+ ArrowBasePlugin.class,
+ ParquetDataFormatPlugin.class,
+ CompositeDataFormatPlugin.class,
+ LucenePlugin.class,
+ DataFusionPlugin.class
+ );
+ }
+
+ @Override
+ protected Settings nodeSettings(int nodeOrdinal) {
+ return Settings.builder().put(super.nodeSettings(nodeOrdinal)).put("node.native_memory.limit", "2gb").build();
+ }
+
+ public void testAllSixPoolsVisibleInStats() {
+ NodesStatsResponse response = client().admin()
+ .cluster()
+ .prepareNodesStats()
+ .addMetric(NodesStatsRequest.Metric.NATIVE_MEMORY.metricName())
+ .get();
+
+ assertThat(response.getNodes().isEmpty(), is(false));
+ NativeAllocatorPoolStats stats = response.getNodes().get(0).getNativeAllocatorStats();
+ assertThat("native_memory stats should be present", stats, notNullValue());
+
+ // All 6 pools should be present
+ Set poolNames = stats.getPools().stream().map(NativeAllocatorPoolStats.PoolStats::getName).collect(Collectors.toSet());
+ assertThat(poolNames, hasItems("flight", "ingest", "query", "datafusion", "write", "merge"));
+
+ // Each pool should have limit > 0
+ for (NativeAllocatorPoolStats.PoolStats pool : stats.getPools()) {
+ assertThat("Pool '" + pool.getName() + "' limit should be > 0", pool.getLimitBytes(), greaterThan(0L));
+ assertThat("Pool '" + pool.getName() + "' allocated should be >= 0", pool.getAllocatedBytes(), greaterThanOrEqualTo(0L));
+ }
+
+ // Native memory stats (jemalloc) should be available since DataFusion plugin sets the supplier
+ assertThat("native allocated_bytes should be > 0", stats.getNativeAllocatedBytes(), greaterThan(0L));
+ assertThat("native resident_bytes should be > 0", stats.getNativeResidentBytes(), greaterThan(0L));
+ }
+
+}
diff --git a/sandbox/plugins/composite-engine/src/main/java/org/opensearch/composite/CompositeDataFormatPlugin.java b/sandbox/plugins/composite-engine/src/main/java/org/opensearch/composite/CompositeDataFormatPlugin.java
index f1757189361ad..315cc5b78fd05 100644
--- a/sandbox/plugins/composite-engine/src/main/java/org/opensearch/composite/CompositeDataFormatPlugin.java
+++ b/sandbox/plugins/composite-engine/src/main/java/org/opensearch/composite/CompositeDataFormatPlugin.java
@@ -163,7 +163,7 @@ public class CompositeDataFormatPlugin extends Plugin implements DataFormatPlugi
*/
public static final Setting MERGE_ON_REFRESH_MAX_SIZE = Setting.byteSizeSetting(
"index.composite.merge_on_refresh_max_size",
- new ByteSizeValue(10, ByteSizeUnit.MB),
+ new ByteSizeValue(0, ByteSizeUnit.MB),
Setting.Property.IndexScope,
Setting.Property.Dynamic
);
diff --git a/sandbox/plugins/parquet-data-format/benchmarks/src/main/java/org/opensearch/parquet/benchmark/VSRRotationBenchmark.java b/sandbox/plugins/parquet-data-format/benchmarks/src/main/java/org/opensearch/parquet/benchmark/VSRRotationBenchmark.java
index 8a65d5fe52734..0986a8790f2e5 100644
--- a/sandbox/plugins/parquet-data-format/benchmarks/src/main/java/org/opensearch/parquet/benchmark/VSRRotationBenchmark.java
+++ b/sandbox/plugins/parquet-data-format/benchmarks/src/main/java/org/opensearch/parquet/benchmark/VSRRotationBenchmark.java
@@ -11,6 +11,9 @@
import org.apache.arrow.vector.types.pojo.Field;
import org.apache.arrow.vector.types.pojo.Schema;
import org.opensearch.Version;
+import org.opensearch.arrow.allocator.ArrowNativeAllocator;
+import org.opensearch.arrow.spi.NativeAllocatorPoolConfig;
+import org.opensearch.arrow.spi.PoolGroup;
import org.opensearch.cluster.metadata.IndexMetadata;
import org.opensearch.common.settings.Settings;
import org.opensearch.index.IndexSettings;
@@ -79,7 +82,7 @@ public class VSRRotationBenchmark {
private ThreadPool threadPool;
private ArrowBufferPool bufferPool;
- private org.opensearch.arrow.allocator.ArrowNativeAllocator nativeAllocator;
+ private ArrowNativeAllocator nativeAllocator;
private Schema schema;
private List fieldTypes;
private VSRManager vsrManager;
@@ -126,8 +129,8 @@ public void setupTrial() {
@Setup(Level.Invocation)
public void setup() throws IOException {
- nativeAllocator = new org.opensearch.arrow.allocator.ArrowNativeAllocator(Long.MAX_VALUE);
- nativeAllocator.getOrCreatePool(org.opensearch.arrow.spi.NativeAllocatorPoolConfig.POOL_INGEST, 0L, Long.MAX_VALUE);
+ nativeAllocator = new ArrowNativeAllocator();
+ nativeAllocator.getOrCreatePool(NativeAllocatorPoolConfig.POOL_INGEST, 0L, Long.MAX_VALUE, PoolGroup.INDEXING);
bufferPool = new ArrowBufferPool(Settings.EMPTY, nativeAllocator);
filePath = Path.of(System.getProperty("java.io.tmpdir"), "benchmark_vsr_" + System.nanoTime() + ".parquet").toString();
Settings idxSettings = Settings.builder().put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT).build();
diff --git a/sandbox/plugins/parquet-data-format/src/main/java/org/opensearch/parquet/ParquetDataFormatPlugin.java b/sandbox/plugins/parquet-data-format/src/main/java/org/opensearch/parquet/ParquetDataFormatPlugin.java
index aa38e5a2f9455..d3a815888e5f7 100644
--- a/sandbox/plugins/parquet-data-format/src/main/java/org/opensearch/parquet/ParquetDataFormatPlugin.java
+++ b/sandbox/plugins/parquet-data-format/src/main/java/org/opensearch/parquet/ParquetDataFormatPlugin.java
@@ -9,8 +9,11 @@
package org.opensearch.parquet;
import org.opensearch.arrow.allocator.ArrowNativeAllocator;
+import org.opensearch.arrow.spi.NativeAllocator;
+import org.opensearch.arrow.spi.PoolGroup;
import org.opensearch.cluster.metadata.IndexNameExpressionResolver;
import org.opensearch.cluster.service.ClusterService;
+import org.opensearch.common.settings.ClusterSettings;
import org.opensearch.common.settings.Setting;
import org.opensearch.common.settings.Settings;
import org.opensearch.common.util.concurrent.OpenSearchExecutors;
@@ -28,6 +31,7 @@
import org.opensearch.index.engine.dataformat.IndexingExecutionEngine;
import org.opensearch.index.engine.dataformat.StoreStrategy;
import org.opensearch.index.store.PrecomputedChecksumStrategy;
+import org.opensearch.parquet.bridge.RustBridge;
import org.opensearch.parquet.engine.ParquetDataFormat;
import org.opensearch.parquet.engine.ParquetIndexingEngine;
import org.opensearch.parquet.fields.ArrowSchemaBuilder;
@@ -100,8 +104,61 @@ public Collection createComponents(
) {
this.settings = clusterService.getSettings();
this.threadPool = threadPool;
- this.nativeAllocator = pluginComponentRegistry.getComponent(ArrowNativeAllocator.class)
- .orElseThrow(() -> new IllegalStateException("ArrowNativeAllocator not available; arrow-base plugin must be installed"));
+ this.nativeAllocator = pluginComponentRegistry.getComponent(ArrowNativeAllocator.class).orElse(null);
+
+ // Initialize native write/merge memory pools
+ long writeMax = ParquetSettings.WRITE_POOL_MAX.get(this.settings);
+ long mergeMax = ParquetSettings.MERGE_POOL_MAX.get(this.settings);
+ RustBridge.initMemoryPools(writeMax, mergeMax);
+
+ // Register virtual pools if allocator is available (arrow-base loaded)
+ if (nativeAllocator != null) {
+ NativeAllocator.VirtualPoolHandle writePool = nativeAllocator.registerVirtualPool(
+ ParquetSettings.POOL_WRITE,
+ ParquetSettings.WRITE_POOL_MIN.get(this.settings),
+ writeMax,
+ PoolGroup.INDEXING,
+ newLimit -> RustBridge.setWritePoolLimit(newLimit)
+ );
+ NativeAllocator.VirtualPoolHandle mergePool = nativeAllocator.registerVirtualPool(
+ ParquetSettings.POOL_MERGE,
+ ParquetSettings.MERGE_POOL_MIN.get(this.settings),
+ mergeMax,
+ PoolGroup.MERGE,
+ newLimit -> RustBridge.setMergePoolLimit(newLimit)
+ );
+
+ // Wire dynamic setting consumers via allocator
+ ClusterSettings cs = clusterService.getClusterSettings();
+ cs.addSettingsUpdateConsumer(
+ ParquetSettings.WRITE_POOL_MAX,
+ newMax -> nativeAllocator.setPoolLimit(ParquetSettings.POOL_WRITE, newMax)
+ );
+ cs.addSettingsUpdateConsumer(
+ ParquetSettings.WRITE_POOL_MIN,
+ newMin -> nativeAllocator.setPoolMin(ParquetSettings.POOL_WRITE, newMin)
+ );
+ cs.addSettingsUpdateConsumer(
+ ParquetSettings.MERGE_POOL_MAX,
+ newMax -> nativeAllocator.setPoolLimit(ParquetSettings.POOL_MERGE, newMax)
+ );
+ cs.addSettingsUpdateConsumer(
+ ParquetSettings.MERGE_POOL_MIN,
+ newMin -> nativeAllocator.setPoolMin(ParquetSettings.POOL_MERGE, newMin)
+ );
+
+ nativeAllocator.addStatsRefresher(() -> {
+ long[] s = RustBridge.getPoolStats();
+ writePool.updateStats(s[1], s[2]);
+ mergePool.updateStats(s[4], s[5]);
+ });
+ } else {
+ // No allocator — wire dynamic consumers directly to Rust pools
+ ClusterSettings cs = clusterService.getClusterSettings();
+ cs.addSettingsUpdateConsumer(ParquetSettings.WRITE_POOL_MAX, newMax -> RustBridge.setWritePoolLimit(newMax));
+ cs.addSettingsUpdateConsumer(ParquetSettings.MERGE_POOL_MAX, newMax -> RustBridge.setMergePoolLimit(newMax));
+ }
+
return Collections.emptyList();
}
diff --git a/sandbox/plugins/parquet-data-format/src/main/java/org/opensearch/parquet/ParquetSettings.java b/sandbox/plugins/parquet-data-format/src/main/java/org/opensearch/parquet/ParquetSettings.java
index 643b9809f0367..af1e85193edc2 100644
--- a/sandbox/plugins/parquet-data-format/src/main/java/org/opensearch/parquet/ParquetSettings.java
+++ b/sandbox/plugins/parquet-data-format/src/main/java/org/opensearch/parquet/ParquetSettings.java
@@ -18,6 +18,7 @@
import org.opensearch.common.settings.Settings;
import org.opensearch.core.common.unit.ByteSizeUnit;
import org.opensearch.core.common.unit.ByteSizeValue;
+import org.opensearch.node.resource.tracker.ResourceTrackerSettings;
import java.util.Collections;
import java.util.HashMap;
@@ -34,6 +35,9 @@ public final class ParquetSettings {
private ParquetSettings() {}
+ public static final String POOL_WRITE = "write";
+ public static final String POOL_MERGE = "merge";
+
public static final String DEFAULT_MAX_NATIVE_ALLOCATION = "10%";
public static final int DEFAULT_MAX_ROWS_PER_VSR = 65536;
@@ -168,6 +172,93 @@ private ParquetSettings() {}
Setting.Property.NodeScope
);
+ /** Minimum guaranteed bytes for the native write pool. Default is half of write max (2% of budget). */
+ public static final Setting WRITE_POOL_MIN = new Setting<>(
+ "parquet.native.pool.write.min",
+ s -> derivePoolMinDefault(s, 2),
+ s -> {
+ long v = Long.parseLong(s);
+ if (v < 0) {
+ throw new IllegalArgumentException("Setting [parquet.native.pool.write.min] must be >= 0, got " + v);
+ }
+ return v;
+ },
+ Setting.Property.NodeScope,
+ Setting.Property.Dynamic
+ );
+
+ /** Maximum bytes the native write pool can burst to. Default is 5% of node.native_memory.limit. */
+ public static final Setting WRITE_POOL_MAX = new Setting<>(
+ "parquet.native.pool.write.max",
+ s -> derivePoolMaxDefault(s, 5),
+ s -> {
+ long v = Long.parseLong(s);
+ if (v < 0) {
+ throw new IllegalArgumentException("Setting [parquet.native.pool.write.max] must be >= 0, got " + v);
+ }
+ return v;
+ },
+ Setting.Property.NodeScope,
+ Setting.Property.Dynamic
+ );
+
+ /** Minimum guaranteed bytes for the native merge pool. Default is half of merge max (1% of budget). */
+ public static final Setting MERGE_POOL_MIN = new Setting<>(
+ "parquet.native.pool.merge.min",
+ s -> derivePoolMinDefault(s, 1),
+ s -> {
+ long v = Long.parseLong(s);
+ if (v < 0) {
+ throw new IllegalArgumentException("Setting [parquet.native.pool.merge.min] must be >= 0, got " + v);
+ }
+ return v;
+ },
+ Setting.Property.NodeScope,
+ Setting.Property.Dynamic
+ );
+
+ /** Maximum bytes the native merge pool can burst to. Default is 3% of node.native_memory.limit. */
+ public static final Setting MERGE_POOL_MAX = new Setting<>(
+ "parquet.native.pool.merge.max",
+ s -> derivePoolMaxDefault(s, 3),
+ s -> {
+ long v = Long.parseLong(s);
+ if (v < 0) {
+ throw new IllegalArgumentException("Setting [parquet.native.pool.merge.max] must be >= 0, got " + v);
+ }
+ return v;
+ },
+ Setting.Property.NodeScope,
+ Setting.Property.Dynamic
+ );
+
+ /**
+ * Computes the default for a pool max as a percentage of
+ * {@link ResourceTrackerSettings#NODE_NATIVE_MEMORY_LIMIT_SETTING}.
+ * Falls back to {@link Long#MAX_VALUE} when AC is unconfigured.
+ */
+ static String derivePoolMaxDefault(Settings settings, int percent) {
+ ByteSizeValue nativeLimit = ResourceTrackerSettings.NODE_NATIVE_MEMORY_LIMIT_SETTING.get(settings);
+ if (nativeLimit.getBytes() <= 0) {
+ return Long.toString(Long.MAX_VALUE);
+ }
+ long pool = Math.max(0L, nativeLimit.getBytes() * percent / 100);
+ return Long.toString(pool);
+ }
+
+ /**
+ * Computes the default for a pool min as a percentage of
+ * {@link ResourceTrackerSettings#NODE_NATIVE_MEMORY_LIMIT_SETTING}.
+ * Returns 0 when AC is unconfigured (unlike max which returns Long.MAX_VALUE).
+ */
+ static String derivePoolMinDefault(Settings settings, int percent) {
+ ByteSizeValue nativeLimit = ResourceTrackerSettings.NODE_NATIVE_MEMORY_LIMIT_SETTING.get(settings);
+ if (nativeLimit.getBytes() <= 0) {
+ return "0";
+ }
+ return Long.toString(Math.max(0L, nativeLimit.getBytes() * percent / 100));
+ }
+
public static final Set VALID_ENCODINGS = Set.of(
"PLAIN",
"RLE",
@@ -666,6 +757,10 @@ public static List> getSettings() {
MERGE_BATCH_SIZE,
MERGE_RAYON_THREADS,
MERGE_IO_THREADS,
+ WRITE_POOL_MIN,
+ WRITE_POOL_MAX,
+ MERGE_POOL_MIN,
+ MERGE_POOL_MAX,
ENCODING_FIELD_SETTING,
ENCODING_VALUE_SETTING,
COMPRESSION_FIELD_SETTING,
diff --git a/sandbox/plugins/parquet-data-format/src/main/java/org/opensearch/parquet/bridge/RustBridge.java b/sandbox/plugins/parquet-data-format/src/main/java/org/opensearch/parquet/bridge/RustBridge.java
index e59a3549a0dd1..d3f99e1aa1a1f 100644
--- a/sandbox/plugins/parquet-data-format/src/main/java/org/opensearch/parquet/bridge/RustBridge.java
+++ b/sandbox/plugins/parquet-data-format/src/main/java/org/opensearch/parquet/bridge/RustBridge.java
@@ -45,6 +45,10 @@ public class RustBridge {
private static final MethodHandle FREE_MERGE_RESULT;
private static final MethodHandle READ_AS_JSON;
private static final MethodHandle FREE_ROW_ID_MAPPING;
+ private static final MethodHandle INIT_MEMORY_POOLS;
+ private static final MethodHandle SET_WRITE_POOL_LIMIT;
+ private static final MethodHandle SET_MERGE_POOL_LIMIT;
+ private static final MethodHandle GET_POOL_STATS;
static {
SymbolLookup lib = NativeLibraryLoader.symbolLookup();
@@ -251,6 +255,22 @@ public class RustBridge {
ValueLayout.JAVA_LONG // mapping_len
)
);
+ INIT_MEMORY_POOLS = linker.downcallHandle(
+ lib.find("parquet_init_memory_pools").orElseThrow(),
+ FunctionDescriptor.ofVoid(ValueLayout.JAVA_LONG, ValueLayout.JAVA_LONG)
+ );
+ SET_WRITE_POOL_LIMIT = linker.downcallHandle(
+ lib.find("parquet_set_write_pool_limit").orElseThrow(),
+ FunctionDescriptor.ofVoid(ValueLayout.JAVA_LONG)
+ );
+ SET_MERGE_POOL_LIMIT = linker.downcallHandle(
+ lib.find("parquet_set_merge_pool_limit").orElseThrow(),
+ FunctionDescriptor.ofVoid(ValueLayout.JAVA_LONG)
+ );
+ GET_POOL_STATS = linker.downcallHandle(
+ lib.find("parquet_get_pool_stats").orElseThrow(),
+ FunctionDescriptor.ofVoid(ValueLayout.ADDRESS)
+ );
}
public static void initLogger() {}
@@ -688,5 +708,25 @@ private static LongMapArrays toLongMapArrays(NativeCall call, Map
return new LongMapArrays(call.strArray(keys), seg);
}
+ public static void initMemoryPools(long writeLimit, long mergeLimit) {
+ NativeCall.invokeVoid(INIT_MEMORY_POOLS, writeLimit, mergeLimit);
+ }
+
+ public static void setWritePoolLimit(long newLimit) {
+ NativeCall.invokeVoid(SET_WRITE_POOL_LIMIT, newLimit);
+ }
+
+ public static void setMergePoolLimit(long newLimit) {
+ NativeCall.invokeVoid(SET_MERGE_POOL_LIMIT, newLimit);
+ }
+
+ public static long[] getPoolStats() {
+ try (var call = new NativeCall()) {
+ var buf = call.buf(6 * 8);
+ NativeCall.invokeVoid(GET_POOL_STATS, buf);
+ return buf.toArray(ValueLayout.JAVA_LONG);
+ }
+ }
+
private RustBridge() {}
}
diff --git a/sandbox/plugins/parquet-data-format/src/main/rust/src/ffm.rs b/sandbox/plugins/parquet-data-format/src/main/rust/src/ffm.rs
index e96d6b70d5b00..9a150f9fe1972 100644
--- a/sandbox/plugins/parquet-data-format/src/main/rust/src/ffm.rs
+++ b/sandbox/plugins/parquet-data-format/src/main/rust/src/ffm.rs
@@ -694,3 +694,35 @@ pub unsafe extern "C" fn parquet_free_row_id_mapping(
let _ = Box::from_raw(slice::from_raw_parts_mut(mapping_ptr as *mut i64, mapping_len as usize));
}
}
+
+// ---------------------------------------------------------------------------
+// Memory pool management (Phase 1 stubs)
+// ---------------------------------------------------------------------------
+
+/// Initialize write and merge memory pool counters.
+#[no_mangle]
+pub extern "C" fn parquet_init_memory_pools(write_limit: i64, merge_limit: i64) {
+ crate::memory::init_pools(write_limit as usize, merge_limit as usize);
+}
+
+/// Set write pool limit. Called by Java rebalancer via FFM.
+#[no_mangle]
+pub extern "C" fn parquet_set_write_pool_limit(new_limit: i64) {
+ crate::memory::set_write_limit(new_limit as usize);
+}
+
+/// Set merge pool limit. Called by Java rebalancer via FFM.
+#[no_mangle]
+pub extern "C" fn parquet_set_merge_pool_limit(new_limit: i64) {
+ crate::memory::set_merge_limit(new_limit as usize);
+}
+
+/// Get pool stats: writes 6 i64s to out_buf.
+/// Layout: [write_limit, write_used, write_peak, merge_limit, merge_used, merge_peak]
+#[no_mangle]
+pub unsafe extern "C" fn parquet_get_pool_stats(out_buf: *mut i64) {
+ let stats = crate::memory::get_stats();
+ for (i, val) in stats.iter().enumerate() {
+ *out_buf.add(i) = *val as i64;
+ }
+}
diff --git a/sandbox/plugins/parquet-data-format/src/main/rust/src/lib.rs b/sandbox/plugins/parquet-data-format/src/main/rust/src/lib.rs
index 2ce15506f12c4..9a2fac354e97c 100644
--- a/sandbox/plugins/parquet-data-format/src/main/rust/src/lib.rs
+++ b/sandbox/plugins/parquet-data-format/src/main/rust/src/lib.rs
@@ -14,6 +14,7 @@ mod tests;
pub mod writer;
pub mod ffm;
+pub mod memory;
pub mod native_settings;
pub mod field_config;
pub mod writer_properties_builder;
diff --git a/sandbox/plugins/parquet-data-format/src/main/rust/src/memory.rs b/sandbox/plugins/parquet-data-format/src/main/rust/src/memory.rs
new file mode 100644
index 0000000000000..d88ee47f8e831
--- /dev/null
+++ b/sandbox/plugins/parquet-data-format/src/main/rust/src/memory.rs
@@ -0,0 +1,57 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+//! Write and merge memory pools backed by `native_bridge_common::memory_pool::MemoryPool`.
+
+use std::sync::{Arc, OnceLock};
+
+use native_bridge_common::memory_pool::MemoryPool;
+
+static WRITE_POOL: OnceLock> = OnceLock::new();
+static MERGE_POOL: OnceLock> = OnceLock::new();
+
+/// Initialize write and merge pools. Called once from Java.
+pub fn init_pools(write_limit: usize, merge_limit: usize) {
+ WRITE_POOL.get_or_init(|| Arc::new(MemoryPool::new("write", write_limit)));
+ MERGE_POOL.get_or_init(|| Arc::new(MemoryPool::new("merge", merge_limit)));
+}
+
+/// Returns the write pool, or panics if not initialized.
+pub fn write_pool() -> &'static Arc {
+ WRITE_POOL.get().expect("write pool not initialized")
+}
+
+/// Returns the merge pool, or panics if not initialized.
+pub fn merge_pool() -> &'static Arc {
+ MERGE_POOL.get().expect("merge pool not initialized")
+}
+
+pub fn set_write_limit(v: usize) {
+ if let Some(p) = WRITE_POOL.get() {
+ p.set_limit(v);
+ }
+}
+
+pub fn set_merge_limit(v: usize) {
+ if let Some(p) = MERGE_POOL.get() {
+ p.set_limit(v);
+ }
+}
+
+/// Returns [write_limit, write_used, write_peak, merge_limit, merge_used, merge_peak].
+pub fn get_stats() -> [usize; 6] {
+ let w = WRITE_POOL
+ .get()
+ .map(|p| (p.limit(), p.used(), p.peak()))
+ .unwrap_or((0, 0, 0));
+ let m = MERGE_POOL
+ .get()
+ .map(|p| (p.limit(), p.used(), p.peak()))
+ .unwrap_or((0, 0, 0));
+ [w.0, w.1, w.2, m.0, m.1, m.2]
+}
diff --git a/sandbox/plugins/parquet-data-format/src/main/rust/src/tests/mod.rs b/sandbox/plugins/parquet-data-format/src/main/rust/src/tests/mod.rs
index 1e03fd66ce905..b04b4beb0c3a2 100644
--- a/sandbox/plugins/parquet-data-format/src/main/rust/src/tests/mod.rs
+++ b/sandbox/plugins/parquet-data-format/src/main/rust/src/tests/mod.rs
@@ -3295,9 +3295,9 @@ fn test_writer_properties_defaults_single_chunk() {
assert!(matches!(compression, parquet::basic::Compression::LZ4_RAW),
"Default compression should be LZ4_RAW, got: {:?}", compression);
- // Default bloom filter is enabled
- assert!(has_bloom_filter_in_parquet(&filename),
- "Default bloom_filter_enabled should be true");
+ // Default bloom filter is disabled
+ assert!(!has_bloom_filter_in_parquet(&filename),
+ "Default bloom_filter_enabled should be false");
// Format version always stamped
let format_version = read_format_version_from_parquet(&filename);
@@ -3339,9 +3339,9 @@ fn test_writer_properties_defaults_multi_chunk() {
assert!(matches!(compression, parquet::basic::Compression::LZ4_RAW),
"Default compression should be LZ4_RAW in multi-chunk path, got: {:?}", compression);
- // Default bloom filter is enabled
- assert!(has_bloom_filter_in_parquet(&filename),
- "Default bloom_filter_enabled should be true in multi-chunk path");
+ // Default bloom filter is disabled
+ assert!(!has_bloom_filter_in_parquet(&filename),
+ "Default bloom_filter_enabled should be false in multi-chunk path");
// Format version always stamped
let format_version = read_format_version_from_parquet(&filename);
diff --git a/sandbox/plugins/parquet-data-format/src/main/rust/src/writer_properties_builder.rs b/sandbox/plugins/parquet-data-format/src/main/rust/src/writer_properties_builder.rs
index d3954369b2d46..89417020918a1 100644
--- a/sandbox/plugins/parquet-data-format/src/main/rust/src/writer_properties_builder.rs
+++ b/sandbox/plugins/parquet-data-format/src/main/rust/src/writer_properties_builder.rs
@@ -1043,7 +1043,8 @@ mod tests {
#[test]
fn test_build_stamps_format_version() {
let config = NativeSettings::default();
- let props = WriterPropertiesBuilder::build(&config);
+ let schema = ArrowSchema::new(Vec::::new());
+ let props = WriterPropertiesBuilder::build(&config, &schema).expect("build failed");
let kv = props.key_value_metadata().expect("KV metadata missing");
let found = kv.iter().find(|k| k.key == FORMAT_VERSION_KEY);
let entry = found.expect("format_version KV entry missing");
@@ -1053,7 +1054,8 @@ mod tests {
#[test]
fn test_build_with_generation_stamps_both() {
let config = NativeSettings::default();
- let props = WriterPropertiesBuilder::build_with_generation(&config, Some(42));
+ let schema = ArrowSchema::new(Vec::::new());
+ let props = WriterPropertiesBuilder::build_with_generation(&config, Some(42), &schema).expect("build failed");
let kv = props.key_value_metadata().expect("KV metadata missing");
let has_format = kv.iter().any(|k| k.key == FORMAT_VERSION_KEY && k.value.as_deref() == Some(FORMAT_VERSION));
let has_gen = kv.iter().any(|k| k.key == WRITER_GENERATION_KEY && k.value.as_deref() == Some("42"));
diff --git a/sandbox/plugins/parquet-data-format/src/main/rust/tests/writer_integration_tests.rs b/sandbox/plugins/parquet-data-format/src/main/rust/tests/writer_integration_tests.rs
index ca74236316ea5..18f56283abede 100644
--- a/sandbox/plugins/parquet-data-format/src/main/rust/tests/writer_integration_tests.rs
+++ b/sandbox/plugins/parquet-data-format/src/main/rust/tests/writer_integration_tests.rs
@@ -33,8 +33,8 @@ fn test_complete_writer_lifecycle() {
assert!(file_path.metadata().unwrap().len() > 0);
let read_metadata = NativeParquetWriter::get_file_metadata(filename.clone()).unwrap();
- assert_eq!(read_metadata.num_rows(), metadata.metadata.file_metadata().num_rows());
- assert_eq!(read_metadata.version(), metadata.metadata.file_metadata().version());
+ assert_eq!(read_metadata.file_metadata().num_rows(), metadata.metadata.file_metadata().num_rows());
+ assert_eq!(read_metadata.file_metadata().version(), metadata.metadata.file_metadata().version());
}
#[test]
@@ -231,7 +231,7 @@ fn test_ipc_staging_sorted_writer_integration() {
assert_eq!(ids, vec![10, 20, 30, 40, 50, 60]);
let read_metadata = NativeParquetWriter::get_file_metadata(filename).unwrap();
- assert_eq!(read_metadata.num_rows(), 6);
+ assert_eq!(read_metadata.file_metadata().num_rows(), 6);
cleanup_ffi_schema(schema_ptr);
}
diff --git a/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/engine/ParquetDataFormatAwareEngineTests.java b/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/engine/ParquetDataFormatAwareEngineTests.java
index 609a3a9e6c4c9..358d244221c95 100644
--- a/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/engine/ParquetDataFormatAwareEngineTests.java
+++ b/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/engine/ParquetDataFormatAwareEngineTests.java
@@ -13,6 +13,8 @@
import org.apache.arrow.vector.types.pojo.FieldType;
import org.apache.arrow.vector.types.pojo.Schema;
import org.apache.lucene.search.Query;
+import org.opensearch.arrow.allocator.ArrowNativeAllocator;
+import org.opensearch.arrow.spi.NativeAllocatorPoolConfig;
import org.opensearch.cluster.metadata.IndexMetadata;
import org.opensearch.common.network.InetAddresses;
import org.opensearch.common.settings.Settings;
@@ -115,14 +117,14 @@ public Query termQuery(Object value, QueryShardContext context) {
};
private Schema schema;
- private org.opensearch.arrow.allocator.ArrowNativeAllocator nativeAllocator;
+ private ArrowNativeAllocator nativeAllocator;
@Override
public void setUp() throws Exception {
super.setUp();
RustBridge.initLogger();
- nativeAllocator = new org.opensearch.arrow.allocator.ArrowNativeAllocator(Long.MAX_VALUE);
- nativeAllocator.getOrCreatePool(org.opensearch.arrow.spi.NativeAllocatorPoolConfig.POOL_INGEST, 0L, Long.MAX_VALUE);
+ nativeAllocator = new ArrowNativeAllocator();
+ nativeAllocator.getOrCreatePool(NativeAllocatorPoolConfig.POOL_INGEST, 0L, Long.MAX_VALUE, null);
schema = buildSchema();
}
diff --git a/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/engine/ParquetIndexingEngineTests.java b/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/engine/ParquetIndexingEngineTests.java
index 54c783e2ac6de..1b6b7f414955e 100644
--- a/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/engine/ParquetIndexingEngineTests.java
+++ b/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/engine/ParquetIndexingEngineTests.java
@@ -58,7 +58,7 @@
public class ParquetIndexingEngineTests extends OpenSearchTestCase {
- private org.opensearch.arrow.allocator.ArrowNativeAllocator nativeAllocator;
+ private ArrowNativeAllocator nativeAllocator;
private MappedFieldType idField;
private MappedFieldType nameField;
private MappedFieldType scoreField;
@@ -71,8 +71,8 @@ public class ParquetIndexingEngineTests extends OpenSearchTestCase {
public void setUp() throws Exception {
super.setUp();
RustBridge.initLogger();
- nativeAllocator = new ArrowNativeAllocator(Long.MAX_VALUE);
- nativeAllocator.getOrCreatePool(NativeAllocatorPoolConfig.POOL_INGEST, 0L, Long.MAX_VALUE);
+ nativeAllocator = new ArrowNativeAllocator();
+ nativeAllocator.getOrCreatePool(NativeAllocatorPoolConfig.POOL_INGEST, 0L, Long.MAX_VALUE, null);
idField = new NumberFieldMapper.NumberFieldType("id", NumberFieldMapper.NumberType.INTEGER);
nameField = new KeywordFieldMapper.KeywordFieldType("name");
scoreField = new NumberFieldMapper.NumberFieldType("score", NumberFieldMapper.NumberType.LONG);
diff --git a/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/memory/ArrowBufferPoolTests.java b/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/memory/ArrowBufferPoolTests.java
index 9e18ad0e79b6a..d242572838129 100644
--- a/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/memory/ArrowBufferPoolTests.java
+++ b/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/memory/ArrowBufferPoolTests.java
@@ -24,8 +24,8 @@ public void setUp() throws Exception {
super.setUp();
// Each test gets its own allocator with the standard pools pre-created.
// Production code receives this via dependency injection; tests build it explicitly.
- nativeAllocator = new ArrowNativeAllocator(Long.MAX_VALUE);
- nativeAllocator.getOrCreatePool(NativeAllocatorPoolConfig.POOL_INGEST, 0L, Long.MAX_VALUE);
+ nativeAllocator = new ArrowNativeAllocator();
+ nativeAllocator.getOrCreatePool(NativeAllocatorPoolConfig.POOL_INGEST, 0L, Long.MAX_VALUE, null);
}
@Override
diff --git a/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/vsr/VSRManagerTests.java b/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/vsr/VSRManagerTests.java
index c9584463dc6ad..bb7a9bae9f834 100644
--- a/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/vsr/VSRManagerTests.java
+++ b/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/vsr/VSRManagerTests.java
@@ -15,6 +15,7 @@
import org.apache.arrow.vector.types.pojo.Schema;
import org.opensearch.Version;
import org.opensearch.arrow.allocator.ArrowNativeAllocator;
+import org.opensearch.arrow.spi.NativeAllocatorPoolConfig;
import org.opensearch.cluster.metadata.IndexMetadata;
import org.opensearch.common.settings.Settings;
import org.opensearch.index.IndexSettings;
@@ -50,8 +51,8 @@ public class VSRManagerTests extends OpenSearchTestCase {
public void setUp() throws Exception {
super.setUp();
RustBridge.initLogger();
- nativeAllocator = new ArrowNativeAllocator(Long.MAX_VALUE);
- nativeAllocator.getOrCreatePool(org.opensearch.arrow.spi.NativeAllocatorPoolConfig.POOL_INGEST, 0L, Long.MAX_VALUE);
+ nativeAllocator = new ArrowNativeAllocator();
+ nativeAllocator.getOrCreatePool(NativeAllocatorPoolConfig.POOL_INGEST, 0L, Long.MAX_VALUE, null);
bufferPool = new ArrowBufferPool(Settings.EMPTY, nativeAllocator);
schema = new Schema(List.of(new Field("val", FieldType.nullable(new ArrowType.Int(32, true)), null)));
Settings indexSettingsBuilder = Settings.builder()
diff --git a/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/vsr/VSRPoolTests.java b/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/vsr/VSRPoolTests.java
index 8cb2626921fad..80e7af76f4b4b 100644
--- a/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/vsr/VSRPoolTests.java
+++ b/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/vsr/VSRPoolTests.java
@@ -30,8 +30,8 @@ public class VSRPoolTests extends OpenSearchTestCase {
@Override
public void setUp() throws Exception {
super.setUp();
- nativeAllocator = new ArrowNativeAllocator(Long.MAX_VALUE);
- nativeAllocator.getOrCreatePool(NativeAllocatorPoolConfig.POOL_INGEST, 0L, Long.MAX_VALUE);
+ nativeAllocator = new ArrowNativeAllocator();
+ nativeAllocator.getOrCreatePool(NativeAllocatorPoolConfig.POOL_INGEST, 0L, Long.MAX_VALUE, null);
bufferPool = new ArrowBufferPool(Settings.EMPTY, nativeAllocator);
schema = new Schema(List.of(new Field("val", FieldType.nullable(new ArrowType.Int(32, true)), null)));
}
diff --git a/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/writer/ParquetWriterTests.java b/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/writer/ParquetWriterTests.java
index 09bf28a908441..fd76ff204cc9a 100644
--- a/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/writer/ParquetWriterTests.java
+++ b/sandbox/plugins/parquet-data-format/src/test/java/org/opensearch/parquet/writer/ParquetWriterTests.java
@@ -12,6 +12,7 @@
import org.apache.arrow.vector.types.pojo.Schema;
import org.opensearch.Version;
import org.opensearch.arrow.allocator.ArrowNativeAllocator;
+import org.opensearch.arrow.spi.NativeAllocatorPoolConfig;
import org.opensearch.cluster.metadata.IndexMetadata;
import org.opensearch.common.settings.Settings;
import org.opensearch.index.IndexSettings;
@@ -55,8 +56,8 @@ public class ParquetWriterTests extends OpenSearchTestCase {
public void setUp() throws Exception {
super.setUp();
RustBridge.initLogger();
- nativeAllocator = new ArrowNativeAllocator(Long.MAX_VALUE);
- nativeAllocator.getOrCreatePool(org.opensearch.arrow.spi.NativeAllocatorPoolConfig.POOL_INGEST, 0L, Long.MAX_VALUE);
+ nativeAllocator = new ArrowNativeAllocator();
+ nativeAllocator.getOrCreatePool(NativeAllocatorPoolConfig.POOL_INGEST, 0L, Long.MAX_VALUE, null);
bufferPool = new ArrowBufferPool(Settings.EMPTY, nativeAllocator);
idField = new NumberFieldMapper.NumberFieldType("id", NumberFieldMapper.NumberType.INTEGER);
nameField = new KeywordFieldMapper.KeywordFieldType("name");
diff --git a/server/build.gradle b/server/build.gradle
index bd14f4b6606d3..ad38c8b21ed02 100644
--- a/server/build.gradle
+++ b/server/build.gradle
@@ -77,6 +77,7 @@ dependencies {
compileOnly project(":libs:agent-sm:bootstrap")
compileOnly project(':libs:opensearch-plugin-classloader')
+ api project(":libs:opensearch-arrow-spi")
testRuntimeOnly project(':libs:opensearch-plugin-classloader')
api libs.bundles.lucene
diff --git a/server/src/main/java/org/opensearch/action/admin/cluster/node/stats/NodeStats.java b/server/src/main/java/org/opensearch/action/admin/cluster/node/stats/NodeStats.java
index 7eece7a11595e..3ae869f42d766 100644
--- a/server/src/main/java/org/opensearch/action/admin/cluster/node/stats/NodeStats.java
+++ b/server/src/main/java/org/opensearch/action/admin/cluster/node/stats/NodeStats.java
@@ -60,7 +60,6 @@
import org.opensearch.node.AdaptiveSelectionStats;
import org.opensearch.node.NodesResourceUsageStats;
import org.opensearch.node.remotestore.RemoteStoreNodeStats;
-import org.opensearch.plugin.stats.AnalyticsBackendNativeMemoryStats;
import org.opensearch.plugin.stats.NativeAllocatorPoolStats;
import org.opensearch.plugins.BlockCacheStats;
import org.opensearch.ratelimitting.admissioncontrol.stats.AdmissionControlStats;
@@ -189,9 +188,6 @@ public class NodeStats extends BaseNodeResponse implements ToXContentFragment {
*/
private long totalEstimatedNativeBytes;
- @Nullable
- private AnalyticsBackendNativeMemoryStats nativeMemoryStats;
-
public NodeStats(StreamInput in) throws IOException {
super(in);
timestamp = in.readVLong();
@@ -290,11 +286,6 @@ public NodeStats(StreamInput in) throws IOException {
} else {
nativeAllocatorStats = null;
}
- if (in.getVersion().onOrAfter(Version.V_3_7_0)) {
- nativeMemoryStats = in.readOptionalWriteable(AnalyticsBackendNativeMemoryStats::new);
- } else {
- nativeMemoryStats = null;
- }
if (in.getVersion().onOrAfter(Version.V_3_7_0)) {
totalEstimatedNativeBytes = in.readLong();
} else {
@@ -336,7 +327,6 @@ public NodeStats(
@Nullable NodeCacheStats nodeCacheStats,
@Nullable RemoteStoreNodeStats remoteStoreNodeStats,
@Nullable NativeAllocatorPoolStats nativeAllocatorStats,
- @Nullable AnalyticsBackendNativeMemoryStats nativeMemoryStats,
long totalEstimatedNativeBytes
) {
super(node);
@@ -372,7 +362,6 @@ public NodeStats(
this.nodeCacheStats = nodeCacheStats;
this.remoteStoreNodeStats = remoteStoreNodeStats;
this.nativeAllocatorStats = nativeAllocatorStats;
- this.nativeMemoryStats = nativeMemoryStats;
this.totalEstimatedNativeBytes = totalEstimatedNativeBytes;
}
@@ -568,14 +557,6 @@ public long getTotalEstimatedNativeBytes() {
return totalEstimatedNativeBytes;
}
- /**
- * Returns the analytics backend native memory stats, or {@code null} if not available.
- */
- @Nullable
- public AnalyticsBackendNativeMemoryStats getAnalyticsBackendNativeMemoryStats() {
- return nativeMemoryStats;
- }
-
@Override
public void writeTo(StreamOutput out) throws IOException {
super.writeTo(out);
@@ -644,9 +625,6 @@ public void writeTo(StreamOutput out) throws IOException {
if (out.getVersion().onOrAfter(Version.V_3_7_0)) {
out.writeOptionalWriteable(nativeAllocatorStats);
}
- if (out.getVersion().onOrAfter(Version.V_3_7_0)) {
- out.writeOptionalWriteable(nativeMemoryStats);
- }
if (out.getVersion().onOrAfter(Version.V_3_7_0)) {
out.writeLong(totalEstimatedNativeBytes);
}
@@ -771,17 +749,20 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
getRemoteStoreNodeStats().toXContent(builder, params);
}
// total_estimated_bytes ≈ RssAnon - JVM heap committed - JVM non-heap committed.
- // Always emit so operators see the per-node value even when no plugin contributes
- // an inner stats block. The value is captured on the data node in NodeService.stats()
- // and serialized; the coordinator never re-reads its own OsProbe here.
+ // native_memory: unified view of all native memory pools and jemalloc stats.
+ // NativeAllocatorPoolStats now includes jemalloc allocated/resident + all pools.
builder.startObject("native_memory");
builder.field("total_estimated_bytes", totalEstimatedNativeBytes);
- if (getAnalyticsBackendNativeMemoryStats() != null) {
- getAnalyticsBackendNativeMemoryStats().toXContent(builder, params);
- }
if (getNativeAllocatorStats() != null) {
- builder.startObject("native_allocator");
- getNativeAllocatorStats().toXContent(builder, params);
+ NativeAllocatorPoolStats stats = getNativeAllocatorStats();
+ builder.startObject("runtime");
+ builder.field("allocated_bytes", stats.getNativeAllocatedBytes());
+ builder.field("resident_bytes", stats.getNativeResidentBytes());
+ builder.endObject();
+ builder.startObject("memory_pools");
+ for (var entry : stats.getGroupedStats().entrySet()) {
+ entry.getValue().toXContent(builder, params);
+ }
builder.endObject();
}
builder.endObject();
diff --git a/server/src/main/java/org/opensearch/action/admin/cluster/node/stats/NodesStatsRequest.java b/server/src/main/java/org/opensearch/action/admin/cluster/node/stats/NodesStatsRequest.java
index 80ef0b6cc6d8e..544caa0b6da78 100644
--- a/server/src/main/java/org/opensearch/action/admin/cluster/node/stats/NodesStatsRequest.java
+++ b/server/src/main/java/org/opensearch/action/admin/cluster/node/stats/NodesStatsRequest.java
@@ -226,7 +226,6 @@ public enum Metric {
ADMISSION_CONTROL("admission_control"),
CACHE_STATS("caches"),
REMOTE_STORE("remote_store"),
- NATIVE_ALLOCATOR("native_allocator"),
NATIVE_MEMORY("native_memory");
private String metricName;
diff --git a/server/src/main/java/org/opensearch/action/admin/cluster/node/stats/TransportNodesStatsAction.java b/server/src/main/java/org/opensearch/action/admin/cluster/node/stats/TransportNodesStatsAction.java
index 64b0fee32408b..40e8788fc2238 100644
--- a/server/src/main/java/org/opensearch/action/admin/cluster/node/stats/TransportNodesStatsAction.java
+++ b/server/src/main/java/org/opensearch/action/admin/cluster/node/stats/TransportNodesStatsAction.java
@@ -132,7 +132,6 @@ protected NodeStats nodeOperation(NodeStatsRequest nodeStatsRequest) {
NodesStatsRequest.Metric.ADMISSION_CONTROL.containedIn(metrics),
NodesStatsRequest.Metric.CACHE_STATS.containedIn(metrics),
NodesStatsRequest.Metric.REMOTE_STORE.containedIn(metrics),
- NodesStatsRequest.Metric.NATIVE_ALLOCATOR.containedIn(metrics),
NodesStatsRequest.Metric.NATIVE_MEMORY.containedIn(metrics)
);
}
diff --git a/server/src/main/java/org/opensearch/action/admin/cluster/stats/TransportClusterStatsAction.java b/server/src/main/java/org/opensearch/action/admin/cluster/stats/TransportClusterStatsAction.java
index 1d39f635606d7..c8d06034e6fdf 100644
--- a/server/src/main/java/org/opensearch/action/admin/cluster/stats/TransportClusterStatsAction.java
+++ b/server/src/main/java/org/opensearch/action/admin/cluster/stats/TransportClusterStatsAction.java
@@ -201,7 +201,6 @@ protected ClusterStatsNodeResponse nodeOperation(ClusterStatsNodeRequest nodeReq
false,
false,
false,
- false,
false
);
List shardsStats = new ArrayList<>();
diff --git a/server/src/main/java/org/opensearch/node/Node.java b/server/src/main/java/org/opensearch/node/Node.java
index 8321ab127bb0a..06596fecbfd6d 100644
--- a/server/src/main/java/org/opensearch/node/Node.java
+++ b/server/src/main/java/org/opensearch/node/Node.java
@@ -56,6 +56,7 @@
import org.opensearch.action.search.StreamSearchTransportService;
import org.opensearch.action.support.TransportAction;
import org.opensearch.action.update.UpdateHelper;
+import org.opensearch.arrow.spi.NativeAllocator;
import org.opensearch.bootstrap.BootstrapCheck;
import org.opensearch.bootstrap.BootstrapContext;
import org.opensearch.cluster.ClusterInfoService;
@@ -1205,6 +1206,14 @@ protected Node(final Environment initialEnvironment, Collection clas
// Add the telemetryAwarePlugin components to the existing pluginComponents collection.
pluginComponents.addAll(telemetryAwarePluginComponents);
+ // Extract the NativeAllocator instance (published by ArrowBasePlugin in phase 1)
+ // so it can be passed to SearchBackEndPlugin.createComponents for virtual pool registration.
+ final NativeAllocator nativeAllocator = pluginComponents.stream()
+ .filter(c -> c instanceof NativeAllocator)
+ .map(c -> (NativeAllocator) c)
+ .findFirst()
+ .orElse(null);
+
@SuppressWarnings("rawtypes")
Collection searchBackEndPluginComponents = pluginsService.filterPlugins(SearchBackEndPlugin.class)
.stream()
@@ -1221,7 +1230,8 @@ protected Node(final Environment initialEnvironment, Collection clas
namedWriteableRegistry,
clusterModule.getIndexNameExpressionResolver(),
repositoriesServiceReference::get,
- dataFormatRegistry
+ dataFormatRegistry,
+ nativeAllocator
).stream()
)
.collect(Collectors.toList());
diff --git a/server/src/main/java/org/opensearch/node/NodeService.java b/server/src/main/java/org/opensearch/node/NodeService.java
index 3a7aa0ee0b5dd..1ab06c811b8d3 100644
--- a/server/src/main/java/org/opensearch/node/NodeService.java
+++ b/server/src/main/java/org/opensearch/node/NodeService.java
@@ -263,7 +263,6 @@ public NodeStats stats(
boolean admissionControl,
boolean cacheService,
boolean remoteStoreNodeStats,
- boolean nativeAllocator,
boolean nativeMemory
) {
// for indices stats we want to include previous allocated shards stats as well (it will
@@ -301,8 +300,7 @@ public NodeStats stats(
admissionControl ? this.admissionControlService.stats() : null,
cacheService ? this.cacheService.stats(indices) : null,
remoteStoreNodeStats ? new RemoteStoreNodeStats() : null,
- nativeAllocator ? collectNativeAllocatorStats() : null,
- nativeMemory ? monitorService.memoryReportingService().nativeStats() : null,
+ nativeMemory ? collectNativeAllocatorStats() : null,
// Always capture the process-level native memory estimate on this data node.
// Serialized over the wire so the coordinator renders the source node's value,
// not its own. Returns -1 on non-Linux platforms or when /proc/self/status is
diff --git a/server/src/main/java/org/opensearch/plugin/stats/NativeAllocatorPoolStats.java b/server/src/main/java/org/opensearch/plugin/stats/NativeAllocatorPoolStats.java
index 1f27e44d9423d..be8251038a60a 100644
--- a/server/src/main/java/org/opensearch/plugin/stats/NativeAllocatorPoolStats.java
+++ b/server/src/main/java/org/opensearch/plugin/stats/NativeAllocatorPoolStats.java
@@ -17,57 +17,46 @@
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
+import java.util.LinkedHashMap;
import java.util.List;
+import java.util.Map;
/**
- * Point-in-time snapshot of native allocator pool statistics for a node.
+ * Point-in-time snapshot of native memory statistics for a node.
*
- * Arrow-agnostic POJO. The plugin that owns the allocator (e.g. {@code arrow-base})
- * constructs instances of this class and exposes them through a
- * {@link NativeAllocatorStatsRegistry} component returned from its
- * {@code createComponents()}. Server is the type's home so that the cross-module
- * dependency from {@code :server} to {@code :libs:opensearch-arrow-spi} is unnecessary,
- * mirroring the placement of {@link AnalyticsBackendNativeMemoryStats}.
+ *
Includes process-wide native memory stats (allocated/resident from jemalloc)
+ * and per-pool stats for all registered pools (Arrow and virtual).
*
- *
Renders as the inner body of the {@code native_allocator} object inside
- * {@code _nodes/stats[/native_allocator]} — the caller ({@code NodeStats.toXContent})
- * is responsible for opening the {@code native_allocator} wrapper. Each pool exposes
- * {@code allocated_bytes}, {@code peak_bytes}, and {@code limit_bytes}; root exposes
- * the same.
+ *
Renders as the body of the {@code native_memory} object inside
+ * {@code _nodes/stats/native_memory}.
*
* @opensearch.api
*/
public class NativeAllocatorPoolStats implements Writeable, ToXContentFragment {
- private final long rootAllocatedBytes;
- private final long rootPeakBytes;
- private final long rootLimitBytes;
+ private final long nativeAllocatedBytes;
+ private final long nativeResidentBytes;
private final List pools;
/**
- * Creates a new stats snapshot from the given values.
+ * Creates a new stats snapshot.
*
- * @param rootAllocatedBytes current bytes allocated by the root
- * @param rootPeakBytes peak bytes ever allocated by the root since process start
- * @param rootLimitBytes configured root limit
- * @param pools per-pool stats
+ * @param nativeAllocatedBytes process-wide native allocated bytes (jemalloc), -1 if unavailable
+ * @param nativeResidentBytes process-wide native resident bytes (jemalloc RSS), -1 if unavailable
+ * @param pools per-pool stats (Arrow + virtual)
*/
- public NativeAllocatorPoolStats(long rootAllocatedBytes, long rootPeakBytes, long rootLimitBytes, List pools) {
- this.rootAllocatedBytes = rootAllocatedBytes;
- this.rootPeakBytes = rootPeakBytes;
- this.rootLimitBytes = rootLimitBytes;
+ public NativeAllocatorPoolStats(long nativeAllocatedBytes, long nativeResidentBytes, List pools) {
+ this.nativeAllocatedBytes = nativeAllocatedBytes;
+ this.nativeResidentBytes = nativeResidentBytes;
this.pools = Collections.unmodifiableList(pools);
}
/**
* Deserializes from stream.
- *
- * @param in the stream input
*/
public NativeAllocatorPoolStats(StreamInput in) throws IOException {
- this.rootAllocatedBytes = in.readVLong();
- this.rootPeakBytes = in.readVLong();
- this.rootLimitBytes = in.readVLong();
+ this.nativeAllocatedBytes = in.readLong();
+ this.nativeResidentBytes = in.readLong();
int count = in.readVInt();
List list = new ArrayList<>(count);
for (int i = 0; i < count; i++) {
@@ -78,9 +67,8 @@ public NativeAllocatorPoolStats(StreamInput in) throws IOException {
@Override
public void writeTo(StreamOutput out) throws IOException {
- out.writeVLong(rootAllocatedBytes);
- out.writeVLong(rootPeakBytes);
- out.writeVLong(rootLimitBytes);
+ out.writeLong(nativeAllocatedBytes);
+ out.writeLong(nativeResidentBytes);
out.writeVInt(pools.size());
for (PoolStats pool : pools) {
pool.writeTo(out);
@@ -89,11 +77,8 @@ public void writeTo(StreamOutput out) throws IOException {
@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
- builder.startObject("root");
- builder.field("allocated_bytes", rootAllocatedBytes);
- builder.field("peak_bytes", rootPeakBytes);
- builder.field("limit_bytes", rootLimitBytes);
- builder.endObject();
+ builder.field("allocated_bytes", nativeAllocatedBytes);
+ builder.field("resident_bytes", nativeResidentBytes);
builder.startObject("pools");
for (PoolStats pool : pools) {
@@ -103,19 +88,14 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
return builder;
}
- /** Returns the root allocator's currently allocated bytes. */
- public long getRootAllocatedBytes() {
- return rootAllocatedBytes;
- }
-
- /** Returns the root allocator's peak allocated bytes since process start. */
- public long getRootPeakBytes() {
- return rootPeakBytes;
+ /** Returns process-wide native allocated bytes, or -1 if unavailable. */
+ public long getNativeAllocatedBytes() {
+ return nativeAllocatedBytes;
}
- /** Returns the root allocator's configured limit in bytes. */
- public long getRootLimitBytes() {
- return rootLimitBytes;
+ /** Returns process-wide native resident bytes (RSS), or -1 if unavailable. */
+ public long getNativeResidentBytes() {
+ return nativeResidentBytes;
}
/** Returns the per-pool statistics. */
@@ -123,6 +103,26 @@ public List getPools() {
return pools;
}
+ /** Returns stats grouped by pool group. Pools without a group use their name as the key. */
+ public Map getGroupedStats() {
+ // [allocated, peak, limit] — peak uses max (highest watermark) rather than sum
+ // because individual pool peaks are not additive (they occur at different times).
+ Map grouped = new LinkedHashMap<>();
+ for (PoolStats pool : pools) {
+ String g = pool.getGroup() != null ? pool.getGroup() : pool.getName();
+ grouped.merge(
+ g,
+ new long[] { pool.getAllocatedBytes(), pool.getPeakBytes(), pool.getLimitBytes() },
+ (a, b) -> new long[] { a[0] + b[0], Math.max(a[1], b[1]), a[2] + b[2] }
+ );
+ }
+ Map result = new LinkedHashMap<>();
+ for (var e : grouped.entrySet()) {
+ result.put(e.getKey(), new PoolStats(e.getKey(), e.getValue()[0], e.getValue()[1], e.getValue()[2]));
+ }
+ return result;
+ }
+
/**
* Per-pool statistics snapshot.
*/
@@ -132,32 +132,29 @@ public static class PoolStats implements Writeable, ToXContentFragment {
private final long allocatedBytes;
private final long peakBytes;
private final long limitBytes;
+ private final String group;
+ private final long minBytes;
- /**
- * Creates a new pool stats snapshot.
- *
- * @param name pool name
- * @param allocatedBytes current allocated bytes
- * @param peakBytes peak bytes ever allocated since process start
- * @param limitBytes configured limit
- */
public PoolStats(String name, long allocatedBytes, long peakBytes, long limitBytes) {
+ this(name, allocatedBytes, peakBytes, limitBytes, null, 0L);
+ }
+
+ public PoolStats(String name, long allocatedBytes, long peakBytes, long limitBytes, String group, long minBytes) {
this.name = name;
this.allocatedBytes = allocatedBytes;
this.peakBytes = peakBytes;
this.limitBytes = limitBytes;
+ this.group = group;
+ this.minBytes = minBytes;
}
- /**
- * Deserializes from stream.
- *
- * @param in the stream input
- */
public PoolStats(StreamInput in) throws IOException {
this.name = in.readString();
this.allocatedBytes = in.readVLong();
this.peakBytes = in.readVLong();
this.limitBytes = in.readVLong();
+ this.group = in.readOptionalString();
+ this.minBytes = in.readVLong();
}
@Override
@@ -166,36 +163,45 @@ public void writeTo(StreamOutput out) throws IOException {
out.writeVLong(allocatedBytes);
out.writeVLong(peakBytes);
out.writeVLong(limitBytes);
+ out.writeOptionalString(group);
+ out.writeVLong(minBytes);
}
@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject(name);
builder.field("allocated_bytes", allocatedBytes);
- builder.field("peak_bytes", peakBytes);
builder.field("limit_bytes", limitBytes);
+ builder.field("min_bytes", minBytes);
+ if (group != null) {
+ builder.field("group", group);
+ }
builder.endObject();
return builder;
}
- /** Returns the pool name. */
public String getName() {
return name;
}
- /** Returns the currently allocated bytes. */
public long getAllocatedBytes() {
return allocatedBytes;
}
- /** Returns the peak allocated bytes since process start. */
public long getPeakBytes() {
return peakBytes;
}
- /** Returns the configured limit in bytes. */
public long getLimitBytes() {
return limitBytes;
}
+
+ public String getGroup() {
+ return group;
+ }
+
+ public long getMinBytes() {
+ return minBytes;
+ }
}
}
diff --git a/server/src/main/java/org/opensearch/plugins/SearchBackEndPlugin.java b/server/src/main/java/org/opensearch/plugins/SearchBackEndPlugin.java
index 1761a0c35d9b7..c103c7dff10a2 100644
--- a/server/src/main/java/org/opensearch/plugins/SearchBackEndPlugin.java
+++ b/server/src/main/java/org/opensearch/plugins/SearchBackEndPlugin.java
@@ -8,6 +8,7 @@
package org.opensearch.plugins;
+import org.opensearch.arrow.spi.NativeAllocator;
import org.opensearch.cluster.metadata.IndexNameExpressionResolver;
import org.opensearch.cluster.service.ClusterService;
import org.opensearch.common.Nullable;
@@ -105,6 +106,44 @@ default Collection createComponents(
return Collections.emptyList();
}
+ /**
+ * Extended variant that also receives the unified native memory allocator.
+ * Plugins that need to register virtual pools (e.g., DataFusion) override this method.
+ * The default delegates to the original method for backwards compatibility.
+ *
+ * @param nativeAllocator the unified native allocator, or null if arrow-base is not installed
+ */
+ default Collection createComponents(
+ Client client,
+ ClusterService clusterService,
+ ThreadPool threadPool,
+ ResourceWatcherService resourceWatcherService,
+ ScriptService scriptService,
+ NamedXContentRegistry xContentRegistry,
+ Environment environment,
+ NodeEnvironment nodeEnvironment,
+ NamedWriteableRegistry namedWriteableRegistry,
+ IndexNameExpressionResolver indexNameExpressionResolver,
+ Supplier repositoriesServiceSupplier,
+ DataFormatRegistry dataFormatRegistry,
+ @Nullable NativeAllocator nativeAllocator
+ ) {
+ return createComponents(
+ client,
+ clusterService,
+ threadPool,
+ resourceWatcherService,
+ scriptService,
+ xContentRegistry,
+ environment,
+ nodeEnvironment,
+ namedWriteableRegistry,
+ indexNameExpressionResolver,
+ repositoriesServiceSupplier,
+ dataFormatRegistry
+ );
+ }
+
/**
* Returns a supplier for native task cancellation stats, or {@code null} if not available.
*
diff --git a/server/src/test/java/org/opensearch/action/admin/cluster/node/stats/AnalyticsBackendNativeMemoryStatsVersionGateTests.java b/server/src/test/java/org/opensearch/action/admin/cluster/node/stats/AnalyticsBackendNativeMemoryStatsVersionGateTests.java
deleted file mode 100644
index c2dc2424d74ab..0000000000000
--- a/server/src/test/java/org/opensearch/action/admin/cluster/node/stats/AnalyticsBackendNativeMemoryStatsVersionGateTests.java
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- * SPDX-License-Identifier: Apache-2.0
- *
- * The OpenSearch Contributors require contributions made to
- * this file be licensed under the Apache-2.0 license or a
- * compatible open source license.
- */
-
-package org.opensearch.action.admin.cluster.node.stats;
-
-import org.opensearch.Version;
-import org.opensearch.cluster.node.DiscoveryNode;
-import org.opensearch.common.io.stream.BytesStreamOutput;
-import org.opensearch.core.common.io.stream.StreamInput;
-import org.opensearch.plugin.stats.AnalyticsBackendNativeMemoryStats;
-import org.opensearch.test.OpenSearchTestCase;
-
-import java.io.IOException;
-
-import static java.util.Collections.emptyMap;
-import static java.util.Collections.emptySet;
-
-/**
- * Property-based tests for version-gated serialization of {@link AnalyticsBackendNativeMemoryStats}
- * within {@link NodeStats}.
- *
- * Verifies that when NodeStats containing a non-null AnalyticsBackendNativeMemoryStats is serialized
- * to a stream with a version older than V_3_7_0, the deserialized NodeStats has
- * nativeMemoryStats == null. Conversely, when serialized to V_3_7_0 or later, the
- * AnalyticsBackendNativeMemoryStats is preserved.
- */
-public class AnalyticsBackendNativeMemoryStatsVersionGateTests extends OpenSearchTestCase {
-
- /**
- * Property 3: Version-gated serialization preserves null for old versions.
- *
- * For any NodeStats containing a non-null AnalyticsBackendNativeMemoryStats, serializing to a stream
- * with version older than the native-memory support version (V_3_7_0) and then
- * deserializing SHALL yield a NodeStats with nativeMemoryStats == null.
- *
- * Validates: Requirements 3.4, 3.5
- */
- public void testVersionGatedSerializationOmitsAnalyticsBackendNativeMemoryStatsForOldVersions() throws IOException {
- for (int i = 0; i < 100; i++) {
- long allocatedBytes = randomLongBetween(Long.MIN_VALUE, Long.MAX_VALUE);
- long residentBytes = randomLongBetween(Long.MIN_VALUE, Long.MAX_VALUE);
- AnalyticsBackendNativeMemoryStats nativeMemoryStats = new AnalyticsBackendNativeMemoryStats(allocatedBytes, residentBytes);
-
- NodeStats nodeStats = createNodeStatsWithNativeMemory(nativeMemoryStats);
-
- // Serialize with a version older than V_3_7_0
- try (BytesStreamOutput out = new BytesStreamOutput()) {
- out.setVersion(Version.V_2_18_0);
- nodeStats.writeTo(out);
-
- try (StreamInput in = out.bytes().streamInput()) {
- in.setVersion(Version.V_2_18_0);
- NodeStats deserialized = new NodeStats(in);
-
- assertNull(
- "nativeMemoryStats should be null when deserialized from version < V_3_7_0, "
- + "iteration "
- + i
- + " with values ["
- + allocatedBytes
- + ", "
- + residentBytes
- + "]",
- deserialized.getAnalyticsBackendNativeMemoryStats()
- );
- }
- }
- }
- }
-
- /**
- * Positive case: Version-gated serialization preserves AnalyticsBackendNativeMemoryStats for V_3_7_0+.
- *
- * For any NodeStats containing a non-null AnalyticsBackendNativeMemoryStats, serializing to a stream
- * with version V_3_7_0 or later and then deserializing SHALL yield a NodeStats with
- * nativeMemoryStats containing the original values.
- *
- * Validates: Requirements 3.4, 3.5
- */
- public void testVersionGatedSerializationPreservesAnalyticsBackendNativeMemoryStatsForCurrentVersion() throws IOException {
- for (int i = 0; i < 100; i++) {
- long allocatedBytes = randomLongBetween(Long.MIN_VALUE, Long.MAX_VALUE);
- long residentBytes = randomLongBetween(Long.MIN_VALUE, Long.MAX_VALUE);
- AnalyticsBackendNativeMemoryStats nativeMemoryStats = new AnalyticsBackendNativeMemoryStats(allocatedBytes, residentBytes);
-
- NodeStats nodeStats = createNodeStatsWithNativeMemory(nativeMemoryStats);
-
- // Serialize with V_3_7_0 (the version that introduced native memory support)
- try (BytesStreamOutput out = new BytesStreamOutput()) {
- out.setVersion(Version.V_3_7_0);
- nodeStats.writeTo(out);
-
- try (StreamInput in = out.bytes().streamInput()) {
- in.setVersion(Version.V_3_7_0);
- NodeStats deserialized = new NodeStats(in);
-
- assertNotNull(
- "nativeMemoryStats should be non-null when deserialized from version >= V_3_7_0, " + "iteration " + i,
- deserialized.getAnalyticsBackendNativeMemoryStats()
- );
- assertEquals(
- "allocatedBytes mismatch on iteration " + i,
- allocatedBytes,
- deserialized.getAnalyticsBackendNativeMemoryStats().getAllocatedBytes()
- );
- assertEquals(
- "residentBytes mismatch on iteration " + i,
- residentBytes,
- deserialized.getAnalyticsBackendNativeMemoryStats().getResidentBytes()
- );
- }
- }
- }
- }
-
- /**
- * Creates a minimal NodeStats with the given AnalyticsBackendNativeMemoryStats and all other fields null.
- * Uses the current version for the DiscoveryNode.
- */
- private NodeStats createNodeStatsWithNativeMemory(AnalyticsBackendNativeMemoryStats nativeMemoryStats) {
- DiscoveryNode node = new DiscoveryNode("test_node", buildNewFakeTransportAddress(), emptyMap(), emptySet(), Version.CURRENT);
-
- return new NodeStats(
- node,
- System.currentTimeMillis(),
- null, // indices
- null, // os
- null, // process
- null, // jvm
- null, // threadPool
- null, // fs
- null, // transport
- null, // http
- null, // breaker
- null, // scriptStats
- null, // discoveryStats
- null, // ingestStats
- null, // adaptiveSelectionStats
- null, // resourceUsageStats
- null, // scriptCacheStats
- null, // indexingPressureStats
- null, // shardIndexingPressureStats
- null, // searchBackpressureStats
- null, // clusterManagerThrottlingStats
- null, // weightedRoutingStats
- null, // fileCacheStats
- null, // fileCacheOnlyStats
- null, // blockCacheOnlyStats
- null, // taskCancellationStats
- null, // searchPipelineStats
- null, // segmentReplicationRejectionStats
- null, // repositoriesStats
- null, // admissionControlStats
- null, // nodeCacheStats
- null, // remoteStoreNodeStats
- null, // nativeAllocator
- nativeMemoryStats,
- -1L // totalEstimatedNativeBytes
- );
- }
-}
diff --git a/server/src/test/java/org/opensearch/action/admin/cluster/node/stats/NodeStatsTests.java b/server/src/test/java/org/opensearch/action/admin/cluster/node/stats/NodeStatsTests.java
index 05f27c3f98562..7d9f77f1d0bd8 100644
--- a/server/src/test/java/org/opensearch/action/admin/cluster/node/stats/NodeStatsTests.java
+++ b/server/src/test/java/org/opensearch/action/admin/cluster/node/stats/NodeStatsTests.java
@@ -1056,7 +1056,6 @@ public long getLastSuccessfulFetchOfPinnedTimestamps() {
nodeCacheStats,
remoteStoreNodeStats,
null,
- null,
-1L
);
}
@@ -1524,7 +1523,6 @@ public void testNativeAllocatorStatsBwcEmptyOnOldVersion() throws IOException {
NativeAllocatorPoolStats stats = new NativeAllocatorPoolStats(
1024L,
2048L,
- 8192L,
List.of(new NativeAllocatorPoolStats.PoolStats("flight", 100L, 200L, 2048L))
);
DiscoveryNode node = new DiscoveryNode("node1", buildNewFakeTransportAddress(), emptyMap(), emptySet(), Version.CURRENT);
@@ -1554,7 +1552,6 @@ public void testNativeAllocatorStatsRoundTripCurrentVersion() throws IOException
NativeAllocatorPoolStats stats = new NativeAllocatorPoolStats(
1024L,
2048L,
- 8192L,
List.of(
new NativeAllocatorPoolStats.PoolStats("flight", 100L, 200L, 2048L),
new NativeAllocatorPoolStats.PoolStats("ingest", 200L, 400L, 4096L),
@@ -1572,9 +1569,8 @@ public void testNativeAllocatorStatsRoundTripCurrentVersion() throws IOException
NodeStats roundtripped = new NodeStats(in);
NativeAllocatorPoolStats decoded = roundtripped.getNativeAllocatorStats();
assertNotNull("native allocator stats must round-trip on current wire version", decoded);
- assertEquals(1024L, decoded.getRootAllocatedBytes());
- assertEquals(2048L, decoded.getRootPeakBytes());
- assertEquals(8192L, decoded.getRootLimitBytes());
+ assertEquals(1024L, decoded.getNativeAllocatedBytes());
+ assertEquals(2048L, decoded.getNativeResidentBytes());
assertEquals(3, decoded.getPools().size());
assertEquals("flight", decoded.getPools().get(0).getName());
assertEquals(100L, decoded.getPools().get(0).getAllocatedBytes());
@@ -1586,15 +1582,13 @@ public void testNativeAllocatorStatsRoundTripCurrentVersion() throws IOException
/**
* Renders {@code NodeStats.toXContent} when {@code nativeAllocatorStats} is non-null and
- * asserts the JSON shape: a top-level {@code native_memory.native_allocator} block with
- * the SPI's inner {@code root}/{@code pools.} structure. Covers the conditional
- * branch in {@code NodeStats.toXContent} that opens the {@code native_allocator} wrapper.
+ * asserts the JSON shape: a top-level {@code native_memory} block with
+ * {@code runtime.allocated_bytes}/{@code runtime.resident_bytes} and grouped {@code memory_pools}.
*/
public void testNativeAllocatorStatsXContentRendersInsideNativeMemory() throws IOException {
NativeAllocatorPoolStats stats = new NativeAllocatorPoolStats(
1024L,
2048L,
- 8192L,
List.of(new NativeAllocatorPoolStats.PoolStats("flight", 100L, 200L, 2048L))
);
DiscoveryNode node = new DiscoveryNode("node1", buildNewFakeTransportAddress(), emptyMap(), emptySet(), Version.CURRENT);
@@ -1608,20 +1602,22 @@ public void testNativeAllocatorStatsXContentRendersInsideNativeMemory() throws I
@SuppressWarnings("unchecked")
Map nativeMemory = (Map) root.get("native_memory");
assertNotNull("native_memory wrapper must be opened when allocator stats are present", nativeMemory);
+
+ // Runtime stats are nested under "runtime"
@SuppressWarnings("unchecked")
- Map nativeAllocator = (Map) nativeMemory.get("native_allocator");
- assertNotNull("native_allocator block must be present", nativeAllocator);
- @SuppressWarnings("unchecked")
- Map rootBlock = (Map) nativeAllocator.get("root");
- assertEquals(1024L, ((Number) rootBlock.get("allocated_bytes")).longValue());
- assertEquals(2048L, ((Number) rootBlock.get("peak_bytes")).longValue());
- assertEquals(8192L, ((Number) rootBlock.get("limit_bytes")).longValue());
+ Map runtime = (Map) nativeMemory.get("runtime");
+ assertNotNull("runtime block must be present", runtime);
+ assertEquals(1024L, ((Number) runtime.get("allocated_bytes")).longValue());
+ assertEquals(2048L, ((Number) runtime.get("resident_bytes")).longValue());
+
+ // Pools are grouped under "memory_pools"
@SuppressWarnings("unchecked")
- Map pools = (Map) nativeAllocator.get("pools");
+ Map pools = (Map) nativeMemory.get("memory_pools");
+ assertNotNull("memory_pools block must be present", pools);
@SuppressWarnings("unchecked")
Map flight = (Map) pools.get("flight");
+ assertNotNull("flight pool must be present in memory_pools", flight);
assertEquals(100L, ((Number) flight.get("allocated_bytes")).longValue());
- assertEquals(200L, ((Number) flight.get("peak_bytes")).longValue());
assertEquals(2048L, ((Number) flight.get("limit_bytes")).longValue());
}
@@ -1704,7 +1700,6 @@ private static NodeStats newNodeStatsWithNativeAllocator(
null, // nodeCacheStats
null,
nativeAllocatorStats,
- null,
totalEstimatedNativeBytes
);
}
diff --git a/server/src/test/java/org/opensearch/action/admin/cluster/stats/ClusterStatsNodesTests.java b/server/src/test/java/org/opensearch/action/admin/cluster/stats/ClusterStatsNodesTests.java
index 9820102840829..fcdd26912848f 100644
--- a/server/src/test/java/org/opensearch/action/admin/cluster/stats/ClusterStatsNodesTests.java
+++ b/server/src/test/java/org/opensearch/action/admin/cluster/stats/ClusterStatsNodesTests.java
@@ -354,7 +354,6 @@ private ClusterStatsNodeResponse createClusterStatsNodeResponse(
null,
null,
null, // nativeAllocator
- null,
-1L // totalEstimatedNativeBytes
);
if (defaultBehavior) {
diff --git a/server/src/test/java/org/opensearch/action/admin/cluster/stats/ClusterStatsResponseTests.java b/server/src/test/java/org/opensearch/action/admin/cluster/stats/ClusterStatsResponseTests.java
index b1ae92df3793c..7ff9dd3d6a89e 100644
--- a/server/src/test/java/org/opensearch/action/admin/cluster/stats/ClusterStatsResponseTests.java
+++ b/server/src/test/java/org/opensearch/action/admin/cluster/stats/ClusterStatsResponseTests.java
@@ -226,7 +226,6 @@ private ClusterStatsNodeResponse createClusterStatsNodeResponse(DiscoveryNode no
null,
null,
null, // nativeAllocator
- null,
-1L // totalEstimatedNativeBytes
);
return new ClusterStatsNodeResponse(node, null, nodeInfo, nodeStats, shardStats);
diff --git a/server/src/test/java/org/opensearch/cluster/DiskUsageTests.java b/server/src/test/java/org/opensearch/cluster/DiskUsageTests.java
index ec689d6554b33..2bf4b1fcd8b94 100644
--- a/server/src/test/java/org/opensearch/cluster/DiskUsageTests.java
+++ b/server/src/test/java/org/opensearch/cluster/DiskUsageTests.java
@@ -216,7 +216,6 @@ public void testFillDiskUsage() {
null,
null,
null, // nativeAllocator
- null,
-1L // totalEstimatedNativeBytes
),
new NodeStats(
@@ -253,7 +252,6 @@ public void testFillDiskUsage() {
null,
null,
null, // nativeAllocator
- null,
-1L // totalEstimatedNativeBytes
),
new NodeStats(
@@ -290,7 +288,6 @@ public void testFillDiskUsage() {
null,
null,
null, // nativeAllocator
- null,
-1L // totalEstimatedNativeBytes
)
);
@@ -358,7 +355,6 @@ public void testFillDiskUsageSomeInvalidValues() {
null,
null,
null, // nativeAllocator
- null,
-1L // totalEstimatedNativeBytes
),
new NodeStats(
@@ -395,7 +391,6 @@ public void testFillDiskUsageSomeInvalidValues() {
null,
null,
null, // nativeAllocator
- null,
-1L // totalEstimatedNativeBytes
),
new NodeStats(
@@ -432,7 +427,6 @@ public void testFillDiskUsageSomeInvalidValues() {
null,
null,
null, // nativeAllocator
- null,
-1L // totalEstimatedNativeBytes
)
);
@@ -529,7 +523,6 @@ private NodeStats makeNodeStatsWithResourceUsage(DiscoveryNode node, NodesResour
null,
null,
null,
- null,
-1L
);
diff --git a/server/src/test/java/org/opensearch/node/NodeServiceNativeMemoryTests.java b/server/src/test/java/org/opensearch/node/NodeServiceNativeMemoryTests.java
index 2598165dc6dcc..e69eefd2be669 100644
--- a/server/src/test/java/org/opensearch/node/NodeServiceNativeMemoryTests.java
+++ b/server/src/test/java/org/opensearch/node/NodeServiceNativeMemoryTests.java
@@ -16,19 +16,13 @@
import org.opensearch.cluster.service.ClusterService;
import org.opensearch.common.settings.Settings;
import org.opensearch.common.settings.SettingsFilter;
-import org.opensearch.common.xcontent.XContentHelper;
-import org.opensearch.common.xcontent.json.JsonXContent;
import org.opensearch.core.indices.breaker.CircuitBreakerService;
-import org.opensearch.core.xcontent.ToXContent;
-import org.opensearch.core.xcontent.XContentBuilder;
import org.opensearch.discovery.Discovery;
import org.opensearch.index.IndexingPressureService;
import org.opensearch.index.SegmentReplicationStatsTracker;
import org.opensearch.indices.IndicesService;
import org.opensearch.ingest.IngestService;
import org.opensearch.monitor.MonitorService;
-import org.opensearch.monitor.memory.MemoryReportingService;
-import org.opensearch.plugin.stats.AnalyticsBackendNativeMemoryStats;
import org.opensearch.plugin.stats.NativeAllocatorPoolStats;
import org.opensearch.plugins.PluginsService;
import org.opensearch.ratelimitting.admissioncontrol.AdmissionControlService;
@@ -43,7 +37,6 @@
import java.util.Collections;
import java.util.List;
-import java.util.Map;
import java.util.function.Supplier;
import static org.mockito.Mockito.mock;
@@ -52,38 +45,21 @@
/**
* Unit tests for NodeService native memory stats delegation logic.
*
- * Validates that NodeService correctly delegates to the native memory stats
+ * Validates that NodeService correctly delegates to the native allocator stats
* supplier when nativeMemory=true and the supplier is non-null,
* and returns null otherwise.
*/
public class NodeServiceNativeMemoryTests extends OpenSearchTestCase {
- private NodeService createNodeService(AnalyticsBackendNativeMemoryStats nativeStats) {
- return createNodeService(nativeStats, null);
- }
-
- private NodeService createNodeService(
- AnalyticsBackendNativeMemoryStats nativeStats,
- Supplier nativeAllocatorStatsSupplier
- ) {
+ private NodeService createNodeService(Supplier nativeAllocatorStatsSupplier) {
TransportService transportService = mock(TransportService.class);
DiscoveryNode localNode = new DiscoveryNode("test_node", buildNewFakeTransportAddress(), Version.CURRENT);
when(transportService.getLocalNode()).thenReturn(localNode);
- ClusterService clusterService = mock(ClusterService.class);
- IngestService ingestService = mock(IngestService.class);
- SearchPipelineService searchPipelineService = mock(SearchPipelineService.class);
-
- MemoryReportingService memoryReportingService = mock(MemoryReportingService.class);
- when(memoryReportingService.nativeStats()).thenReturn(nativeStats);
-
- MonitorService monitorService = mock(MonitorService.class);
- when(monitorService.memoryReportingService()).thenReturn(memoryReportingService);
-
return new NodeService(
Settings.EMPTY,
mock(ThreadPool.class),
- monitorService,
+ mock(MonitorService.class),
mock(Discovery.class),
transportService,
mock(IndicesService.class),
@@ -91,16 +67,16 @@ private NodeService createNodeService(
mock(CircuitBreakerService.class),
mock(ScriptService.class),
null, // httpServerTransport
- ingestService,
- clusterService,
+ mock(IngestService.class),
+ mock(ClusterService.class),
new SettingsFilter(Collections.emptyList()),
null, // responseCollectorService - not needed when adaptiveSelection=false
mock(SearchTransportService.class),
mock(IndexingPressureService.class),
null, // aggregationUsageService
mock(SearchBackpressureService.class),
- searchPipelineService,
- null, // fileCache
+ mock(SearchPipelineService.class),
+ null, // nodeCacheService
mock(TaskCancellationMonitoringService.class),
null, // resourceUsageCollectorService
mock(SegmentReplicationStatsTracker.class),
@@ -115,10 +91,13 @@ private NodeService createNodeService(
* Tests that stats() with nativeMemory=true and a non-null supplier
* returns the stats from the supplier.
*/
- public void testStatsWithNativeMemoryTrueAndServicePresent() {
- AnalyticsBackendNativeMemoryStats expectedStats = new AnalyticsBackendNativeMemoryStats(1024L, 2048L);
-
- NodeService nodeService = createNodeService(expectedStats);
+ public void testStatsWithNativeMemoryTrueAndSupplierPresent() {
+ NativeAllocatorPoolStats expected = new NativeAllocatorPoolStats(
+ 1024L,
+ 2048L,
+ List.of(new NativeAllocatorPoolStats.PoolStats("flight", 100L, 200L, 2048L))
+ );
+ NodeService nodeService = createNodeService(() -> expected);
NodeStats nodeStats = nodeService.stats(
CommonStatsFlags.NONE,
@@ -150,21 +129,18 @@ public void testStatsWithNativeMemoryTrueAndServicePresent() {
false, // admissionControl
false, // cacheService
false, // remoteStoreNodeStats
- false, // nativeAllocator
true // nativeMemory
);
- assertNotNull(nodeStats.getAnalyticsBackendNativeMemoryStats());
- assertSame(expectedStats, nodeStats.getAnalyticsBackendNativeMemoryStats());
- assertEquals(1024L, nodeStats.getAnalyticsBackendNativeMemoryStats().getAllocatedBytes());
- assertEquals(2048L, nodeStats.getAnalyticsBackendNativeMemoryStats().getResidentBytes());
+ assertNotNull("nativeAllocatorStats should be present when supplier returns non-null", nodeStats.getNativeAllocatorStats());
+ assertSame(expected, nodeStats.getNativeAllocatorStats());
}
/**
- * Tests that stats() with nativeMemory=true and a null supplier
- * returns null for the nativeMemoryStats field.
+ * Tests that stats() with nativeMemory=true and no supplier
+ * returns null for the nativeAllocatorStats field.
*/
- public void testStatsWithNativeMemoryTrueAndNullService() {
+ public void testStatsWithNativeMemoryTrueAndNoSupplier() {
NodeService nodeService = createNodeService(null);
NodeStats nodeStats = nodeService.stats(
@@ -197,21 +173,23 @@ public void testStatsWithNativeMemoryTrueAndNullService() {
false, // admissionControl
false, // cacheService
false, // remoteStoreNodeStats
- false, // nativeAllocator
true // nativeMemory
);
- assertNull(nodeStats.getAnalyticsBackendNativeMemoryStats());
+ assertNull("nativeAllocatorStats should be null when no supplier registered", nodeStats.getNativeAllocatorStats());
}
/**
* Tests that stats() with nativeMemory=false returns null for the
- * nativeMemoryStats field regardless of whether the supplier is present.
+ * nativeAllocatorStats field regardless of whether the supplier is present.
*/
public void testStatsWithNativeMemoryFalse() {
- AnalyticsBackendNativeMemoryStats expectedStats = new AnalyticsBackendNativeMemoryStats(4096L, 8192L);
-
- NodeService nodeService = createNodeService(expectedStats);
+ NativeAllocatorPoolStats expected = new NativeAllocatorPoolStats(
+ 4096L,
+ 8192L,
+ List.of(new NativeAllocatorPoolStats.PoolStats("flight", 100L, 200L, 2048L))
+ );
+ NodeService nodeService = createNodeService(() -> expected);
NodeStats nodeStats = nodeService.stats(
CommonStatsFlags.NONE,
@@ -243,225 +221,9 @@ public void testStatsWithNativeMemoryFalse() {
false, // admissionControl
false, // cacheService
false, // remoteStoreNodeStats
- false, // nativeAllocator
- false // nativeMemory
- );
-
- assertNull(nodeStats.getAnalyticsBackendNativeMemoryStats());
- }
-
- /**
- * Integration test: verifies that the _nodes/stats/native_memory response format
- * contains the expected "native_memory" object with "allocated_bytes" and "resident_bytes" fields.
- * This ensures the response format is unchanged after the refactor.
- */
- @SuppressWarnings("unchecked")
- public void testNativeMemoryResponseFormatUnchanged() throws Exception {
- AnalyticsBackendNativeMemoryStats expectedStats = new AnalyticsBackendNativeMemoryStats(123456789L, 987654321L);
-
- NodeService nodeService = createNodeService(expectedStats);
-
- NodeStats nodeStats = nodeService.stats(
- CommonStatsFlags.NONE,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false, // fileCacheDetailed
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false, // nativeAllocator
- true // nativeMemory
- );
-
- assertNotNull("nativeMemoryStats should be present", nodeStats.getAnalyticsBackendNativeMemoryStats());
-
- // Render the parent NodeStats to JSON — NodeStats now opens the `native_memory`
- // wrapper, emits `total_estimated_bytes` from OsProbe, then delegates to
- // AnalyticsBackendNativeMemoryStats which renders only the `analytics_backend` block.
- XContentBuilder builder = JsonXContent.contentBuilder();
- builder.startObject();
- nodeStats.toXContent(builder, ToXContent.EMPTY_PARAMS);
- builder.endObject();
- String json = builder.toString();
-
- Map root = XContentHelper.convertToMap(JsonXContent.jsonXContent, json, false);
-
- // Verify "native_memory" object is present
- assertTrue("Response should contain 'native_memory' key", root.containsKey("native_memory"));
-
- @SuppressWarnings("unchecked")
- Map nativeMemory = (Map) root.get("native_memory");
- assertNotNull("native_memory object should not be null", nativeMemory);
-
- // Verify nested "analytics_backend" with correct values
- assertTrue("native_memory should contain 'analytics_backend'", nativeMemory.containsKey("analytics_backend"));
- @SuppressWarnings("unchecked")
- Map analyticsBackend = (Map) nativeMemory.get("analytics_backend");
- assertEquals(123456789L, ((Number) analyticsBackend.get("allocated_bytes")).longValue());
- assertEquals(987654321L, ((Number) analyticsBackend.get("resident_bytes")).longValue());
- }
-
- /**
- * Integration test: verifies that when native stats are unavailable (null supplier),
- * the response omits the native_memory object entirely.
- */
- public void testNativeMemoryOmittedWhenUnavailable() throws Exception {
- NodeService nodeService = createNodeService(null);
-
- NodeStats nodeStats = nodeService.stats(
- CommonStatsFlags.NONE,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false, // fileCacheDetailed
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false, // nativeAllocator
- true // nativeMemory
- );
-
- assertNull("nativeMemoryStats should be null when supplier is null", nodeStats.getAnalyticsBackendNativeMemoryStats());
- }
-
- /**
- * Tests that {@code stats(... nativeAllocator=true ...)} invokes the constructor-injected
- * {@code Supplier} and surfaces its return value on
- * {@link NodeStats#getNativeAllocatorStats()}. Covers the supplier-invocation branch in
- * {@code collectNativeAllocatorStats}.
- */
- public void testStatsWithNativeAllocatorTrueAndSupplierPresent() {
- NativeAllocatorPoolStats expected = new NativeAllocatorPoolStats(
- 1024L,
- 2048L,
- 8192L,
- List.of(new NativeAllocatorPoolStats.PoolStats("flight", 100L, 200L, 2048L))
- );
- NodeService nodeService = createNodeService(null, () -> expected);
-
- NodeStats nodeStats = nodeService.stats(
- CommonStatsFlags.NONE,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- true, // nativeAllocator
false // nativeMemory
);
- assertNotNull("nativeAllocatorStats should be present when supplier returns non-null", nodeStats.getNativeAllocatorStats());
- assertSame(expected, nodeStats.getNativeAllocatorStats());
- }
-
- /**
- * Tests that {@code stats(... nativeAllocator=true ...)} returns {@code null} for the
- * allocator stats when no supplier was injected at construction.
- */
- public void testStatsWithNativeAllocatorTrueAndNoSupplier() {
- NodeService nodeService = createNodeService(null);
- // No supplier passed to the factory — defaults to null.
-
- NodeStats nodeStats = nodeService.stats(
- CommonStatsFlags.NONE,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- false,
- true, // nativeAllocator
- false // nativeMemory
- );
-
- assertNull("nativeAllocatorStats should be null when no supplier registered", nodeStats.getNativeAllocatorStats());
+ assertNull("nativeAllocatorStats should be null when nativeMemory=false", nodeStats.getNativeAllocatorStats());
}
}
diff --git a/server/src/test/java/org/opensearch/plugin/stats/NativeAllocatorPoolStatsTests.java b/server/src/test/java/org/opensearch/plugin/stats/NativeAllocatorPoolStatsTests.java
index 8257f58f0466c..a49c784a34e51 100644
--- a/server/src/test/java/org/opensearch/plugin/stats/NativeAllocatorPoolStatsTests.java
+++ b/server/src/test/java/org/opensearch/plugin/stats/NativeAllocatorPoolStatsTests.java
@@ -25,7 +25,7 @@ public void testSerializationRoundTrip() throws IOException {
new NativeAllocatorPoolStats.PoolStats("flight", 1000, 2000, 3000),
new NativeAllocatorPoolStats.PoolStats("query", 4000, 5000, 6000)
);
- NativeAllocatorPoolStats original = new NativeAllocatorPoolStats(10000, 20000, 30000, pools);
+ NativeAllocatorPoolStats original = new NativeAllocatorPoolStats(10000, 20000, pools);
BytesStreamOutput out = new BytesStreamOutput();
original.writeTo(out);
@@ -33,9 +33,8 @@ public void testSerializationRoundTrip() throws IOException {
StreamInput in = out.bytes().streamInput();
NativeAllocatorPoolStats deserialized = new NativeAllocatorPoolStats(in);
- assertEquals(original.getRootAllocatedBytes(), deserialized.getRootAllocatedBytes());
- assertEquals(original.getRootPeakBytes(), deserialized.getRootPeakBytes());
- assertEquals(original.getRootLimitBytes(), deserialized.getRootLimitBytes());
+ assertEquals(original.getNativeAllocatedBytes(), deserialized.getNativeAllocatedBytes());
+ assertEquals(original.getNativeResidentBytes(), deserialized.getNativeResidentBytes());
assertEquals(original.getPools().size(), deserialized.getPools().size());
for (int i = 0; i < pools.size(); i++) {
@@ -49,7 +48,7 @@ public void testSerializationRoundTrip() throws IOException {
}
public void testEmptyPoolsSerialization() throws IOException {
- NativeAllocatorPoolStats original = new NativeAllocatorPoolStats(0, 0, 16000000000L, List.of());
+ NativeAllocatorPoolStats original = new NativeAllocatorPoolStats(-1, -1, List.of());
BytesStreamOutput out = new BytesStreamOutput();
original.writeTo(out);
@@ -57,23 +56,16 @@ public void testEmptyPoolsSerialization() throws IOException {
StreamInput in = out.bytes().streamInput();
NativeAllocatorPoolStats deserialized = new NativeAllocatorPoolStats(in);
- assertEquals(0, deserialized.getRootAllocatedBytes());
- assertEquals(0, deserialized.getRootPeakBytes());
- assertEquals(16000000000L, deserialized.getRootLimitBytes());
+ assertEquals(-1, deserialized.getNativeAllocatedBytes());
+ assertEquals(-1, deserialized.getNativeResidentBytes());
assertTrue(deserialized.getPools().isEmpty());
}
- /**
- * Asserts the JSON shape: {@code root}/{@code pools.} blocks expose
- * {@code allocated_bytes}, {@code peak_bytes}, and {@code limit_bytes}. Caller is
- * responsible for the outer {@code native_allocator} wrapper, so this test does
- * not expect it.
- */
public void testToXContent() throws IOException {
List pools = List.of(
new NativeAllocatorPoolStats.PoolStats("flight", 1024, 1048576, 2147483648L)
);
- NativeAllocatorPoolStats stats = new NativeAllocatorPoolStats(4096, 8192, 17179869184L, pools);
+ NativeAllocatorPoolStats stats = new NativeAllocatorPoolStats(4096, 8192, pools);
XContentBuilder builder = JsonXContent.contentBuilder();
builder.startObject();
@@ -81,17 +73,12 @@ public void testToXContent() throws IOException {
builder.endObject();
String json = builder.toString();
- assertTrue(json.contains("\"root\""));
+ assertTrue(json.contains("\"allocated_bytes\""));
+ assertTrue(json.contains("\"resident_bytes\""));
assertTrue(json.contains("\"pools\""));
assertTrue(json.contains("\"flight\""));
- assertTrue(json.contains("\"allocated_bytes\""));
- assertTrue(json.contains("\"peak_bytes\""));
assertTrue(json.contains("\"limit_bytes\""));
-
- // Removed fields must NOT appear in the JSON.
- assertFalse("child_count was dropped from the stats shape", json.contains("\"child_count\""));
- assertFalse("human-readable byte string was dropped", json.contains("\"allocated\":"));
- assertFalse("human-readable byte string was dropped", json.contains("\"limit\":"));
+ assertFalse("root object should not exist", json.contains("\"root\""));
}
public void testPoolStatsSerializationRoundTrip() throws IOException {
diff --git a/test/framework/src/main/java/org/opensearch/cluster/MockInternalClusterInfoService.java b/test/framework/src/main/java/org/opensearch/cluster/MockInternalClusterInfoService.java
index 9576112b8b12b..bd2842cdaa20d 100644
--- a/test/framework/src/main/java/org/opensearch/cluster/MockInternalClusterInfoService.java
+++ b/test/framework/src/main/java/org/opensearch/cluster/MockInternalClusterInfoService.java
@@ -154,7 +154,6 @@ List adjustNodesStats(List nodesStats) {
nodeStats.getNodeCacheStats(),
nodeStats.getRemoteStoreNodeStats(),
nodeStats.getNativeAllocatorStats(),
- nodeStats.getAnalyticsBackendNativeMemoryStats(),
nodeStats.getTotalEstimatedNativeBytes()
);
}).collect(Collectors.toList());
diff --git a/test/framework/src/main/java/org/opensearch/test/InternalTestCluster.java b/test/framework/src/main/java/org/opensearch/test/InternalTestCluster.java
index e01f4d651d979..e92c1c6402a9f 100644
--- a/test/framework/src/main/java/org/opensearch/test/InternalTestCluster.java
+++ b/test/framework/src/main/java/org/opensearch/test/InternalTestCluster.java
@@ -2709,7 +2709,6 @@ public void ensureEstimatedStats() {
false,
false,
false,
- false,
false
);
assertThat(