From 8d0dfb6ef910a4aa18aa7e48ca6f19644dbe555f Mon Sep 17 00:00:00 2001 From: Marc Handalian Date: Wed, 30 Jul 2025 23:02:54 -0700 Subject: [PATCH 1/3] Initial commit of plugin engine-datafusion Signed-off-by: Marc Handalian --- plugins/engine-datafusion/.gitignore | 38 +++++ plugins/engine-datafusion/build.gradle | 111 ++++++++++++++ plugins/engine-datafusion/jni/Cargo.toml | 43 ++++++ plugins/engine-datafusion/jni/src/lib.rs | 47 ++++++ .../opensearch/datafusion/DataFusionJNI.java | 77 ++++++++++ .../datafusion/DataFusionPlugin.java | 137 ++++++++++++++++++ .../datafusion/DataFusionService.java | 109 ++++++++++++++ .../datafusion/action/DataFusionAction.java | 70 +++++++++ .../datafusion/action/NodeDataFusionInfo.java | 85 +++++++++++ .../action/NodesDataFusionInfoAction.java | 29 ++++ .../action/NodesDataFusionInfoRequest.java | 76 ++++++++++ .../action/NodesDataFusionInfoResponse.java | 98 +++++++++++++ .../TransportNodesDataFusionInfoAction.java | 115 +++++++++++++++ .../datafusion/core/SessionContext.java | 38 +++++ .../datafusion/DataFusionServiceTest.java | 60 ++++++++ 15 files changed, 1133 insertions(+) create mode 100644 plugins/engine-datafusion/.gitignore create mode 100644 plugins/engine-datafusion/build.gradle create mode 100644 plugins/engine-datafusion/jni/Cargo.toml create mode 100644 plugins/engine-datafusion/jni/src/lib.rs create mode 100644 plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/DataFusionJNI.java create mode 100644 plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/DataFusionPlugin.java create mode 100644 plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/DataFusionService.java create mode 100644 plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/DataFusionAction.java create mode 100644 plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/NodeDataFusionInfo.java create mode 100644 plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/NodesDataFusionInfoAction.java create mode 100644 plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/NodesDataFusionInfoRequest.java create mode 100644 plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/NodesDataFusionInfoResponse.java create mode 100644 plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/TransportNodesDataFusionInfoAction.java create mode 100644 plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/core/SessionContext.java create mode 100644 plugins/engine-datafusion/src/test/java/org/opensearch/datafusion/DataFusionServiceTest.java diff --git a/plugins/engine-datafusion/.gitignore b/plugins/engine-datafusion/.gitignore new file mode 100644 index 0000000000000..8e535981ee076 --- /dev/null +++ b/plugins/engine-datafusion/.gitignore @@ -0,0 +1,38 @@ +# Gradle +.gradle/ +build/ + +# Java +*.class +*.jar +*.war +*.ear +hs_err_pid* + +# IDE +.idea/ +*.iml +*.ipr +*.iws +.vscode/ +.settings/ +.project +.classpath + +# OS +.DS_Store +Thumbs.db + +# Rust +jni/target/ +jni/Cargo.lock + +# Native libraries +src/main/resources/native/ + +# Logs +*.log + +# Temporary files +*.tmp +*.temp diff --git a/plugins/engine-datafusion/build.gradle b/plugins/engine-datafusion/build.gradle new file mode 100644 index 0000000000000..afd994a9eea3a --- /dev/null +++ b/plugins/engine-datafusion/build.gradle @@ -0,0 +1,111 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +apply plugin: 'java' +apply plugin: 'idea' +apply plugin: 'opensearch.internal-cluster-test' +apply plugin: 'opensearch.yaml-rest-test' +apply plugin: 'opensearch.pluginzip' + +def pluginName = 'engine-datafusion' +def pluginDescription = 'OpenSearch plugin providing access to DataFusion via JNI' +def projectPath = 'org.opensearch' +def pathToPlugin = 'datafusion.DataFusionPlugin' +def pluginClassName = 'DataFusionPlugin' + +opensearchplugin { + name = pluginName + description = pluginDescription + classname = "${projectPath}.${pathToPlugin}" + licenseFile = rootProject.file('LICENSE.txt') + noticeFile = rootProject.file('NOTICE.txt') +} + +dependencies { + implementation "org.apache.logging.log4j:log4j-api:${versions.log4j}" + implementation "org.apache.logging.log4j:log4j-core:${versions.log4j}" + testImplementation "junit:junit:${versions.junit}" + testImplementation "org.hamcrest:hamcrest:${versions.hamcrest}" + testImplementation "org.mockito:mockito-core:${versions.mockito}" +} + +// Task to build the Rust JNI library +task buildRustLibrary(type: Exec) { + description = 'Build the Rust JNI library using Cargo' + group = 'build' + + workingDir file('jni') + + // Determine the target directory and library name based on OS + def osName = System.getProperty('os.name').toLowerCase() + def libPrefix = osName.contains('windows') ? '' : 'lib' + def libExtension = osName.contains('windows') ? '.dll' : (osName.contains('mac') ? '.dylib' : '.so') + + // Use debug build for development, release for production + def buildType = project.hasProperty('rustRelease') ? 'release' : 'debug' + def targetDir = "target/${buildType}" + + def cargoArgs = ['cargo', 'build'] + if (buildType == 'release') { + cargoArgs.add('--release') + } + + if (osName.contains('windows')) { + commandLine cargoArgs + } else { + commandLine cargoArgs + } + + // Set environment variables for cross-compilation if needed + environment 'CARGO_TARGET_DIR', file('jni/target').absolutePath + + inputs.files fileTree('jni/src') + inputs.file 'jni/Cargo.toml' + outputs.files file("jni/${targetDir}/${libPrefix}opensearch_datafusion_jni${libExtension}") + System.out.println("Building Rust library in ${buildType} mode"); +} + +// Task to copy the native library to resources +task copyNativeLibrary(type: Copy, dependsOn: buildRustLibrary) { + description = 'Copy the native library to Java resources' + group = 'build' + + def osName = System.getProperty('os.name').toLowerCase() + def libPrefix = osName.contains('windows') ? '' : 'lib' + def libExtension = osName.contains('windows') ? '.dll' : (osName.contains('mac') ? '.dylib' : '.so') + def buildType = project.hasProperty('rustRelease') ? 'release' : 'debug' + + from file("jni/target/${buildType}/${libPrefix}opensearch_datafusion_jni${libExtension}") + into file('src/main/resources/native') + + // Rename to a standard name for Java to load + rename { filename -> + "libopensearch_datafusion_jni${libExtension}" + } +} + +// Ensure native library is built before Java compilation +compileJava.dependsOn copyNativeLibrary + +// Ensure processResources depends on copyNativeLibrary +processResources.dependsOn copyNativeLibrary + +// Clean task should also clean Rust artifacts +clean { + delete file('jni/target') + delete file('src/main/resources/native') +} + +test { + // Set system property to help tests find the native library + systemProperty 'java.library.path', file('src/main/resources/native').absolutePath +} + +yamlRestTest { + systemProperty 'tests.security.manager', 'false' +} diff --git a/plugins/engine-datafusion/jni/Cargo.toml b/plugins/engine-datafusion/jni/Cargo.toml new file mode 100644 index 0000000000000..e26317758fb69 --- /dev/null +++ b/plugins/engine-datafusion/jni/Cargo.toml @@ -0,0 +1,43 @@ +[package] +name = "opensearch-datafusion-jni" +version = "0.1.0" +edition = "2021" +description = "JNI bindings for DataFusion integration with OpenSearch" +license = "Apache-2.0" + +[lib] +name = "opensearch_datafusion_jni" +crate-type = ["cdylib"] + +[dependencies] +datafusion = "49.0.0" +arrow = "55.2" +arrow-json = "55.2" + +# JNI dependencies +jni = "0.21" + +# Async runtime +tokio = { version = "1.0", features = ["rt", "rt-multi-thread", "macros"] } + +# Serialization +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" + +# Error handling +anyhow = "1.0" +thiserror = "1.0" + +# Logging +log = "0.4" + +[profile.release] +lto = true +codegen-units = 1 +panic = "abort" + +[profile.dev] +opt-level = 1 # Some optimization for reasonable performance +lto = false # Disable LTO for faster builds +codegen-units = 16 # More parallel compilation +incremental = true # Enable incremental compilation diff --git a/plugins/engine-datafusion/jni/src/lib.rs b/plugins/engine-datafusion/jni/src/lib.rs new file mode 100644 index 0000000000000..452a3951dc2fb --- /dev/null +++ b/plugins/engine-datafusion/jni/src/lib.rs @@ -0,0 +1,47 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +use jni::objects::JClass; +use jni::sys::{jlong, jstring}; +use jni::JNIEnv; + +use datafusion::execution::context::SessionContext; + +use datafusion::DATAFUSION_VERSION; +use datafusion::prelude::SessionConfig; + +/// Create a new DataFusion session context +#[no_mangle] +pub extern "system" fn Java_org_opensearch_datafusion_DataFusionJNI_createContext( + _env: JNIEnv, + _class: JClass, +) -> jlong { + let config = SessionConfig::new().with_repartition_aggregations(true); + let context = SessionContext::new_with_config(config); + let ctx = Box::into_raw(Box::new(context)) as jlong; + ctx +} + +/// Close and cleanup a DataFusion context +#[no_mangle] +pub extern "system" fn Java_org_opensearch_datafusion_DataFusionJNI_closeContext( + _env: JNIEnv, + _class: JClass, + context_id: jlong, +) { + let _ = unsafe { Box::from_raw(context_id as *mut SessionContext) }; +} + +/// Get version information +#[no_mangle] +pub extern "system" fn Java_org_opensearch_datafusion_DataFusionJNI_getVersion( + env: JNIEnv, + _class: JClass, +) -> jstring { + env.new_string(DATAFUSION_VERSION).expect("Couldn't create Java string").as_raw() +} diff --git a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/DataFusionJNI.java b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/DataFusionJNI.java new file mode 100644 index 0000000000000..2353fef35d7ad --- /dev/null +++ b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/DataFusionJNI.java @@ -0,0 +1,77 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.datafusion; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardCopyOption; + +/** + * JNI wrapper for DataFusion operations + */ +public class DataFusionJNI { + + private static boolean libraryLoaded = false; + + static { + loadNativeLibrary(); + } + + /** + * Load the native library from resources + */ + private static synchronized void loadNativeLibrary() { + if (libraryLoaded) { + return; + } + + try { + String osName = System.getProperty("os.name").toLowerCase(); + String libExtension; + String libName; + + if (osName.contains("windows")) { + libExtension = ".dll"; + libName = "libopensearch_datafusion_jni.dll"; + } else if (osName.contains("mac")) { + libExtension = ".dylib"; + libName = "libopensearch_datafusion_jni.dylib"; + } else { + libExtension = ".so"; + libName = "libopensearch_datafusion_jni.so"; + } + + // Try to load from resources first + InputStream libStream = DataFusionJNI.class.getResourceAsStream("/native/" + libName); + if (libStream != null) { + // Extract to temporary file and load + Path tempLib = Files.createTempFile("libopensearch_datafusion_jni", libExtension); + Files.copy(libStream, tempLib, StandardCopyOption.REPLACE_EXISTING); + tempLib.toFile().deleteOnExit(); + System.load(tempLib.toAbsolutePath().toString()); + libStream.close(); + } else { + // Fallback to system library path + System.loadLibrary("opensearch_datafusion_jni"); + } + + libraryLoaded = true; + } catch (IOException | UnsatisfiedLinkError e) { + throw new RuntimeException("Failed to load DataFusion JNI library", e); + } + } + + /** + * Get version information + * @return JSON string with version information + */ + public static native String getVersion(); +} diff --git a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/DataFusionPlugin.java b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/DataFusionPlugin.java new file mode 100644 index 0000000000000..5050e831ea895 --- /dev/null +++ b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/DataFusionPlugin.java @@ -0,0 +1,137 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.datafusion; + +import org.opensearch.cluster.metadata.IndexNameExpressionResolver; +import org.opensearch.cluster.node.DiscoveryNodes; +import org.opensearch.cluster.service.ClusterService; +import org.opensearch.common.settings.ClusterSettings; +import org.opensearch.common.settings.IndexScopedSettings; +import org.opensearch.common.settings.Settings; +import org.opensearch.common.settings.SettingsFilter; +import org.opensearch.core.common.io.stream.NamedWriteableRegistry; +import org.opensearch.core.xcontent.NamedXContentRegistry; +import org.opensearch.datafusion.action.DataFusionAction; +import org.opensearch.datafusion.action.NodesDataFusionInfoAction; +import org.opensearch.datafusion.action.TransportNodesDataFusionInfoAction; +import org.opensearch.env.Environment; +import org.opensearch.env.NodeEnvironment; +import org.opensearch.plugins.ActionPlugin; +import org.opensearch.plugins.Plugin; +import org.opensearch.repositories.RepositoriesService; +import org.opensearch.rest.RestController; +import org.opensearch.rest.RestHandler; +import org.opensearch.script.ScriptService; +import org.opensearch.threadpool.ThreadPool; +import org.opensearch.transport.client.Client; +import org.opensearch.watcher.ResourceWatcherService; + +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.function.Supplier; + +/** + * Main plugin class for OpenSearch DataFusion integration. + */ +public class DataFusionPlugin extends Plugin implements ActionPlugin { + + private DataFusionService dataFusionService; + private final boolean isDataFusionEnabled; + + /** + * Constructor for DataFusionPlugin. + * @param settings The settings for the DataFusionPlugin. + */ + public DataFusionPlugin(Settings settings) { + // For now, DataFusion is always enabled if the plugin is loaded + // In the future, this could be controlled by a feature flag + this.isDataFusionEnabled = true; + } + + /** + * Creates components for the DataFusion plugin. + * @param client The client instance. + * @param clusterService The cluster service instance. + * @param threadPool The thread pool instance. + * @param resourceWatcherService The resource watcher service instance. + * @param scriptService The script service instance. + * @param xContentRegistry The named XContent registry. + * @param environment The environment instance. + * @param nodeEnvironment The node environment instance. + * @param namedWriteableRegistry The named writeable registry. + * @param indexNameExpressionResolver The index name expression resolver instance. + * @param repositoriesServiceSupplier The supplier for the repositories service. + * @return Collection of created components + */ + @Override + public Collection createComponents( + Client client, + ClusterService clusterService, + ThreadPool threadPool, + ResourceWatcherService resourceWatcherService, + ScriptService scriptService, + NamedXContentRegistry xContentRegistry, + Environment environment, + NodeEnvironment nodeEnvironment, + NamedWriteableRegistry namedWriteableRegistry, + IndexNameExpressionResolver indexNameExpressionResolver, + Supplier repositoriesServiceSupplier + ) { + if (!isDataFusionEnabled) { + return Collections.emptyList(); + } + + dataFusionService = new DataFusionService(); + return Collections.singletonList(dataFusionService); + } + + /** + * Gets the REST handlers for the DataFusion plugin. + * @param settings The settings for the plugin. + * @param restController The REST controller instance. + * @param clusterSettings The cluster settings instance. + * @param indexScopedSettings The index scoped settings instance. + * @param settingsFilter The settings filter instance. + * @param indexNameExpressionResolver The index name expression resolver instance. + * @param nodesInCluster The supplier for the discovery nodes. + * @return A list of REST handlers. + */ + @Override + public List getRestHandlers( + Settings settings, + RestController restController, + ClusterSettings clusterSettings, + IndexScopedSettings indexScopedSettings, + SettingsFilter settingsFilter, + IndexNameExpressionResolver indexNameExpressionResolver, + Supplier nodesInCluster + ) { + if (!isDataFusionEnabled) { + return Collections.emptyList(); + } + return List.of( + new DataFusionAction() + ); + } + + /** + * Gets the list of action handlers for the DataFusion plugin. + * @return A list of action handlers. + */ + @Override + public List> getActions() { + if (!isDataFusionEnabled) { + return Collections.emptyList(); + } + return List.of( + new ActionHandler<>(NodesDataFusionInfoAction.INSTANCE, TransportNodesDataFusionInfoAction.class) + ); + } +} diff --git a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/DataFusionService.java b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/DataFusionService.java new file mode 100644 index 0000000000000..17bbb4738db9b --- /dev/null +++ b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/DataFusionService.java @@ -0,0 +1,109 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.datafusion; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.opensearch.common.lifecycle.AbstractLifecycleComponent; +import org.opensearch.common.util.concurrent.ConcurrentCollections; +import org.opensearch.common.util.concurrent.ConcurrentMapLong; +import org.opensearch.datafusion.core.SessionContext; + +import java.util.concurrent.atomic.AtomicLong; + +/** + * Service for managing DataFusion contexts and operations - essentially like SearchService + */ +public class DataFusionService extends AbstractLifecycleComponent { + + private static final Logger logger = LogManager.getLogger(DataFusionService.class); + + // in memory contexts, similar to ReaderContext in SearchService, just a ptr to SessionContext for now. + private final ConcurrentMapLong contexts = ConcurrentCollections.newConcurrentMapLongWithAggressiveConcurrency(); + + private final AtomicLong idGenerator = new AtomicLong(); + + @Override + protected void doStart() { + logger.info("Starting DataFusion service"); + try { + // Test that the native library loads correctly + String version = DataFusionJNI.getVersion(); + logger.info("DataFusion service started successfully. Version info: {}", version); + } catch (Exception e) { + logger.error("Failed to start DataFusion service", e); + throw new RuntimeException("Failed to initialize DataFusion JNI", e); + } + } + + @Override + protected void doStop() { + logger.info("Stopping DataFusion service"); + // Close all named contexts + for (SessionContext ctx : contexts.values()) { + try { + ctx.close(); + } catch (Exception e) { + logger.warn("Error closing DataFusion context", e); + } + } + contexts.clear(); + logger.info("DataFusion service stopped"); + } + + @Override + protected void doClose() { + // Ensure all resources are cleaned up + doStop(); + } + + /** + * Create a new named DataFusion context + * @return the context ID + */ + long createContext() { + SessionContext ctx = new SessionContext(); + // just stores the context for now + long id = idGenerator.incrementAndGet(); + SessionContext existing = contexts.put(id, ctx); + assert existing == null; + return id; + } + + /** + * Get a context by id + * @param id the context id + * @return the context ID, or null if not found + */ + SessionContext getContext(long id) { + return contexts.get(id); + } + + /** + * Close a context + * @param contextId the context id + * @return true if the context was found and closed, false otherwise + */ + public boolean closeContext(long contextId) { + try (SessionContext ignored = contexts.remove(contextId)) { + // do nothing + } catch (Exception e) { + throw new RuntimeException(e); + } + return false; + } + + /** + * Get version information + * @return JSON version string + */ + public String getVersion() { + return DataFusionJNI.getVersion(); + } +} diff --git a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/DataFusionAction.java b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/DataFusionAction.java new file mode 100644 index 0000000000000..66dd36d2d0bfe --- /dev/null +++ b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/DataFusionAction.java @@ -0,0 +1,70 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.datafusion.action; + +import org.opensearch.rest.BaseRestHandler; +import org.opensearch.rest.RestRequest; +import org.opensearch.rest.action.RestToXContentListener; +import org.opensearch.transport.client.node.NodeClient; + +import java.util.List; + +import static org.opensearch.rest.RestRequest.Method.GET; + +/** + * REST handler for DataFusion information operations. + * It handles GET requests for retrieving DataFusion server information. + */ +public class DataFusionAction extends BaseRestHandler { + + /** + * Constructor for DataFusionRestHandler. + */ + public DataFusionAction() {} + + /** + * Returns the name of the action. + * @return The name of the action. + */ + @Override + public String getName() { + return "datafusion_info_action"; + } + + /** + * Returns the list of routes for the action. + * @return The list of routes for the action. + */ + @Override + public List routes() { + return List.of( + new Route(GET, "/_plugins/datafusion/info"), + new Route(GET, "/_plugins/datafusion/info/{nodeId}") + ); + } + + /** + * Prepares the request for the action. + * @param request The REST request. + * @param client The node client. + * @return The rest channel consumer. + */ + @Override + protected RestChannelConsumer prepareRequest(RestRequest request, NodeClient client) { + String nodeId = request.param("nodeId"); + if (nodeId != null) { + // Query specific node + NodesDataFusionInfoRequest nodesRequest = new NodesDataFusionInfoRequest(nodeId); + return channel -> client.execute(NodesDataFusionInfoAction.INSTANCE, nodesRequest, new RestToXContentListener<>(channel)); + } else { + NodesDataFusionInfoRequest nodesRequest = new NodesDataFusionInfoRequest(); + return channel -> client.execute(NodesDataFusionInfoAction.INSTANCE, nodesRequest, new RestToXContentListener<>(channel)); + } + } +} diff --git a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/NodeDataFusionInfo.java b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/NodeDataFusionInfo.java new file mode 100644 index 0000000000000..6d50e2d40af78 --- /dev/null +++ b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/NodeDataFusionInfo.java @@ -0,0 +1,85 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.datafusion.action; + +import org.opensearch.action.support.nodes.BaseNodeResponse; +import org.opensearch.cluster.node.DiscoveryNode; +import org.opensearch.core.common.io.stream.StreamInput; +import org.opensearch.core.common.io.stream.StreamOutput; +import org.opensearch.core.xcontent.ToXContentFragment; +import org.opensearch.core.xcontent.XContentBuilder; + +import java.io.IOException; + +/** + * Information about DataFusion on a specific node + */ +public class NodeDataFusionInfo extends BaseNodeResponse implements ToXContentFragment { + + private final String dataFusionVersion; + + /** + * Constructor for NodeDataFusionInfo. + * @param node The discovery node. + * @param dataFusionVersion The DataFusion version. + */ + public NodeDataFusionInfo( + DiscoveryNode node, + String dataFusionVersion + ) { + super(node); + this.dataFusionVersion = dataFusionVersion; + } + + /** + * Constructor for NodeDataFusionInfo from stream input. + * @param in The stream input. + * @throws IOException If an I/O error occurs. + */ + public NodeDataFusionInfo(StreamInput in) throws IOException { + super(in); + this.dataFusionVersion = in.readString(); + } + + /** + * Writes the node info to the stream output. + * @param out The stream output. + * @throws IOException If an I/O error occurs. + */ + @Override + public void writeTo(StreamOutput out) throws IOException { + super.writeTo(out); + out.writeString(dataFusionVersion); + } + + /** + * Converts the node info to XContent. + * @param builder The XContent builder. + * @param params The parameters. + * @return The XContent builder. + * @throws IOException If an I/O error occurs. + */ + @Override + public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { + builder.startObject(); + builder.startObject("data_fusion_info"); + builder.field("datafusion_version", dataFusionVersion); + builder.endObject(); + builder.endObject(); + return builder; + } + + /** + * Gets the DataFusion version. + * @return The DataFusion version. + */ + public String getDataFusionVersion() { + return dataFusionVersion; + } +} diff --git a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/NodesDataFusionInfoAction.java b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/NodesDataFusionInfoAction.java new file mode 100644 index 0000000000000..198c7973e6a9c --- /dev/null +++ b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/NodesDataFusionInfoAction.java @@ -0,0 +1,29 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.datafusion.action; + +import org.opensearch.action.ActionType; + +/** + * Action to retrieve DataFusion info from nodes + */ +public class NodesDataFusionInfoAction extends ActionType { + /** + * Singleton instance of NodesDataFusionInfoAction. + */ + public static final NodesDataFusionInfoAction INSTANCE = new NodesDataFusionInfoAction(); + /** + * Name of this action. + */ + public static final String NAME = "cluster:admin/datafusion/info"; + + NodesDataFusionInfoAction() { + super(NAME, NodesDataFusionInfoResponse::new); + } +} diff --git a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/NodesDataFusionInfoRequest.java b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/NodesDataFusionInfoRequest.java new file mode 100644 index 0000000000000..61ce2444722ee --- /dev/null +++ b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/NodesDataFusionInfoRequest.java @@ -0,0 +1,76 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.datafusion.action; + +import org.opensearch.action.support.nodes.BaseNodesRequest; +import org.opensearch.core.common.io.stream.StreamInput; +import org.opensearch.core.common.io.stream.StreamOutput; + +import java.io.IOException; + +/** + * Request for retrieving DataFusion information from nodes + */ +public class NodesDataFusionInfoRequest extends BaseNodesRequest { + + /** + * Default constructor for NodesDataFusionInfoRequest. + */ + public NodesDataFusionInfoRequest() { + super((String[]) null); + } + + /** + * Constructor for NodesDataFusionInfoRequest with specific node IDs. + * @param nodeIds The node IDs to query. + */ + public NodesDataFusionInfoRequest(String... nodeIds) { + super(nodeIds); + } + + /** + * Constructor for NodesDataFusionInfoRequest from stream input. + * @param in The stream input. + * @throws IOException If an I/O error occurs. + */ + public NodesDataFusionInfoRequest(StreamInput in) throws IOException { + super(in); + } + + /** + * Writes the request to the stream output. + * @param out The stream output. + * @throws IOException If an I/O error occurs. + */ + @Override + public void writeTo(StreamOutput out) throws IOException { + super.writeTo(out); + } + + + /** + * Node-level request for DataFusion information + */ + public static class NodeDataFusionInfoRequest extends org.opensearch.transport.TransportRequest { + + /** + * Default constructor for NodeDataFusionInfoRequest. + */ + public NodeDataFusionInfoRequest() {} + + /** + * Constructor for NodeDataFusionInfoRequest from stream input. + * @param in The stream input. + * @throws IOException If an I/O error occurs. + */ + public NodeDataFusionInfoRequest(StreamInput in) throws IOException { + super(in); + } + } +} diff --git a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/NodesDataFusionInfoResponse.java b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/NodesDataFusionInfoResponse.java new file mode 100644 index 0000000000000..5c14455da1622 --- /dev/null +++ b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/NodesDataFusionInfoResponse.java @@ -0,0 +1,98 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.datafusion.action; + +import org.opensearch.action.FailedNodeException; +import org.opensearch.action.support.nodes.BaseNodesResponse; +import org.opensearch.cluster.ClusterName; +import org.opensearch.core.common.io.stream.StreamInput; +import org.opensearch.core.common.io.stream.StreamOutput; +import org.opensearch.core.xcontent.ToXContentFragment; +import org.opensearch.core.xcontent.ToXContentObject; +import org.opensearch.core.xcontent.XContentBuilder; + +import java.io.IOException; +import java.util.List; + +/** + * Response containing DataFusion information from multiple nodes + */ +public class NodesDataFusionInfoResponse extends BaseNodesResponse implements ToXContentObject { + + /** + * Constructor for NodesDataFusionInfoResponse. + * @param clusterName The cluster name. + * @param nodes The list of node DataFusion info. + * @param failures The list of failed node exceptions. + */ + public NodesDataFusionInfoResponse( + ClusterName clusterName, + List nodes, + List failures + ) { + super(clusterName, nodes, failures); + } + + @Override + protected List readNodesFrom(StreamInput in) throws IOException { + return in.readList(NodeDataFusionInfo::new); + } + + /** + * Constructor for NodesDataFusionInfoResponse from stream input. + * @param in The stream input. + * @throws IOException If an I/O error occurs. + */ + public NodesDataFusionInfoResponse(StreamInput in) throws IOException { + super(in); + } + + /** + * Writes the node response to stream output. + * @param out The stream output. + * @throws IOException If an I/O error occurs. + */ + @Override + protected void writeNodesTo(StreamOutput out, List nodes) throws IOException { + out.writeList(nodes); + } + + /** + * Converts the response to XContent. + * @param builder The XContent builder. + * @param params The parameters. + * @return The XContent builder. + * @throws IOException If an I/O error occurs. + */ + @Override + public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { + builder.startObject(); + builder.startObject("nodes"); + for (NodeDataFusionInfo nodeInfo : getNodes()) { + builder.field(nodeInfo.getNode().getId()); +// builder.field("name", nodeInfo.getNode().getName()); +// builder.field("transport_address", nodeInfo.getNode().getAddress().toString()); + nodeInfo.toXContent(builder, params); + } + builder.endObject(); + + if (!failures().isEmpty()) { + builder.startArray("failures"); + for (FailedNodeException failure : failures()) { + builder.startObject(); + builder.field("node_id", failure.nodeId()); + builder.field("reason", failure.getMessage()); + builder.endObject(); + } + builder.endArray(); + } + builder.endObject(); + return builder; + } +} diff --git a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/TransportNodesDataFusionInfoAction.java b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/TransportNodesDataFusionInfoAction.java new file mode 100644 index 0000000000000..1ba5fd9af3210 --- /dev/null +++ b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/TransportNodesDataFusionInfoAction.java @@ -0,0 +1,115 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.datafusion.action; + +import org.opensearch.action.FailedNodeException; +import org.opensearch.action.support.ActionFilters; +import org.opensearch.action.support.nodes.TransportNodesAction; +import org.opensearch.cluster.service.ClusterService; +import org.opensearch.common.inject.Inject; +import org.opensearch.core.common.io.stream.StreamInput; +import org.opensearch.datafusion.DataFusionService; +import org.opensearch.threadpool.ThreadPool; +import org.opensearch.transport.TransportService; + +import java.io.IOException; +import java.util.List; + +/** + * Transport action for retrieving DataFusion information from nodes + */ +public class TransportNodesDataFusionInfoAction extends TransportNodesAction< + NodesDataFusionInfoRequest, + NodesDataFusionInfoResponse, + NodesDataFusionInfoRequest.NodeDataFusionInfoRequest, + NodeDataFusionInfo> { + + private final DataFusionService dataFusionService; + + /** + * Constructor for TransportNodesDataFusionInfoAction. + * @param threadPool The thread pool. + * @param clusterService The cluster service. + * @param transportService The transport service. + * @param actionFilters The action filters. + * @param dataFusionService The DataFusion service. + */ + @Inject + public TransportNodesDataFusionInfoAction( + ThreadPool threadPool, + ClusterService clusterService, + TransportService transportService, + ActionFilters actionFilters, + DataFusionService dataFusionService + ) { + super( + NodesDataFusionInfoAction.NAME, + threadPool, + clusterService, + transportService, + actionFilters, + NodesDataFusionInfoRequest::new, + NodesDataFusionInfoRequest.NodeDataFusionInfoRequest::new, + ThreadPool.Names.MANAGEMENT, + NodeDataFusionInfo.class + ); + this.dataFusionService = dataFusionService; + } + + /** + * Creates a new nodes response. + * @param request The nodes request. + * @param responses The list of node responses. + * @param failures The list of failed node exceptions. + * @return The nodes response. + */ + @Override + protected NodesDataFusionInfoResponse newResponse( + NodesDataFusionInfoRequest request, + List responses, + List failures + ) { + return new NodesDataFusionInfoResponse(clusterService.getClusterName(), responses, failures); + } + + /** + * Creates a new node request. + * @param request The nodes request. + * @return The node request. + */ + @Override + protected NodesDataFusionInfoRequest.NodeDataFusionInfoRequest newNodeRequest(NodesDataFusionInfoRequest request) { + return new NodesDataFusionInfoRequest.NodeDataFusionInfoRequest(); + } + + @Override + protected NodeDataFusionInfo newNodeResponse(StreamInput in) throws IOException { + return new NodeDataFusionInfo(in); + } + + /** + * Handles the node request and returns the node response. + * @param request The node request. + * @return The node response. + */ + @Override + protected NodeDataFusionInfo nodeOperation(NodesDataFusionInfoRequest.NodeDataFusionInfoRequest request) { + try { + return new NodeDataFusionInfo( + clusterService.localNode(), + dataFusionService.getVersion() + ); + } catch (Exception e) { + return new NodeDataFusionInfo( + clusterService.localNode(), + "unknown" + ); + } + } +} diff --git a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/core/SessionContext.java b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/core/SessionContext.java new file mode 100644 index 0000000000000..58a750351fe3c --- /dev/null +++ b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/core/SessionContext.java @@ -0,0 +1,38 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.datafusion.core; +/** + * Session context for datafusion + */ +public class SessionContext implements AutoCloseable { + + // ptr to context in df + private final long ptr; + + /** + * Create a new DataFusion session context + * @return context ID for subsequent operations + */ + static native long createContext(); + + /** + * Close and cleanup a DataFusion context + * @param contextId the context ID to close + */ + public static native void closeContext(long contextId); + + public SessionContext() { + this.ptr = createContext(); + } + + @Override + public void close() throws Exception { + closeContext(this.ptr); + } +} diff --git a/plugins/engine-datafusion/src/test/java/org/opensearch/datafusion/DataFusionServiceTest.java b/plugins/engine-datafusion/src/test/java/org/opensearch/datafusion/DataFusionServiceTest.java new file mode 100644 index 0000000000000..af39b70fcab13 --- /dev/null +++ b/plugins/engine-datafusion/src/test/java/org/opensearch/datafusion/DataFusionServiceTest.java @@ -0,0 +1,60 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.datafusion; + +import org.junit.Before; +import org.junit.Test; +import org.junit.Assume; +import org.opensearch.datafusion.core.SessionContext; + +import static org.junit.Assert.*; + +/** + * Unit tests for DataFusionService + * + * Note: These tests require the native library to be available. + * They are disabled by default and can be enabled by setting the system property: + * -Dtest.native.enabled=true + */ +public class DataFusionServiceTest { + + private DataFusionService service; + + @Before + public void setUp() { + service = new DataFusionService(); + service.doStart(); + } + + @Test + public void testGetVersion() { + String version = service.getVersion(); + assertNotNull(version); + assertTrue(version.contains("datafusion_version")); + assertTrue(version.contains("arrow_version")); + } + + @Test + public void testCreateAndCloseContext() { + // Create context + long contextId = service.createContext(); + assertTrue(contextId > 0); + + // Verify context exists + SessionContext context = service.getContext(contextId); + assertNotNull(context); + + // Close context + boolean closed = service.closeContext(contextId); + assertTrue(closed); + + // Verify context is gone + assertNull(service.getContext(contextId)); + } +} From bdbd927da6baf997fac22ff2850632479cf910fc Mon Sep 17 00:00:00 2001 From: bharath-techie Date: Fri, 8 Aug 2025 00:01:15 +0530 Subject: [PATCH 2/3] Add extensions for csv codec Signed-off-by: bharath-techie --- libs/dataformat-csv/build.gradle | 84 ++++++++ libs/dataformat-csv/jni/Cargo.toml | 53 +++++ libs/dataformat-csv/jni/src/context.rs | 70 +++++++ libs/dataformat-csv/jni/src/csv_exec.rs | 24 +++ libs/dataformat-csv/jni/src/lib.rs | 198 ++++++++++++++++++ libs/dataformat-csv/jni/src/runtime.rs | 27 +++ libs/dataformat-csv/jni/src/stream.rs | 43 ++++ libs/dataformat-csv/jni/src/substrait.rs | 37 ++++ libs/dataformat-csv/jni/src/util.rs | 63 ++++++ .../datafusion/csv/CsvDataSourceCodec.java | 142 +++++++++++++ .../datafusion/csv/CsvRecordBatchStream.java | 119 +++++++++++ .../datafusion/csv/JniLibraryLoader.java | 151 +++++++++++++ ....opensearch.datafusion.spi.DataSourceCodec | 1 + plugins/engine-datafusion/build.gradle | 3 + plugins/engine-datafusion/jni/Cargo.toml | 36 ++-- plugins/engine-datafusion/jni/src/lib.rs | 46 ++++ .../opensearch/datafusion/DataFusionJNI.java | 13 +- .../datafusion/DataFusionPlugin.java | 1 + .../datafusion/DataFusionService.java | 164 +++++++++++---- .../action/NodesDataFusionInfoResponse.java | 1 - .../datafusion/core/GlobalRuntimeEnv.java | 31 +++ .../datafusion/spi/DataSourceCodec.java | 52 +++++ .../datafusion/spi/DataSourceRegistry.java | 120 +++++++++++ .../datafusion/spi/RecordBatchStream.java | 39 ++++ ....opensearch.datafusion.spi.DataSourceCodec | 5 + ...t.java => TestDataFusionServiceTests.java} | 27 ++- 26 files changed, 1479 insertions(+), 71 deletions(-) create mode 100644 libs/dataformat-csv/build.gradle create mode 100644 libs/dataformat-csv/jni/Cargo.toml create mode 100644 libs/dataformat-csv/jni/src/context.rs create mode 100644 libs/dataformat-csv/jni/src/csv_exec.rs create mode 100644 libs/dataformat-csv/jni/src/lib.rs create mode 100644 libs/dataformat-csv/jni/src/runtime.rs create mode 100644 libs/dataformat-csv/jni/src/stream.rs create mode 100644 libs/dataformat-csv/jni/src/substrait.rs create mode 100644 libs/dataformat-csv/jni/src/util.rs create mode 100644 libs/dataformat-csv/src/main/java/org/opensearch/datafusion/csv/CsvDataSourceCodec.java create mode 100644 libs/dataformat-csv/src/main/java/org/opensearch/datafusion/csv/CsvRecordBatchStream.java create mode 100644 libs/dataformat-csv/src/main/java/org/opensearch/datafusion/csv/JniLibraryLoader.java create mode 100644 libs/dataformat-csv/src/main/resources/META-INF/services/org.opensearch.datafusion.spi.DataSourceCodec create mode 100644 plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/core/GlobalRuntimeEnv.java create mode 100644 plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/spi/DataSourceCodec.java create mode 100644 plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/spi/DataSourceRegistry.java create mode 100644 plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/spi/RecordBatchStream.java create mode 100644 plugins/engine-datafusion/src/main/resources/META-INF/services/org.opensearch.datafusion.spi.DataSourceCodec rename plugins/engine-datafusion/src/test/java/org/opensearch/datafusion/{DataFusionServiceTest.java => TestDataFusionServiceTests.java} (62%) diff --git a/libs/dataformat-csv/build.gradle b/libs/dataformat-csv/build.gradle new file mode 100644 index 0000000000000..a6dadddcb3dea --- /dev/null +++ b/libs/dataformat-csv/build.gradle @@ -0,0 +1,84 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + */ + +apply plugin: 'opensearch.java' + +dependencies { + // TODO : circular dependency + compileOnly project(':plugins:engine-datafusion') + + implementation "org.apache.logging.log4j:log4j-api:${versions.log4j}" + implementation "org.apache.logging.log4j:log4j-core:${versions.log4j}" + + testImplementation "junit:junit:${versions.junit}" +} + +// Task to build the Rust JNI library +task buildRustLibrary(type: Exec) { + description = 'Build the Rust JNI library using Cargo' + group = 'build' + + workingDir file('jni') + def osName = System.getProperty('os.name').toLowerCase() + def libPrefix = osName.contains('windows') ? '' : 'lib' + def libExtension = osName.contains('windows') ? '.dll' : (osName.contains('mac') ? '.dylib' : '.so') + + def buildType = project.hasProperty('rustRelease') ? 'release' : 'debug' + def targetDir = "target/${buildType}" + + def cargoArgs = ['cargo', 'build'] + if (buildType == 'release') { + cargoArgs.add('--release') + } + + if (osName.contains('windows')) { + commandLine cargoArgs + } else { + commandLine cargoArgs + } + environment 'CARGO_TARGET_DIR', file('jni/target').absolutePath + + inputs.files fileTree('jni/src') + inputs.file 'jni/Cargo.toml' + outputs.files file("jni/${targetDir}/${libPrefix}opensearch_datafusion_csv_jni${libExtension}") + System.out.println("Building Rust library in ${buildType} mode"); +} + +task copyNativeLibrary(type: Copy, dependsOn: buildRustLibrary) { + description = 'Copy the native library to Java resources' + group = 'build' + + def osName = System.getProperty('os.name').toLowerCase() + def libPrefix = osName.contains('windows') ? '' : 'lib' + def libExtension = osName.contains('windows') ? '.dll' : (osName.contains('mac') ? '.dylib' : '.so') + def buildType = project.hasProperty('rustRelease') ? 'release' : 'debug' + + from file("jni/target/${buildType}/${libPrefix}opensearch_datafusion_csv_jni${libExtension}") + into file('src/main/resources') + + rename { filename -> + "libopensearch_datafusion_csv_jni${libExtension}" + } +} + +compileJava.dependsOn copyNativeLibrary + +processResources.dependsOn copyNativeLibrary + +jar { + archiveBaseName = 'opensearch-dataformat-csv-codec' + duplicatesStrategy = DuplicatesStrategy.WARN + dependsOn copyNativeLibrary +} + +clean { + delete file('jni/target') + delete file('src/main/resources/libopensearch_datafusion_csv_jni.dylib') + delete file('src/main/resources/libopensearch_datafusion_csv_jni.so') + delete file('src/main/resources/opensearch_datafusion_csv_jni.dll') +} + +test { + systemProperty 'java.library.path', file('src/main/resources').absolutePath +} diff --git a/libs/dataformat-csv/jni/Cargo.toml b/libs/dataformat-csv/jni/Cargo.toml new file mode 100644 index 0000000000000..be5b6c92bfa66 --- /dev/null +++ b/libs/dataformat-csv/jni/Cargo.toml @@ -0,0 +1,53 @@ +[package] +name = "opensearch-datafusion-csv-jni" +version = "0.1.0" +edition = "2021" + +[lib] +name = "opensearch_datafusion_csv_jni" +crate-type = ["cdylib"] + +[dependencies] +# DataFusion dependencies +datafusion = "49.0.0" +datafusion-substrait = "49.0.0" +arrow = "54.0.0" +arrow-array = "54.0.0" +arrow-schema = "54.0.0" +arrow-buffer = "54.0.0" + +# JNI dependencies +jni = "0.21" + +# Async runtime +tokio = { version = "1.0", features = ["full"] } +futures = "0.3" +futures-util = "0.3" + +# Serialization +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" + +# Error handling +anyhow = "1.0" +thiserror = "1.0" + +# Logging +log = "0.4" + +# Parquet support +parquet = "54.0.0" + +# Object store for file access +object_store = "0.11" +url = "2.0" + +# Substrait support +substrait = "0.47" +prost = "0.13" + +# Temporary directory support +tempfile = "3.0" + +[build-dependencies] +cbindgen = "0.27" diff --git a/libs/dataformat-csv/jni/src/context.rs b/libs/dataformat-csv/jni/src/context.rs new file mode 100644 index 0000000000000..0878254479201 --- /dev/null +++ b/libs/dataformat-csv/jni/src/context.rs @@ -0,0 +1,70 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + */ + +use datafusion::prelude::*; +use datafusion::execution::context::SessionContext; +use std::collections::HashMap; +use std::sync::Arc; +use anyhow::Result; + +/// Manages DataFusion session contexts +pub struct SessionContextManager { + contexts: HashMap<*mut SessionContext, Arc>, + next_runtime_id: u64, +} + +impl SessionContextManager { + pub fn new() -> Self { + Self { + contexts: HashMap::new(), + next_runtime_id: 1, + } + } + + pub async fn register_directory( + &mut self, + table_name: &str, + directory_path: &str, + options: HashMap, + ) -> Result { + // Placeholder implementation - would register csv directory as table + log::info!("Registering directory: {} at path: {} with options: {:?}", + table_name, directory_path, options); + + let runtime_id = self.next_runtime_id; + self.next_runtime_id += 1; + Ok(runtime_id) + } + + pub async fn create_session_context( + &mut self, + config: HashMap, + ) -> Result<*mut SessionContext> { + // Create actual DataFusion session context + let mut session_config = SessionConfig::new(); + + // Apply configuration options + if let Some(batch_size) = config.get("batch_size") { + if let Ok(size) = batch_size.parse::() { + session_config = session_config.with_batch_size(size); + } + } + + let ctx = Arc::new(SessionContext::new_with_config(session_config)); + let ctx_ptr = Arc::as_ptr(&ctx) as *mut SessionContext; + + self.contexts.insert(ctx_ptr, ctx); + + Ok(ctx_ptr) + } + + pub async fn close_session_context(&mut self, ctx_ptr: *mut SessionContext) -> Result<()> { + self.contexts.remove(&ctx_ptr); + Ok(()) + } + + pub fn get_context(&self, ctx_ptr: *mut SessionContext) -> Option<&Arc> { + self.contexts.get(&ctx_ptr) + } +} diff --git a/libs/dataformat-csv/jni/src/csv_exec.rs b/libs/dataformat-csv/jni/src/csv_exec.rs new file mode 100644 index 0000000000000..2043be331b35a --- /dev/null +++ b/libs/dataformat-csv/jni/src/csv_exec.rs @@ -0,0 +1,24 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + */ + +use anyhow::Result; + +/// Csv-specific execution utilities - placeholder implementation +pub struct CsvExecutor; + +impl CsvExecutor { + pub fn new() -> Self { + Self + } + + /// Create a listing table for Csv files - placeholder + pub async fn create_csv_table( + &self, + table_path: &str, + ) -> Result { + // Placeholder implementation + log::info!("Creating csv table for path: {}", table_path); + Ok(1) // Return dummy table ID + } +} diff --git a/libs/dataformat-csv/jni/src/lib.rs b/libs/dataformat-csv/jni/src/lib.rs new file mode 100644 index 0000000000000..34618f94a9372 --- /dev/null +++ b/libs/dataformat-csv/jni/src/lib.rs @@ -0,0 +1,198 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +//! OpenSearch DataFusion Csv JNI Library +//! +//! This library provides JNI bindings for DataFusion query execution, + +use jni::JNIEnv; +use jni::objects::{JClass, JString, JObjectArray, JByteArray}; +use jni::sys::{jlong, jstring}; +use std::ptr; +use std::collections::HashMap; + +mod context; +mod runtime; +mod stream; +mod substrait; +mod util; +mod csv_exec; + +use context::SessionContextManager; +use runtime::RuntimeManager; +use stream::RecordBatchStreamWrapper; +use substrait::SubstraitExecutor; +use datafusion::execution::context::SessionContext; +use datafusion::execution::runtime_env::RuntimeEnv; + +/** +TODO : Put more thought into this +**/ +static mut RUNTIME_MANAGER: Option = None; + +static mut SESSION_MANAGER: Option = None; + +/// Initialize the managers (call once) +fn init_managers() { + unsafe { + if RUNTIME_MANAGER.is_none() { + RUNTIME_MANAGER = Some(RuntimeManager::new()); + } + if SESSION_MANAGER.is_none() { + SESSION_MANAGER = Some(SessionContextManager::new()); + } + } +} +static mut RUNTIME_ENVIRONMENTS: Option> = None; + + +/// Register a directory as a table in the global context and return runtime environment ID +#[no_mangle] +pub extern "system" fn Java_org_opensearch_datafusion_csv_CsvDataSourceCodec_nativeRegisterDirectory( + mut env: JNIEnv, + _class: JClass, + table_name: JString, + directory_path: JString, + files: JObjectArray, + runtime_id: jlong +) { + let runtimeEnv = unsafe { &mut *(runtime_id as *mut RuntimeEnv) }; + // placeholder +} + +/// Create a new session context +#[no_mangle] +pub extern "system" fn Java_org_opensearch_datafusion_csv_CsvDataSourceCodec_nativeCreateSessionContext( + mut env: JNIEnv, + _class: JClass, + config_keys: JObjectArray, + config_values: JObjectArray, +) -> jlong { + // Initialize managers if not already done + init_managers(); + + // PLACEHOLDER + // Parse configuration from JNI arrays + let config = match util::parse_string_map(&mut env, config_keys, config_values) { + Ok(cfg) => cfg, + Err(e) => { + util::throw_exception(&mut env, &format!("Failed to parse config: {}", e)); + return 0; + } + }; + + // Create session context + match unsafe { + RUNTIME_MANAGER.as_ref().unwrap().block_on(async { + SESSION_MANAGER.as_mut().unwrap().create_session_context(config).await + }) + } { + Ok(context_ptr) => context_ptr as jlong, + Err(e) => { + util::throw_exception(&mut env, &format!("Failed to create session context: {}", e)); + 0 + } + } +} + +/// Execute a Substrait query plan +#[no_mangle] +pub extern "system" fn Java_org_opensearch_datafusion_csv_CsvDataSourceCodec_nativeExecuteSubstraitQuery( + mut env: JNIEnv, + _class: JClass, + session_context_ptr: jlong, + substrait_plan: JByteArray, +) -> jlong { + + // Convert JByteArray to Vec + let substrait_plan_bytes = match env.convert_byte_array(substrait_plan) { + Ok(bytes) => bytes, + Err(e) => { + util::throw_exception(&mut env, &format!("Failed to convert substrait plan: {}", e)); + return 0; + } + }; + + // Execute the query + match unsafe { + RUNTIME_MANAGER.as_ref().unwrap().block_on(async { + let executor = SubstraitExecutor::new(); + executor.execute_plan(session_context_ptr as *mut SessionContext, &substrait_plan_bytes).await + }) + } { + Ok(stream_ptr) => stream_ptr as jlong, + Err(e) => { + util::throw_exception(&mut env, &format!("Failed to execute query: {}", e)); + 0 + } + } +} + +/// Close a session context +#[no_mangle] +pub extern "system" fn Java_org_opensearch_datafusion_csv_CsvDataSourceCodec_nativeCloseSessionContext( + mut env: JNIEnv, + _class: JClass, + session_context_ptr: jlong, +) { + + if let Err(e) = unsafe { + RUNTIME_MANAGER.as_ref().unwrap().block_on(async { + SESSION_MANAGER.as_mut().unwrap() + .close_session_context(session_context_ptr as *mut SessionContext) + .await + }) + } { + util::throw_exception(&mut env, &format!("Failed to close session context: {}", e)); + } +} + +/// Get the next record batch from a stream +#[no_mangle] +pub extern "system" fn Java_org_opensearch_datafusion_csv_CsvRecordBatchStream_nativeNextBatch( + mut env: JNIEnv, + _class: JClass, + stream_ptr: jlong, +) -> jstring { + + let stream = unsafe { &mut *(stream_ptr as *mut RecordBatchStreamWrapper) }; + + match unsafe { + RUNTIME_MANAGER.as_ref().unwrap().block_on(async { + stream.next_batch().await + }) + } { + Ok(Some(batch_json)) => { + match env.new_string(&batch_json) { + Ok(jstr) => jstr.into_raw(), + Err(e) => { + util::throw_exception(&mut env, &format!("Failed to create Java string: {}", e)); + ptr::null_mut() + } + } + } + Ok(None) => ptr::null_mut(), // End of stream + Err(e) => { + util::throw_exception(&mut env, &format!("Failed to get next batch: {}", e)); + ptr::null_mut() + } + } +} + +/// Close a record batch stream +#[no_mangle] +pub extern "system" fn Java_org_opensearch_datafusion_csv_CsvRecordBatchStream_nativeCloseStream( + _env: JNIEnv, + _class: JClass, + stream_ptr: jlong, +) { + if stream_ptr != 0 { + let stream = unsafe { Box::from_raw(stream_ptr as *mut RecordBatchStreamWrapper) }; + drop(stream); + } +} diff --git a/libs/dataformat-csv/jni/src/runtime.rs b/libs/dataformat-csv/jni/src/runtime.rs new file mode 100644 index 0000000000000..bcd48a7dee58b --- /dev/null +++ b/libs/dataformat-csv/jni/src/runtime.rs @@ -0,0 +1,27 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + */ + +use tokio::runtime::Runtime; +use std::future::Future; + +/// Manages the Tokio runtime for async operations +pub struct RuntimeManager { + runtime: Runtime, +} + +impl RuntimeManager { + pub fn new() -> Self { + // Placeholder + + let runtime = Runtime::new().expect("Failed to create Tokio runtime"); + Self { runtime } + } + + pub fn block_on(&self, future: F) -> F::Output + where + F: Future, + { + self.runtime.block_on(future) + } +} diff --git a/libs/dataformat-csv/jni/src/stream.rs b/libs/dataformat-csv/jni/src/stream.rs new file mode 100644 index 0000000000000..2fe30f941223b --- /dev/null +++ b/libs/dataformat-csv/jni/src/stream.rs @@ -0,0 +1,43 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + */ + +use anyhow::Result; +use serde_json; + +/// Wrapper for DataFusion record batch streams - placeholder implementation +pub struct RecordBatchStreamWrapper { + batch_count: u32, + is_placeholder: bool, +} + +impl RecordBatchStreamWrapper { + pub fn new_placeholder() -> Self { + Self { + batch_count: 0, + is_placeholder: true, + } + } + + pub async fn next_batch(&mut self) -> Result> { + // Return placeholder data for first few calls, then None + if self.is_placeholder { + if self.batch_count < 2 { + self.batch_count += 1; + let placeholder_data = serde_json::json!({ + "rows": [ + {"id": self.batch_count, "name": format!("placeholder_row_{}", self.batch_count)} + ], + "num_rows": 1, + "num_columns": 2 + }); + Ok(Some(serde_json::to_string(&placeholder_data)?)) + } else { + Ok(None) // End of stream + } + } else { + // Real implementation would go here + Ok(None) + } + } +} diff --git a/libs/dataformat-csv/jni/src/substrait.rs b/libs/dataformat-csv/jni/src/substrait.rs new file mode 100644 index 0000000000000..d8ca0f2846fd7 --- /dev/null +++ b/libs/dataformat-csv/jni/src/substrait.rs @@ -0,0 +1,37 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + */ + +use datafusion::execution::context::SessionContext; +use crate::stream::RecordBatchStreamWrapper; +use anyhow::Result; + +/// Executes Substrait query plans +pub struct SubstraitExecutor; + +impl SubstraitExecutor { + pub fn new() -> Self { + Self + } + + pub async fn execute_plan( + &self, + session_context_ptr: *mut SessionContext, + substrait_plan_bytes: &[u8], + ) -> Result<*mut RecordBatchStreamWrapper> { + // Placeholder implementation - would normally: + // 1. Parse Substrait plan from substrait_plan_bytes + // 2. Convert to DataFusion logical plan using datafusion-substrait + // 3. Execute using the session context + // 4. Return actual record batch stream + + log::info!("Executing Substrait plan with {} bytes for session: {:?}", + substrait_plan_bytes.len(), session_context_ptr); + + // For now, return a placeholder stream + let wrapper = RecordBatchStreamWrapper::new_placeholder(); + let wrapper_ptr = Box::into_raw(Box::new(wrapper)); + + Ok(wrapper_ptr) + } +} diff --git a/libs/dataformat-csv/jni/src/util.rs b/libs/dataformat-csv/jni/src/util.rs new file mode 100644 index 0000000000000..5055c1312791a --- /dev/null +++ b/libs/dataformat-csv/jni/src/util.rs @@ -0,0 +1,63 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + */ + +use jni::JNIEnv; +use jni::objects::{JObjectArray, JString}; +use std::collections::HashMap; +use anyhow::Result; + +/// Parse a string map from JNI arrays +pub fn parse_string_map( + env: &mut JNIEnv, + keys: JObjectArray, + values: JObjectArray, +) -> Result> { + let mut map = HashMap::new(); + + let keys_len = env.get_array_length(&keys)?; + let values_len = env.get_array_length(&values)?; + + if keys_len != values_len { + return Err(anyhow::anyhow!("Keys and values arrays must have the same length")); + } + + for i in 0..keys_len { + let key_obj = env.get_object_array_element(&keys, i)?; + let value_obj = env.get_object_array_element(&values, i)?; + + let key_jstring = JString::from(key_obj); + let value_jstring = JString::from(value_obj); + + let key_str = env.get_string(&key_jstring)?; + let value_str = env.get_string(&value_jstring)?; + + map.insert(key_str.to_string_lossy().to_string(), value_str.to_string_lossy().to_string()); + } + + Ok(map) +} + +// Parse a string map from JNI arrays +pub fn parse_string_arr( + env: &mut JNIEnv, + files: JObjectArray, +) -> Result> { + let length = env.get_array_length(&files).unwrap(); + let mut rust_strings: Vec = Vec::with_capacity(length as usize); + for i in 0..length { + let file_obj = env.get_object_array_element(&files, i).unwrap(); + let jstring = JString::from(file_obj); + let rust_str: String = env + .get_string(&jstring) + .expect("Couldn't get java string!") + .into(); + rust_strings.push(rust_str); + } + Ok(rust_strings) +} + +/// Throw a Java exception +pub fn throw_exception(env: &mut JNIEnv, message: &str) { + let _ = env.throw_new("java/lang/RuntimeException", message); +} diff --git a/libs/dataformat-csv/src/main/java/org/opensearch/datafusion/csv/CsvDataSourceCodec.java b/libs/dataformat-csv/src/main/java/org/opensearch/datafusion/csv/CsvDataSourceCodec.java new file mode 100644 index 0000000000000..ea796c6b14ef2 --- /dev/null +++ b/libs/dataformat-csv/src/main/java/org/opensearch/datafusion/csv/CsvDataSourceCodec.java @@ -0,0 +1,142 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.datafusion.csv; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.opensearch.datafusion.spi.DataSourceCodec; +import org.opensearch.datafusion.spi.RecordBatchStream; + +import java.util.List; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.CompletionException; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.atomic.AtomicLong; + +/** + * Datasource codec implementation for CSV files + */ +public class CsvDataSourceCodec implements DataSourceCodec { + + private static final Logger logger = LogManager.getLogger(CsvDataSourceCodec.class); + private static final AtomicLong runtimeIdGenerator = new AtomicLong(0); + private static final AtomicLong sessionIdGenerator = new AtomicLong(0); + private final ConcurrentHashMap sessionContexts = new ConcurrentHashMap<>(); + + // JNI library loading + static { + try { + JniLibraryLoader.loadLibrary(); + logger.info("DataFusion JNI library loaded successfully"); + } catch (Exception e) { + logger.error("Failed to load DataFusion JNI library", e); + throw new RuntimeException("Failed to initialize DataFusion JNI library", e); + } + } + + @Override + public CompletableFuture registerDirectory(String directoryPath, List fileNames, long runtimeId) { + return CompletableFuture.supplyAsync(() -> { + try { + logger.debug("Registering directory: {} with {} files", directoryPath, fileNames.size()); + + // Convert file names to arrays for JNI + String[] fileArray = fileNames.toArray(new String[0]); + + // Call native method to register directory + nativeRegisterDirectory("csv_table", directoryPath, fileArray, runtimeId); + return null; + } catch (Exception e) { + logger.error("Failed to register directory: " + directoryPath, e); + throw new CompletionException("Failed to register directory", e); + } + }); + } + + @Override + public CompletableFuture createSessionContext(long globalRuntimeEnvId) { + return CompletableFuture.supplyAsync(() -> { + try { + long sessionId = sessionIdGenerator.incrementAndGet(); + logger.debug("Creating session context with ID: {} for runtime: {}", sessionId, globalRuntimeEnvId); + + // Default configuration + String[] configKeys = { "batch_size", "target_partitions" }; + String[] configValues = { "1024", "4" }; + + // Create native session context + long nativeContextPtr = nativeCreateSessionContext(configKeys, configValues); + sessionContexts.put(sessionId, nativeContextPtr); + + logger.info("Created session context with ID: {}", sessionId); + return sessionId; + } catch (Exception e) { + logger.error("Failed to create session context for runtime: " + globalRuntimeEnvId, e); + throw new CompletionException("Failed to create session context", e); + } + }); + } + + @Override + public CompletableFuture executeSubstraitQuery(long sessionContextId, byte[] substraitPlanBytes) { + return CompletableFuture.supplyAsync(() -> { + try { + logger.debug("Executing Substrait query for session: {}", sessionContextId); + + Long nativeContextPtr = sessionContexts.get(sessionContextId); + if (nativeContextPtr == null) { + throw new IllegalArgumentException("Invalid session context ID: " + sessionContextId); + } + + // Execute query and get native stream pointer + long nativeStreamPtr = nativeExecuteSubstraitQuery(nativeContextPtr, substraitPlanBytes); + + // Create Java wrapper for the native stream + RecordBatchStream stream = new CsvRecordBatchStream(nativeStreamPtr); + + logger.info("Successfully executed Substrait query for session: {}", sessionContextId); + return stream; + } catch (Exception e) { + logger.error("Failed to execute Substrait query for session: " + sessionContextId, e); + throw new CompletionException("Failed to execute Substrait query", e); + } + }); + } + + @Override + public CompletableFuture closeSessionContext(long sessionContextId) { + return CompletableFuture.supplyAsync(() -> { + try { + logger.debug("Closing session context: {}", sessionContextId); + + Long nativeContextPtr = sessionContexts.remove(sessionContextId); + if (nativeContextPtr != null) { + nativeCloseSessionContext(nativeContextPtr); + logger.info("Successfully closed session context: {}", sessionContextId); + } else { + logger.warn("Session context not found: {}", sessionContextId); + } + + return null; + } catch (Exception e) { + logger.error("Failed to close session context: " + sessionContextId, e); + throw new CompletionException("Failed to close session context", e); + } + }); + } + + // Native method declarations - these will be implemented in the JNI library + private static native void nativeRegisterDirectory(String tableName, String directoryPath, String[] files, long runtimeId); + + private static native long nativeCreateSessionContext(String[] configKeys, String[] configValues); + + private static native long nativeExecuteSubstraitQuery(long sessionContextPtr, byte[] substraitPlan); + + private static native void nativeCloseSessionContext(long sessionContextPtr); +} diff --git a/libs/dataformat-csv/src/main/java/org/opensearch/datafusion/csv/CsvRecordBatchStream.java b/libs/dataformat-csv/src/main/java/org/opensearch/datafusion/csv/CsvRecordBatchStream.java new file mode 100644 index 0000000000000..16feb1149885b --- /dev/null +++ b/libs/dataformat-csv/src/main/java/org/opensearch/datafusion/csv/CsvRecordBatchStream.java @@ -0,0 +1,119 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.datafusion.csv; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.opensearch.datafusion.spi.RecordBatchStream; + +import java.util.concurrent.CompletableFuture; + +/** + * TODO : this need not be here - nothing specific to CSV + * Native implementation of RecordBatchStream that wraps a JNI stream pointer. + * This class provides a Java interface over native DataFusion record batches. + */ +public class CsvRecordBatchStream implements RecordBatchStream { + + private static final Logger logger = LogManager.getLogger(CsvRecordBatchStream.class); + + private final long nativeStreamPtr; + private volatile boolean closed = false; + private volatile boolean hasNextCached = false; + private volatile boolean hasNextValue = false; + + /** + * Creates a new CsvRecordBatchStream wrapping the given native stream pointer. + * + * @param nativeStreamPtr Pointer to the native DataFusion RecordBatch stream + */ + public CsvRecordBatchStream(long nativeStreamPtr) { + if (nativeStreamPtr == 0) { + throw new IllegalArgumentException("Invalid native stream pointer"); + } + this.nativeStreamPtr = nativeStreamPtr; + logger.debug("Created CsvRecordBatchStream with pointer: {}", nativeStreamPtr); + } + + @Override + public Object getSchema() { + return "CsvSchema"; // Placeholder + } + + @Override + public CompletableFuture next() { + // PlaceholderImpl + return CompletableFuture.supplyAsync(() -> { + if (closed) { + return null; + } + + try { + // Get the next batch from native code + String batch = nativeNextBatch(nativeStreamPtr); + + // Reset cached hasNext value since we consumed a batch + hasNextCached = false; + + logger.trace("Retrieved next batch from stream pointer: {}", nativeStreamPtr); + return batch; + } catch (Exception e) { + logger.error("Error getting next batch from stream", e); + return null; + } + }); + } + + @Override + public boolean hasNext() { + // Placeholder impl + if (closed) { + return false; + } + + if (hasNextCached) { + return hasNextValue; + } + + try { + // Check if there's a next batch available + // This is a simplified implementation - in practice, you might want to + // peek at the stream without consuming the batch + String nextBatch = nativeNextBatch(nativeStreamPtr); + hasNextValue = (nextBatch != null); + hasNextCached = true; + + logger.trace("hasNext() = {} for stream pointer: {}", hasNextValue, nativeStreamPtr); + return hasNextValue; + } catch (Exception e) { + logger.error("Error checking for next batch in stream", e); + return false; + } + } + + @Override + public void close() { + if (!closed) { + logger.debug("Closing CsvRecordBatchStream with pointer: {}", nativeStreamPtr); + try { + nativeCloseStream(nativeStreamPtr); + closed = true; + logger.debug("Successfully closed CsvRecordBatchStream"); + } catch (Exception e) { + logger.error("Error closing CsvRecordBatchStream", e); + throw e; + } + } + } + + // Native method declarations + private static native String nativeNextBatch(long streamPtr); + + private static native void nativeCloseStream(long streamPtr); +} diff --git a/libs/dataformat-csv/src/main/java/org/opensearch/datafusion/csv/JniLibraryLoader.java b/libs/dataformat-csv/src/main/java/org/opensearch/datafusion/csv/JniLibraryLoader.java new file mode 100644 index 0000000000000..49fb8d9b79c13 --- /dev/null +++ b/libs/dataformat-csv/src/main/java/org/opensearch/datafusion/csv/JniLibraryLoader.java @@ -0,0 +1,151 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.datafusion.csv; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardCopyOption; + +/** + * Utility class for loading the data source JNI library. + */ +public class JniLibraryLoader { + + private static final Logger logger = LogManager.getLogger(JniLibraryLoader.class); + private static volatile boolean libraryLoaded = false; + + private static final String LIBRARY_NAME = "opensearch_datafusion_csv_jni"; + + /** + * Loads the DataFusion JNI library. This method is thread-safe and will only + * load the library once. + */ + public static synchronized void loadLibrary() { + if (libraryLoaded) { + return; + } + + try { + // First try to load from system library path + System.loadLibrary(LIBRARY_NAME); + logger.info("Loaded DataFusion JNI library from system path"); + libraryLoaded = true; + return; + } catch (UnsatisfiedLinkError e) { + logger.debug("Could not load library from system path, trying to extract from JAR", e); + } + + // Try to extract and load from JAR resources + String libraryPath = extractLibraryFromJar(); + if (libraryPath != null) { + try { + System.load(libraryPath); + logger.info("Loaded DataFusion JNI library from extracted path: {}", libraryPath); + libraryLoaded = true; + return; + } catch (UnsatisfiedLinkError e) { + logger.error("Failed to load extracted library from: " + libraryPath, e); + } + } + + throw new RuntimeException("Failed to load DataFusion JNI library"); + } + + /** + * Extracts the platform-specific JNI library from JAR resources to a temporary file. + * + * @return Path to the extracted library file, or null if extraction failed + */ + private static String extractLibraryFromJar() { + String osName = System.getProperty("os.name").toLowerCase(); + String osArch = System.getProperty("os.arch").toLowerCase(); + + logger.debug("Detecting platform: OS={}, Arch={}", osName, osArch); + + String libraryFileName = getLibraryFileName(osName); + if (libraryFileName == null) { + logger.error("Unsupported platform: {}", osName); + return null; + } + + String resourcePath = "/" + libraryFileName; + logger.debug("Looking for library resource: {}", resourcePath); + + try (InputStream inputStream = JniLibraryLoader.class.getResourceAsStream(resourcePath)) { + if (inputStream == null) { + logger.error("Library resource not found: {}", resourcePath); + return null; + } + + // Create temporary file + Path tempDir = Files.createTempDirectory("datafusion-jni"); + Path tempLibrary = tempDir.resolve(libraryFileName); + + // Extract library to temporary file + Files.copy(inputStream, tempLibrary, StandardCopyOption.REPLACE_EXISTING); + + // Make executable on Unix-like systems + if (!osName.contains("windows")) { + tempLibrary.toFile().setExecutable(true); + } + + // Schedule cleanup on JVM shutdown + tempLibrary.toFile().deleteOnExit(); + tempDir.toFile().deleteOnExit(); + + String libraryPath = tempLibrary.toAbsolutePath().toString(); + logger.debug("Extracted library to: {}", libraryPath); + return libraryPath; + + } catch (IOException e) { + logger.error("Failed to extract library from JAR", e); + return null; + } + } + + /** + * Gets the platform-specific library file name. + * + * @param osName Operating system name + * @return Library file name, or null if platform is unsupported + */ + private static String getLibraryFileName(String osName) { + String prefix; + String extension; + + if (osName.contains("windows")) { + prefix = ""; + extension = ".dll"; + } else if (osName.contains("mac") || osName.contains("darwin")) { + prefix = "lib"; + extension = ".dylib"; + } else if (osName.contains("linux") || osName.contains("unix")) { + prefix = "lib"; + extension = ".so"; + } else { + return null; + } + + return prefix + LIBRARY_NAME + extension; + } + + /** + * Checks if the JNI library has been loaded. + * + * @return true if the library is loaded, false otherwise + */ + public static boolean isLibraryLoaded() { + return libraryLoaded; + } +} diff --git a/libs/dataformat-csv/src/main/resources/META-INF/services/org.opensearch.datafusion.spi.DataSourceCodec b/libs/dataformat-csv/src/main/resources/META-INF/services/org.opensearch.datafusion.spi.DataSourceCodec new file mode 100644 index 0000000000000..452b39dc4abf7 --- /dev/null +++ b/libs/dataformat-csv/src/main/resources/META-INF/services/org.opensearch.datafusion.spi.DataSourceCodec @@ -0,0 +1 @@ +org.opensearch.datafusion.csv.CsvDataSourceCodec diff --git a/plugins/engine-datafusion/build.gradle b/plugins/engine-datafusion/build.gradle index afd994a9eea3a..53989c99a13f1 100644 --- a/plugins/engine-datafusion/build.gradle +++ b/plugins/engine-datafusion/build.gradle @@ -32,6 +32,9 @@ dependencies { testImplementation "junit:junit:${versions.junit}" testImplementation "org.hamcrest:hamcrest:${versions.hamcrest}" testImplementation "org.mockito:mockito-core:${versions.mockito}" + + // Add CSV codec for testing + testImplementation project(':libs:opensearch-dataformat-csv') // TODO : adding implementation results in cycle dependency } // Task to build the Rust JNI library diff --git a/plugins/engine-datafusion/jni/Cargo.toml b/plugins/engine-datafusion/jni/Cargo.toml index e26317758fb69..75097bb55e70c 100644 --- a/plugins/engine-datafusion/jni/Cargo.toml +++ b/plugins/engine-datafusion/jni/Cargo.toml @@ -2,23 +2,25 @@ name = "opensearch-datafusion-jni" version = "0.1.0" edition = "2021" -description = "JNI bindings for DataFusion integration with OpenSearch" -license = "Apache-2.0" [lib] name = "opensearch_datafusion_jni" crate-type = ["cdylib"] [dependencies] +# DataFusion dependencies datafusion = "49.0.0" -arrow = "55.2" -arrow-json = "55.2" +datafusion-substrait = "49.0.0" +arrow = "55.2.0" +arrow-array = "55.2.0" +arrow-schema = "55.2.0" +arrow-buffer = "55.2.0" # JNI dependencies jni = "0.21" # Async runtime -tokio = { version = "1.0", features = ["rt", "rt-multi-thread", "macros"] } +tokio = { version = "1.0", features = ["full"] } # Serialization serde = { version = "1.0", features = ["derive"] } @@ -31,13 +33,19 @@ thiserror = "1.0" # Logging log = "0.4" -[profile.release] -lto = true -codegen-units = 1 -panic = "abort" +# Parquet support +parquet = "53.0.0" -[profile.dev] -opt-level = 1 # Some optimization for reasonable performance -lto = false # Disable LTO for faster builds -codegen-units = 16 # More parallel compilation -incremental = true # Enable incremental compilation +# Object store for file access +object_store = "0.11" +url = "2.0" + +# Substrait support +substrait = "0.47" +prost = "0.13" + +# Temporary directory support +tempfile = "3.0" + +[build-dependencies] +cbindgen = "0.27" diff --git a/plugins/engine-datafusion/jni/src/lib.rs b/plugins/engine-datafusion/jni/src/lib.rs index 452a3951dc2fb..d158cea89b7cb 100644 --- a/plugins/engine-datafusion/jni/src/lib.rs +++ b/plugins/engine-datafusion/jni/src/lib.rs @@ -13,6 +13,9 @@ use jni::JNIEnv; use datafusion::execution::context::SessionContext; use datafusion::DATAFUSION_VERSION; +use datafusion::execution::cache::cache_manager::{CacheManager, CacheManagerConfig, FileStatisticsCache}; +use datafusion::execution::disk_manager::DiskManagerConfig; +use datafusion::execution::runtime_env::{RuntimeEnv, RuntimeEnvBuilder}; use datafusion::prelude::SessionConfig; /// Create a new DataFusion session context @@ -45,3 +48,46 @@ pub extern "system" fn Java_org_opensearch_datafusion_DataFusionJNI_getVersion( ) -> jstring { env.new_string(DATAFUSION_VERSION).expect("Couldn't create Java string").as_raw() } + +#[no_mangle] +pub extern "system" fn Java_org_opensearch_datafusion_DataFusionJNI_createGlobalRuntime( + _env: JNIEnv, + _class: JClass, +) -> jlong { + let runtime_env = RuntimeEnvBuilder::default().build().unwrap(); + /** + // We can copy global runtime to local runtime - file statistics cache, and most of the things + // will be shared across session contexts. But list files cache will be specific to session + // context + + let fsCache = runtimeEnv.clone().cache_manager.get_file_statistic_cache().unwrap(); + let localCacheManagerConfig = CacheManagerConfig::default().with_files_statistics_cache(Option::from(fsCache)); + let localCacheManager = CacheManager::try_new(&localCacheManagerConfig); + let localRuntimeEnv = RuntimeEnvBuilder::new() + .with_cache_manager(localCacheManagerConfig) + .with_disk_manager(DiskManagerConfig::new_existing(runtimeEnv.disk_manager)) + .with_memory_pool(runtimeEnv.memory_pool) + .with_object_store_registry(runtimeEnv.object_store_registry) + .build(); + let config = SessionConfig::new().with_repartition_aggregations(true); + let context = SessionContext::new_with_config(config); + **/ + let ctx = Box::into_raw(Box::new(runtime_env)) as jlong; + ctx +} + +#[no_mangle] +pub extern "system" fn Java_org_opensearch_datafusion_DataFusionJNI_closeGlobalRuntime( + _env: JNIEnv, + _class: JClass, + runtime_env_id: jlong, +) { + // Convert raw pointer back to a Box + let _ = unsafe { Box::from_raw(runtime_env_id as *mut RuntimeEnv) }; + // Box automatically drops here, cleaning up the runtime +} + + + + + diff --git a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/DataFusionJNI.java b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/DataFusionJNI.java index 2353fef35d7ad..25bdc353541ea 100644 --- a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/DataFusionJNI.java +++ b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/DataFusionJNI.java @@ -59,7 +59,6 @@ private static synchronized void loadNativeLibrary() { System.load(tempLib.toAbsolutePath().toString()); libStream.close(); } else { - // Fallback to system library path System.loadLibrary("opensearch_datafusion_jni"); } @@ -69,6 +68,18 @@ private static synchronized void loadNativeLibrary() { } } + /** + * Create a new global runtime environment + * @return runtime env pointer for subsequent operations + */ + public static native long createGlobalRuntime(); + + /** + * Closes global runtime environment + * @return runtime env pointer for subsequent operations + */ + public static native long closeGlobalRuntime(long pointer); + /** * Get version information * @return JSON string with version information diff --git a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/DataFusionPlugin.java b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/DataFusionPlugin.java index 5050e831ea895..13d5ca2afb467 100644 --- a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/DataFusionPlugin.java +++ b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/DataFusionPlugin.java @@ -39,6 +39,7 @@ /** * Main plugin class for OpenSearch DataFusion integration. + * */ public class DataFusionPlugin extends Plugin implements ActionPlugin { diff --git a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/DataFusionService.java b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/DataFusionService.java index 17bbb4738db9b..03b678a494d2f 100644 --- a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/DataFusionService.java +++ b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/DataFusionService.java @@ -13,9 +13,13 @@ import org.opensearch.common.lifecycle.AbstractLifecycleComponent; import org.opensearch.common.util.concurrent.ConcurrentCollections; import org.opensearch.common.util.concurrent.ConcurrentMapLong; -import org.opensearch.datafusion.core.SessionContext; +import org.opensearch.datafusion.core.GlobalRuntimeEnv; +import org.opensearch.datafusion.spi.DataSourceCodec; +import org.opensearch.datafusion.spi.DataSourceRegistry; +import org.opensearch.datafusion.spi.RecordBatchStream; -import java.util.concurrent.atomic.AtomicLong; +import java.util.List; +import java.util.concurrent.CompletableFuture; /** * Service for managing DataFusion contexts and operations - essentially like SearchService @@ -23,87 +27,167 @@ public class DataFusionService extends AbstractLifecycleComponent { private static final Logger logger = LogManager.getLogger(DataFusionService.class); + private final ConcurrentMapLong sessionEngines = ConcurrentCollections.newConcurrentMapLongWithAggressiveConcurrency(); - // in memory contexts, similar to ReaderContext in SearchService, just a ptr to SessionContext for now. - private final ConcurrentMapLong contexts = ConcurrentCollections.newConcurrentMapLongWithAggressiveConcurrency(); + private final DataSourceRegistry dataSourceRegistry; + private final GlobalRuntimeEnv globalRuntimeEnv; - private final AtomicLong idGenerator = new AtomicLong(); + public DataFusionService() { + this.dataSourceRegistry = DataSourceRegistry.getInstance(); + String version = DataFusionJNI.getVersion(); + this.globalRuntimeEnv = new GlobalRuntimeEnv(); + } @Override protected void doStart() { logger.info("Starting DataFusion service"); try { - // Test that the native library loads correctly - String version = DataFusionJNI.getVersion(); - logger.info("DataFusion service started successfully. Version info: {}", version); + // Initialize the data source registry + dataSourceRegistry.initialize(); + + // Test that at least one data source is available + if (!dataSourceRegistry.hasCodecs()) { + logger.warn("No data sources available"); + } else { + logger.info("DataFusion service started successfully with {} data sources: {}", + dataSourceRegistry.getCodecNames().size(), dataSourceRegistry.getCodecNames()); + + } } catch (Exception e) { logger.error("Failed to start DataFusion service", e); - throw new RuntimeException("Failed to initialize DataFusion JNI", e); + throw new RuntimeException("Failed to initialize DataFusion service", e); } } @Override protected void doStop() { logger.info("Stopping DataFusion service"); - // Close all named contexts - for (SessionContext ctx : contexts.values()) { + + // Close all session contexts + for (Long sessionId : sessionEngines.keySet()) { try { - ctx.close(); + closeSessionContext(sessionId).get(); } catch (Exception e) { - logger.warn("Error closing DataFusion context", e); + logger.warn("Error closing session context {}", sessionId, e); } } - contexts.clear(); + + // Shutdown the engine registry + dataSourceRegistry.shutdown(); + sessionEngines.clear(); + globalRuntimeEnv.close(); logger.info("DataFusion service stopped"); } @Override protected void doClose() { - // Ensure all resources are cleaned up doStop(); } /** - * Create a new named DataFusion context - * @return the context ID + * Register a directory with list of files to create a runtime environment + * with listing files cache of DataFusion + * + * @param directoryPath path to the directory containing files + * @param fileNames list of file names in the directory + * @return runtime environment ID */ - long createContext() { - SessionContext ctx = new SessionContext(); - // just stores the context for now - long id = idGenerator.incrementAndGet(); - SessionContext existing = contexts.put(id, ctx); - assert existing == null; - return id; + public CompletableFuture registerDirectory(String directoryPath, List fileNames) { + DataSourceCodec engine = dataSourceRegistry.getDefaultEngine(); + if (engine == null) { + return CompletableFuture.failedFuture( + new IllegalStateException("No DataFusion engine available")); + } + + logger.debug("Registering directory {} with {} files using engine {}", + directoryPath, fileNames.size(), engine.getClass().getSimpleName()); + + return engine.registerDirectory(directoryPath, fileNames, globalRuntimeEnv.getPointer()); } /** - * Get a context by id - * @param id the context id - * @return the context ID, or null if not found + * Create a session context + * + * @return session context ID */ - SessionContext getContext(long id) { - return contexts.get(id); + public CompletableFuture createSessionContext() { + long runtimeEnvironmentId = globalRuntimeEnv.getPointer(); + DataSourceCodec codec = dataSourceRegistry.getDefaultEngine(); + if (codec == null) { + return CompletableFuture.failedFuture( + new IllegalArgumentException("Runtime environment not found: " + runtimeEnvironmentId)); + } + + logger.debug("Creating session context for runtime environment {} using engine {}", + runtimeEnvironmentId, codec.getClass().getSimpleName()); + + return codec.createSessionContext(runtimeEnvironmentId) + .thenApply(sessionId -> { + // Track which engine created this session context + sessionEngines.put(sessionId, codec); + logger.debug("Created session context {} with engine {}", + sessionId, codec.getClass().getSimpleName()); + return sessionId; + }); } /** - * Close a context - * @param contextId the context id - * @return true if the context was found and closed, false otherwise + * Execute a query accepting substrait plan bytes and run via session context + * + * @param sessionContextId the session context ID + * @param substraitPlanBytes the substrait plan as byte array + * @return record batch stream containing query results */ - public boolean closeContext(long contextId) { - try (SessionContext ignored = contexts.remove(contextId)) { - // do nothing - } catch (Exception e) { - throw new RuntimeException(e); + public CompletableFuture executeSubstraitQuery(long sessionContextId, byte[] substraitPlanBytes) { + DataSourceCodec engine = sessionEngines.get(sessionContextId); + if (engine == null) { + return CompletableFuture.failedFuture( + new IllegalArgumentException("Session context not found: " + sessionContextId)); } - return false; + + logger.debug("Executing substrait query for session {} with plan size {} bytes using engine {}", + sessionContextId, substraitPlanBytes.length, engine.getClass().getSimpleName()); + + return engine.executeSubstraitQuery(sessionContextId, substraitPlanBytes); } /** - * Get version information + * Close the session context and clean up resources + * + * @param sessionContextId the session context ID to close + * @return future that completes when cleanup is done + */ + public CompletableFuture closeSessionContext(long sessionContextId) { + DataSourceCodec engine = sessionEngines.remove(sessionContextId); + if (engine == null) { + logger.debug("Session context {} not found or already closed", sessionContextId); + return CompletableFuture.completedFuture(null); + } + + logger.debug("Closing session context {} using engine {}", + sessionContextId, engine.getClass().getSimpleName()); + + return engine.closeSessionContext(sessionContextId); + } + + /** + * Get version information from available codecs * @return JSON version string */ public String getVersion() { - return DataFusionJNI.getVersion(); + StringBuilder version = new StringBuilder(); + version.append("{\"codecs\":["); + + boolean first = true; + for (String engineName : dataSourceRegistry.getCodecNames()) { + if (!first) { + version.append(","); + } + version.append("{\"name\":\"").append(engineName).append("\"}"); + first = false; + } + + version.append("]}"); + return version.toString(); } } diff --git a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/NodesDataFusionInfoResponse.java b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/NodesDataFusionInfoResponse.java index 5c14455da1622..1c7d99627c7e5 100644 --- a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/NodesDataFusionInfoResponse.java +++ b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/NodesDataFusionInfoResponse.java @@ -13,7 +13,6 @@ import org.opensearch.cluster.ClusterName; import org.opensearch.core.common.io.stream.StreamInput; import org.opensearch.core.common.io.stream.StreamOutput; -import org.opensearch.core.xcontent.ToXContentFragment; import org.opensearch.core.xcontent.ToXContentObject; import org.opensearch.core.xcontent.XContentBuilder; diff --git a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/core/GlobalRuntimeEnv.java b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/core/GlobalRuntimeEnv.java new file mode 100644 index 0000000000000..8380f8ea2dd67 --- /dev/null +++ b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/core/GlobalRuntimeEnv.java @@ -0,0 +1,31 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.datafusion.core; + +import static org.opensearch.datafusion.DataFusionJNI.closeGlobalRuntime; +import static org.opensearch.datafusion.DataFusionJNI.createGlobalRuntime; + +public class GlobalRuntimeEnv implements AutoCloseable{ + // ptr to runtime environment in df + private final long ptr; + + + public GlobalRuntimeEnv() { + this.ptr = createGlobalRuntime(); + } + + public long getPointer() { + return ptr; + } + + @Override + public void close() { + closeGlobalRuntime(this.ptr); + } +} diff --git a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/spi/DataSourceCodec.java b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/spi/DataSourceCodec.java new file mode 100644 index 0000000000000..201e3e3b055a5 --- /dev/null +++ b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/spi/DataSourceCodec.java @@ -0,0 +1,52 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.datafusion.spi; + +import java.util.List; +import java.util.concurrent.CompletableFuture; + +/** + * Service Provider Interface for DataFusion data source codecs. + * Implementations provide access to different data formats (CSV, Parquet, etc.) + * through the DataFusion query engine. + */ +public interface DataSourceCodec { + + /** + * Register a directory containing data files with the runtime environment to prewarm cache + * This ideally should be used as part of each refresh - equivalent of acquire searcher + * where we register the files associated with this particular refresh point + */ + CompletableFuture registerDirectory(String directoryPath, List fileNames, long runtimeId); + + /** + * Create a new session context for query execution. + * + * @param globalRuntimeEnvId the global runtime environment ID + * @return a CompletableFuture containing the session context ID + */ + CompletableFuture createSessionContext(long globalRuntimeEnvId); + + /** + * Execute a Substrait query plan. + * + * @param sessionContextId the session context ID + * @param substraitPlanBytes the serialized Substrait query plan + * @return a CompletableFuture containing the result stream + */ + CompletableFuture executeSubstraitQuery(long sessionContextId, byte[] substraitPlanBytes); + + /** + * Close a session context and free associated resources. + * + * @param sessionContextId the session context ID to close + * @return a CompletableFuture that completes when the context is closed + */ + CompletableFuture closeSessionContext(long sessionContextId); +} diff --git a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/spi/DataSourceRegistry.java b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/spi/DataSourceRegistry.java new file mode 100644 index 0000000000000..e5684054979ed --- /dev/null +++ b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/spi/DataSourceRegistry.java @@ -0,0 +1,120 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.datafusion.spi; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import java.util.ArrayList; +import java.util.List; +import java.util.ServiceLoader; +import java.util.concurrent.ConcurrentHashMap; + +/** + * Registry for DataFusion data source codecs. + */ +public class DataSourceRegistry { + + private static final Logger logger = LogManager.getLogger(DataSourceRegistry.class); + private static final DataSourceRegistry INSTANCE = new DataSourceRegistry(); + + private final ConcurrentHashMap codecs = new ConcurrentHashMap<>(); + private volatile boolean initialized = false; + + private DataSourceRegistry() { + // Private constructor for singleton + } + + /** + * Get the singleton instance of the registry. + * + * @return the registry instance + */ + public static DataSourceRegistry getInstance() { + return INSTANCE; + } + + /** + * Initialize the registry by loading available codecs. + */ + public synchronized void initialize() { + if (initialized) { + return; + } + + logger.info("Initializing DataSource registry"); + + try { + // Use ServiceLoader to discover codec implementations + ServiceLoader loader = ServiceLoader.load(DataSourceCodec.class); + + for (DataSourceCodec codec : loader) { + String codecName = codec.getClass().getSimpleName(); + codecs.put(codecName, codec); + logger.info("Registered DataSource codec: {}", codecName); + } + + initialized = true; + logger.info("DataSource registry initialized with {} codecs", codecs.size()); + + } catch (Exception e) { + logger.error("Failed to initialize DataSource registry", e); + throw new RuntimeException("Failed to initialize DataSource registry", e); + } + } + + /** + * Shutdown the registry and clean up resources. + */ + public synchronized void shutdown() { + logger.info("Shutting down DataSource registry"); + codecs.clear(); + initialized = false; + } + + /** + * Check if any codecs are available. + * + * @return true if codecs are available, false otherwise + */ + public boolean hasCodecs() { + return !codecs.isEmpty(); + } + + /** + * Get the names of all registered codecs. + * + * @return list of codec names + */ + public List getCodecNames() { + return new ArrayList<>(codecs.keySet()); + } + + /** + * Get the default codec (first available codec). + * + * @return the default codec, or null if none available + */ + public DataSourceCodec getDefaultEngine() { + if (codecs.isEmpty()) { + return null; + } + return codecs.values().iterator().next(); + } + + /** + * Get a codec by name. + * + * @param name the codec name + * @return the codec, or null if not found + */ + public DataSourceCodec getCodec(String name) { + return codecs.get(name); + } +} diff --git a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/spi/RecordBatchStream.java b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/spi/RecordBatchStream.java new file mode 100644 index 0000000000000..18cfb71e93292 --- /dev/null +++ b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/spi/RecordBatchStream.java @@ -0,0 +1,39 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.datafusion.spi; + +import java.util.concurrent.CompletableFuture; + +/** + * Represents a stream of record batches from a DataFusion query execution. + * This interface provides access to query results in a streaming fashion. + */ +public interface RecordBatchStream extends AutoCloseable { + + /** + * Check if there are more record batches available in the stream. + * + * @return true if more batches are available, false otherwise + */ + boolean hasNext(); + + Object getSchema(); + /** + * Get the next record batch from the stream. + * + * @return the next record batch as a byte array, or null if no more batches + */ + CompletableFuture next(); + + /** + * Close the stream and free associated resources. + */ + @Override + void close(); +} diff --git a/plugins/engine-datafusion/src/main/resources/META-INF/services/org.opensearch.datafusion.spi.DataSourceCodec b/plugins/engine-datafusion/src/main/resources/META-INF/services/org.opensearch.datafusion.spi.DataSourceCodec new file mode 100644 index 0000000000000..9b1ec055f7ea2 --- /dev/null +++ b/plugins/engine-datafusion/src/main/resources/META-INF/services/org.opensearch.datafusion.spi.DataSourceCodec @@ -0,0 +1,5 @@ +# DataFusion Engine implementations +# Add your custom implementations here, e.g.: +# com.example.CustomCsvDataFusionEngine + +# Note: Built-in csv engine is now in separate library diff --git a/plugins/engine-datafusion/src/test/java/org/opensearch/datafusion/DataFusionServiceTest.java b/plugins/engine-datafusion/src/test/java/org/opensearch/datafusion/TestDataFusionServiceTests.java similarity index 62% rename from plugins/engine-datafusion/src/test/java/org/opensearch/datafusion/DataFusionServiceTest.java rename to plugins/engine-datafusion/src/test/java/org/opensearch/datafusion/TestDataFusionServiceTests.java index af39b70fcab13..e63e749b9cdf5 100644 --- a/plugins/engine-datafusion/src/test/java/org/opensearch/datafusion/DataFusionServiceTest.java +++ b/plugins/engine-datafusion/src/test/java/org/opensearch/datafusion/TestDataFusionServiceTests.java @@ -12,6 +12,9 @@ import org.junit.Test; import org.junit.Assume; import org.opensearch.datafusion.core.SessionContext; +import org.opensearch.test.OpenSearchTestCase; + +import java.util.List; import static org.junit.Assert.*; @@ -22,12 +25,13 @@ * They are disabled by default and can be enabled by setting the system property: * -Dtest.native.enabled=true */ -public class DataFusionServiceTest { +public class TestDataFusionServiceTests extends OpenSearchTestCase { private DataFusionService service; @Before - public void setUp() { + public void setUp() throws Exception { + super.setUp(); service = new DataFusionService(); service.doStart(); } @@ -36,25 +40,18 @@ public void setUp() { public void testGetVersion() { String version = service.getVersion(); assertNotNull(version); - assertTrue(version.contains("datafusion_version")); - assertTrue(version.contains("arrow_version")); + // The service returns codec information in JSON format + assertTrue("Version should contain codecs", version.contains("codecs")); + assertTrue("Version should contain CsvDataSourceCodec", version.contains("CsvDataSourceCodec")); } @Test public void testCreateAndCloseContext() { + service.registerDirectory("/somedir", List.of("some.csv")); + long contextId = service.createSessionContext().join(); // Create context - long contextId = service.createContext(); assertTrue(contextId > 0); - // Verify context exists - SessionContext context = service.getContext(contextId); - assertNotNull(context); - - // Close context - boolean closed = service.closeContext(contextId); - assertTrue(closed); - - // Verify context is gone - assertNull(service.getContext(contextId)); + service.getVersion(); } } From 2fb9ed1b404835487ab52e1694de3f7a559250fa Mon Sep 17 00:00:00 2001 From: bharath-techie Date: Fri, 15 Aug 2025 19:11:22 +0530 Subject: [PATCH 3/3] adding libs , data source plugin and data source aware plugin Signed-off-by: bharath-techie --- libs/dataformat-csv/build.gradle | 84 ----------- libs/vectorized-exec-spi/build.gradle | 30 ++++ .../vectorized/execution/package-info.java | 13 ++ .../execution}/spi/DataSourceCodec.java | 6 +- .../execution}/spi/RecordBatchStream.java | 7 +- .../execution/spi/package-info.java | 13 ++ plugins/dataformat-csv/build.gradle | 112 ++++++++++++++ .../dataformat-csv/jni/Cargo.toml | 0 .../dataformat-csv/jni/src/context.rs | 0 .../dataformat-csv/jni/src/csv_exec.rs | 0 .../dataformat-csv/jni/src/lib.rs | 0 .../dataformat-csv/jni/src/runtime.rs | 0 .../dataformat-csv/jni/src/stream.rs | 0 .../dataformat-csv/jni/src/substrait.rs | 0 .../dataformat-csv/jni/src/util.rs | 0 .../datafusion/csv/CsvDataFormatPlugin.java | 43 ++++++ .../datafusion/csv/CsvDataSourceCodec.java | 4 +- .../datafusion/csv/CsvRecordBatchStream.java | 4 +- .../datafusion/csv/JniLibraryLoader.java | 39 +++-- .../datafusion/csv/package-info.java | 13 ++ ...h.vectorized.execution.spi.DataSourceCodec | 0 .../resources/plugin-descriptor.properties | 7 + .../csv/CsvDataFormatPluginTests.java | 25 ++++ plugins/engine-datafusion/build.gradle | 28 +++- plugins/engine-datafusion/jni/src/lib.rs | 40 +++-- .../opensearch/datafusion/DataFusionJNI.java | 88 ----------- .../datafusion/DataFusionPlugin.java | 59 ++++---- .../datafusion/DataFusionQueryJNI.java | 141 ++++++++++++++++++ .../datafusion/DataFusionService.java | 82 +++++----- .../datafusion/DataSourceRegistry.java | 72 +++++++++ .../datafusion/action/DataFusionAction.java | 5 +- .../datafusion/action/NodeDataFusionInfo.java | 5 +- .../action/NodesDataFusionInfoRequest.java | 1 - .../action/NodesDataFusionInfoResponse.java | 10 +- .../TransportNodesDataFusionInfoAction.java | 11 +- .../datafusion/action/package-info.java | 13 ++ .../datafusion/core/GlobalRuntimeEnv.java | 18 ++- .../datafusion/core/SessionContext.java | 4 + .../datafusion/core/package-info.java | 13 ++ .../opensearch/datafusion/package-info.java | 13 ++ .../datafusion/spi/DataSourceRegistry.java | 120 --------------- ....vectorized.execution.spi.DataSourceCodec} | 0 .../TestDataFusionServiceTests.java | 20 ++- server/build.gradle | 2 + .../main/java/org/opensearch/node/Node.java | 33 +++- .../plugins/DataSourceAwarePlugin.java | 51 +++++++ .../opensearch/plugins/DataSourcePlugin.java | 21 +++ 47 files changed, 827 insertions(+), 423 deletions(-) delete mode 100644 libs/dataformat-csv/build.gradle create mode 100644 libs/vectorized-exec-spi/build.gradle create mode 100644 libs/vectorized-exec-spi/src/main/java/org/opensearch/vectorized/execution/package-info.java rename {plugins/engine-datafusion/src/main/java/org/opensearch/datafusion => libs/vectorized-exec-spi/src/main/java/org/opensearch/vectorized/execution}/spi/DataSourceCodec.java (85%) rename {plugins/engine-datafusion/src/main/java/org/opensearch/datafusion => libs/vectorized-exec-spi/src/main/java/org/opensearch/vectorized/execution}/spi/RecordBatchStream.java (86%) create mode 100644 libs/vectorized-exec-spi/src/main/java/org/opensearch/vectorized/execution/spi/package-info.java create mode 100644 plugins/dataformat-csv/build.gradle rename {libs => plugins}/dataformat-csv/jni/Cargo.toml (100%) rename {libs => plugins}/dataformat-csv/jni/src/context.rs (100%) rename {libs => plugins}/dataformat-csv/jni/src/csv_exec.rs (100%) rename {libs => plugins}/dataformat-csv/jni/src/lib.rs (100%) rename {libs => plugins}/dataformat-csv/jni/src/runtime.rs (100%) rename {libs => plugins}/dataformat-csv/jni/src/stream.rs (100%) rename {libs => plugins}/dataformat-csv/jni/src/substrait.rs (100%) rename {libs => plugins}/dataformat-csv/jni/src/util.rs (100%) create mode 100644 plugins/dataformat-csv/src/main/java/org/opensearch/datafusion/csv/CsvDataFormatPlugin.java rename {libs => plugins}/dataformat-csv/src/main/java/org/opensearch/datafusion/csv/CsvDataSourceCodec.java (97%) rename {libs => plugins}/dataformat-csv/src/main/java/org/opensearch/datafusion/csv/CsvRecordBatchStream.java (96%) rename {libs => plugins}/dataformat-csv/src/main/java/org/opensearch/datafusion/csv/JniLibraryLoader.java (78%) create mode 100644 plugins/dataformat-csv/src/main/java/org/opensearch/datafusion/csv/package-info.java rename libs/dataformat-csv/src/main/resources/META-INF/services/org.opensearch.datafusion.spi.DataSourceCodec => plugins/dataformat-csv/src/main/resources/META-INF/services/org.opensearch.vectorized.execution.spi.DataSourceCodec (100%) create mode 100644 plugins/dataformat-csv/src/main/resources/plugin-descriptor.properties create mode 100644 plugins/dataformat-csv/src/test/java/org/opensearch/datafusion/csv/CsvDataFormatPluginTests.java delete mode 100644 plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/DataFusionJNI.java create mode 100644 plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/DataFusionQueryJNI.java create mode 100644 plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/DataSourceRegistry.java create mode 100644 plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/package-info.java create mode 100644 plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/core/package-info.java create mode 100644 plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/package-info.java delete mode 100644 plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/spi/DataSourceRegistry.java rename plugins/engine-datafusion/src/main/resources/META-INF/services/{org.opensearch.datafusion.spi.DataSourceCodec => org.opensearch.vectorized.execution.spi.DataSourceCodec} (100%) create mode 100644 server/src/main/java/org/opensearch/plugins/DataSourceAwarePlugin.java create mode 100644 server/src/main/java/org/opensearch/plugins/DataSourcePlugin.java diff --git a/libs/dataformat-csv/build.gradle b/libs/dataformat-csv/build.gradle deleted file mode 100644 index a6dadddcb3dea..0000000000000 --- a/libs/dataformat-csv/build.gradle +++ /dev/null @@ -1,84 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - */ - -apply plugin: 'opensearch.java' - -dependencies { - // TODO : circular dependency - compileOnly project(':plugins:engine-datafusion') - - implementation "org.apache.logging.log4j:log4j-api:${versions.log4j}" - implementation "org.apache.logging.log4j:log4j-core:${versions.log4j}" - - testImplementation "junit:junit:${versions.junit}" -} - -// Task to build the Rust JNI library -task buildRustLibrary(type: Exec) { - description = 'Build the Rust JNI library using Cargo' - group = 'build' - - workingDir file('jni') - def osName = System.getProperty('os.name').toLowerCase() - def libPrefix = osName.contains('windows') ? '' : 'lib' - def libExtension = osName.contains('windows') ? '.dll' : (osName.contains('mac') ? '.dylib' : '.so') - - def buildType = project.hasProperty('rustRelease') ? 'release' : 'debug' - def targetDir = "target/${buildType}" - - def cargoArgs = ['cargo', 'build'] - if (buildType == 'release') { - cargoArgs.add('--release') - } - - if (osName.contains('windows')) { - commandLine cargoArgs - } else { - commandLine cargoArgs - } - environment 'CARGO_TARGET_DIR', file('jni/target').absolutePath - - inputs.files fileTree('jni/src') - inputs.file 'jni/Cargo.toml' - outputs.files file("jni/${targetDir}/${libPrefix}opensearch_datafusion_csv_jni${libExtension}") - System.out.println("Building Rust library in ${buildType} mode"); -} - -task copyNativeLibrary(type: Copy, dependsOn: buildRustLibrary) { - description = 'Copy the native library to Java resources' - group = 'build' - - def osName = System.getProperty('os.name').toLowerCase() - def libPrefix = osName.contains('windows') ? '' : 'lib' - def libExtension = osName.contains('windows') ? '.dll' : (osName.contains('mac') ? '.dylib' : '.so') - def buildType = project.hasProperty('rustRelease') ? 'release' : 'debug' - - from file("jni/target/${buildType}/${libPrefix}opensearch_datafusion_csv_jni${libExtension}") - into file('src/main/resources') - - rename { filename -> - "libopensearch_datafusion_csv_jni${libExtension}" - } -} - -compileJava.dependsOn copyNativeLibrary - -processResources.dependsOn copyNativeLibrary - -jar { - archiveBaseName = 'opensearch-dataformat-csv-codec' - duplicatesStrategy = DuplicatesStrategy.WARN - dependsOn copyNativeLibrary -} - -clean { - delete file('jni/target') - delete file('src/main/resources/libopensearch_datafusion_csv_jni.dylib') - delete file('src/main/resources/libopensearch_datafusion_csv_jni.so') - delete file('src/main/resources/opensearch_datafusion_csv_jni.dll') -} - -test { - systemProperty 'java.library.path', file('src/main/resources').absolutePath -} diff --git a/libs/vectorized-exec-spi/build.gradle b/libs/vectorized-exec-spi/build.gradle new file mode 100644 index 0000000000000..dfb95964d01f5 --- /dev/null +++ b/libs/vectorized-exec-spi/build.gradle @@ -0,0 +1,30 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +apply plugin: 'opensearch.build' + +description = 'Vectorized engine common interfaces for OpenSearch' + +dependencies { + api project(':libs:opensearch-core') + api project(':libs:opensearch-common') + + testImplementation(project(":test:framework")) { + exclude group: 'org.opensearch', module: 'vectorized-exec-spi' + } +} + +tasks.named('forbiddenApisMain').configure { + replaceSignatureFiles 'jdk-signatures' +} + +jarHell.enabled = false + +test { + systemProperty 'tests.security.manager', 'false' +} diff --git a/libs/vectorized-exec-spi/src/main/java/org/opensearch/vectorized/execution/package-info.java b/libs/vectorized-exec-spi/src/main/java/org/opensearch/vectorized/execution/package-info.java new file mode 100644 index 0000000000000..8d91260830538 --- /dev/null +++ b/libs/vectorized-exec-spi/src/main/java/org/opensearch/vectorized/execution/package-info.java @@ -0,0 +1,13 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +/** + * DataFusion integration for OpenSearch. + * Provides JNI bindings and core functionality for DataFusion query engine. + */ +package org.opensearch.vectorized.execution; diff --git a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/spi/DataSourceCodec.java b/libs/vectorized-exec-spi/src/main/java/org/opensearch/vectorized/execution/spi/DataSourceCodec.java similarity index 85% rename from plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/spi/DataSourceCodec.java rename to libs/vectorized-exec-spi/src/main/java/org/opensearch/vectorized/execution/spi/DataSourceCodec.java index 201e3e3b055a5..c42b5d67c8791 100644 --- a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/spi/DataSourceCodec.java +++ b/libs/vectorized-exec-spi/src/main/java/org/opensearch/vectorized/execution/spi/DataSourceCodec.java @@ -6,7 +6,7 @@ * compatible open source license. */ -package org.opensearch.datafusion.spi; +package org.opensearch.vectorized.execution.spi; import java.util.List; import java.util.concurrent.CompletableFuture; @@ -22,6 +22,10 @@ public interface DataSourceCodec { * Register a directory containing data files with the runtime environment to prewarm cache * This ideally should be used as part of each refresh - equivalent of acquire searcher * where we register the files associated with this particular refresh point + * @param directoryPath the path to the directory containing data files + * @param fileNames the list of file names to register + * @param runtimeId the runtime environment ID + * @return a CompletableFuture that completes when registration is done */ CompletableFuture registerDirectory(String directoryPath, List fileNames, long runtimeId); diff --git a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/spi/RecordBatchStream.java b/libs/vectorized-exec-spi/src/main/java/org/opensearch/vectorized/execution/spi/RecordBatchStream.java similarity index 86% rename from plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/spi/RecordBatchStream.java rename to libs/vectorized-exec-spi/src/main/java/org/opensearch/vectorized/execution/spi/RecordBatchStream.java index 18cfb71e93292..b79f895c243b9 100644 --- a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/spi/RecordBatchStream.java +++ b/libs/vectorized-exec-spi/src/main/java/org/opensearch/vectorized/execution/spi/RecordBatchStream.java @@ -6,7 +6,7 @@ * compatible open source license. */ -package org.opensearch.datafusion.spi; +package org.opensearch.vectorized.execution.spi; import java.util.concurrent.CompletableFuture; @@ -23,7 +23,12 @@ public interface RecordBatchStream extends AutoCloseable { */ boolean hasNext(); + /** + * Get the schema of the record batches in this stream. + * @return the schema object + */ Object getSchema(); + /** * Get the next record batch from the stream. * diff --git a/libs/vectorized-exec-spi/src/main/java/org/opensearch/vectorized/execution/spi/package-info.java b/libs/vectorized-exec-spi/src/main/java/org/opensearch/vectorized/execution/spi/package-info.java new file mode 100644 index 0000000000000..9402386b8746b --- /dev/null +++ b/libs/vectorized-exec-spi/src/main/java/org/opensearch/vectorized/execution/spi/package-info.java @@ -0,0 +1,13 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +/** + * Service Provider Interface (SPI) for DataFusion data source codecs. + * Defines interfaces for implementing different data format support. + */ +package org.opensearch.vectorized.execution.spi; diff --git a/plugins/dataformat-csv/build.gradle b/plugins/dataformat-csv/build.gradle new file mode 100644 index 0000000000000..99860394bff22 --- /dev/null +++ b/plugins/dataformat-csv/build.gradle @@ -0,0 +1,112 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +apply plugin: 'opensearch.opensearchplugin' + +opensearchplugin { + name = 'dataformat-csv' + description = 'CSV data format plugin for OpenSearch DataFusion' + classname = 'org.opensearch.datafusion.csv.CsvDataFormatPlugin' + hasNativeController = false +} + +dependencies { + api project(':libs:opensearch-vectorized-exec-spi') + api project(':libs:opensearch-core') + api project(':libs:opensearch-common') + + testImplementation(project(":test:framework")) { + exclude group: 'org.opensearch', module: 'opensearch-dataformat-csv' + } +} + +// JNI library configuration +task buildJni(type: Exec) { + description = 'Build the Rust JNI library using Cargo' + group = 'build' + + workingDir 'jni' + + // Determine the target directory and library name based on OS + def osName = System.getProperty('os.name').toLowerCase() + def libPrefix = osName.contains('windows') ? '' : 'lib' + def libExtension = osName.contains('windows') ? '.dll' : (osName.contains('mac') ? '.dylib' : '.so') + + // Find cargo executable - try common locations + def cargoExecutable = 'cargo' + def possibleCargoPaths = [ + System.getenv('HOME') + '/.cargo/bin/cargo', + '/usr/local/bin/cargo', + 'cargo' + ] + + for (String path : possibleCargoPaths) { + if (new File(path).exists()) { + cargoExecutable = path + break + } + } + + // Use release build + //def cargoArgs = ['cargo', 'build', '--release'] + + def cargoArgs = [cargoExecutable, 'build', '--release'] + + if (osName.contains('windows')) { + commandLine cargoArgs + } else { + commandLine cargoArgs + } + + // Set environment variables for cross-compilation if needed + environment 'CARGO_TARGET_DIR', file('jni/target').absolutePath + + inputs.files fileTree('jni/src') + inputs.file 'jni/Cargo.toml' + outputs.files file("jni/target/release/${libPrefix}opensearch_datafusion_csv_jni${libExtension}") +} + +task copyJniLib(type: Copy, dependsOn: buildJni) { + from 'jni/target/release' + into 'src/main/resources' + include '*.dylib', '*.so', '*.dll' + + doLast { + // Remove executable permissions from copied native libraries + fileTree('src/main/resources').matching { + include '*.dylib', '*.so', '*.dll' + }.each { file -> + file.setExecutable(false, false) + file.setReadable(true, false) + file.setWritable(true, false) + } + } +} + +processResources.dependsOn copyJniLib +sourcesJar.dependsOn copyJniLib + +// Ensure file permissions check runs after JNI library is copied +tasks.named('filepermissions').configure { + dependsOn copyJniLib +} + +// Ensure forbidden patterns check runs after JNI library is copied +tasks.named('forbiddenPatterns').configure { + dependsOn copyJniLib + exclude '**/*.dylib', '**/*.so', '**/*.dll' +} + +// Ensure spotless check runs after JNI library is copied +tasks.named('spotlessJava').configure { + dependsOn copyJniLib +} + +test { + systemProperty 'tests.security.manager', 'false' +} diff --git a/libs/dataformat-csv/jni/Cargo.toml b/plugins/dataformat-csv/jni/Cargo.toml similarity index 100% rename from libs/dataformat-csv/jni/Cargo.toml rename to plugins/dataformat-csv/jni/Cargo.toml diff --git a/libs/dataformat-csv/jni/src/context.rs b/plugins/dataformat-csv/jni/src/context.rs similarity index 100% rename from libs/dataformat-csv/jni/src/context.rs rename to plugins/dataformat-csv/jni/src/context.rs diff --git a/libs/dataformat-csv/jni/src/csv_exec.rs b/plugins/dataformat-csv/jni/src/csv_exec.rs similarity index 100% rename from libs/dataformat-csv/jni/src/csv_exec.rs rename to plugins/dataformat-csv/jni/src/csv_exec.rs diff --git a/libs/dataformat-csv/jni/src/lib.rs b/plugins/dataformat-csv/jni/src/lib.rs similarity index 100% rename from libs/dataformat-csv/jni/src/lib.rs rename to plugins/dataformat-csv/jni/src/lib.rs diff --git a/libs/dataformat-csv/jni/src/runtime.rs b/plugins/dataformat-csv/jni/src/runtime.rs similarity index 100% rename from libs/dataformat-csv/jni/src/runtime.rs rename to plugins/dataformat-csv/jni/src/runtime.rs diff --git a/libs/dataformat-csv/jni/src/stream.rs b/plugins/dataformat-csv/jni/src/stream.rs similarity index 100% rename from libs/dataformat-csv/jni/src/stream.rs rename to plugins/dataformat-csv/jni/src/stream.rs diff --git a/libs/dataformat-csv/jni/src/substrait.rs b/plugins/dataformat-csv/jni/src/substrait.rs similarity index 100% rename from libs/dataformat-csv/jni/src/substrait.rs rename to plugins/dataformat-csv/jni/src/substrait.rs diff --git a/libs/dataformat-csv/jni/src/util.rs b/plugins/dataformat-csv/jni/src/util.rs similarity index 100% rename from libs/dataformat-csv/jni/src/util.rs rename to plugins/dataformat-csv/jni/src/util.rs diff --git a/plugins/dataformat-csv/src/main/java/org/opensearch/datafusion/csv/CsvDataFormatPlugin.java b/plugins/dataformat-csv/src/main/java/org/opensearch/datafusion/csv/CsvDataFormatPlugin.java new file mode 100644 index 0000000000000..e8f0d2306d2e6 --- /dev/null +++ b/plugins/dataformat-csv/src/main/java/org/opensearch/datafusion/csv/CsvDataFormatPlugin.java @@ -0,0 +1,43 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.datafusion.csv; + +import org.opensearch.plugins.DataSourcePlugin; +import org.opensearch.plugins.Plugin; +import org.opensearch.vectorized.execution.spi.DataSourceCodec; + +import java.util.HashMap; +import java.util.Map; +import java.util.Optional; + +/** + * Plugin for CSV data format support in OpenSearch DataFusion. + * This plugin provides CSV data source codec through ServiceLoader mechanism. + * + * Todo: implement vectorized exec specific plugin + */ +public class CsvDataFormatPlugin extends Plugin implements DataSourcePlugin { + + /** + * Creates a new CSV data format plugin. + */ + public CsvDataFormatPlugin() { + // Plugin initialization + } + + // TODO : move to vectorized exec specific plugin + @Override + public Optional> getDataSourceCodecs() { + Map codecs = new HashMap<>(); + // TODO : version it correctly - similar to lucene codecs? + codecs.put("csv-v1", new CsvDataSourceCodec()); + return Optional.of(codecs); + // return Optional.empty(); + } +} diff --git a/libs/dataformat-csv/src/main/java/org/opensearch/datafusion/csv/CsvDataSourceCodec.java b/plugins/dataformat-csv/src/main/java/org/opensearch/datafusion/csv/CsvDataSourceCodec.java similarity index 97% rename from libs/dataformat-csv/src/main/java/org/opensearch/datafusion/csv/CsvDataSourceCodec.java rename to plugins/dataformat-csv/src/main/java/org/opensearch/datafusion/csv/CsvDataSourceCodec.java index ea796c6b14ef2..80622fbda6e31 100644 --- a/libs/dataformat-csv/src/main/java/org/opensearch/datafusion/csv/CsvDataSourceCodec.java +++ b/plugins/dataformat-csv/src/main/java/org/opensearch/datafusion/csv/CsvDataSourceCodec.java @@ -10,8 +10,8 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; -import org.opensearch.datafusion.spi.DataSourceCodec; -import org.opensearch.datafusion.spi.RecordBatchStream; +import org.opensearch.vectorized.execution.spi.DataSourceCodec; +import org.opensearch.vectorized.execution.spi.RecordBatchStream; import java.util.List; import java.util.concurrent.CompletableFuture; diff --git a/libs/dataformat-csv/src/main/java/org/opensearch/datafusion/csv/CsvRecordBatchStream.java b/plugins/dataformat-csv/src/main/java/org/opensearch/datafusion/csv/CsvRecordBatchStream.java similarity index 96% rename from libs/dataformat-csv/src/main/java/org/opensearch/datafusion/csv/CsvRecordBatchStream.java rename to plugins/dataformat-csv/src/main/java/org/opensearch/datafusion/csv/CsvRecordBatchStream.java index 16feb1149885b..56738a87cbddf 100644 --- a/libs/dataformat-csv/src/main/java/org/opensearch/datafusion/csv/CsvRecordBatchStream.java +++ b/plugins/dataformat-csv/src/main/java/org/opensearch/datafusion/csv/CsvRecordBatchStream.java @@ -10,12 +10,12 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; -import org.opensearch.datafusion.spi.RecordBatchStream; +import org.opensearch.vectorized.execution.spi.RecordBatchStream; import java.util.concurrent.CompletableFuture; /** - * TODO : this need not be here - nothing specific to CSV + * TODO : this need not be here - nothing specific to CSV - move to LIB ? * Native implementation of RecordBatchStream that wraps a JNI stream pointer. * This class provides a Java interface over native DataFusion record batches. */ diff --git a/libs/dataformat-csv/src/main/java/org/opensearch/datafusion/csv/JniLibraryLoader.java b/plugins/dataformat-csv/src/main/java/org/opensearch/datafusion/csv/JniLibraryLoader.java similarity index 78% rename from libs/dataformat-csv/src/main/java/org/opensearch/datafusion/csv/JniLibraryLoader.java rename to plugins/dataformat-csv/src/main/java/org/opensearch/datafusion/csv/JniLibraryLoader.java index 49fb8d9b79c13..6f3e68baa10d1 100644 --- a/libs/dataformat-csv/src/main/java/org/opensearch/datafusion/csv/JniLibraryLoader.java +++ b/plugins/dataformat-csv/src/main/java/org/opensearch/datafusion/csv/JniLibraryLoader.java @@ -16,6 +16,9 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardCopyOption; +import java.nio.file.attribute.PosixFilePermission; +import java.util.Locale; +import java.util.Set; /** * Utility class for loading the data source JNI library. @@ -27,6 +30,13 @@ public class JniLibraryLoader { private static final String LIBRARY_NAME = "opensearch_datafusion_csv_jni"; + /** + * Private constructor to prevent instantiation of utility class. + */ + private JniLibraryLoader() { + // Utility class + } + /** * Loads the DataFusion JNI library. This method is thread-safe and will only * load the library once. @@ -68,8 +78,8 @@ public static synchronized void loadLibrary() { * @return Path to the extracted library file, or null if extraction failed */ private static String extractLibraryFromJar() { - String osName = System.getProperty("os.name").toLowerCase(); - String osArch = System.getProperty("os.arch").toLowerCase(); + String osName = System.getProperty("os.name").toLowerCase(Locale.ROOT); + String osArch = System.getProperty("os.arch").toLowerCase(Locale.ROOT); logger.debug("Detecting platform: OS={}, Arch={}", osName, osArch); @@ -88,21 +98,31 @@ private static String extractLibraryFromJar() { return null; } - // Create temporary file - Path tempDir = Files.createTempDirectory("datafusion-jni"); + // Create temporary file in system temp directory + Path tempDir = Files.createTempDirectory(Path.of(System.getProperty("java.io.tmpdir")), "datafusion-jni"); Path tempLibrary = tempDir.resolve(libraryFileName); // Extract library to temporary file Files.copy(inputStream, tempLibrary, StandardCopyOption.REPLACE_EXISTING); - // Make executable on Unix-like systems + // Make executable on Unix-like systems using NIO if (!osName.contains("windows")) { - tempLibrary.toFile().setExecutable(true); + Set permissions = Files.getPosixFilePermissions(tempLibrary); + permissions.add(PosixFilePermission.OWNER_EXECUTE); + permissions.add(PosixFilePermission.GROUP_EXECUTE); + permissions.add(PosixFilePermission.OTHERS_EXECUTE); + Files.setPosixFilePermissions(tempLibrary, permissions); } - // Schedule cleanup on JVM shutdown - tempLibrary.toFile().deleteOnExit(); - tempDir.toFile().deleteOnExit(); + // Register for cleanup on JVM shutdown using NIO + Runtime.getRuntime().addShutdownHook(new Thread(() -> { + try { + Files.deleteIfExists(tempLibrary); + Files.deleteIfExists(tempDir); + } catch (IOException e) { + logger.debug("Failed to cleanup temporary files", e); + } + })); String libraryPath = tempLibrary.toAbsolutePath().toString(); logger.debug("Extracted library to: {}", libraryPath); @@ -138,6 +158,7 @@ private static String getLibraryFileName(String osName) { } return prefix + LIBRARY_NAME + extension; + } /** diff --git a/plugins/dataformat-csv/src/main/java/org/opensearch/datafusion/csv/package-info.java b/plugins/dataformat-csv/src/main/java/org/opensearch/datafusion/csv/package-info.java new file mode 100644 index 0000000000000..35fd564c68e51 --- /dev/null +++ b/plugins/dataformat-csv/src/main/java/org/opensearch/datafusion/csv/package-info.java @@ -0,0 +1,13 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +/** + * CSV data format implementation for DataFusion integration. + * Provides CSV file reading capabilities through DataFusion query engine. + */ +package org.opensearch.datafusion.csv; diff --git a/libs/dataformat-csv/src/main/resources/META-INF/services/org.opensearch.datafusion.spi.DataSourceCodec b/plugins/dataformat-csv/src/main/resources/META-INF/services/org.opensearch.vectorized.execution.spi.DataSourceCodec similarity index 100% rename from libs/dataformat-csv/src/main/resources/META-INF/services/org.opensearch.datafusion.spi.DataSourceCodec rename to plugins/dataformat-csv/src/main/resources/META-INF/services/org.opensearch.vectorized.execution.spi.DataSourceCodec diff --git a/plugins/dataformat-csv/src/main/resources/plugin-descriptor.properties b/plugins/dataformat-csv/src/main/resources/plugin-descriptor.properties new file mode 100644 index 0000000000000..713d226cce94a --- /dev/null +++ b/plugins/dataformat-csv/src/main/resources/plugin-descriptor.properties @@ -0,0 +1,7 @@ +# Plugin descriptor for CSV data format plugin +description=CSV data format plugin for OpenSearch DataFusion +version=${project.version} +name=dataformat-csv +classname=org.opensearch.datafusion.csv.CsvDataFormatPlugin +java.version=${versions.java} +opensearch.version=${opensearch_version} diff --git a/plugins/dataformat-csv/src/test/java/org/opensearch/datafusion/csv/CsvDataFormatPluginTests.java b/plugins/dataformat-csv/src/test/java/org/opensearch/datafusion/csv/CsvDataFormatPluginTests.java new file mode 100644 index 0000000000000..27ea2251e66b6 --- /dev/null +++ b/plugins/dataformat-csv/src/test/java/org/opensearch/datafusion/csv/CsvDataFormatPluginTests.java @@ -0,0 +1,25 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.datafusion.csv; + +import org.opensearch.test.OpenSearchTestCase; + +/** + * Tests for the CSV data format plugin. + */ +public class CsvDataFormatPluginTests extends OpenSearchTestCase { + + /** + * Test that the plugin can be instantiated. + */ + public void testPluginInstantiation() { + CsvDataFormatPlugin plugin = new CsvDataFormatPlugin(); + assertNotNull("Plugin should not be null", plugin); + } +} diff --git a/plugins/engine-datafusion/build.gradle b/plugins/engine-datafusion/build.gradle index 53989c99a13f1..1b4c41371af14 100644 --- a/plugins/engine-datafusion/build.gradle +++ b/plugins/engine-datafusion/build.gradle @@ -27,14 +27,15 @@ opensearchplugin { } dependencies { + api project(':libs:opensearch-vectorized-exec-spi') implementation "org.apache.logging.log4j:log4j-api:${versions.log4j}" implementation "org.apache.logging.log4j:log4j-core:${versions.log4j}" testImplementation "junit:junit:${versions.junit}" testImplementation "org.hamcrest:hamcrest:${versions.hamcrest}" testImplementation "org.mockito:mockito-core:${versions.mockito}" - // Add CSV codec for testing - testImplementation project(':libs:opensearch-dataformat-csv') // TODO : adding implementation results in cycle dependency + // Add CSV plugin for testing + // testImplementation project(':plugins:dataformat-csv') } // Task to build the Rust JNI library @@ -53,7 +54,22 @@ task buildRustLibrary(type: Exec) { def buildType = project.hasProperty('rustRelease') ? 'release' : 'debug' def targetDir = "target/${buildType}" - def cargoArgs = ['cargo', 'build'] + // Find cargo executable - try common locations + def cargoExecutable = 'cargo' + def possibleCargoPaths = [ + System.getenv('HOME') + '/.cargo/bin/cargo', + '/usr/local/bin/cargo', + 'cargo' + ] + + for (String path : possibleCargoPaths) { + if (new File(path).exists()) { + cargoExecutable = path + break + } + } + + def cargoArgs = [cargoExecutable, 'build'] if (buildType == 'release') { cargoArgs.add('--release') } @@ -97,6 +113,12 @@ compileJava.dependsOn copyNativeLibrary // Ensure processResources depends on copyNativeLibrary processResources.dependsOn copyNativeLibrary +sourcesJar.dependsOn copyNativeLibrary + +// Ensure filepermissions task depends on copyNativeLibrary +tasks.named('filepermissions').configure { + dependsOn copyNativeLibrary +} // Clean task should also clean Rust artifacts clean { diff --git a/plugins/engine-datafusion/jni/src/lib.rs b/plugins/engine-datafusion/jni/src/lib.rs index d158cea89b7cb..1e9981e9abae3 100644 --- a/plugins/engine-datafusion/jni/src/lib.rs +++ b/plugins/engine-datafusion/jni/src/lib.rs @@ -9,6 +9,7 @@ use jni::objects::JClass; use jni::sys::{jlong, jstring}; use jni::JNIEnv; +use std::sync::Arc; use datafusion::execution::context::SessionContext; @@ -20,7 +21,7 @@ use datafusion::prelude::SessionConfig; /// Create a new DataFusion session context #[no_mangle] -pub extern "system" fn Java_org_opensearch_datafusion_DataFusionJNI_createContext( +pub extern "system" fn Java_org_opensearch_datafusion_DataFusionQueryJNI_createContext( _env: JNIEnv, _class: JClass, ) -> jlong { @@ -32,7 +33,7 @@ pub extern "system" fn Java_org_opensearch_datafusion_DataFusionJNI_createContex /// Close and cleanup a DataFusion context #[no_mangle] -pub extern "system" fn Java_org_opensearch_datafusion_DataFusionJNI_closeContext( +pub extern "system" fn Java_org_opensearch_datafusion_DataFusionQueryJNI_closeContext( _env: JNIEnv, _class: JClass, context_id: jlong, @@ -42,7 +43,17 @@ pub extern "system" fn Java_org_opensearch_datafusion_DataFusionJNI_closeContext /// Get version information #[no_mangle] -pub extern "system" fn Java_org_opensearch_datafusion_DataFusionJNI_getVersion( +pub extern "system" fn Java_org_opensearch_datafusion_DataFusionQueryJNI_getVersionInfo( + env: JNIEnv, + _class: JClass, +) -> jstring { + let version_info = format!(r#"{{"version": "{}", "codecs": ["CsvDataSourceCodec"]}}"#, DATAFUSION_VERSION); + env.new_string(version_info).expect("Couldn't create Java string").as_raw() +} + +/// Get version information (legacy method name) +#[no_mangle] +pub extern "system" fn Java_org_opensearch_datafusion_DataFusionQueryJNI_getVersion( env: JNIEnv, _class: JClass, ) -> jstring { @@ -50,7 +61,7 @@ pub extern "system" fn Java_org_opensearch_datafusion_DataFusionJNI_getVersion( } #[no_mangle] -pub extern "system" fn Java_org_opensearch_datafusion_DataFusionJNI_createGlobalRuntime( +pub extern "system" fn Java_org_opensearch_datafusion_DataFusionQueryJNI_createGlobalRuntime( _env: JNIEnv, _class: JClass, ) -> jlong { @@ -77,14 +88,25 @@ pub extern "system" fn Java_org_opensearch_datafusion_DataFusionJNI_createGlobal } #[no_mangle] -pub extern "system" fn Java_org_opensearch_datafusion_DataFusionJNI_closeGlobalRuntime( +pub extern "system" fn Java_org_opensearch_datafusion_DataFusionQueryJNI_createSessionContext( _env: JNIEnv, _class: JClass, - runtime_env_id: jlong, + runtime_id: jlong, +) -> jlong { + let runtimeEnv = unsafe { &mut *(runtime_id as *mut RuntimeEnv) }; + let config = SessionConfig::new().with_repartition_aggregations(true); + let context = SessionContext::new_with_config_rt(config, Arc::new(runtimeEnv.clone())); + let ctx = Box::into_raw(Box::new(context)) as jlong; + ctx +} + +#[no_mangle] +pub extern "system" fn Java_org_opensearch_datafusion_DataFusionQueryJNI_closeSessionContext( + _env: JNIEnv, + _class: JClass, + context_id: jlong, ) { - // Convert raw pointer back to a Box - let _ = unsafe { Box::from_raw(runtime_env_id as *mut RuntimeEnv) }; - // Box automatically drops here, cleaning up the runtime + let _ = unsafe { Box::from_raw(context_id as *mut SessionContext) }; } diff --git a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/DataFusionJNI.java b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/DataFusionJNI.java deleted file mode 100644 index 25bdc353541ea..0000000000000 --- a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/DataFusionJNI.java +++ /dev/null @@ -1,88 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The OpenSearch Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -package org.opensearch.datafusion; - -import java.io.IOException; -import java.io.InputStream; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.StandardCopyOption; - -/** - * JNI wrapper for DataFusion operations - */ -public class DataFusionJNI { - - private static boolean libraryLoaded = false; - - static { - loadNativeLibrary(); - } - - /** - * Load the native library from resources - */ - private static synchronized void loadNativeLibrary() { - if (libraryLoaded) { - return; - } - - try { - String osName = System.getProperty("os.name").toLowerCase(); - String libExtension; - String libName; - - if (osName.contains("windows")) { - libExtension = ".dll"; - libName = "libopensearch_datafusion_jni.dll"; - } else if (osName.contains("mac")) { - libExtension = ".dylib"; - libName = "libopensearch_datafusion_jni.dylib"; - } else { - libExtension = ".so"; - libName = "libopensearch_datafusion_jni.so"; - } - - // Try to load from resources first - InputStream libStream = DataFusionJNI.class.getResourceAsStream("/native/" + libName); - if (libStream != null) { - // Extract to temporary file and load - Path tempLib = Files.createTempFile("libopensearch_datafusion_jni", libExtension); - Files.copy(libStream, tempLib, StandardCopyOption.REPLACE_EXISTING); - tempLib.toFile().deleteOnExit(); - System.load(tempLib.toAbsolutePath().toString()); - libStream.close(); - } else { - System.loadLibrary("opensearch_datafusion_jni"); - } - - libraryLoaded = true; - } catch (IOException | UnsatisfiedLinkError e) { - throw new RuntimeException("Failed to load DataFusion JNI library", e); - } - } - - /** - * Create a new global runtime environment - * @return runtime env pointer for subsequent operations - */ - public static native long createGlobalRuntime(); - - /** - * Closes global runtime environment - * @return runtime env pointer for subsequent operations - */ - public static native long closeGlobalRuntime(long pointer); - - /** - * Get version information - * @return JSON string with version information - */ - public static native String getVersion(); -} diff --git a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/DataFusionPlugin.java b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/DataFusionPlugin.java index 13d5ca2afb467..224075b9c2414 100644 --- a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/DataFusionPlugin.java +++ b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/DataFusionPlugin.java @@ -23,6 +23,7 @@ import org.opensearch.env.Environment; import org.opensearch.env.NodeEnvironment; import org.opensearch.plugins.ActionPlugin; +import org.opensearch.plugins.DataSourceAwarePlugin; import org.opensearch.plugins.Plugin; import org.opensearch.repositories.RepositoriesService; import org.opensearch.rest.RestController; @@ -30,18 +31,20 @@ import org.opensearch.script.ScriptService; import org.opensearch.threadpool.ThreadPool; import org.opensearch.transport.client.Client; +import org.opensearch.vectorized.execution.spi.DataSourceCodec; import org.opensearch.watcher.ResourceWatcherService; import java.util.Collection; import java.util.Collections; import java.util.List; +import java.util.Map; import java.util.function.Supplier; /** * Main plugin class for OpenSearch DataFusion integration. * */ -public class DataFusionPlugin extends Plugin implements ActionPlugin { +public class DataFusionPlugin extends Plugin implements ActionPlugin, DataSourceAwarePlugin { private DataFusionService dataFusionService; private final boolean isDataFusionEnabled; @@ -73,23 +76,24 @@ public DataFusionPlugin(Settings settings) { */ @Override public Collection createComponents( - Client client, - ClusterService clusterService, - ThreadPool threadPool, - ResourceWatcherService resourceWatcherService, - ScriptService scriptService, - NamedXContentRegistry xContentRegistry, - Environment environment, - NodeEnvironment nodeEnvironment, - NamedWriteableRegistry namedWriteableRegistry, - IndexNameExpressionResolver indexNameExpressionResolver, - Supplier repositoriesServiceSupplier + Client client, + ClusterService clusterService, + ThreadPool threadPool, + ResourceWatcherService resourceWatcherService, + ScriptService scriptService, + NamedXContentRegistry xContentRegistry, + Environment environment, + NodeEnvironment nodeEnvironment, + NamedWriteableRegistry namedWriteableRegistry, + IndexNameExpressionResolver indexNameExpressionResolver, + Supplier repositoriesServiceSupplier, + Map dataSourceCodecs ) { if (!isDataFusionEnabled) { return Collections.emptyList(); } - - dataFusionService = new DataFusionService(); + dataFusionService = new DataFusionService(dataSourceCodecs); + // return Collections.emptyList(); return Collections.singletonList(dataFusionService); } @@ -106,20 +110,18 @@ public Collection createComponents( */ @Override public List getRestHandlers( - Settings settings, - RestController restController, - ClusterSettings clusterSettings, - IndexScopedSettings indexScopedSettings, - SettingsFilter settingsFilter, - IndexNameExpressionResolver indexNameExpressionResolver, - Supplier nodesInCluster + Settings settings, + RestController restController, + ClusterSettings clusterSettings, + IndexScopedSettings indexScopedSettings, + SettingsFilter settingsFilter, + IndexNameExpressionResolver indexNameExpressionResolver, + Supplier nodesInCluster ) { if (!isDataFusionEnabled) { return Collections.emptyList(); } - return List.of( - new DataFusionAction() - ); + return List.of(new DataFusionAction()); } /** @@ -131,8 +133,11 @@ public List getRestHandlers( if (!isDataFusionEnabled) { return Collections.emptyList(); } - return List.of( - new ActionHandler<>(NodesDataFusionInfoAction.INSTANCE, TransportNodesDataFusionInfoAction.class) - ); + return List.of(new ActionHandler<>(NodesDataFusionInfoAction.INSTANCE, TransportNodesDataFusionInfoAction.class)); + } + + @Override + public void registerDataSources(Map dataSourceCodecs) { + dataFusionService = new DataFusionService(dataSourceCodecs); } } diff --git a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/DataFusionQueryJNI.java b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/DataFusionQueryJNI.java new file mode 100644 index 0000000000000..48578c987226d --- /dev/null +++ b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/DataFusionQueryJNI.java @@ -0,0 +1,141 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.datafusion; + +/** + * JNI wrapper for DataFusion operations + */ +public class DataFusionQueryJNI { + + private static boolean libraryLoaded = false; + + static { + loadNativeLibrary(); + } + + /** + * Private constructor to prevent instantiation of utility class. + */ + private DataFusionQueryJNI() { + // Utility class + } + + /** + * Load the native library from resources + */ + private static synchronized void loadNativeLibrary() { + if (libraryLoaded) { + return; + } + + try { + // Try to load the library directly + System.loadLibrary("opensearch_datafusion_jni"); + libraryLoaded = true; + } catch (UnsatisfiedLinkError e) { + // Try loading from resources + try { + String osName = System.getProperty("os.name").toLowerCase(); + String libExtension = osName.contains("windows") ? ".dll" : (osName.contains("mac") ? ".dylib" : ".so"); + String libName = "libopensearch_datafusion_jni" + libExtension; + + java.io.InputStream is = DataFusionQueryJNI.class.getResourceAsStream("/native/" + libName); + if (is != null) { + java.io.File tempFile = java.io.File.createTempFile("libopensearch_datafusion_jni", libExtension); + tempFile.deleteOnExit(); + + try (java.io.FileOutputStream fos = new java.io.FileOutputStream(tempFile)) { + byte[] buffer = new byte[8192]; + int bytesRead; + while ((bytesRead = is.read(buffer)) != -1) { + fos.write(buffer, 0, bytesRead); + } + } + + System.load(tempFile.getAbsolutePath()); + libraryLoaded = true; + } else { + throw new RuntimeException("Native library not found: " + libName, e); + } + } catch (Exception ex) { + throw new RuntimeException("Failed to load native library", ex); + } + } + } + + /** + * Create a new global runtime environment + * @return runtime env pointer for subsequent operations + */ + public static native long createGlobalRuntime(); + + /** + * Closes global runtime environment + * @param pointer the runtime environment pointer to close + * @return status code + */ + public static native long closeGlobalRuntime(long pointer); + + /** + * Get version information + * @return JSON string with version information + */ + public static native String getVersionInfo(); + + /** + * Create a new DataFusion session context + * @param runtimeId the global runtime environment ID + * @return context ID for subsequent operations + */ + public static native long createSessionContext(long runtimeId); + + /** + * Close and cleanup a DataFusion context + * @param contextId the context ID to close + */ + public static native void closeSessionContext(long contextId); + + /** + * Execute a Substrait query plan + * @param contextId the session context ID + * @param substraitPlan the serialized Substrait query plan + * @return stream pointer for result iteration + */ + public static native long executeSubstraitQuery(long contextId, byte[] substraitPlan); + + /** + * Register a directory with CSV files + * @param contextId the session context ID + * @param tableName the table name to register + * @param directoryPath the directory path containing CSV files + * @param fileNames array of file names to register + * @return status code + */ + public static native int registerCsvDirectory(long contextId, String tableName, String directoryPath, String[] fileNames); + + /** + * Check if stream has more data + * @param streamPtr the stream pointer + * @return true if more data available + */ + public static native boolean streamHasNext(long streamPtr); + + /** + * Get next batch from stream + * @param streamPtr the stream pointer + * @return byte array containing the next batch, or null if no more data + */ + public static native byte[] streamNext(long streamPtr); + + /** + * Close and cleanup a result stream + * @param streamPtr the stream pointer to close + */ + public static native void closeStream(long streamPtr); +} diff --git a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/DataFusionService.java b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/DataFusionService.java index 03b678a494d2f..099ae90d20599 100644 --- a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/DataFusionService.java +++ b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/DataFusionService.java @@ -14,11 +14,11 @@ import org.opensearch.common.util.concurrent.ConcurrentCollections; import org.opensearch.common.util.concurrent.ConcurrentMapLong; import org.opensearch.datafusion.core.GlobalRuntimeEnv; -import org.opensearch.datafusion.spi.DataSourceCodec; -import org.opensearch.datafusion.spi.DataSourceRegistry; -import org.opensearch.datafusion.spi.RecordBatchStream; +import org.opensearch.vectorized.execution.spi.DataSourceCodec; +import org.opensearch.vectorized.execution.spi.RecordBatchStream; import java.util.List; +import java.util.Map; import java.util.concurrent.CompletableFuture; /** @@ -32,9 +32,14 @@ public class DataFusionService extends AbstractLifecycleComponent { private final DataSourceRegistry dataSourceRegistry; private final GlobalRuntimeEnv globalRuntimeEnv; - public DataFusionService() { - this.dataSourceRegistry = DataSourceRegistry.getInstance(); - String version = DataFusionJNI.getVersion(); + /** + * Creates a new DataFusion service instance. + */ + public DataFusionService(Map dataSourceCodecs) { + this.dataSourceRegistry = new DataSourceRegistry(dataSourceCodecs); + + // to verify jni + String version = DataFusionQueryJNI.getVersionInfo(); this.globalRuntimeEnv = new GlobalRuntimeEnv(); } @@ -43,14 +48,15 @@ protected void doStart() { logger.info("Starting DataFusion service"); try { // Initialize the data source registry - dataSourceRegistry.initialize(); - // Test that at least one data source is available if (!dataSourceRegistry.hasCodecs()) { logger.warn("No data sources available"); } else { - logger.info("DataFusion service started successfully with {} data sources: {}", - dataSourceRegistry.getCodecNames().size(), dataSourceRegistry.getCodecNames()); + logger.info( + "DataFusion service started successfully with {} data sources: {}", + dataSourceRegistry.getCodecNames().size(), + dataSourceRegistry.getCodecNames() + ); } } catch (Exception e) { @@ -71,9 +77,6 @@ protected void doStop() { logger.warn("Error closing session context {}", sessionId, e); } } - - // Shutdown the engine registry - dataSourceRegistry.shutdown(); sessionEngines.clear(); globalRuntimeEnv.close(); logger.info("DataFusion service stopped"); @@ -95,12 +98,15 @@ protected void doClose() { public CompletableFuture registerDirectory(String directoryPath, List fileNames) { DataSourceCodec engine = dataSourceRegistry.getDefaultEngine(); if (engine == null) { - return CompletableFuture.failedFuture( - new IllegalStateException("No DataFusion engine available")); + return CompletableFuture.failedFuture(new IllegalStateException("No DataFusion engine available")); } - logger.debug("Registering directory {} with {} files using engine {}", - directoryPath, fileNames.size(), engine.getClass().getSimpleName()); + logger.debug( + "Registering directory {} with {} files using engine {}", + directoryPath, + fileNames.size(), + engine.getClass().getSimpleName() + ); return engine.registerDirectory(directoryPath, fileNames, globalRuntimeEnv.getPointer()); } @@ -114,21 +120,21 @@ public CompletableFuture createSessionContext() { long runtimeEnvironmentId = globalRuntimeEnv.getPointer(); DataSourceCodec codec = dataSourceRegistry.getDefaultEngine(); if (codec == null) { - return CompletableFuture.failedFuture( - new IllegalArgumentException("Runtime environment not found: " + runtimeEnvironmentId)); + return CompletableFuture.failedFuture(new IllegalArgumentException("Runtime environment not found: " + runtimeEnvironmentId)); } - logger.debug("Creating session context for runtime environment {} using engine {}", - runtimeEnvironmentId, codec.getClass().getSimpleName()); - - return codec.createSessionContext(runtimeEnvironmentId) - .thenApply(sessionId -> { - // Track which engine created this session context - sessionEngines.put(sessionId, codec); - logger.debug("Created session context {} with engine {}", - sessionId, codec.getClass().getSimpleName()); - return sessionId; - }); + logger.debug( + "Creating session context for runtime environment {} using engine {}", + runtimeEnvironmentId, + codec.getClass().getSimpleName() + ); + + return codec.createSessionContext(runtimeEnvironmentId).thenApply(sessionId -> { + // Track which engine created this session context + sessionEngines.put(sessionId, codec); + logger.debug("Created session context {} with engine {}", sessionId, codec.getClass().getSimpleName()); + return sessionId; + }); } /** @@ -141,12 +147,15 @@ public CompletableFuture createSessionContext() { public CompletableFuture executeSubstraitQuery(long sessionContextId, byte[] substraitPlanBytes) { DataSourceCodec engine = sessionEngines.get(sessionContextId); if (engine == null) { - return CompletableFuture.failedFuture( - new IllegalArgumentException("Session context not found: " + sessionContextId)); + return CompletableFuture.failedFuture(new IllegalArgumentException("Session context not found: " + sessionContextId)); } - logger.debug("Executing substrait query for session {} with plan size {} bytes using engine {}", - sessionContextId, substraitPlanBytes.length, engine.getClass().getSimpleName()); + logger.debug( + "Executing substrait query for session {} with plan size {} bytes using engine {}", + sessionContextId, + substraitPlanBytes.length, + engine.getClass().getSimpleName() + ); return engine.executeSubstraitQuery(sessionContextId, substraitPlanBytes); } @@ -164,8 +173,7 @@ public CompletableFuture closeSessionContext(long sessionContextId) { return CompletableFuture.completedFuture(null); } - logger.debug("Closing session context {} using engine {}", - sessionContextId, engine.getClass().getSimpleName()); + logger.debug("Closing session context {} using engine {}", sessionContextId, engine.getClass().getSimpleName()); return engine.closeSessionContext(sessionContextId); } @@ -179,7 +187,7 @@ public String getVersion() { version.append("{\"codecs\":["); boolean first = true; - for (String engineName : dataSourceRegistry.getCodecNames()) { + for (String engineName : this.dataSourceRegistry.getCodecNames()) { if (!first) { version.append(","); } diff --git a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/DataSourceRegistry.java b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/DataSourceRegistry.java new file mode 100644 index 0000000000000..9229b861ceef3 --- /dev/null +++ b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/DataSourceRegistry.java @@ -0,0 +1,72 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.datafusion; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.opensearch.vectorized.execution.spi.DataSourceCodec; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +/** + * Registry for DataFusion data source codecs. + */ +public class DataSourceRegistry { + + private static final Logger logger = LogManager.getLogger(DataSourceRegistry.class); + + private final ConcurrentHashMap codecs = new ConcurrentHashMap<>(); + + public DataSourceRegistry(Map dataSourceCodecMap) { + codecs.putAll(dataSourceCodecMap); + } + + /** + * Check if any codecs are available. + * + * @return true if codecs are available, false otherwise + */ + public boolean hasCodecs() { + return !codecs.isEmpty(); + } + + /** + * Get the names of all registered codecs. + * + * @return list of codec names + */ + public List getCodecNames() { + return new ArrayList<>(codecs.keySet()); + } + + /** + * Get the default codec (first available codec). + * + * @return the default codec, or null if none available + */ + public DataSourceCodec getDefaultEngine() { + if (codecs.isEmpty()) { + return null; + } + return codecs.values().iterator().next(); + } + + /** + * Get a codec by name. + * + * @param name the codec name + * @return the codec, or null if not found + */ + public DataSourceCodec getCodec(String name) { + return codecs.get(name); + } +} diff --git a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/DataFusionAction.java b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/DataFusionAction.java index 66dd36d2d0bfe..99695d2c96266 100644 --- a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/DataFusionAction.java +++ b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/DataFusionAction.java @@ -43,10 +43,7 @@ public String getName() { */ @Override public List routes() { - return List.of( - new Route(GET, "/_plugins/datafusion/info"), - new Route(GET, "/_plugins/datafusion/info/{nodeId}") - ); + return List.of(new Route(GET, "/_plugins/datafusion/info"), new Route(GET, "/_plugins/datafusion/info/{nodeId}")); } /** diff --git a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/NodeDataFusionInfo.java b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/NodeDataFusionInfo.java index 6d50e2d40af78..5512110c576da 100644 --- a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/NodeDataFusionInfo.java +++ b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/NodeDataFusionInfo.java @@ -29,10 +29,7 @@ public class NodeDataFusionInfo extends BaseNodeResponse implements ToXContentFr * @param node The discovery node. * @param dataFusionVersion The DataFusion version. */ - public NodeDataFusionInfo( - DiscoveryNode node, - String dataFusionVersion - ) { + public NodeDataFusionInfo(DiscoveryNode node, String dataFusionVersion) { super(node); this.dataFusionVersion = dataFusionVersion; } diff --git a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/NodesDataFusionInfoRequest.java b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/NodesDataFusionInfoRequest.java index 61ce2444722ee..4e32bb3b0f18c 100644 --- a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/NodesDataFusionInfoRequest.java +++ b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/NodesDataFusionInfoRequest.java @@ -53,7 +53,6 @@ public void writeTo(StreamOutput out) throws IOException { super.writeTo(out); } - /** * Node-level request for DataFusion information */ diff --git a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/NodesDataFusionInfoResponse.java b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/NodesDataFusionInfoResponse.java index 1c7d99627c7e5..fca186749cde6 100644 --- a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/NodesDataFusionInfoResponse.java +++ b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/NodesDataFusionInfoResponse.java @@ -30,11 +30,7 @@ public class NodesDataFusionInfoResponse extends BaseNodesResponse nodes, - List failures - ) { + public NodesDataFusionInfoResponse(ClusterName clusterName, List nodes, List failures) { super(clusterName, nodes, failures); } @@ -75,8 +71,8 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws builder.startObject("nodes"); for (NodeDataFusionInfo nodeInfo : getNodes()) { builder.field(nodeInfo.getNode().getId()); -// builder.field("name", nodeInfo.getNode().getName()); -// builder.field("transport_address", nodeInfo.getNode().getAddress().toString()); + // builder.field("name", nodeInfo.getNode().getName()); + // builder.field("transport_address", nodeInfo.getNode().getAddress().toString()); nodeInfo.toXContent(builder, params); } builder.endObject(); diff --git a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/TransportNodesDataFusionInfoAction.java b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/TransportNodesDataFusionInfoAction.java index 1ba5fd9af3210..8a659f29230d6 100644 --- a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/TransportNodesDataFusionInfoAction.java +++ b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/TransportNodesDataFusionInfoAction.java @@ -101,15 +101,10 @@ protected NodeDataFusionInfo newNodeResponse(StreamInput in) throws IOException @Override protected NodeDataFusionInfo nodeOperation(NodesDataFusionInfoRequest.NodeDataFusionInfoRequest request) { try { - return new NodeDataFusionInfo( - clusterService.localNode(), - dataFusionService.getVersion() - ); + System.out.println(this.dataFusionService.getVersion()); + return new NodeDataFusionInfo(clusterService.localNode(), dataFusionService.getVersion()); } catch (Exception e) { - return new NodeDataFusionInfo( - clusterService.localNode(), - "unknown" - ); + return new NodeDataFusionInfo(clusterService.localNode(), "unknown"); } } } diff --git a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/package-info.java b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/package-info.java new file mode 100644 index 0000000000000..d3542f4dfe9dc --- /dev/null +++ b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/action/package-info.java @@ -0,0 +1,13 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +/** + * REST actions and transport handlers for DataFusion plugin. + * Provides API endpoints for DataFusion functionality. + */ +package org.opensearch.datafusion.action; diff --git a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/core/GlobalRuntimeEnv.java b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/core/GlobalRuntimeEnv.java index 8380f8ea2dd67..1867028fcb945 100644 --- a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/core/GlobalRuntimeEnv.java +++ b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/core/GlobalRuntimeEnv.java @@ -8,18 +8,28 @@ package org.opensearch.datafusion.core; -import static org.opensearch.datafusion.DataFusionJNI.closeGlobalRuntime; -import static org.opensearch.datafusion.DataFusionJNI.createGlobalRuntime; +import static org.opensearch.datafusion.DataFusionQueryJNI.closeGlobalRuntime; +import static org.opensearch.datafusion.DataFusionQueryJNI.createGlobalRuntime; -public class GlobalRuntimeEnv implements AutoCloseable{ +/** + * Global runtime environment for DataFusion operations. + * Manages the lifecycle of the native DataFusion runtime. + */ +public class GlobalRuntimeEnv implements AutoCloseable { // ptr to runtime environment in df private final long ptr; - + /** + * Creates a new global runtime environment. + */ public GlobalRuntimeEnv() { this.ptr = createGlobalRuntime(); } + /** + * Gets the native pointer to the runtime environment. + * @return the native pointer + */ public long getPointer() { return ptr; } diff --git a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/core/SessionContext.java b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/core/SessionContext.java index 58a750351fe3c..956aa78fdaa30 100644 --- a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/core/SessionContext.java +++ b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/core/SessionContext.java @@ -7,6 +7,7 @@ */ package org.opensearch.datafusion.core; + /** * Session context for datafusion */ @@ -27,6 +28,9 @@ public class SessionContext implements AutoCloseable { */ public static native void closeContext(long contextId); + /** + * Creates a new session context. + */ public SessionContext() { this.ptr = createContext(); } diff --git a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/core/package-info.java b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/core/package-info.java new file mode 100644 index 0000000000000..2c6e72ef3a582 --- /dev/null +++ b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/core/package-info.java @@ -0,0 +1,13 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +/** + * Core DataFusion runtime and session management classes. + * Provides runtime environment and session context management. + */ +package org.opensearch.datafusion.core; diff --git a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/package-info.java b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/package-info.java new file mode 100644 index 0000000000000..81017da49c16c --- /dev/null +++ b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/package-info.java @@ -0,0 +1,13 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +/** + * DataFusion query engine integration for OpenSearch. + * Provides the main plugin and service classes for DataFusion functionality. + */ +package org.opensearch.datafusion; diff --git a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/spi/DataSourceRegistry.java b/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/spi/DataSourceRegistry.java deleted file mode 100644 index e5684054979ed..0000000000000 --- a/plugins/engine-datafusion/src/main/java/org/opensearch/datafusion/spi/DataSourceRegistry.java +++ /dev/null @@ -1,120 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The OpenSearch Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -package org.opensearch.datafusion.spi; - -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; - -import java.util.ArrayList; -import java.util.List; -import java.util.ServiceLoader; -import java.util.concurrent.ConcurrentHashMap; - -/** - * Registry for DataFusion data source codecs. - */ -public class DataSourceRegistry { - - private static final Logger logger = LogManager.getLogger(DataSourceRegistry.class); - private static final DataSourceRegistry INSTANCE = new DataSourceRegistry(); - - private final ConcurrentHashMap codecs = new ConcurrentHashMap<>(); - private volatile boolean initialized = false; - - private DataSourceRegistry() { - // Private constructor for singleton - } - - /** - * Get the singleton instance of the registry. - * - * @return the registry instance - */ - public static DataSourceRegistry getInstance() { - return INSTANCE; - } - - /** - * Initialize the registry by loading available codecs. - */ - public synchronized void initialize() { - if (initialized) { - return; - } - - logger.info("Initializing DataSource registry"); - - try { - // Use ServiceLoader to discover codec implementations - ServiceLoader loader = ServiceLoader.load(DataSourceCodec.class); - - for (DataSourceCodec codec : loader) { - String codecName = codec.getClass().getSimpleName(); - codecs.put(codecName, codec); - logger.info("Registered DataSource codec: {}", codecName); - } - - initialized = true; - logger.info("DataSource registry initialized with {} codecs", codecs.size()); - - } catch (Exception e) { - logger.error("Failed to initialize DataSource registry", e); - throw new RuntimeException("Failed to initialize DataSource registry", e); - } - } - - /** - * Shutdown the registry and clean up resources. - */ - public synchronized void shutdown() { - logger.info("Shutting down DataSource registry"); - codecs.clear(); - initialized = false; - } - - /** - * Check if any codecs are available. - * - * @return true if codecs are available, false otherwise - */ - public boolean hasCodecs() { - return !codecs.isEmpty(); - } - - /** - * Get the names of all registered codecs. - * - * @return list of codec names - */ - public List getCodecNames() { - return new ArrayList<>(codecs.keySet()); - } - - /** - * Get the default codec (first available codec). - * - * @return the default codec, or null if none available - */ - public DataSourceCodec getDefaultEngine() { - if (codecs.isEmpty()) { - return null; - } - return codecs.values().iterator().next(); - } - - /** - * Get a codec by name. - * - * @param name the codec name - * @return the codec, or null if not found - */ - public DataSourceCodec getCodec(String name) { - return codecs.get(name); - } -} diff --git a/plugins/engine-datafusion/src/main/resources/META-INF/services/org.opensearch.datafusion.spi.DataSourceCodec b/plugins/engine-datafusion/src/main/resources/META-INF/services/org.opensearch.vectorized.execution.spi.DataSourceCodec similarity index 100% rename from plugins/engine-datafusion/src/main/resources/META-INF/services/org.opensearch.datafusion.spi.DataSourceCodec rename to plugins/engine-datafusion/src/main/resources/META-INF/services/org.opensearch.vectorized.execution.spi.DataSourceCodec diff --git a/plugins/engine-datafusion/src/test/java/org/opensearch/datafusion/TestDataFusionServiceTests.java b/plugins/engine-datafusion/src/test/java/org/opensearch/datafusion/TestDataFusionServiceTests.java index e63e749b9cdf5..395e2fae52e2f 100644 --- a/plugins/engine-datafusion/src/test/java/org/opensearch/datafusion/TestDataFusionServiceTests.java +++ b/plugins/engine-datafusion/src/test/java/org/opensearch/datafusion/TestDataFusionServiceTests.java @@ -8,16 +8,11 @@ package org.opensearch.datafusion; -import org.junit.Before; -import org.junit.Test; -import org.junit.Assume; -import org.opensearch.datafusion.core.SessionContext; import org.opensearch.test.OpenSearchTestCase; +import java.util.Collections; import java.util.List; -import static org.junit.Assert.*; - /** * Unit tests for DataFusionService * @@ -29,14 +24,13 @@ public class TestDataFusionServiceTests extends OpenSearchTestCase { private DataFusionService service; - @Before + @Override public void setUp() throws Exception { super.setUp(); - service = new DataFusionService(); + service = new DataFusionService(Collections.emptyMap()); service.doStart(); } - @Test public void testGetVersion() { String version = service.getVersion(); assertNotNull(version); @@ -45,13 +39,17 @@ public void testGetVersion() { assertTrue("Version should contain CsvDataSourceCodec", version.contains("CsvDataSourceCodec")); } - @Test public void testCreateAndCloseContext() { - service.registerDirectory("/somedir", List.of("some.csv")); + service.registerDirectory("/Users/gbh/Documents", List.of("parquet-nested.csv")); long contextId = service.createSessionContext().join(); // Create context assertTrue(contextId > 0); service.getVersion(); } + + public void testCodecDiscovery() { + // Test that the CSV codec can be discovered via SPI + // TODO : test with dummy plugin and dummy codec + } } diff --git a/server/build.gradle b/server/build.gradle index 803d791295e71..100ff7be5b49b 100644 --- a/server/build.gradle +++ b/server/build.gradle @@ -71,6 +71,7 @@ dependencies { api project(":libs:opensearch-geo") api project(":libs:opensearch-telemetry") api project(":libs:opensearch-task-commons") + api project(':libs:opensearch-vectorized-exec-spi') compileOnly project(":libs:agent-sm:bootstrap") compileOnly project(':libs:opensearch-plugin-classloader') @@ -114,6 +115,7 @@ dependencies { api libs.protobuf api libs.jakartaannotation + // https://mvnrepository.com/artifact/org.roaringbitmap/RoaringBitmap api libs.roaringbitmap testImplementation 'org.awaitility:awaitility:4.3.0' diff --git a/server/src/main/java/org/opensearch/node/Node.java b/server/src/main/java/org/opensearch/node/Node.java index b972457ee085a..1c2701e95a72e 100644 --- a/server/src/main/java/org/opensearch/node/Node.java +++ b/server/src/main/java/org/opensearch/node/Node.java @@ -210,6 +210,8 @@ import org.opensearch.plugins.ClusterPlugin; import org.opensearch.plugins.CryptoKeyProviderPlugin; import org.opensearch.plugins.CryptoPlugin; +import org.opensearch.plugins.DataSourceAwarePlugin; +import org.opensearch.plugins.DataSourcePlugin; import org.opensearch.plugins.DiscoveryPlugin; import org.opensearch.plugins.EnginePlugin; import org.opensearch.plugins.ExtensionAwarePlugin; @@ -285,6 +287,7 @@ import org.opensearch.transport.client.Client; import org.opensearch.transport.client.node.NodeClient; import org.opensearch.usage.UsageService; +import org.opensearch.vectorized.execution.spi.DataSourceCodec; import org.opensearch.watcher.ResourceWatcherService; import org.opensearch.wlm.WorkloadGroupService; import org.opensearch.wlm.WorkloadGroupsStateAccessor; @@ -1093,10 +1096,38 @@ protected Node(final Environment initialEnvironment, Collection clas ).stream() ) .collect(Collectors.toList()); - // Add the telemetryAwarePlugin components to the existing pluginComponents collection. pluginComponents.addAll(telemetryAwarePluginComponents); + Map dataSourceCodecMap = new HashMap<>(); + for (DataSourcePlugin dataSourcePlugin : pluginsService.filterPlugins(DataSourcePlugin.class)) { + if (dataSourcePlugin.getDataSourceCodecs().isPresent()) { + dataSourceCodecMap.putAll(dataSourcePlugin.getDataSourceCodecs().get()); + } + } + + Collection dataSourceAwareComponents = pluginsService.filterPlugins(DataSourceAwarePlugin.class) + .stream() + .flatMap( + p -> p.createComponents( + client, + clusterService, + threadPool, + resourceWatcherService, + scriptService, + xContentRegistry, + environment, + nodeEnvironment, + namedWriteableRegistry, + clusterModule.getIndexNameExpressionResolver(), + repositoriesServiceReference::get, + dataSourceCodecMap + ).stream() + ) + .collect(Collectors.toList()); + + // Add all dataSourceAwarePlugin components to the existing pluginComponents + pluginComponents.addAll(dataSourceAwareComponents); List identityAwarePlugins = pluginsService.filterPlugins(IdentityAwarePlugin.class); identityService.initializeIdentityAwarePlugins(identityAwarePlugins); diff --git a/server/src/main/java/org/opensearch/plugins/DataSourceAwarePlugin.java b/server/src/main/java/org/opensearch/plugins/DataSourceAwarePlugin.java new file mode 100644 index 0000000000000..1b2a4d0d05e52 --- /dev/null +++ b/server/src/main/java/org/opensearch/plugins/DataSourceAwarePlugin.java @@ -0,0 +1,51 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.plugins; + +import org.opensearch.cluster.metadata.IndexNameExpressionResolver; +import org.opensearch.cluster.service.ClusterService; +import org.opensearch.core.common.io.stream.NamedWriteableRegistry; +import org.opensearch.core.xcontent.NamedXContentRegistry; +import org.opensearch.env.Environment; +import org.opensearch.env.NodeEnvironment; +import org.opensearch.repositories.RepositoriesService; +import org.opensearch.script.ScriptService; +import org.opensearch.threadpool.ThreadPool; +import org.opensearch.transport.client.Client; +import org.opensearch.vectorized.execution.spi.DataSourceCodec; +import org.opensearch.watcher.ResourceWatcherService; + +import java.util.Collection; +import java.util.Collections; +import java.util.Map; +import java.util.function.Supplier; + +public interface DataSourceAwarePlugin { + void registerDataSources(Map dataSourceCodecs); + + /** + * Make dataSourceCodecs available for the DataSourceAwarePlugin(s) + */ + default Collection createComponents( + Client client, + ClusterService clusterService, + ThreadPool threadPool, + ResourceWatcherService resourceWatcherService, + ScriptService scriptService, + NamedXContentRegistry xContentRegistry, + Environment environment, + NodeEnvironment nodeEnvironment, + NamedWriteableRegistry namedWriteableRegistry, + IndexNameExpressionResolver indexNameExpressionResolver, + Supplier repositoriesServiceSupplier, + Map dataSourceCodecs + ) { + return Collections.emptyList(); + } +} diff --git a/server/src/main/java/org/opensearch/plugins/DataSourcePlugin.java b/server/src/main/java/org/opensearch/plugins/DataSourcePlugin.java new file mode 100644 index 0000000000000..3118e3d1e7d90 --- /dev/null +++ b/server/src/main/java/org/opensearch/plugins/DataSourcePlugin.java @@ -0,0 +1,21 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.plugins; + +import org.opensearch.vectorized.execution.spi.DataSourceCodec; + +import java.util.Map; +import java.util.Optional; + +public interface DataSourcePlugin { + // TODO : move to vectorized exec specific plugin + default Optional> getDataSourceCodecs() { + return Optional.empty(); + } +}