apache · andygrove · May 5, 2026 · May 5, 2026
diff --git a/common/src/main/java/org/apache/comet/udf/CometUdfBridge.java b/common/src/main/java/org/apache/comet/udf/CometUdfBridge.java
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.comet.udf;
+
+import java.util.LinkedHashMap;
+import java.util.Map;
+
+import org.apache.arrow.c.ArrowArray;
+import org.apache.arrow.c.ArrowSchema;
+import org.apache.arrow.c.Data;
+import org.apache.arrow.memory.BufferAllocator;
+import org.apache.arrow.vector.FieldVector;
+import org.apache.arrow.vector.ValueVector;
+
+/**
+ * JNI entry point for native execution to invoke a {@link CometUDF}. Matches the static-method
+ * pattern used by CometScalarSubquery so the native side can dispatch via
+ * call_static_method_unchecked.
+ */
+public class CometUdfBridge {
+
+  // Per-thread, bounded LRU of UDF instances keyed by class name. Comet
+  // native execution threads (Tokio/DataFusion worker pool) are reused
+  // across tasks within an executor, so the effective lifetime of cached
+  // entries is the worker thread (i.e. the executor JVM). This is fine for
+  // stateless UDFs like ArrayExistsUDF; future stateful UDFs would need
+  // explicit per-task isolation.
+  private static final int CACHE_CAPACITY = 64;
+
+  private static final ThreadLocal<LinkedHashMap<String, CometUDF>> INSTANCES =
+      ThreadLocal.withInitial(
+          () ->
+              new LinkedHashMap<String, CometUDF>(CACHE_CAPACITY, 0.75f, true) {
+                @Override
+                protected boolean removeEldestEntry(Map.Entry<String, CometUDF> eldest) {
+                  return size() > CACHE_CAPACITY;
+                }
+              });
+
+  /**
+   * Called from native via JNI.
+   *
+   * @param udfClassName fully-qualified class name implementing CometUDF
+   * @param inputArrayPtrs addresses of pre-allocated FFI_ArrowArray structs (one per input)
+   * @param inputSchemaPtrs addresses of pre-allocated FFI_ArrowSchema structs (one per input)
+   * @param outArrayPtr address of pre-allocated FFI_ArrowArray for the result
+   * @param outSchemaPtr address of pre-allocated FFI_ArrowSchema for the result
+   */
+  public static void evaluate(
+      String udfClassName,
+      long[] inputArrayPtrs,
+      long[] inputSchemaPtrs,
+      long outArrayPtr,
+      long outSchemaPtr) {
+    LinkedHashMap<String, CometUDF> cache = INSTANCES.get();
+    CometUDF udf = cache.get(udfClassName);
+    if (udf == null) {
+      try {
+        // Resolve via the executor's context classloader so user-supplied UDF jars
+        // (added via spark.jars / --jars) are visible.
+        ClassLoader cl = Thread.currentThread().getContextClassLoader();
+        if (cl == null) {
+          cl = CometUdfBridge.class.getClassLoader();
+        }
+        udf =
+            (CometUDF) Class.forName(udfClassName, true, cl).getDeclaredConstructor().newInstance();
+      } catch (ReflectiveOperationException e) {
+        throw new RuntimeException("Failed to instantiate CometUDF: " + udfClassName, e);
+      }
+      cache.put(udfClassName, udf);
+    }
+
+    BufferAllocator allocator = org.apache.comet.package$.MODULE$.CometArrowAllocator();
+
+    ValueVector[] inputs = new ValueVector[inputArrayPtrs.length];
+    ValueVector result = null;
+    try {
+      for (int i = 0; i < inputArrayPtrs.length; i++) {
+        ArrowArray inArr = ArrowArray.wrap(inputArrayPtrs[i]);
+        ArrowSchema inSch = ArrowSchema.wrap(inputSchemaPtrs[i]);
+        inputs[i] = Data.importVector(allocator, inArr, inSch, null);
+      }
+
+      result = udf.evaluate(inputs);
+      if (!(result instanceof FieldVector)) {
+        throw new RuntimeException(
+            "CometUDF.evaluate() must return a FieldVector, got: " + result.getClass().getName());
+      }
+      // Result length must match the longest input. Scalar (length-1) inputs
+      // are allowed to be shorter, but a vector input bounds the output.
+      int expectedLen = 0;
+      for (ValueVector v : inputs) {
+        expectedLen = Math.max(expectedLen, v.getValueCount());
+      }
+      if (result.getValueCount() != expectedLen) {
+        throw new RuntimeException(
+            "CometUDF.evaluate() returned "
+                + result.getValueCount()
+                + " rows, expected "
+                + expectedLen);
+      }
+      ArrowArray outArr = ArrowArray.wrap(outArrayPtr);
+      ArrowSchema outSch = ArrowSchema.wrap(outSchemaPtr);
+      Data.exportVector(allocator, (FieldVector) result, null, outArr, outSch);
+    } finally {
+      for (ValueVector v : inputs) {
+        if (v != null) {
+          try {
+            v.close();
+          } catch (RuntimeException ignored) {
+            // do not mask the original throwable
+          }
+        }
+      }
+      if (result != null) {
+        try {
+          result.close();
+        } catch (RuntimeException ignored) {
+          // do not mask the original throwable
+        }
+      }
+    }
+  }
+}
diff --git a/common/src/main/scala/org/apache/comet/udf/ArrayExistsUDF.scala b/common/src/main/scala/org/apache/comet/udf/ArrayExistsUDF.scala
@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.comet.udf
+
+import java.nio.charset.StandardCharsets
+
+import org.apache.arrow.vector._
+import org.apache.arrow.vector.complex.ListVector
+import org.apache.spark.sql.catalyst.expressions.{ArrayExists, LambdaFunction, NamedLambdaVariable}
+import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
+
+import org.apache.comet.CometArrowAllocator
+
+/**
+ * JVM UDF implementing Spark's `exists(array, x -> predicate(x))` higher-order function.
+ *
+ * Inputs:
+ *   - inputs(0): ListVector (the array column)
+ *   - inputs(1): VarCharVector length-1 scalar (registry key for the lambda expression)
+ *
+ * Output: BitVector (nullable boolean), same length as the input array vector.
+ *
+ * Implements Spark's three-valued logic:
+ *   - true if any element satisfies the predicate
+ *   - null if no element satisfies but the predicate returned null for at least one element
+ *   - false if all elements produce false
+ */
+class ArrayExistsUDF extends CometUDF {
+
+  override def evaluate(inputs: Array[ValueVector]): ValueVector = {
+    require(inputs.length == 2, s"ArrayExistsUDF expects 2 inputs, got ${inputs.length}")
+    val listVec = inputs(0).asInstanceOf[ListVector]
+    val keyVec = inputs(1).asInstanceOf[VarCharVector]
+    require(
+      keyVec.getValueCount >= 1 && !keyVec.isNull(0),
+      "ArrayExistsUDF requires a non-null scalar registry key")
+
+    val registryKey = new String(keyVec.get(0), StandardCharsets.UTF_8)
+    val arrayExistsExpr = CometLambdaRegistry.get(registryKey).asInstanceOf[ArrayExists]
+
+    val LambdaFunction(_, Seq(elementVar: NamedLambdaVariable), _) = arrayExistsExpr.function
+    val body = arrayExistsExpr.functionForEval
+    val followThreeValuedLogic = arrayExistsExpr.followThreeValuedLogic
+    val elementType = elementVar.dataType
+
+    val dataVec = listVec.getDataVector
+    val n = listVec.getValueCount
+    val out = new BitVector("exists_result", CometArrowAllocator)
+    out.allocateNew(n)
+
+    var i = 0
+    while (i < n) {
+      if (listVec.isNull(i)) {
+        out.setNull(i)
+      } else {
+        val startIdx = listVec.getElementStartIndex(i)
+        val endIdx = listVec.getElementEndIndex(i)
+        var exists = false
+        var foundNull = false
+        var j = startIdx
+        while (j < endIdx && !exists) {
+          if (dataVec.isNull(j)) {
+            elementVar.value.set(null)
+            val ret = body.eval(null)
+            if (ret == null) foundNull = true
+            else if (ret.asInstanceOf[Boolean]) exists = true
+          } else {
+            val elem = getSparkValue(dataVec, j, elementType)
+            elementVar.value.set(elem)
+            val ret = body.eval(null)
+            if (ret == null) foundNull = true
+            else if (ret.asInstanceOf[Boolean]) exists = true
+          }
+          j += 1
+        }
+        if (exists) {
+          out.set(i, 1)
+        } else if (followThreeValuedLogic && foundNull) {
+          out.setNull(i)
+        } else {
+          out.set(i, 0)
+        }
+      }
+      i += 1
+    }
+    out.setValueCount(n)
+    out
+  }
+
+  private def getSparkValue(vec: ValueVector, index: Int, sparkType: DataType): Any = {
+    sparkType match {
+      case BooleanType =>
+        vec.asInstanceOf[BitVector].get(index) == 1
+      case ByteType =>
+        vec.asInstanceOf[TinyIntVector].get(index).toByte
+      case ShortType =>
+        vec.asInstanceOf[SmallIntVector].get(index).toShort
+      case IntegerType =>
+        vec.asInstanceOf[IntVector].get(index)
+      case LongType =>
+        vec.asInstanceOf[BigIntVector].get(index)
+      case FloatType =>
+        vec.asInstanceOf[Float4Vector].get(index)
+      case DoubleType =>
+        vec.asInstanceOf[Float8Vector].get(index)
+      case StringType =>
+        val bytes = vec.asInstanceOf[VarCharVector].get(index)
+        UTF8String.fromBytes(bytes)
+      case BinaryType =>
+        vec.asInstanceOf[VarBinaryVector].get(index)
+      case _: DecimalType =>
+        val dt = sparkType.asInstanceOf[DecimalType]
+        val decimal = vec.asInstanceOf[DecimalVector].getObject(index)
+        Decimal(decimal, dt.precision, dt.scale)
+      case DateType =>
+        vec.asInstanceOf[DateDayVector].get(index)
+      case TimestampType =>
+        vec.asInstanceOf[TimeStampMicroTZVector].get(index)
+      case _ =>
+        throw new UnsupportedOperationException(
+          s"ArrayExistsUDF does not yet support element type: $sparkType")
+    }
+  }
+}
diff --git a/common/src/main/scala/org/apache/comet/udf/CometLambdaRegistry.scala b/common/src/main/scala/org/apache/comet/udf/CometLambdaRegistry.scala
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.comet.udf
+
+import java.util.UUID
+import java.util.concurrent.ConcurrentHashMap
+
+import org.apache.spark.sql.catalyst.expressions.Expression
+
+/**
+ * Thread-safe registry bridging plan-time Spark expressions to execution-time UDF lookup. At plan
+ * time the serde layer registers a lambda expression under a unique key; at execution time the
+ * UDF retrieves it by that key (passed as a scalar argument).
+ */
+object CometLambdaRegistry {
+
+  private val registry = new ConcurrentHashMap[String, Expression]()
+
+  def register(expression: Expression): String = {
+    val key = UUID.randomUUID().toString
+    registry.put(key, expression)
+    key
+  }
+
+  def get(key: String): Expression = {
+    val expr = registry.get(key)
+    if (expr == null) {
+      throw new IllegalStateException(
+        s"Lambda expression not found in registry for key: $key. " +
+          "This indicates a lifecycle issue between plan creation and execution.")
+    }
+    expr
+  }
+
+  def remove(key: String): Unit = {
+    registry.remove(key)
+  }
+
+  // Visible for testing
+  def size(): Int = registry.size()
+}
diff --git a/common/src/main/scala/org/apache/comet/udf/CometUDF.scala b/common/src/main/scala/org/apache/comet/udf/CometUDF.scala
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.comet.udf
+
+import org.apache.arrow.vector.ValueVector
+
+/**
+ * Scalar UDF invoked from native execution via JNI. Receives Arrow vectors as input and returns
+ * an Arrow vector.
+ *
+ *   - Vector arguments arrive at the row count of the current batch.
+ *   - Scalar (literal-folded) arguments arrive as length-1 vectors and must be read at index 0.
+ *   - The returned vector's length must match the longest input.
+ *
+ * Implementations must have a public no-arg constructor and should be stateless: instances are
+ * cached per executor thread for the lifetime of the JVM.
+ */
+trait CometUDF {
+  def evaluate(inputs: Array[ValueVector]): ValueVector
+}