From b96e6e1e2c96218db5766bb9cc0a55203ea6efc1 Mon Sep 17 00:00:00 2001 From: Parth Chandra Date: Tue, 5 May 2026 13:22:07 -0700 Subject: [PATCH 1/3] feat: add support for url_encode, url_decode, and try_url_decode --- native/core/src/execution/jni_api.rs | 6 ++ .../org/apache/comet/serde/statics.scala | 34 ++++++++- .../expressions/url/try_url_decode.sql | 70 +++++++++++++++++++ .../sql-tests/expressions/url/url_decode.sql | 65 +++++++++++++++++ .../sql-tests/expressions/url/url_encode.sql | 53 ++++++++++++++ 5 files changed, 226 insertions(+), 2 deletions(-) create mode 100644 spark/src/test/resources/sql-tests/expressions/url/try_url_decode.sql create mode 100644 spark/src/test/resources/sql-tests/expressions/url/url_decode.sql create mode 100644 spark/src/test/resources/sql-tests/expressions/url/url_encode.sql diff --git a/native/core/src/execution/jni_api.rs b/native/core/src/execution/jni_api.rs index 8da688563c..9f58e636a3 100644 --- a/native/core/src/execution/jni_api.rs +++ b/native/core/src/execution/jni_api.rs @@ -62,6 +62,9 @@ use datafusion_spark::function::string::char::CharFunc; use datafusion_spark::function::string::concat::SparkConcat; use datafusion_spark::function::string::luhn_check::SparkLuhnCheck; use datafusion_spark::function::string::space::SparkSpace; +use datafusion_spark::function::url::try_url_decode::TryUrlDecode as SparkTryUrlDecode; +use datafusion_spark::function::url::url_decode::UrlDecode as SparkUrlDecode; +use datafusion_spark::function::url::url_encode::UrlEncode as SparkUrlEncode; use futures::poll; use futures::stream::StreamExt; use futures::FutureExt; @@ -567,6 +570,9 @@ fn register_datafusion_spark_function(session_ctx: &SessionContext) { session_ctx.register_udf(ScalarUDF::new_from_impl(SparkArrayContains::default())); session_ctx.register_udf(ScalarUDF::new_from_impl(SparkBin::default())); session_ctx.register_udf(ScalarUDF::new_from_impl(SparkStrToMap::default())); + session_ctx.register_udf(ScalarUDF::new_from_impl(SparkUrlDecode::default())); + session_ctx.register_udf(ScalarUDF::new_from_impl(SparkUrlEncode::default())); + session_ctx.register_udf(ScalarUDF::new_from_impl(SparkTryUrlDecode::default())); } /// Prepares arrow arrays for output. diff --git a/spark/src/main/scala/org/apache/comet/serde/statics.scala b/spark/src/main/scala/org/apache/comet/serde/statics.scala index 9dbc6d169f..bff64e753a 100644 --- a/spark/src/main/scala/org/apache/comet/serde/statics.scala +++ b/spark/src/main/scala/org/apache/comet/serde/statics.scala @@ -19,11 +19,12 @@ package org.apache.comet.serde -import org.apache.spark.sql.catalyst.expressions.{Attribute, ExpressionImplUtils} +import org.apache.spark.sql.catalyst.expressions.{Attribute, ExpressionImplUtils, Literal, UrlCodec} import org.apache.spark.sql.catalyst.expressions.objects.StaticInvoke import org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils import org.apache.comet.CometSparkSessionExtensions.withInfo +import org.apache.comet.serde.QueryPlanSerde.{exprToProtoInternal, optExprWithInfo, scalarFunctionExprToProto} object CometStaticInvoke extends CometExpressionSerde[StaticInvoke] { @@ -35,7 +36,9 @@ object CometStaticInvoke extends CometExpressionSerde[StaticInvoke] { Map( ("readSidePadding", classOf[CharVarcharCodegenUtils]) -> CometScalarFunction( "read_side_padding"), - ("isLuhnNumber", classOf[ExpressionImplUtils]) -> CometScalarFunction("luhn_check")) + ("isLuhnNumber", classOf[ExpressionImplUtils]) -> CometScalarFunction("luhn_check"), + ("encode", UrlCodec.getClass) -> CometUrlEncodeStaticInvoke, + ("decode", UrlCodec.getClass) -> CometUrlDecodeStaticInvoke) override def convert( expr: StaticInvoke, @@ -53,3 +56,30 @@ object CometStaticInvoke extends CometExpressionSerde[StaticInvoke] { } } } + +object CometUrlEncodeStaticInvoke extends CometExpressionSerde[StaticInvoke] { + override def convert( + expr: StaticInvoke, + inputs: Seq[Attribute], + binding: Boolean): Option[ExprOuterClass.Expr] = { + val childExpr = exprToProtoInternal(expr.children.head, inputs, binding) + val optExpr = scalarFunctionExprToProto("url_encode", childExpr) + optExprWithInfo(optExpr, expr, expr.children: _*) + } +} + +object CometUrlDecodeStaticInvoke extends CometExpressionSerde[StaticInvoke] { + override def convert( + expr: StaticInvoke, + inputs: Seq[Attribute], + binding: Boolean): Option[ExprOuterClass.Expr] = { + val failOnError = expr.children match { + case Seq(_, Literal(false, _)) => false + case _ => true + } + val funcName = if (failOnError) "url_decode" else "try_url_decode" + val childExpr = exprToProtoInternal(expr.children.head, inputs, binding) + val optExpr = scalarFunctionExprToProto(funcName, childExpr) + optExprWithInfo(optExpr, expr, expr.children: _*) + } +} diff --git a/spark/src/test/resources/sql-tests/expressions/url/try_url_decode.sql b/spark/src/test/resources/sql-tests/expressions/url/try_url_decode.sql new file mode 100644 index 0000000000..7916c2bf7e --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/url/try_url_decode.sql @@ -0,0 +1,70 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- try_url_decode is Spark 4.0+. It rewrites to UrlDecode(_, failOnError=false), +-- which becomes StaticInvoke(UrlCodec, "decode", [child, Literal(false)], ...). +-- CometUrlDecodeStaticInvoke detects failOnError=false and emits try_url_decode. + +-- MinSparkVersion: 4.0 + +statement +CREATE TABLE test_try_decode(s string) USING parquet + +statement +INSERT INTO test_try_decode VALUES + ('https%3A%2F%2Fspark.apache.org'), + ('hello+world'), + ('a%2Bb%3Dc%26d%3De'), + ('caf%C3%A9'), + (''), + (NULL), + ('no+encoding+needed'), + ('%21%40%23%24%25%5E%26%2A%28%29%5F%2B'), + ('%2a%2b%2c'), + ('http%3A%2F%2spark.apache.org') + +query +SELECT try_url_decode(s) FROM test_try_decode + +-- literal arguments +query +SELECT try_url_decode('https%3A%2F%2Fspark.apache.org') + +query +SELECT try_url_decode('hello+world') + +query +SELECT try_url_decode('') + +query +SELECT try_url_decode(NULL) + +-- roundtrip: encode then decode +query +SELECT try_url_decode(url_encode('hello world & goodbye')) + +-- multibyte UTF-8 +query +SELECT try_url_decode('%E6%97%A5%E6%9C%AC%E8%AA%9E%E3%83%86%E3%82%B9%E3%83%88') + +-- lowercase hex (RFC 3986 says hex digits are case-insensitive) +query +SELECT try_url_decode('%2a%2b%2c') + +-- malformed percent-encoding: try_url_decode returns NULL instead of erroring +query +SELECT try_url_decode('http%3A%2F%2spark.apache.org') diff --git a/spark/src/test/resources/sql-tests/expressions/url/url_decode.sql b/spark/src/test/resources/sql-tests/expressions/url/url_decode.sql new file mode 100644 index 0000000000..42b22f575f --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/url/url_decode.sql @@ -0,0 +1,65 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- url_decode function +statement +CREATE TABLE test_decode(s string) USING parquet + +statement +INSERT INTO test_decode VALUES + ('https%3A%2F%2Fspark.apache.org'), + ('hello+world'), + ('a%2Bb%3Dc%26d%3De'), + ('caf%C3%A9'), + (''), + (NULL), + ('no+encoding+needed'), + ('%21%40%23%24%25%5E%26%2A%28%29%5F%2B'), + ('%2a%2b%2c') + +query +SELECT url_decode(s) FROM test_decode + +-- literal arguments +query +SELECT url_decode('https%3A%2F%2Fspark.apache.org') + +query +SELECT url_decode('hello+world') + +query +SELECT url_decode('') + +query +SELECT url_decode(NULL) + +-- roundtrip: encode then decode +query +SELECT url_decode(url_encode('hello world & goodbye')) + +-- multibyte UTF-8 +query +SELECT url_decode('%E6%97%A5%E6%9C%AC%E8%AA%9E%E3%83%86%E3%82%B9%E3%83%88') + +-- lowercase hex (RFC 3986 says hex digits are case-insensitive) +query +SELECT url_decode('%2a%2b%2c') + +-- malformed percent-encoding: both Spark and Comet must error and the bad +-- sequence must appear in the error message +query expect_error(%2s) +SELECT url_decode('http%3A%2F%2spark.apache.org') diff --git a/spark/src/test/resources/sql-tests/expressions/url/url_encode.sql b/spark/src/test/resources/sql-tests/expressions/url/url_encode.sql new file mode 100644 index 0000000000..378bd1c113 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/url/url_encode.sql @@ -0,0 +1,53 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- url_encode function +statement +CREATE TABLE test_encode(s string) USING parquet + +statement +INSERT INTO test_encode VALUES + ('https://spark.apache.org'), + ('hello world'), + ('a+b=c&d=e'), + (''), + (NULL), + ('foo bar/baz?x=1&y=2') + +query +SELECT url_encode(s) FROM test_encode + +-- literal arguments +query +SELECT url_encode('https://spark.apache.org') + +query +SELECT url_encode('hello world') + +query +SELECT url_encode('') + +query +SELECT url_encode(NULL) + +-- special characters +query +SELECT url_encode('a b+c&d=e/f') + +-- multibyte UTF-8 +query +SELECT url_encode('日本語テスト') From 86a234bab8bd5debbc52d7268fe1a42c17338cb0 Mon Sep 17 00:00:00 2001 From: Parth Chandra Date: Tue, 5 May 2026 14:46:56 -0700 Subject: [PATCH 2/3] update doc --- docs/source/contributor-guide/spark_expressions_support.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/contributor-guide/spark_expressions_support.md b/docs/source/contributor-guide/spark_expressions_support.md index 0c671c0403..0d10d08dac 100644 --- a/docs/source/contributor-guide/spark_expressions_support.md +++ b/docs/source/contributor-guide/spark_expressions_support.md @@ -587,9 +587,9 @@ - [ ] parse_url - [ ] try_parse_url -- [ ] try_url_decode -- [ ] url_decode -- [ ] url_encode +- [x] try_url_decode +- [x] url_decode +- [x] url_encode ### window_funcs From 66aafd57ceacda60e483ae1a45512b00f2d59272 Mon Sep 17 00:00:00 2001 From: Parth Chandra Date: Tue, 5 May 2026 18:05:28 -0700 Subject: [PATCH 3/3] feat: add support for parse_url, url_encode, url_decode expressions (#4152) Squashed from PR #4152 (url-functions branch). Co-Authored-By: Andy Grove --- .../spark_expressions_support.md | 13 ++- native/core/src/execution/jni_api.rs | 4 + .../apache/comet/serde/QueryPlanSerde.scala | 5 +- .../scala/org/apache/comet/serde/url.scala | 48 +++++++++++ .../sql-tests/expressions/url/parse_url.sql | 37 ++++++++ .../expressions/url/parse_url_native.sql | 85 +++++++++++++++++++ .../expressions/url/try_parse_url.sql | 40 +++++++++ .../sql-tests/expressions/url/url_encode.sql | 18 +++- 8 files changed, 246 insertions(+), 4 deletions(-) create mode 100644 spark/src/main/scala/org/apache/comet/serde/url.scala create mode 100644 spark/src/test/resources/sql-tests/expressions/url/parse_url.sql create mode 100644 spark/src/test/resources/sql-tests/expressions/url/parse_url_native.sql create mode 100644 spark/src/test/resources/sql-tests/expressions/url/try_parse_url.sql diff --git a/docs/source/contributor-guide/spark_expressions_support.md b/docs/source/contributor-guide/spark_expressions_support.md index 0d10d08dac..73c2710743 100644 --- a/docs/source/contributor-guide/spark_expressions_support.md +++ b/docs/source/contributor-guide/spark_expressions_support.md @@ -585,11 +585,20 @@ ### url_funcs -- [ ] parse_url -- [ ] try_parse_url +- [x] parse_url + - 3.4.3, 2026-04-29 + - 3.5.8, 2026-04-29 + - 4.0.1, 2026-04-29: marked Incompatible. Comet tracks the work at https://github.com/apache/datafusion-comet/issues/4156, with the divergences enumerated upstream at https://github.com/apache/datafusion/issues/21943. - [x] try_url_decode + - 4.0.1, 2026-05-05 - [x] url_decode + - 3.4.3, 2026-04-29 + - 3.5.8, 2026-04-29 + - 4.0.1, 2026-04-29 - [x] url_encode + - 3.4.3, 2026-04-29 + - 3.5.8, 2026-04-29 + - 4.0.1, 2026-04-29 ### window_funcs diff --git a/native/core/src/execution/jni_api.rs b/native/core/src/execution/jni_api.rs index 9f58e636a3..b3509215ac 100644 --- a/native/core/src/execution/jni_api.rs +++ b/native/core/src/execution/jni_api.rs @@ -62,6 +62,8 @@ use datafusion_spark::function::string::char::CharFunc; use datafusion_spark::function::string::concat::SparkConcat; use datafusion_spark::function::string::luhn_check::SparkLuhnCheck; use datafusion_spark::function::string::space::SparkSpace; +use datafusion_spark::function::url::parse_url::ParseUrl as SparkParseUrl; +use datafusion_spark::function::url::try_parse_url::TryParseUrl as SparkTryParseUrl; use datafusion_spark::function::url::try_url_decode::TryUrlDecode as SparkTryUrlDecode; use datafusion_spark::function::url::url_decode::UrlDecode as SparkUrlDecode; use datafusion_spark::function::url::url_encode::UrlEncode as SparkUrlEncode; @@ -570,6 +572,8 @@ fn register_datafusion_spark_function(session_ctx: &SessionContext) { session_ctx.register_udf(ScalarUDF::new_from_impl(SparkArrayContains::default())); session_ctx.register_udf(ScalarUDF::new_from_impl(SparkBin::default())); session_ctx.register_udf(ScalarUDF::new_from_impl(SparkStrToMap::default())); + session_ctx.register_udf(ScalarUDF::new_from_impl(SparkParseUrl::default())); + session_ctx.register_udf(ScalarUDF::new_from_impl(SparkTryParseUrl::default())); session_ctx.register_udf(ScalarUDF::new_from_impl(SparkUrlDecode::default())); session_ctx.register_udf(ScalarUDF::new_from_impl(SparkUrlEncode::default())); session_ctx.register_udf(ScalarUDF::new_from_impl(SparkTryUrlDecode::default())); diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala index 1cfb58b88b..931e28183b 100644 --- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala @@ -234,6 +234,9 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim { private val conversionExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map( classOf[Cast] -> CometCast) + private val urlExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map( + classOf[ParseUrl] -> CometParseUrl) + private[comet] val miscExpressions: Map[Class[_ <: Expression], CometExpressionSerde[_]] = Map( // TODO PromotePrecision classOf[Alias] -> CometAlias, @@ -258,7 +261,7 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim { mathExpressions ++ hashExpressions ++ stringExpressions ++ conditionalExpressions ++ mapExpressions ++ predicateExpressions ++ structExpressions ++ bitwiseExpressions ++ miscExpressions ++ arrayExpressions ++ - temporalExpressions ++ conversionExpressions + temporalExpressions ++ conversionExpressions ++ urlExpressions /** * Mapping of Spark aggregate expression class to Comet expression handler. diff --git a/spark/src/main/scala/org/apache/comet/serde/url.scala b/spark/src/main/scala/org/apache/comet/serde/url.scala new file mode 100644 index 0000000000..c00c65977f --- /dev/null +++ b/spark/src/main/scala/org/apache/comet/serde/url.scala @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet.serde + +import org.apache.spark.sql.catalyst.expressions.{Attribute, ParseUrl} + +import org.apache.comet.serde.QueryPlanSerde.{exprToProtoInternal, optExprWithInfo, scalarFunctionExprToProto} + +object CometParseUrl extends CometExpressionSerde[ParseUrl] { + + // The full list of edge-case divergences is tracked at + // https://github.com/apache/datafusion/issues/21943. + private val incompatibleReason = + "Native parse_url diverges from Spark on several edge cases. " + + "See https://github.com/apache/datafusion/issues/21943." + + override def getIncompatibleReasons(): Seq[String] = Seq(incompatibleReason) + + override def getSupportLevel(expr: ParseUrl): SupportLevel = + Incompatible(Some(incompatibleReason)) + + override def convert( + expr: ParseUrl, + inputs: Seq[Attribute], + binding: Boolean): Option[ExprOuterClass.Expr] = { + val funcName = if (expr.failOnError) "parse_url" else "try_parse_url" + val childExprs = expr.children.map(exprToProtoInternal(_, inputs, binding)) + val optExpr = scalarFunctionExprToProto(funcName, childExprs: _*) + optExprWithInfo(optExpr, expr, expr.children: _*) + } +} diff --git a/spark/src/test/resources/sql-tests/expressions/url/parse_url.sql b/spark/src/test/resources/sql-tests/expressions/url/parse_url.sql new file mode 100644 index 0000000000..20e80a2d4a --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/url/parse_url.sql @@ -0,0 +1,37 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- parse_url is marked Incompatible (see CometParseUrl). Known divergences from +-- Spark, tracked upstream at https://github.com/apache/datafusion/issues/21943: +-- 1. empty-string URL returns NULL instead of "" for any part +-- 2. FILE on a URL without an explicit path returns "/?..." instead of "?..." +-- 3. PATH on a URL with a bare trailing slash returns "" instead of "/" +-- In the default configuration, Comet falls back to Spark. The two queries below +-- both verify a normal-shape URL takes the fallback path, and exercise one of +-- the divergent shapes (trailing-slash PATH) to lock in that fallback handles +-- it correctly. See parse_url_native.sql for native-execution coverage. + +query expect_fallback(not fully compatible with Spark) +SELECT parse_url('http://spark.apache.org/path?query=1', 'HOST') + +query expect_fallback(not fully compatible with Spark) +SELECT parse_url('http://spark.apache.org/path?query=1', 'QUERY', 'query') + +-- Trailing-slash PATH: Spark returns "/", native impl returns "". Verifying +-- the fallback path emits Spark's "/". +query expect_fallback(not fully compatible with Spark) +SELECT parse_url('http://example.com/', 'PATH') diff --git a/spark/src/test/resources/sql-tests/expressions/url/parse_url_native.sql b/spark/src/test/resources/sql-tests/expressions/url/parse_url_native.sql new file mode 100644 index 0000000000..654506ebaf --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/url/parse_url_native.sql @@ -0,0 +1,85 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- Exercises the native parse_url implementation. Inputs are restricted to +-- URLs with explicit paths because the native implementation diverges from +-- Spark for empty-string input and for FILE extraction on path-less URLs. +-- Tracked upstream at https://github.com/apache/datafusion/issues/21943. + +-- Config: spark.comet.expression.ParseUrl.allowIncompatible=true +-- Config: spark.sql.ansi.enabled=true + +statement +CREATE TABLE test_urls_native(url string) USING parquet + +statement +INSERT INTO test_urls_native VALUES + ('http://spark.apache.org/path?query=1'), + ('http://user:password@host:8080/path?key=value&key2=value2#ref'), + ('http://example.com/path'), + (NULL) + +query +SELECT parse_url(url, 'HOST') FROM test_urls_native + +query +SELECT parse_url(url, 'PATH') FROM test_urls_native + +query +SELECT parse_url(url, 'QUERY') FROM test_urls_native + +query +SELECT parse_url(url, 'REF') FROM test_urls_native + +query +SELECT parse_url(url, 'PROTOCOL') FROM test_urls_native + +query +SELECT parse_url(url, 'FILE') FROM test_urls_native + +query +SELECT parse_url(url, 'AUTHORITY') FROM test_urls_native + +query +SELECT parse_url(url, 'USERINFO') FROM test_urls_native + +query +SELECT parse_url(url, 'QUERY', 'query') FROM test_urls_native + +query +SELECT parse_url(url, 'QUERY', 'key') FROM test_urls_native + +query +SELECT parse_url(url, 'QUERY', 'key2') FROM test_urls_native + +query +SELECT parse_url(url, 'QUERY', 'nonexistent') FROM test_urls_native + +query +SELECT parse_url('http://spark.apache.org/path?query=1', 'HOST') + +query +SELECT parse_url('http://spark.apache.org/path?query=1', 'QUERY', 'query') + +query +SELECT parse_url(NULL, 'HOST') + +-- ANSI-mode invalid URL: parse_url's failOnError is driven by spark.sql.ansi.enabled +-- (set above). Both Spark (INVALID_URL error class) and Comet's native impl +-- produce a message starting "The url is invalid". +query expect_error(The url is invalid) +SELECT parse_url('inva lid://user:pass@host/file', 'HOST') diff --git a/spark/src/test/resources/sql-tests/expressions/url/try_parse_url.sql b/spark/src/test/resources/sql-tests/expressions/url/try_parse_url.sql new file mode 100644 index 0000000000..e62dc329e4 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/url/try_parse_url.sql @@ -0,0 +1,40 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- try_parse_url is Spark 4.0+. It rewrites to ParseUrl(_, failOnError=false), +-- so Comet emits the native try_parse_url scalar function. parse_url is marked +-- Incompatible (see CometParseUrl), so this test opts in via allowIncompatible. + +-- MinSparkVersion: 4.0 +-- Config: spark.comet.expression.ParseUrl.allowIncompatible=true + +-- Valid URL: same answer as parse_url. +query +SELECT try_parse_url('http://spark.apache.org/path?query=1', 'HOST') + +query +SELECT try_parse_url('http://spark.apache.org/path?query=1', 'QUERY', 'query') + +-- Malformed URL with a scheme: Spark returns NULL, Comet's try_parse_url +-- returns NULL (failOnError=false propagates through CometParseUrl to the +-- native try_parse_url UDF). +query +SELECT try_parse_url('inva lid://user:pass@host/file', 'HOST') + +-- NULL input. +query +SELECT try_parse_url(NULL, 'HOST') diff --git a/spark/src/test/resources/sql-tests/expressions/url/url_encode.sql b/spark/src/test/resources/sql-tests/expressions/url/url_encode.sql index 378bd1c113..3cb5933223 100644 --- a/spark/src/test/resources/sql-tests/expressions/url/url_encode.sql +++ b/spark/src/test/resources/sql-tests/expressions/url/url_encode.sql @@ -24,9 +24,13 @@ INSERT INTO test_encode VALUES ('https://spark.apache.org'), ('hello world'), ('a+b=c&d=e'), + ('café'), (''), (NULL), - ('foo bar/baz?x=1&y=2') + ('foo bar/baz?x=1&y=2'), + ('~*''()'), + ('a%20b'), + ('\t\n\r') query SELECT url_encode(s) FROM test_encode @@ -51,3 +55,15 @@ SELECT url_encode('a b+c&d=e/f') -- multibyte UTF-8 query SELECT url_encode('日本語テスト') + +-- boundary characters in the preserved set +query +SELECT url_encode('~*''()') + +-- already-encoded input (verify double-encoding of percent) +query +SELECT url_encode('a%20b') + +-- whitespace control characters +query +SELECT url_encode('\t\n\r')