From 60ca622512e7a2aea5394cc705e560f770e1ccd1 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 2 May 2026 08:16:53 -0600 Subject: [PATCH 1/3] test: add SQL test coverage for spark.sql.legacy.timeParserPolicy Audit every Spark expression that reads spark.sql.legacy.timeParserPolicy (date_format, from_unixtime, unix_timestamp, to_unix_timestamp, to_timestamp, to_date, and Spark 4's try_to_timestamp) and add CometSqlFileTestSuite coverage. For each expression provide: - a ConfigMatrix file exercising convergent inputs under LEGACY, CORRECTED, and EXCEPTION - per-policy files locking in divergent behavior (lenient parsing under LEGACY, null returns under CORRECTED, INCONSISTENT_BEHAVIOR_CROSS_VERSION under EXCEPTION) Also add docs/source/contributor-guide/spark_configs_support.md modeled on the expression audit log to track Spark configs that affect Comet behavior, with full audit notes for the timeParserPolicy entry. All 42 generated tests pass on Spark 3.4.3, 3.5.8, and 4.0.1. --- docs/source/contributor-guide/index.md | 1 + .../spark_configs_support.md | 109 ++++++++++++++++++ .../date_format_time_parser_policy.sql | 49 ++++++++ ...te_format_time_parser_policy_corrected.sql | 33 ++++++ ...te_format_time_parser_policy_exception.sql | 31 +++++ .../date_format_time_parser_policy_legacy.sql | 31 +++++ .../from_unix_time_time_parser_policy.sql | 49 ++++++++ ...unix_time_time_parser_policy_corrected.sql | 31 +++++ ...unix_time_time_parser_policy_exception.sql | 31 +++++ ...om_unix_time_time_parser_policy_legacy.sql | 31 +++++ .../datetime/to_date_time_parser_policy.sql | 38 ++++++ .../to_date_time_parser_policy_corrected.sql | 35 ++++++ .../to_date_time_parser_policy_exception.sql | 30 +++++ .../to_date_time_parser_policy_legacy.sql | 35 ++++++ .../to_timestamp_time_parser_policy.sql | 47 ++++++++ ...timestamp_time_parser_policy_corrected.sql | 35 ++++++ ...timestamp_time_parser_policy_exception.sql | 30 +++++ ...to_timestamp_time_parser_policy_legacy.sql | 35 ++++++ .../to_unix_timestamp_time_parser_policy.sql | 47 ++++++++ ...timestamp_time_parser_policy_corrected.sql | 35 ++++++ ...timestamp_time_parser_policy_exception.sql | 30 +++++ ...ix_timestamp_time_parser_policy_legacy.sql | 35 ++++++ .../try_to_timestamp_time_parser_policy.sql | 48 ++++++++ ...timestamp_time_parser_policy_corrected.sql | 36 ++++++ ...timestamp_time_parser_policy_exception.sql | 33 ++++++ ...to_timestamp_time_parser_policy_legacy.sql | 36 ++++++ .../unix_timestamp_time_parser_policy.sql | 50 ++++++++ ...timestamp_time_parser_policy_corrected.sql | 36 ++++++ ...timestamp_time_parser_policy_exception.sql | 31 +++++ ...ix_timestamp_time_parser_policy_legacy.sql | 36 ++++++ 30 files changed, 1134 insertions(+) create mode 100644 docs/source/contributor-guide/spark_configs_support.md create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/date_format_time_parser_policy.sql create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/date_format_time_parser_policy_corrected.sql create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/date_format_time_parser_policy_exception.sql create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/date_format_time_parser_policy_legacy.sql create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/from_unix_time_time_parser_policy.sql create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/from_unix_time_time_parser_policy_corrected.sql create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/from_unix_time_time_parser_policy_exception.sql create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/from_unix_time_time_parser_policy_legacy.sql create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/to_date_time_parser_policy.sql create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/to_date_time_parser_policy_corrected.sql create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/to_date_time_parser_policy_exception.sql create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/to_date_time_parser_policy_legacy.sql create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/to_timestamp_time_parser_policy.sql create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/to_timestamp_time_parser_policy_corrected.sql create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/to_timestamp_time_parser_policy_exception.sql create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/to_timestamp_time_parser_policy_legacy.sql create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/to_unix_timestamp_time_parser_policy.sql create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/to_unix_timestamp_time_parser_policy_corrected.sql create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/to_unix_timestamp_time_parser_policy_exception.sql create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/to_unix_timestamp_time_parser_policy_legacy.sql create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/try_to_timestamp_time_parser_policy.sql create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/try_to_timestamp_time_parser_policy_corrected.sql create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/try_to_timestamp_time_parser_policy_exception.sql create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/try_to_timestamp_time_parser_policy_legacy.sql create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/unix_timestamp_time_parser_policy.sql create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/unix_timestamp_time_parser_policy_corrected.sql create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/unix_timestamp_time_parser_policy_exception.sql create mode 100644 spark/src/test/resources/sql-tests/expressions/datetime/unix_timestamp_time_parser_policy_legacy.sql diff --git a/docs/source/contributor-guide/index.md b/docs/source/contributor-guide/index.md index 20e73c7428..2010335407 100644 --- a/docs/source/contributor-guide/index.md +++ b/docs/source/contributor-guide/index.md @@ -34,6 +34,7 @@ Benchmarking Guide Adding a New Operator Adding a New Expression Supported Spark Expressions +Supported Spark Configurations Tracing Profiling Comet SQL Tests diff --git a/docs/source/contributor-guide/spark_configs_support.md b/docs/source/contributor-guide/spark_configs_support.md new file mode 100644 index 0000000000..6ca4f99ecf --- /dev/null +++ b/docs/source/contributor-guide/spark_configs_support.md @@ -0,0 +1,109 @@ + + +# Supported Spark Configurations + +This document tracks Spark SQL configurations that affect Comet's behavior. For each +configuration we record which Comet expressions or operators are influenced, what +verification has been performed, and any known gaps. + +## How to Read This Document + +The status column uses these values: + +- **Supported** -- Comet runs the affected expressions natively under every value of + the config, and produces results matching Spark. +- **Partial** -- Comet runs natively for some values of the config but falls back to + Spark for others, or runs natively but with documented incompatibilities. +- **Falls back** -- Comet does not run the affected expressions natively under this + config and always defers to Spark. +- **Unaudited** -- the config's interaction with Comet has not yet been verified. + +## Audited Configurations + +| Config | Default | Status | Affected expressions / operators | Spark Versions | Date | +| ------------------------------------ | ----------- | ----------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------- | ---------- | +| `spark.sql.legacy.timeParserPolicy` | `EXCEPTION` | Falls back (see notes) | `date_format`, `from_unixtime`, `unix_timestamp`, `to_unix_timestamp`, `to_timestamp`, `to_timestamp_ntz`, `to_date`, `try_to_timestamp` (Spark 4+) | 3.4.3, 3.5.8, 4.0.1 | 2026-05-02 | + +## Audit Notes + +### `spark.sql.legacy.timeParserPolicy` + +**Source.** `SQLConf.LEGACY_TIME_PARSER_POLICY` selects the formatter used by +`TimestampFormatter` and `DateFormatter`: + +- `LEGACY` -- `java.text.SimpleDateFormat` / `FastDateFormat`. Lenient parsing. +- `CORRECTED` -- `java.time.DateTimeFormatter` via `Iso8601TimestampFormatter`. Strict. +- `EXCEPTION` (default) -- same parser as `CORRECTED`, plus + `DateTimeFormatterHelper.checkParsedDiff` raises `SparkUpgradeException` + (`INCONSISTENT_BEHAVIOR_CROSS_VERSION`) when the new parser fails on input that the + legacy parser would have accepted. Pattern validation also raises + `SparkUpgradeException` when a pattern is recognized only by the legacy formatter + (this check applies under both `CORRECTED` and `EXCEPTION`). + +**Affected expressions.** Determined by tracing `TimestampFormatterHelper`, +`TimestampFormatter(...)`, and `DateFormatter(...)` usage in +`sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala` +across Spark 3.4, 3.5, 4.0, and 4.1. Three expression classes mix in +`TimestampFormatterHelper`: + +- `DateFormatClass` -- `date_format` +- `FromUnixTime` -- `from_unixtime` +- `ToTimestamp` (abstract) -- `UnixTimestamp` (`unix_timestamp`), + `ToUnixTimestamp` (`to_unix_timestamp`), `GetTimestamp` (used by + `ParseToTimestamp` for `to_timestamp` / `to_timestamp_ntz`, `ParseToDate` for + `to_date`, and Spark 4's `try_to_timestamp`) + +`Cast` between strings and date / timestamp also reads the policy via the default +formatters but is tested separately by `CometCastSuite` and is out of scope here. + +**Comet status.** None of the listed expressions consult `legacyTimeParserPolicy` in +their Comet serde. The native implementations of `date_format`, `from_unixtime`, and +`unix_timestamp` use a fixed strftime-style mapping that does not vary with policy; +the remaining four (`to_unix_timestamp`, `to_timestamp`, `to_date`, +`try_to_timestamp`) have no native implementation and fall back to Spark. Today this +works because: + +- `date_format` is `Compatible` only for a small whitelist of formats under UTC; the + whitelisted formats happen to produce identical output under all three policies. +- `from_unixtime` is marked `Incompatible` and falls back unless + `spark.comet.expression.FromUnixTime.allowIncompatible=true` is set. +- `unix_timestamp()` does not call the formatter at all; the + string-input overload falls back. + +If a Comet contributor adds native string-format parsing or extends the date_format +whitelist, this audit should be revisited and the policy must be honored explicitly. + +**Test coverage.** `spark/src/test/resources/sql-tests/expressions/datetime/`: + +- One ConfigMatrix file per expression covering convergent inputs under + `LEGACY,CORRECTED,EXCEPTION` (`*_time_parser_policy.sql`). +- Per-policy files locking in divergent behavior: + - `_legacy.sql` -- lenient inputs (single-digit fields, out-of-range values, + trailing characters) and legacy-only pattern tokens (`'aaaa'`). + - `_corrected.sql` -- the same inputs return null; legacy-only tokens raise + `INCONSISTENT_BEHAVIOR_CROSS_VERSION.DATETIME_PATTERN_RECOGNITION` at formatter + creation. + - `_exception.sql` -- the same inputs raise + `INCONSISTENT_BEHAVIOR_CROSS_VERSION.PARSE_DATETIME_BY_NEW_PARSER` at parse time. + +**Findings.** All 42 generated test cases pass on Spark 3.4.3, 3.5.8, and 4.0.1. No +Comet bugs were uncovered by the audit. The tests use `query spark_answer_only` so +that result-correctness is enforced regardless of whether Comet runs the expression +natively or falls back. diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/date_format_time_parser_policy.sql b/spark/src/test/resources/sql-tests/expressions/datetime/date_format_time_parser_policy.sql new file mode 100644 index 0000000000..a44dff6114 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/date_format_time_parser_policy.sql @@ -0,0 +1,49 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- Convergent date_format() behavior across all three timeParserPolicy values. +-- Patterns here produce identical output under LEGACY, CORRECTED, and EXCEPTION. +-- ConfigMatrix: spark.sql.legacy.timeParserPolicy=LEGACY,CORRECTED,EXCEPTION +-- Config: spark.sql.session.timeZone=UTC + +statement +CREATE TABLE test_date_format_policy(ts timestamp) USING parquet + +statement +INSERT INTO test_date_format_policy VALUES (timestamp('2024-06-15 10:30:45')), (timestamp('1970-01-01 00:00:00')), (NULL) + +query spark_answer_only +SELECT date_format(ts, 'yyyy-MM-dd') FROM test_date_format_policy + +query spark_answer_only +SELECT date_format(ts, 'yyyy-MM-dd HH:mm:ss') FROM test_date_format_policy + +query spark_answer_only +SELECT date_format(ts, 'HH:mm:ss') FROM test_date_format_policy + +query spark_answer_only +SELECT date_format(ts, 'yyyyMMdd') FROM test_date_format_policy + +query spark_answer_only +SELECT date_format(ts, 'yyyyMM') FROM test_date_format_policy + +-- literal arguments +query spark_answer_only +SELECT date_format(timestamp('2024-06-15 10:30:45'), 'yyyy-MM-dd') + +query spark_answer_only +SELECT date_format(NULL, 'yyyy-MM-dd') diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/date_format_time_parser_policy_corrected.sql b/spark/src/test/resources/sql-tests/expressions/datetime/date_format_time_parser_policy_corrected.sql new file mode 100644 index 0000000000..700c571122 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/date_format_time_parser_policy_corrected.sql @@ -0,0 +1,33 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- date_format() under CORRECTED timeParserPolicy. +-- Patterns recognized only by the legacy formatter raise SparkUpgradeException at +-- formatter creation, even under CORRECTED, because validatePatternString is called +-- with checkLegacy=true. +-- Config: spark.sql.legacy.timeParserPolicy=CORRECTED +-- Config: spark.sql.session.timeZone=UTC + +statement +CREATE TABLE test_date_format_corrected(ts timestamp) USING parquet + +statement +INSERT INTO test_date_format_corrected VALUES (timestamp('2024-06-15 10:30:45')) + +-- 4-char am/pm marker: legacy accepts, new rejects, validation throws SparkUpgradeException. +query expect_error(INCONSISTENT_BEHAVIOR_CROSS_VERSION) +SELECT date_format(ts, 'yyyy-MM-dd aaaa') FROM test_date_format_corrected diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/date_format_time_parser_policy_exception.sql b/spark/src/test/resources/sql-tests/expressions/datetime/date_format_time_parser_policy_exception.sql new file mode 100644 index 0000000000..44398e6774 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/date_format_time_parser_policy_exception.sql @@ -0,0 +1,31 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- date_format() under EXCEPTION timeParserPolicy (the default). +-- Patterns rejected by the new formatter but accepted by legacy raise +-- SparkUpgradeException at formatter creation. +-- Config: spark.sql.legacy.timeParserPolicy=EXCEPTION +-- Config: spark.sql.session.timeZone=UTC + +statement +CREATE TABLE test_date_format_exception(ts timestamp) USING parquet + +statement +INSERT INTO test_date_format_exception VALUES (timestamp('2024-06-15 10:30:45')) + +query expect_error(INCONSISTENT_BEHAVIOR_CROSS_VERSION) +SELECT date_format(ts, 'yyyy-MM-dd aaaa') FROM test_date_format_exception diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/date_format_time_parser_policy_legacy.sql b/spark/src/test/resources/sql-tests/expressions/datetime/date_format_time_parser_policy_legacy.sql new file mode 100644 index 0000000000..7a8d4f6496 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/date_format_time_parser_policy_legacy.sql @@ -0,0 +1,31 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- date_format() under LEGACY timeParserPolicy. +-- Legacy SimpleDateFormat accepts patterns that the new java.time formatter rejects. +-- Config: spark.sql.legacy.timeParserPolicy=LEGACY +-- Config: spark.sql.session.timeZone=UTC + +statement +CREATE TABLE test_date_format_legacy(ts timestamp) USING parquet + +statement +INSERT INTO test_date_format_legacy VALUES (timestamp('2024-06-15 10:30:45')), (timestamp('1970-01-01 00:00:00')), (NULL) + +-- Legacy-only token: 4-char am/pm marker is invalid in the new formatter. +query spark_answer_only +SELECT date_format(ts, 'yyyy-MM-dd aaaa') FROM test_date_format_legacy diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/from_unix_time_time_parser_policy.sql b/spark/src/test/resources/sql-tests/expressions/datetime/from_unix_time_time_parser_policy.sql new file mode 100644 index 0000000000..f4b58abb24 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/from_unix_time_time_parser_policy.sql @@ -0,0 +1,49 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- Convergent from_unixtime() behavior across all three timeParserPolicy values. +-- Patterns here produce identical output under LEGACY, CORRECTED, and EXCEPTION. +-- ConfigMatrix: spark.sql.legacy.timeParserPolicy=LEGACY,CORRECTED,EXCEPTION +-- Config: spark.sql.session.timeZone=UTC + +statement +CREATE TABLE test_from_unix_time_policy(t long) USING parquet + +statement +INSERT INTO test_from_unix_time_policy VALUES (0), (1718451045), (-1), (NULL), (2147483647) + +query spark_answer_only +SELECT from_unixtime(t) FROM test_from_unix_time_policy + +query spark_answer_only +SELECT from_unixtime(t, 'yyyy-MM-dd') FROM test_from_unix_time_policy + +query spark_answer_only +SELECT from_unixtime(t, 'yyyy-MM-dd HH:mm:ss') FROM test_from_unix_time_policy + +query spark_answer_only +SELECT from_unixtime(t, 'HH:mm:ss') FROM test_from_unix_time_policy + +-- literal arguments +query spark_answer_only +SELECT from_unixtime(0) + +query spark_answer_only +SELECT from_unixtime(1718451045, 'yyyy-MM-dd') + +query spark_answer_only +SELECT from_unixtime(NULL, 'yyyy-MM-dd') diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/from_unix_time_time_parser_policy_corrected.sql b/spark/src/test/resources/sql-tests/expressions/datetime/from_unix_time_time_parser_policy_corrected.sql new file mode 100644 index 0000000000..6cbdcf4607 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/from_unix_time_time_parser_policy_corrected.sql @@ -0,0 +1,31 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- from_unixtime() under CORRECTED timeParserPolicy. +-- Patterns recognized only by the legacy formatter raise SparkUpgradeException +-- at formatter creation, even under CORRECTED. +-- Config: spark.sql.legacy.timeParserPolicy=CORRECTED +-- Config: spark.sql.session.timeZone=UTC + +statement +CREATE TABLE test_from_unix_time_corrected(t long) USING parquet + +statement +INSERT INTO test_from_unix_time_corrected VALUES (1718451045) + +query expect_error(INCONSISTENT_BEHAVIOR_CROSS_VERSION) +SELECT from_unixtime(t, 'yyyy-MM-dd aaaa') FROM test_from_unix_time_corrected diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/from_unix_time_time_parser_policy_exception.sql b/spark/src/test/resources/sql-tests/expressions/datetime/from_unix_time_time_parser_policy_exception.sql new file mode 100644 index 0000000000..cc3fced087 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/from_unix_time_time_parser_policy_exception.sql @@ -0,0 +1,31 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- from_unixtime() under EXCEPTION timeParserPolicy (the default). +-- Patterns rejected by the new formatter but accepted by legacy raise +-- SparkUpgradeException at formatter creation. +-- Config: spark.sql.legacy.timeParserPolicy=EXCEPTION +-- Config: spark.sql.session.timeZone=UTC + +statement +CREATE TABLE test_from_unix_time_exception(t long) USING parquet + +statement +INSERT INTO test_from_unix_time_exception VALUES (1718451045) + +query expect_error(INCONSISTENT_BEHAVIOR_CROSS_VERSION) +SELECT from_unixtime(t, 'yyyy-MM-dd aaaa') FROM test_from_unix_time_exception diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/from_unix_time_time_parser_policy_legacy.sql b/spark/src/test/resources/sql-tests/expressions/datetime/from_unix_time_time_parser_policy_legacy.sql new file mode 100644 index 0000000000..0b2f63cb36 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/from_unix_time_time_parser_policy_legacy.sql @@ -0,0 +1,31 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- from_unixtime() under LEGACY timeParserPolicy. +-- Legacy SimpleDateFormat accepts patterns the new java.time formatter rejects. +-- Config: spark.sql.legacy.timeParserPolicy=LEGACY +-- Config: spark.sql.session.timeZone=UTC + +statement +CREATE TABLE test_from_unix_time_legacy(t long) USING parquet + +statement +INSERT INTO test_from_unix_time_legacy VALUES (0), (1718451045), (NULL) + +-- Legacy-only token: 4-char am/pm marker formats successfully under LEGACY. +query spark_answer_only +SELECT from_unixtime(t, 'yyyy-MM-dd aaaa') FROM test_from_unix_time_legacy diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/to_date_time_parser_policy.sql b/spark/src/test/resources/sql-tests/expressions/datetime/to_date_time_parser_policy.sql new file mode 100644 index 0000000000..dc46bbfdda --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/to_date_time_parser_policy.sql @@ -0,0 +1,38 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- Convergent to_date(string, format) behavior across all three policies. +-- ConfigMatrix: spark.sql.legacy.timeParserPolicy=LEGACY,CORRECTED,EXCEPTION +-- Config: spark.sql.session.timeZone=UTC + +statement +CREATE TABLE test_to_date_policy(s string) USING parquet + +statement +INSERT INTO test_to_date_policy VALUES ('2024-06-15'), ('1970-01-01'), (NULL), ('') + +query spark_answer_only +SELECT to_date(s, 'yyyy-MM-dd') FROM test_to_date_policy + +query spark_answer_only +SELECT to_date(s) FROM test_to_date_policy + +query spark_answer_only +SELECT to_date('2024-06-15', 'yyyy-MM-dd') + +query spark_answer_only +SELECT to_date(NULL, 'yyyy-MM-dd') diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/to_date_time_parser_policy_corrected.sql b/spark/src/test/resources/sql-tests/expressions/datetime/to_date_time_parser_policy_corrected.sql new file mode 100644 index 0000000000..b8ea693d2a --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/to_date_time_parser_policy_corrected.sql @@ -0,0 +1,35 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- to_date() under CORRECTED timeParserPolicy. +-- Strict java.time parsing returns null for inputs that legacy would accept. +-- Config: spark.sql.legacy.timeParserPolicy=CORRECTED +-- Config: spark.sql.session.timeZone=UTC + +statement +CREATE TABLE test_to_date_strict(s string) USING parquet + +statement +INSERT INTO test_to_date_strict VALUES + ('2024-1-1'), + ('2024-13-01'), + ('2024-02-30'), + ('2024-01-01garbage'), + ('2024') + +query spark_answer_only +SELECT s, to_date(s, 'yyyy-MM-dd') FROM test_to_date_strict ORDER BY s diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/to_date_time_parser_policy_exception.sql b/spark/src/test/resources/sql-tests/expressions/datetime/to_date_time_parser_policy_exception.sql new file mode 100644 index 0000000000..127eb463fa --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/to_date_time_parser_policy_exception.sql @@ -0,0 +1,30 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- to_date() under EXCEPTION timeParserPolicy (the default). +-- Inputs accepted by legacy but rejected by the new parser raise SparkUpgradeException. +-- Config: spark.sql.legacy.timeParserPolicy=EXCEPTION +-- Config: spark.sql.session.timeZone=UTC + +statement +CREATE TABLE test_to_date_exception(s string) USING parquet + +statement +INSERT INTO test_to_date_exception VALUES ('2024-1-1') + +query expect_error(INCONSISTENT_BEHAVIOR_CROSS_VERSION) +SELECT to_date(s, 'yyyy-MM-dd') FROM test_to_date_exception diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/to_date_time_parser_policy_legacy.sql b/spark/src/test/resources/sql-tests/expressions/datetime/to_date_time_parser_policy_legacy.sql new file mode 100644 index 0000000000..e7493e81c6 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/to_date_time_parser_policy_legacy.sql @@ -0,0 +1,35 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- to_date() under LEGACY timeParserPolicy. +-- Lenient SimpleDateFormat parsing accepts inputs that the new formatter rejects. +-- Config: spark.sql.legacy.timeParserPolicy=LEGACY +-- Config: spark.sql.session.timeZone=UTC + +statement +CREATE TABLE test_to_date_lenient(s string) USING parquet + +statement +INSERT INTO test_to_date_lenient VALUES + ('2024-1-1'), + ('2024-13-01'), + ('2024-02-30'), + ('2024-01-01garbage'), + ('2024') + +query spark_answer_only +SELECT s, to_date(s, 'yyyy-MM-dd') FROM test_to_date_lenient ORDER BY s diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/to_timestamp_time_parser_policy.sql b/spark/src/test/resources/sql-tests/expressions/datetime/to_timestamp_time_parser_policy.sql new file mode 100644 index 0000000000..527f4d0d3a --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/to_timestamp_time_parser_policy.sql @@ -0,0 +1,47 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- Convergent to_timestamp(string, format) behavior across all three policies. +-- ConfigMatrix: spark.sql.legacy.timeParserPolicy=LEGACY,CORRECTED,EXCEPTION +-- Config: spark.sql.session.timeZone=UTC + +statement +CREATE TABLE test_to_ts_policy(s string) USING parquet + +statement +INSERT INTO test_to_ts_policy VALUES ('2024-06-15 10:30:45'), ('1970-01-01 00:00:00'), (NULL), ('') + +query spark_answer_only +SELECT to_timestamp(s, 'yyyy-MM-dd HH:mm:ss') FROM test_to_ts_policy + +query spark_answer_only +SELECT to_timestamp(s) FROM test_to_ts_policy + +statement +CREATE TABLE test_to_ts_date_policy(s string) USING parquet + +statement +INSERT INTO test_to_ts_date_policy VALUES ('2024-06-15'), ('1970-01-01'), (NULL) + +query spark_answer_only +SELECT to_timestamp(s, 'yyyy-MM-dd') FROM test_to_ts_date_policy + +query spark_answer_only +SELECT to_timestamp('2024-06-15', 'yyyy-MM-dd') + +query spark_answer_only +SELECT to_timestamp(NULL, 'yyyy-MM-dd') diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/to_timestamp_time_parser_policy_corrected.sql b/spark/src/test/resources/sql-tests/expressions/datetime/to_timestamp_time_parser_policy_corrected.sql new file mode 100644 index 0000000000..bb3da3c1ee --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/to_timestamp_time_parser_policy_corrected.sql @@ -0,0 +1,35 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- to_timestamp() under CORRECTED timeParserPolicy. +-- Strict java.time parsing returns null for inputs that legacy would accept. +-- Config: spark.sql.legacy.timeParserPolicy=CORRECTED +-- Config: spark.sql.session.timeZone=UTC + +statement +CREATE TABLE test_to_ts_strict(s string) USING parquet + +statement +INSERT INTO test_to_ts_strict VALUES + ('2024-1-1'), + ('2024-13-01'), + ('2024-02-30'), + ('2024-01-01garbage'), + ('2024') + +query spark_answer_only +SELECT s, to_timestamp(s, 'yyyy-MM-dd') FROM test_to_ts_strict ORDER BY s diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/to_timestamp_time_parser_policy_exception.sql b/spark/src/test/resources/sql-tests/expressions/datetime/to_timestamp_time_parser_policy_exception.sql new file mode 100644 index 0000000000..8280f88078 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/to_timestamp_time_parser_policy_exception.sql @@ -0,0 +1,30 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- to_timestamp() under EXCEPTION timeParserPolicy (the default). +-- Inputs accepted by legacy but rejected by the new parser raise SparkUpgradeException. +-- Config: spark.sql.legacy.timeParserPolicy=EXCEPTION +-- Config: spark.sql.session.timeZone=UTC + +statement +CREATE TABLE test_to_ts_exception(s string) USING parquet + +statement +INSERT INTO test_to_ts_exception VALUES ('2024-1-1') + +query expect_error(INCONSISTENT_BEHAVIOR_CROSS_VERSION) +SELECT to_timestamp(s, 'yyyy-MM-dd') FROM test_to_ts_exception diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/to_timestamp_time_parser_policy_legacy.sql b/spark/src/test/resources/sql-tests/expressions/datetime/to_timestamp_time_parser_policy_legacy.sql new file mode 100644 index 0000000000..438ae73ebb --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/to_timestamp_time_parser_policy_legacy.sql @@ -0,0 +1,35 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- to_timestamp() under LEGACY timeParserPolicy. +-- Lenient SimpleDateFormat parsing accepts inputs that the new formatter rejects. +-- Config: spark.sql.legacy.timeParserPolicy=LEGACY +-- Config: spark.sql.session.timeZone=UTC + +statement +CREATE TABLE test_to_ts_lenient(s string) USING parquet + +statement +INSERT INTO test_to_ts_lenient VALUES + ('2024-1-1'), + ('2024-13-01'), + ('2024-02-30'), + ('2024-01-01garbage'), + ('2024') + +query spark_answer_only +SELECT s, to_timestamp(s, 'yyyy-MM-dd') FROM test_to_ts_lenient ORDER BY s diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/to_unix_timestamp_time_parser_policy.sql b/spark/src/test/resources/sql-tests/expressions/datetime/to_unix_timestamp_time_parser_policy.sql new file mode 100644 index 0000000000..a681d34c9c --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/to_unix_timestamp_time_parser_policy.sql @@ -0,0 +1,47 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- Convergent to_unix_timestamp(string, format) behavior across all three policies. +-- ConfigMatrix: spark.sql.legacy.timeParserPolicy=LEGACY,CORRECTED,EXCEPTION +-- Config: spark.sql.session.timeZone=UTC + +statement +CREATE TABLE test_to_unix_ts_policy(s string) USING parquet + +statement +INSERT INTO test_to_unix_ts_policy VALUES ('2024-06-15 10:30:45'), ('1970-01-01 00:00:00'), (NULL), ('') + +query spark_answer_only +SELECT to_unix_timestamp(s, 'yyyy-MM-dd HH:mm:ss') FROM test_to_unix_ts_policy + +query spark_answer_only +SELECT to_unix_timestamp(s) FROM test_to_unix_ts_policy + +statement +CREATE TABLE test_to_unix_ts_date_policy(s string) USING parquet + +statement +INSERT INTO test_to_unix_ts_date_policy VALUES ('2024-06-15'), ('1970-01-01'), (NULL) + +query spark_answer_only +SELECT to_unix_timestamp(s, 'yyyy-MM-dd') FROM test_to_unix_ts_date_policy + +query spark_answer_only +SELECT to_unix_timestamp('2024-06-15', 'yyyy-MM-dd') + +query spark_answer_only +SELECT to_unix_timestamp(NULL, 'yyyy-MM-dd') diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/to_unix_timestamp_time_parser_policy_corrected.sql b/spark/src/test/resources/sql-tests/expressions/datetime/to_unix_timestamp_time_parser_policy_corrected.sql new file mode 100644 index 0000000000..aa235c712b --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/to_unix_timestamp_time_parser_policy_corrected.sql @@ -0,0 +1,35 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- to_unix_timestamp() under CORRECTED timeParserPolicy. +-- Strict java.time parsing returns null for inputs that legacy would accept. +-- Config: spark.sql.legacy.timeParserPolicy=CORRECTED +-- Config: spark.sql.session.timeZone=UTC + +statement +CREATE TABLE test_to_unix_ts_strict(s string) USING parquet + +statement +INSERT INTO test_to_unix_ts_strict VALUES + ('2024-1-1'), + ('2024-13-01'), + ('2024-02-30'), + ('2024-01-01garbage'), + ('2024') + +query spark_answer_only +SELECT s, to_unix_timestamp(s, 'yyyy-MM-dd') FROM test_to_unix_ts_strict ORDER BY s diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/to_unix_timestamp_time_parser_policy_exception.sql b/spark/src/test/resources/sql-tests/expressions/datetime/to_unix_timestamp_time_parser_policy_exception.sql new file mode 100644 index 0000000000..25413bc86d --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/to_unix_timestamp_time_parser_policy_exception.sql @@ -0,0 +1,30 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- to_unix_timestamp() under EXCEPTION timeParserPolicy (the default). +-- Inputs accepted by legacy but rejected by the new parser raise SparkUpgradeException. +-- Config: spark.sql.legacy.timeParserPolicy=EXCEPTION +-- Config: spark.sql.session.timeZone=UTC + +statement +CREATE TABLE test_to_unix_ts_exception(s string) USING parquet + +statement +INSERT INTO test_to_unix_ts_exception VALUES ('2024-1-1') + +query expect_error(INCONSISTENT_BEHAVIOR_CROSS_VERSION) +SELECT to_unix_timestamp(s, 'yyyy-MM-dd') FROM test_to_unix_ts_exception diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/to_unix_timestamp_time_parser_policy_legacy.sql b/spark/src/test/resources/sql-tests/expressions/datetime/to_unix_timestamp_time_parser_policy_legacy.sql new file mode 100644 index 0000000000..493d9c5b56 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/to_unix_timestamp_time_parser_policy_legacy.sql @@ -0,0 +1,35 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- to_unix_timestamp() under LEGACY timeParserPolicy. +-- Lenient SimpleDateFormat parsing accepts inputs that the new formatter rejects. +-- Config: spark.sql.legacy.timeParserPolicy=LEGACY +-- Config: spark.sql.session.timeZone=UTC + +statement +CREATE TABLE test_to_unix_ts_lenient(s string) USING parquet + +statement +INSERT INTO test_to_unix_ts_lenient VALUES + ('2024-1-1'), + ('2024-13-01'), + ('2024-02-30'), + ('2024-01-01garbage'), + ('2024') + +query spark_answer_only +SELECT s, to_unix_timestamp(s, 'yyyy-MM-dd') FROM test_to_unix_ts_lenient ORDER BY s diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/try_to_timestamp_time_parser_policy.sql b/spark/src/test/resources/sql-tests/expressions/datetime/try_to_timestamp_time_parser_policy.sql new file mode 100644 index 0000000000..9c67d0466f --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/try_to_timestamp_time_parser_policy.sql @@ -0,0 +1,48 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- Convergent try_to_timestamp(string, format) behavior across all three policies. +-- MinSparkVersion: 4.0 +-- ConfigMatrix: spark.sql.legacy.timeParserPolicy=LEGACY,CORRECTED,EXCEPTION +-- Config: spark.sql.session.timeZone=UTC + +statement +CREATE TABLE test_try_to_ts_policy(s string) USING parquet + +statement +INSERT INTO test_try_to_ts_policy VALUES ('2024-06-15 10:30:45'), ('1970-01-01 00:00:00'), (NULL), ('') + +query spark_answer_only +SELECT try_to_timestamp(s, 'yyyy-MM-dd HH:mm:ss') FROM test_try_to_ts_policy + +query spark_answer_only +SELECT try_to_timestamp(s) FROM test_try_to_ts_policy + +statement +CREATE TABLE test_try_to_ts_date_policy(s string) USING parquet + +statement +INSERT INTO test_try_to_ts_date_policy VALUES ('2024-06-15'), ('1970-01-01'), (NULL) + +query spark_answer_only +SELECT try_to_timestamp(s, 'yyyy-MM-dd') FROM test_try_to_ts_date_policy + +query spark_answer_only +SELECT try_to_timestamp('2024-06-15', 'yyyy-MM-dd') + +query spark_answer_only +SELECT try_to_timestamp(NULL, 'yyyy-MM-dd') diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/try_to_timestamp_time_parser_policy_corrected.sql b/spark/src/test/resources/sql-tests/expressions/datetime/try_to_timestamp_time_parser_policy_corrected.sql new file mode 100644 index 0000000000..1597bfcd11 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/try_to_timestamp_time_parser_policy_corrected.sql @@ -0,0 +1,36 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- try_to_timestamp() under CORRECTED timeParserPolicy. +-- Strict java.time parsing returns null for inputs that legacy would accept. +-- MinSparkVersion: 4.0 +-- Config: spark.sql.legacy.timeParserPolicy=CORRECTED +-- Config: spark.sql.session.timeZone=UTC + +statement +CREATE TABLE test_try_to_ts_strict(s string) USING parquet + +statement +INSERT INTO test_try_to_ts_strict VALUES + ('2024-1-1'), + ('2024-13-01'), + ('2024-02-30'), + ('2024-01-01garbage'), + ('2024') + +query spark_answer_only +SELECT s, try_to_timestamp(s, 'yyyy-MM-dd') FROM test_try_to_ts_strict ORDER BY s diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/try_to_timestamp_time_parser_policy_exception.sql b/spark/src/test/resources/sql-tests/expressions/datetime/try_to_timestamp_time_parser_policy_exception.sql new file mode 100644 index 0000000000..5d1b91432e --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/try_to_timestamp_time_parser_policy_exception.sql @@ -0,0 +1,33 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- try_to_timestamp() under EXCEPTION timeParserPolicy (the default). +-- try_to_timestamp swallows DateTimeException/ParseException, but SparkUpgradeException +-- raised by checkParsedDiff propagates through the catch -- so the EXCEPTION case still +-- throws on legacy/new divergent inputs. +-- MinSparkVersion: 4.0 +-- Config: spark.sql.legacy.timeParserPolicy=EXCEPTION +-- Config: spark.sql.session.timeZone=UTC + +statement +CREATE TABLE test_try_to_ts_exception(s string) USING parquet + +statement +INSERT INTO test_try_to_ts_exception VALUES ('2024-1-1') + +query expect_error(INCONSISTENT_BEHAVIOR_CROSS_VERSION) +SELECT try_to_timestamp(s, 'yyyy-MM-dd') FROM test_try_to_ts_exception diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/try_to_timestamp_time_parser_policy_legacy.sql b/spark/src/test/resources/sql-tests/expressions/datetime/try_to_timestamp_time_parser_policy_legacy.sql new file mode 100644 index 0000000000..a72c8ae0b4 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/try_to_timestamp_time_parser_policy_legacy.sql @@ -0,0 +1,36 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- try_to_timestamp() under LEGACY timeParserPolicy. +-- Lenient SimpleDateFormat parsing accepts inputs that the new formatter rejects. +-- MinSparkVersion: 4.0 +-- Config: spark.sql.legacy.timeParserPolicy=LEGACY +-- Config: spark.sql.session.timeZone=UTC + +statement +CREATE TABLE test_try_to_ts_lenient(s string) USING parquet + +statement +INSERT INTO test_try_to_ts_lenient VALUES + ('2024-1-1'), + ('2024-13-01'), + ('2024-02-30'), + ('2024-01-01garbage'), + ('2024') + +query spark_answer_only +SELECT s, try_to_timestamp(s, 'yyyy-MM-dd') FROM test_try_to_ts_lenient ORDER BY s diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/unix_timestamp_time_parser_policy.sql b/spark/src/test/resources/sql-tests/expressions/datetime/unix_timestamp_time_parser_policy.sql new file mode 100644 index 0000000000..25737128e4 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/unix_timestamp_time_parser_policy.sql @@ -0,0 +1,50 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- Convergent unix_timestamp(string, format) behavior across all three policies. +-- Strictly-formatted inputs parse identically under LEGACY, CORRECTED, and EXCEPTION. +-- ConfigMatrix: spark.sql.legacy.timeParserPolicy=LEGACY,CORRECTED,EXCEPTION +-- Config: spark.sql.session.timeZone=UTC + +statement +CREATE TABLE test_unix_ts_policy(s string) USING parquet + +statement +INSERT INTO test_unix_ts_policy VALUES ('2024-06-15 10:30:45'), ('1970-01-01 00:00:00'), (NULL), ('') + +query spark_answer_only +SELECT unix_timestamp(s, 'yyyy-MM-dd HH:mm:ss') FROM test_unix_ts_policy + +query spark_answer_only +SELECT unix_timestamp(s) FROM test_unix_ts_policy + +-- date-only input with date-only pattern +statement +CREATE TABLE test_unix_ts_date_policy(s string) USING parquet + +statement +INSERT INTO test_unix_ts_date_policy VALUES ('2024-06-15'), ('1970-01-01'), (NULL) + +query spark_answer_only +SELECT unix_timestamp(s, 'yyyy-MM-dd') FROM test_unix_ts_date_policy + +-- literal arguments +query spark_answer_only +SELECT unix_timestamp('2024-06-15', 'yyyy-MM-dd') + +query spark_answer_only +SELECT unix_timestamp(NULL, 'yyyy-MM-dd') diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/unix_timestamp_time_parser_policy_corrected.sql b/spark/src/test/resources/sql-tests/expressions/datetime/unix_timestamp_time_parser_policy_corrected.sql new file mode 100644 index 0000000000..71a4c6d380 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/unix_timestamp_time_parser_policy_corrected.sql @@ -0,0 +1,36 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- unix_timestamp() under CORRECTED timeParserPolicy. +-- The new java.time formatter is strict: lenient inputs return null without raising +-- SparkUpgradeException. +-- Config: spark.sql.legacy.timeParserPolicy=CORRECTED +-- Config: spark.sql.session.timeZone=UTC + +statement +CREATE TABLE test_unix_ts_strict(s string) USING parquet + +statement +INSERT INTO test_unix_ts_strict VALUES + ('2024-1-1'), + ('2024-13-01'), + ('2024-02-30'), + ('2024-01-01garbage'), + ('2024') + +query spark_answer_only +SELECT s, unix_timestamp(s, 'yyyy-MM-dd') FROM test_unix_ts_strict ORDER BY s diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/unix_timestamp_time_parser_policy_exception.sql b/spark/src/test/resources/sql-tests/expressions/datetime/unix_timestamp_time_parser_policy_exception.sql new file mode 100644 index 0000000000..7a90b44fe6 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/unix_timestamp_time_parser_policy_exception.sql @@ -0,0 +1,31 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- unix_timestamp() under EXCEPTION timeParserPolicy (the default). +-- New parser fails on lenient inputs; legacy parser would have succeeded; +-- DateTimeFormatterHelper.checkParsedDiff converts the failure to SparkUpgradeException. +-- Config: spark.sql.legacy.timeParserPolicy=EXCEPTION +-- Config: spark.sql.session.timeZone=UTC + +statement +CREATE TABLE test_unix_ts_exception(s string) USING parquet + +statement +INSERT INTO test_unix_ts_exception VALUES ('2024-1-1') + +query expect_error(INCONSISTENT_BEHAVIOR_CROSS_VERSION) +SELECT unix_timestamp(s, 'yyyy-MM-dd') FROM test_unix_ts_exception diff --git a/spark/src/test/resources/sql-tests/expressions/datetime/unix_timestamp_time_parser_policy_legacy.sql b/spark/src/test/resources/sql-tests/expressions/datetime/unix_timestamp_time_parser_policy_legacy.sql new file mode 100644 index 0000000000..259a06d1f2 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/datetime/unix_timestamp_time_parser_policy_legacy.sql @@ -0,0 +1,36 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- unix_timestamp() under LEGACY timeParserPolicy. +-- SimpleDateFormat is lenient: single-digit fields, out-of-range values, and trailing +-- characters all parse successfully. +-- Config: spark.sql.legacy.timeParserPolicy=LEGACY +-- Config: spark.sql.session.timeZone=UTC + +statement +CREATE TABLE test_unix_ts_lenient(s string) USING parquet + +statement +INSERT INTO test_unix_ts_lenient VALUES + ('2024-1-1'), + ('2024-13-01'), + ('2024-02-30'), + ('2024-01-01garbage'), + ('2024') + +query spark_answer_only +SELECT s, unix_timestamp(s, 'yyyy-MM-dd') FROM test_unix_ts_lenient ORDER BY s From bf9c035ebe6f5697608efaaf187195c166734c60 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 2 May 2026 08:20:12 -0600 Subject: [PATCH 2/3] docs: replace config support table with bullet list --- docs/source/contributor-guide/spark_configs_support.md | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/docs/source/contributor-guide/spark_configs_support.md b/docs/source/contributor-guide/spark_configs_support.md index 6ca4f99ecf..fe1da3cd23 100644 --- a/docs/source/contributor-guide/spark_configs_support.md +++ b/docs/source/contributor-guide/spark_configs_support.md @@ -37,9 +37,12 @@ The status column uses these values: ## Audited Configurations -| Config | Default | Status | Affected expressions / operators | Spark Versions | Date | -| ------------------------------------ | ----------- | ----------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------- | ---------- | -| `spark.sql.legacy.timeParserPolicy` | `EXCEPTION` | Falls back (see notes) | `date_format`, `from_unixtime`, `unix_timestamp`, `to_unix_timestamp`, `to_timestamp`, `to_timestamp_ntz`, `to_date`, `try_to_timestamp` (Spark 4+) | 3.4.3, 3.5.8, 4.0.1 | 2026-05-02 | +- `spark.sql.legacy.timeParserPolicy` + - Default: `EXCEPTION` + - Status: Falls back (see notes) + - Affected expressions: `date_format`, `from_unixtime`, `unix_timestamp`, `to_unix_timestamp`, `to_timestamp`, `to_timestamp_ntz`, `to_date`, `try_to_timestamp` (Spark 4+) + - Spark versions checked: 3.4.3, 3.5.8, 4.0.1 + - Date: 2026-05-02 ## Audit Notes From 6b06b01d1c28fc4ad10c6f6855b04481dc554ec4 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 2 May 2026 09:21:16 -0600 Subject: [PATCH 3/3] test: add Scala test coverage for spark.sql.optimizer.nestedSchemaPruning.enabled Add CometNestedSchemaPruningSuite, a focused Scala suite that runs each scenario across both Comet scan implementations (native_datafusion, native_iceberg_compat) under the V1 Parquet path. For each scenario the suite walks the executed plan, extracts requiredSchema from the Comet scan exec, and asserts the pruned (or unpruned) shape matches the expected catalogString, then compares results against Spark. Plain Parquet V2 is excluded because Comet's V2 scan rule only covers CSV and Iceberg, leaving Parquet V2 as plain BatchScanExec without a Comet scan to inspect. Scenarios cover top-level struct field, field inside array of struct, field inside map value, doubly-nested struct field, projection plus filter on nested field, and null at an intermediate struct level. Each scenario exercises both pruning-enabled and pruning-disabled behavior. Also append a second entry to docs/source/contributor-guide/spark_configs_support.md with the full audit notes for nestedSchemaPruning.enabled. All 12 generated test cases pass on Spark 3.4.3, 3.5.8, and 4.0.1. --- .../spark_configs_support.md | 48 ++++ .../CometNestedSchemaPruningSuite.scala | 233 ++++++++++++++++++ 2 files changed, 281 insertions(+) create mode 100644 spark/src/test/scala/org/apache/comet/parquet/CometNestedSchemaPruningSuite.scala diff --git a/docs/source/contributor-guide/spark_configs_support.md b/docs/source/contributor-guide/spark_configs_support.md index fe1da3cd23..aecfe6967d 100644 --- a/docs/source/contributor-guide/spark_configs_support.md +++ b/docs/source/contributor-guide/spark_configs_support.md @@ -43,6 +43,12 @@ The status column uses these values: - Affected expressions: `date_format`, `from_unixtime`, `unix_timestamp`, `to_unix_timestamp`, `to_timestamp`, `to_timestamp_ntz`, `to_date`, `try_to_timestamp` (Spark 4+) - Spark versions checked: 3.4.3, 3.5.8, 4.0.1 - Date: 2026-05-02 +- `spark.sql.optimizer.nestedSchemaPruning.enabled` + - Default: `true` + - Status: Supported + - Affected components: catalyst optimizer rules `SchemaPruning` and `NestedColumnAliasing`, datasource V2 push-down (`PushDownUtils`), Parquet readers (`ParquetReadSupport`, `ParquetFileFormat`, `ParquetScan`) + - Spark versions checked: 3.4.3, 3.5.8, 4.0.1 + - Date: 2026-05-02 ## Audit Notes @@ -110,3 +116,45 @@ whitelist, this audit should be revisited and the policy must be honored explici Comet bugs were uncovered by the audit. The tests use `query spark_answer_only` so that result-correctness is enforced regardless of whether Comet runs the expression natively or falls back. + +### `spark.sql.optimizer.nestedSchemaPruning.enabled` + +**Source.** When `true`, the catalyst optimizer rewrites projections of nested +fields so columnar readers fetch only the requested leaves of a struct, array, or +map column. Read sites verified on Spark 3.4.3, 3.5.8, 4.0.1: + +- `org.apache.spark.sql.catalyst.optimizer.SchemaPruning` and + `org.apache.spark.sql.catalyst.optimizer.NestedColumnAliasing` -- gated by + `nestedSchemaPruningEnabled`; rewrite the project list to expose only the leaves + that downstream operators consume. +- `org.apache.spark.sql.execution.datasources.v2.PushDownUtils.pruneColumns` -- + pushes the pruned schema into V2 scans only when the flag is `true`. +- `org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport`, + `ParquetFileFormat`, and `ParquetScan` -- propagate the flag into the Parquet + reader's requested schema. + +**Comet status.** `CometParquetFileFormat.populateConf` propagates the SQL conf +into the Hadoop conf so the Parquet read path honors it. Comet has no other +special handling -- native scans inherit Spark's already-pruned `requiredSchema` +and pass it through to the native reader. The separate +`spark.sql.optimizer.serializer.nestedSchemaPruning.enabled` (Encoder-level) is +out of scope. + +**Test coverage.** `spark/src/test/scala/org/apache/comet/parquet/CometNestedSchemaPruningSuite.scala`: + +- One Scala test per scenario, run across both `SCAN_NATIVE_DATAFUSION` and + `SCAN_NATIVE_ICEBERG_COMPAT` under the V1 Parquet path. Plain Parquet V2 is not + Comet-accelerated (Comet's V2 scan rule covers only CSV and Iceberg) so it is + excluded from the matrix. +- Each scenario inspects the executed plan via a small helper that walks Comet + scan execs (`CometScanExec`, `CometNativeScanExec`) and asserts the + `requiredSchema` matches the expected pruned (or unpruned) shape, then compares + results against Spark via `checkSparkAnswer`. +- Scenarios: top-level struct field, field inside array of struct, field inside + map value, doubly-nested struct field, projection plus filter on nested field, + null at intermediate struct level. Each scenario asserts both the + pruning-enabled and pruning-disabled behavior, except the null-intermediate + case which only varies the pruning-on path. + +**Findings.** All 12 generated test cases pass on Spark 3.4.3, 3.5.8, and 4.0.1. +No Comet bugs were uncovered. diff --git a/spark/src/test/scala/org/apache/comet/parquet/CometNestedSchemaPruningSuite.scala b/spark/src/test/scala/org/apache/comet/parquet/CometNestedSchemaPruningSuite.scala new file mode 100644 index 0000000000..e338876acf --- /dev/null +++ b/spark/src/test/scala/org/apache/comet/parquet/CometNestedSchemaPruningSuite.scala @@ -0,0 +1,233 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet.parquet + +import org.scalactic.source.Position +import org.scalatest.Tag + +import org.apache.spark.sql.{CometTestBase, DataFrame} +import org.apache.spark.sql.catalyst.parser.CatalystSqlParser +import org.apache.spark.sql.comet.{CometNativeScanExec, CometScanExec} +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.StructType + +import org.apache.comet.CometConf + +/** + * Verifies that Comet honors `spark.sql.optimizer.nestedSchemaPruning.enabled` end-to-end: the + * executed plan's required schema is pruned when the flag is `true`, the full schema is read when + * the flag is `false`, and results match Spark in both cases. + * + * Each test runs once per Comet scan implementation (`SCAN_NATIVE_DATAFUSION`, + * `SCAN_NATIVE_ICEBERG_COMPAT`). The V1 datasource path is pinned because plain Parquet V2 is not + * Comet-accelerated (only CSV and Iceberg V2 scans are). + */ +class CometNestedSchemaPruningSuite extends CometTestBase with AdaptiveSparkPlanHelper { + + private val scanImpls = + Seq(CometConf.SCAN_NATIVE_DATAFUSION, CometConf.SCAN_NATIVE_ICEBERG_COMPAT) + + override protected def test(testName: String, testTags: Tag*)(testFun: => Any)(implicit + pos: Position): Unit = { + scanImpls.foreach { scan => + super.test(s"$testName - $scan", testTags: _*) { + withSQLConf( + CometConf.COMET_ENABLED.key -> "true", + CometConf.COMET_EXEC_ENABLED.key -> "true", + CometConf.COMET_EXPLAIN_FALLBACK_ENABLED.key -> "false", + CometConf.COMET_NATIVE_SCAN_IMPL.key -> scan, + SQLConf.USE_V1_SOURCE_LIST.key -> "parquet") { + testFun + } + } + } + } + + /** + * Walks the executed plan, collects the required schema from any Comet scan exec, and asserts + * it matches `expected` (a catalyst-style schema string). Field nullability is ignored. + */ + private def assertScanSchema(df: DataFrame, expected: String): Unit = { + val scanSchemas = collect(df.queryExecution.executedPlan) { + case scan: CometScanExec => scan.requiredSchema + case scan: CometNativeScanExec => scan.requiredSchema + } + assert( + scanSchemas.size == 1, + s"Expected exactly one Comet scan in plan, found ${scanSchemas.size}:\n" + + df.queryExecution.executedPlan.toString) + val expectedSchema = CatalystSqlParser.parseDataType(expected).asInstanceOf[StructType] + // Compare via catalogString which omits nullability flags so the assertions stay readable. + assert( + scanSchemas.head.catalogString == expectedSchema.catalogString, + s"Pruned schema mismatch.\n expected: ${expectedSchema.catalogString}\n" + + s" actual: ${scanSchemas.head.catalogString}") + } + + /** + * Writes a small `Contact` parquet dataset to a temp path and runs `body` with the path. + * Mirrors the case-class shape used by Spark's `SchemaPruningSuite` so the audit + * cross-references easily. + */ + private def withContactsParquet(body: String => Unit): Unit = { + withTempPath { dir => + val path = dir.getCanonicalPath + import testImplicits._ + val df = Seq( + Contact( + id = 0, + name = FullName("Jane", "X.", "Doe"), + address = "123 Main Street", + pets = 1, + friends = Array(FullName("Susan", "Z.", "Smith")), + relatives = Map("brother" -> FullName("John", "Y.", "Doe")), + employer = Employer(0, Company("abc", "123 Business Street"))), + Contact( + id = 1, + name = FullName("John", "Y.", "Doe"), + address = "321 Wall Street", + pets = 3, + friends = Array(FullName("Alice", "A.", "Jones")), + relatives = Map("sister" -> FullName("Jane", "X.", "Doe")), + employer = null)).toDF() + df.write.parquet(path) + body(path) + } + } + + // Top-level field of a struct -- pruned schema retains only the projected leaf. + test("prune top-level struct field") { + withContactsParquet { path => + withSQLConf(SQLConf.NESTED_SCHEMA_PRUNING_ENABLED.key -> "true") { + val df = spark.read.parquet(path).selectExpr("name.first") + assertScanSchema(df, "struct>") + checkSparkAnswer(df) + } + withSQLConf(SQLConf.NESTED_SCHEMA_PRUNING_ENABLED.key -> "false") { + val df = spark.read.parquet(path).selectExpr("name.first") + assertScanSchema(df, "struct>") + checkSparkAnswer(df) + } + } + } + + // Field inside an array of struct. + test("prune field inside array of struct") { + withContactsParquet { path => + withSQLConf(SQLConf.NESTED_SCHEMA_PRUNING_ENABLED.key -> "true") { + val df = spark.read.parquet(path).selectExpr("friends.first") + assertScanSchema(df, "struct>>") + checkSparkAnswer(df) + } + withSQLConf(SQLConf.NESTED_SCHEMA_PRUNING_ENABLED.key -> "false") { + val df = spark.read.parquet(path).selectExpr("friends.first") + assertScanSchema( + df, + "struct>>") + checkSparkAnswer(df) + } + } + } + + // Field inside a map value. + test("prune field inside map value") { + withContactsParquet { path => + withSQLConf(SQLConf.NESTED_SCHEMA_PRUNING_ENABLED.key -> "true") { + val df = spark.read.parquet(path).selectExpr("relatives['brother'].first") + assertScanSchema(df, "struct>>") + checkSparkAnswer(df) + } + withSQLConf(SQLConf.NESTED_SCHEMA_PRUNING_ENABLED.key -> "false") { + val df = spark.read.parquet(path).selectExpr("relatives['brother'].first") + assertScanSchema( + df, + "struct>>") + checkSparkAnswer(df) + } + } + } + + // Doubly-nested struct: only the deep leaf is required. + test("prune doubly-nested struct field") { + withContactsParquet { path => + withSQLConf(SQLConf.NESTED_SCHEMA_PRUNING_ENABLED.key -> "true") { + val df = spark.read.parquet(path).selectExpr("employer.company.name") + assertScanSchema(df, "struct>>") + checkSparkAnswer(df) + } + withSQLConf(SQLConf.NESTED_SCHEMA_PRUNING_ENABLED.key -> "false") { + val df = spark.read.parquet(path).selectExpr("employer.company.name") + assertScanSchema( + df, + "struct>>") + checkSparkAnswer(df) + } + } + } + + // Filter on a nested field plus a separate top-level projection. The required schema must + // include both the filtered leaf and the projected top-level column. + test("prune with filter on nested field") { + withContactsParquet { path => + withSQLConf(SQLConf.NESTED_SCHEMA_PRUNING_ENABLED.key -> "true") { + val df = spark.read + .parquet(path) + .where("name.first = 'Jane'") + .selectExpr("id") + assertScanSchema(df, "struct>") + checkSparkAnswer(df) + } + withSQLConf(SQLConf.NESTED_SCHEMA_PRUNING_ENABLED.key -> "false") { + val df = spark.read + .parquet(path) + .where("name.first = 'Jane'") + .selectExpr("id") + assertScanSchema(df, "struct>") + checkSparkAnswer(df) + } + } + } + + // Pruning correctly returns null when the intermediate struct is null in the row. + // The second contact has employer = null; the projected leaf must round-trip as null and match + // Spark's behavior. + test("prune with null at intermediate struct level") { + withContactsParquet { path => + withSQLConf(SQLConf.NESTED_SCHEMA_PRUNING_ENABLED.key -> "true") { + val df = spark.read.parquet(path).selectExpr("employer.company.name") + assertScanSchema(df, "struct>>") + checkSparkAnswer(df) + } + } + } +} + +private case class FullName(first: String, middle: String, last: String) +private case class Company(name: String, address: String) +private case class Employer(id: Int, company: Company) +private case class Contact( + id: Int, + name: FullName, + address: String, + pets: Int, + friends: Array[FullName] = Array.empty, + relatives: Map[String, FullName] = Map.empty, + employer: Employer = null)