diff --git a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs index fc95be5db5..2468606b48 100644 --- a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs +++ b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs @@ -18,10 +18,10 @@ use std::vec; use datafusion::arrow::datatypes::DataType; -use datafusion::logical_expr::{Expr, Operator}; +use datafusion::logical_expr::{Expr, Like, Operator}; use datafusion::scalar::ScalarValue; use iceberg::expr::{BinaryExpression, Predicate, PredicateOperator, Reference, UnaryExpression}; -use iceberg::spec::Datum; +use iceberg::spec::{Datum, PrimitiveLiteral}; // A datafusion expression could be an Iceberg predicate, column, or literal. enum TransformedResult { @@ -128,6 +128,56 @@ fn to_iceberg_predicate(expr: &Expr) -> TransformedResult { } to_iceberg_predicate(&c.expr) } + Expr::Like(Like { + negated, + expr, + pattern, + escape_char, + case_insensitive, + }) => { + // Only support simple prefix patterns (e.g., 'prefix%') + // Note: Iceberg's StartsWith operator is case-sensitive, so we cannot + // push down case-insensitive LIKE (ILIKE) patterns + // Escape characters are also not supported for pushdown + if escape_char.is_some() || *case_insensitive { + return TransformedResult::NotTransformed; + } + + // Extract the pattern string + let pattern_str = match to_iceberg_predicate(pattern) { + TransformedResult::Literal(d) => match d.literal() { + PrimitiveLiteral::String(s) => s.clone(), + _ => return TransformedResult::NotTransformed, + }, + _ => return TransformedResult::NotTransformed, + }; + + // Check if it's a simple prefix pattern (ends with % and no other wildcards) + if pattern_str.ends_with('%') + && !pattern_str[..pattern_str.len() - 1].contains(['%', '_']) + { + // Extract the prefix (remove trailing %) + let prefix = pattern_str[..pattern_str.len() - 1].to_string(); + + // Get the column reference + let column = match to_iceberg_predicate(expr) { + TransformedResult::Column(r) => r, + _ => return TransformedResult::NotTransformed, + }; + + // Create the appropriate predicate + let predicate = if *negated { + column.not_starts_with(Datum::string(prefix)) + } else { + column.starts_with(Datum::string(prefix)) + }; + + TransformedResult::Predicate(predicate) + } else { + // Complex LIKE patterns cannot be pushed down + TransformedResult::NotTransformed + } + } _ => TransformedResult::NotTransformed, } } @@ -458,4 +508,97 @@ mod tests { .and(Reference::new("bar").equal_to(Datum::binary(vec![1u8, 2u8]))); assert_eq!(predicate, expected_predicate); } + + #[test] + fn test_predicate_conversion_with_like_starts_with() { + let sql = "bar LIKE 'test%'"; + let predicate = convert_to_iceberg_predicate(sql).unwrap(); + assert_eq!( + predicate, + Reference::new("bar").starts_with(Datum::string("test")) + ); + } + + #[test] + fn test_predicate_conversion_with_not_like_starts_with() { + let sql = "bar NOT LIKE 'test%'"; + let predicate = convert_to_iceberg_predicate(sql).unwrap(); + assert_eq!( + predicate, + Reference::new("bar").not_starts_with(Datum::string("test")) + ); + } + + #[test] + fn test_predicate_conversion_with_like_empty_prefix() { + let sql = "bar LIKE '%'"; + let predicate = convert_to_iceberg_predicate(sql).unwrap(); + assert_eq!( + predicate, + Reference::new("bar").starts_with(Datum::string("")) + ); + } + + #[test] + fn test_predicate_conversion_with_like_complex_pattern() { + // Patterns with wildcards in the middle cannot be pushed down + let sql = "bar LIKE 'te%st'"; + let predicate = convert_to_iceberg_predicate(sql); + assert_eq!(predicate, None); + } + + #[test] + fn test_predicate_conversion_with_like_underscore_wildcard() { + // Patterns with underscore wildcard cannot be pushed down + let sql = "bar LIKE 'test_'"; + let predicate = convert_to_iceberg_predicate(sql); + assert_eq!(predicate, None); + } + + #[test] + fn test_predicate_conversion_with_like_no_wildcard() { + // Patterns without trailing % cannot be pushed down as StartsWith + let sql = "bar LIKE 'test'"; + let predicate = convert_to_iceberg_predicate(sql); + assert_eq!(predicate, None); + } + + #[test] + fn test_predicate_conversion_with_ilike() { + // Case-insensitive LIKE (ILIKE) is not supported + let sql = "bar ILIKE 'test%'"; + let predicate = convert_to_iceberg_predicate(sql); + assert_eq!(predicate, None); + } + + #[test] + fn test_predicate_conversion_with_like_and_other_conditions() { + let sql = "bar LIKE 'test%' AND foo > 1"; + let predicate = convert_to_iceberg_predicate(sql).unwrap(); + let expected_predicate = Predicate::and( + Reference::new("bar").starts_with(Datum::string("test")), + Reference::new("foo").greater_than(Datum::long(1)), + ); + assert_eq!(predicate, expected_predicate); + } + + #[test] + fn test_predicate_conversion_with_like_special_characters() { + // Test LIKE with special characters in prefix + let sql = "bar LIKE 'test-abc_123%'"; + let predicate = convert_to_iceberg_predicate(sql); + // This should not be pushed down because it contains underscore + assert_eq!(predicate, None); + } + + #[test] + fn test_predicate_conversion_with_like_unicode() { + // Test LIKE with unicode characters in prefix + let sql = "bar LIKE '测试%'"; + let predicate = convert_to_iceberg_predicate(sql).unwrap(); + assert_eq!( + predicate, + Reference::new("bar").starts_with(Datum::string("测试")) + ); + } } diff --git a/crates/sqllogictest/testdata/schedules/df_test.toml b/crates/sqllogictest/testdata/schedules/df_test.toml index 1a1a0ad334..8f4860043b 100644 --- a/crates/sqllogictest/testdata/schedules/df_test.toml +++ b/crates/sqllogictest/testdata/schedules/df_test.toml @@ -34,6 +34,10 @@ slt = "df_test/insert_into.slt" engine = "df" slt = "df_test/binary_predicate_pushdown.slt" +[[steps]] +engine = "df" +slt = "df_test/like_predicate_pushdown.slt" + [[steps]] engine = "df" slt = "df_test/drop_table.slt" diff --git a/crates/sqllogictest/testdata/slts/df_test/like_predicate_pushdown.slt b/crates/sqllogictest/testdata/slts/df_test/like_predicate_pushdown.slt new file mode 100644 index 0000000000..41d8ec0276 --- /dev/null +++ b/crates/sqllogictest/testdata/slts/df_test/like_predicate_pushdown.slt @@ -0,0 +1,113 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Test LIKE predicate pushdown for StartsWith patterns +# This validates that LIKE 'prefix%' patterns are converted to StartsWith operator +# Note: Iceberg's StartsWith operator is case-sensitive + +# Note: test_unpartitioned_table already contains data from insert_into.slt: +# (1, 'Alice'), (2, 'Bob'), (3, 'Charlie'), (4, NULL) +# We'll insert additional test data for LIKE testing +query I +INSERT INTO default.default.test_unpartitioned_table VALUES (5, 'alice'), (6, 'Albert'), (7, 'Bobby'), (8, 'Carol') +---- +4 + +# Test basic LIKE prefix pattern pushdown - verify predicate is pushed to IcebergTableScan +query TT +EXPLAIN SELECT * FROM default.default.test_unpartitioned_table WHERE name LIKE 'Al%' +---- +logical_plan +01)Filter: default.default.test_unpartitioned_table.name LIKE Utf8("Al%") +02)--TableScan: default.default.test_unpartitioned_table projection=[id, name], partial_filters=[default.default.test_unpartitioned_table.name LIKE Utf8("Al%")] +physical_plan +01)CoalesceBatchesExec: target_batch_size=8192 +02)--FilterExec: name@1 LIKE Al% +03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +04)------CooperativeExec +05)--------IcebergTableScan projection:[id,name] predicate:[name STARTS WITH "Al"] + +# Test LIKE filtering with case-sensitive match +query IT rowsort +SELECT * FROM default.default.test_unpartitioned_table WHERE name LIKE 'Al%' +---- +1 Alice +6 Albert + +# Test NOT LIKE prefix pattern pushdown +query TT +EXPLAIN SELECT * FROM default.default.test_unpartitioned_table WHERE name NOT LIKE 'Al%' +---- +logical_plan +01)Filter: default.default.test_unpartitioned_table.name NOT LIKE Utf8("Al%") +02)--TableScan: default.default.test_unpartitioned_table projection=[id, name], partial_filters=[default.default.test_unpartitioned_table.name NOT LIKE Utf8("Al%")] +physical_plan +01)CoalesceBatchesExec: target_batch_size=8192 +02)--FilterExec: name@1 NOT LIKE Al% +03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +04)------CooperativeExec +05)--------IcebergTableScan projection:[id,name] predicate:[name NOT STARTS WITH "Al"] + +# Test NOT LIKE filtering +query IT rowsort +SELECT * FROM default.default.test_unpartitioned_table WHERE name NOT LIKE 'Al%' +---- +2 Bob +3 Charlie +5 alice +7 Bobby +8 Carol + +# Test case sensitivity - lowercase 'al%' should not match 'Alice' or 'Albert' +query IT rowsort +SELECT * FROM default.default.test_unpartitioned_table WHERE name LIKE 'al%' +---- +5 alice + +# Test LIKE with empty prefix (matches everything except NULL) +query IT rowsort +SELECT * FROM default.default.test_unpartitioned_table WHERE name LIKE '%' +---- +1 Alice +2 Bob +3 Charlie +5 alice +6 Albert +7 Bobby +8 Carol + +# Test LIKE with single character prefix +query IT rowsort +SELECT * FROM default.default.test_unpartitioned_table WHERE name LIKE 'B%' +---- +2 Bob +7 Bobby + +# Test LIKE combined with other predicates +query IT rowsort +SELECT * FROM default.default.test_unpartitioned_table WHERE name LIKE 'Al%' AND id > 1 +---- +6 Albert + +# Test LIKE in OR expression +query IT rowsort +SELECT * FROM default.default.test_unpartitioned_table WHERE name LIKE 'Al%' OR name LIKE 'Bo%' +---- +1 Alice +2 Bob +6 Albert +7 Bobby