-
Notifications
You must be signed in to change notification settings - Fork 395
feat(datafusion): Add LIKE predicate pushdown for StartsWith patterns #2014
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,10 +18,10 @@ | |
| use std::vec; | ||
|
|
||
| use datafusion::arrow::datatypes::DataType; | ||
| use datafusion::logical_expr::{Expr, Operator}; | ||
| use datafusion::logical_expr::{Expr, Like, Operator}; | ||
| use datafusion::scalar::ScalarValue; | ||
| use iceberg::expr::{BinaryExpression, Predicate, PredicateOperator, Reference, UnaryExpression}; | ||
| use iceberg::spec::Datum; | ||
| use iceberg::spec::{Datum, PrimitiveLiteral}; | ||
|
|
||
| // A datafusion expression could be an Iceberg predicate, column, or literal. | ||
| enum TransformedResult { | ||
|
|
@@ -128,6 +128,56 @@ fn to_iceberg_predicate(expr: &Expr) -> TransformedResult { | |
| } | ||
| to_iceberg_predicate(&c.expr) | ||
| } | ||
| Expr::Like(Like { | ||
| negated, | ||
| expr, | ||
| pattern, | ||
| escape_char, | ||
| case_insensitive, | ||
| }) => { | ||
| // Only support simple prefix patterns (e.g., 'prefix%') | ||
| // Note: Iceberg's StartsWith operator is case-sensitive, so we cannot | ||
| // push down case-insensitive LIKE (ILIKE) patterns | ||
| // Escape characters are also not supported for pushdown | ||
| if escape_char.is_some() || *case_insensitive { | ||
| return TransformedResult::NotTransformed; | ||
| } | ||
|
|
||
| // Extract the pattern string | ||
| let pattern_str = match to_iceberg_predicate(pattern) { | ||
| TransformedResult::Literal(d) => match d.literal() { | ||
| PrimitiveLiteral::String(s) => s.clone(), | ||
| _ => return TransformedResult::NotTransformed, | ||
| }, | ||
| _ => return TransformedResult::NotTransformed, | ||
| }; | ||
|
|
||
| // Check if it's a simple prefix pattern (ends with % and no other wildcards) | ||
| if pattern_str.ends_with('%') | ||
| && !pattern_str[..pattern_str.len() - 1].contains(['%', '_']) | ||
| { | ||
| // Extract the prefix (remove trailing %) | ||
| let prefix = pattern_str[..pattern_str.len() - 1].to_string(); | ||
|
|
||
| // Get the column reference | ||
| let column = match to_iceberg_predicate(expr) { | ||
| TransformedResult::Column(r) => r, | ||
| _ => return TransformedResult::NotTransformed, | ||
| }; | ||
|
|
||
| // Create the appropriate predicate | ||
| let predicate = if *negated { | ||
| column.not_starts_with(Datum::string(prefix)) | ||
| } else { | ||
| column.starts_with(Datum::string(prefix)) | ||
| }; | ||
|
|
||
| TransformedResult::Predicate(predicate) | ||
| } else { | ||
| // Complex LIKE patterns cannot be pushed down | ||
| TransformedResult::NotTransformed | ||
| } | ||
| } | ||
| _ => TransformedResult::NotTransformed, | ||
| } | ||
| } | ||
|
|
@@ -458,4 +508,97 @@ mod tests { | |
| .and(Reference::new("bar").equal_to(Datum::binary(vec![1u8, 2u8]))); | ||
| assert_eq!(predicate, expected_predicate); | ||
| } | ||
|
|
||
| #[test] | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We recently added support for sqllogictests, see https://github.com/liurenjie1024/iceberg-rust/blob/666a9fe1aaf1692583d6f44e4f7a1d52a688b217/crates/sqllogictest/testdata/schedules/df_test.toml#L19 It would be better if we also include such sql logic tests in spite of ut.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Okay. Added test in sqllogictests. |
||
| fn test_predicate_conversion_with_like_starts_with() { | ||
| let sql = "bar LIKE 'test%'"; | ||
| let predicate = convert_to_iceberg_predicate(sql).unwrap(); | ||
| assert_eq!( | ||
| predicate, | ||
| Reference::new("bar").starts_with(Datum::string("test")) | ||
| ); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_predicate_conversion_with_not_like_starts_with() { | ||
| let sql = "bar NOT LIKE 'test%'"; | ||
| let predicate = convert_to_iceberg_predicate(sql).unwrap(); | ||
| assert_eq!( | ||
| predicate, | ||
| Reference::new("bar").not_starts_with(Datum::string("test")) | ||
| ); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_predicate_conversion_with_like_empty_prefix() { | ||
| let sql = "bar LIKE '%'"; | ||
| let predicate = convert_to_iceberg_predicate(sql).unwrap(); | ||
| assert_eq!( | ||
| predicate, | ||
| Reference::new("bar").starts_with(Datum::string("")) | ||
| ); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_predicate_conversion_with_like_complex_pattern() { | ||
| // Patterns with wildcards in the middle cannot be pushed down | ||
| let sql = "bar LIKE 'te%st'"; | ||
| let predicate = convert_to_iceberg_predicate(sql); | ||
| assert_eq!(predicate, None); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_predicate_conversion_with_like_underscore_wildcard() { | ||
| // Patterns with underscore wildcard cannot be pushed down | ||
| let sql = "bar LIKE 'test_'"; | ||
| let predicate = convert_to_iceberg_predicate(sql); | ||
| assert_eq!(predicate, None); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_predicate_conversion_with_like_no_wildcard() { | ||
| // Patterns without trailing % cannot be pushed down as StartsWith | ||
| let sql = "bar LIKE 'test'"; | ||
| let predicate = convert_to_iceberg_predicate(sql); | ||
| assert_eq!(predicate, None); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_predicate_conversion_with_ilike() { | ||
| // Case-insensitive LIKE (ILIKE) is not supported | ||
| let sql = "bar ILIKE 'test%'"; | ||
| let predicate = convert_to_iceberg_predicate(sql); | ||
| assert_eq!(predicate, None); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_predicate_conversion_with_like_and_other_conditions() { | ||
| let sql = "bar LIKE 'test%' AND foo > 1"; | ||
| let predicate = convert_to_iceberg_predicate(sql).unwrap(); | ||
| let expected_predicate = Predicate::and( | ||
| Reference::new("bar").starts_with(Datum::string("test")), | ||
| Reference::new("foo").greater_than(Datum::long(1)), | ||
| ); | ||
| assert_eq!(predicate, expected_predicate); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_predicate_conversion_with_like_special_characters() { | ||
| // Test LIKE with special characters in prefix | ||
| let sql = "bar LIKE 'test-abc_123%'"; | ||
| let predicate = convert_to_iceberg_predicate(sql); | ||
| // This should not be pushed down because it contains underscore | ||
| assert_eq!(predicate, None); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_predicate_conversion_with_like_unicode() { | ||
| // Test LIKE with unicode characters in prefix | ||
| let sql = "bar LIKE '测试%'"; | ||
| let predicate = convert_to_iceberg_predicate(sql).unwrap(); | ||
| assert_eq!( | ||
| predicate, | ||
| Reference::new("bar").starts_with(Datum::string("测试")) | ||
| ); | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,113 @@ | ||
| # Licensed to the Apache Software Foundation (ASF) under one | ||
| # or more contributor license agreements. See the NOTICE file | ||
| # distributed with this work for additional information | ||
| # regarding copyright ownership. The ASF licenses this file | ||
| # to you under the Apache License, Version 2.0 (the | ||
| # "License"); you may not use this file except in compliance | ||
| # with the License. You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, | ||
| # software distributed under the License is distributed on an | ||
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| # KIND, either express or implied. See the License for the | ||
| # specific language governing permissions and limitations | ||
| # under the License. | ||
|
|
||
| # Test LIKE predicate pushdown for StartsWith patterns | ||
| # This validates that LIKE 'prefix%' patterns are converted to StartsWith operator | ||
| # Note: Iceberg's StartsWith operator is case-sensitive | ||
|
|
||
| # Note: test_unpartitioned_table already contains data from insert_into.slt: | ||
| # (1, 'Alice'), (2, 'Bob'), (3, 'Charlie'), (4, NULL) | ||
| # We'll insert additional test data for LIKE testing | ||
| query I | ||
| INSERT INTO default.default.test_unpartitioned_table VALUES (5, 'alice'), (6, 'Albert'), (7, 'Bobby'), (8, 'Carol') | ||
| ---- | ||
| 4 | ||
|
|
||
| # Test basic LIKE prefix pattern pushdown - verify predicate is pushed to IcebergTableScan | ||
| query TT | ||
| EXPLAIN SELECT * FROM default.default.test_unpartitioned_table WHERE name LIKE 'Al%' | ||
| ---- | ||
| logical_plan | ||
| 01)Filter: default.default.test_unpartitioned_table.name LIKE Utf8("Al%") | ||
| 02)--TableScan: default.default.test_unpartitioned_table projection=[id, name], partial_filters=[default.default.test_unpartitioned_table.name LIKE Utf8("Al%")] | ||
| physical_plan | ||
| 01)CoalesceBatchesExec: target_batch_size=8192 | ||
| 02)--FilterExec: name@1 LIKE Al% | ||
| 03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 | ||
| 04)------CooperativeExec | ||
| 05)--------IcebergTableScan projection:[id,name] predicate:[name STARTS WITH "Al"] | ||
|
|
||
| # Test LIKE filtering with case-sensitive match | ||
| query IT rowsort | ||
| SELECT * FROM default.default.test_unpartitioned_table WHERE name LIKE 'Al%' | ||
| ---- | ||
| 1 Alice | ||
| 6 Albert | ||
|
|
||
| # Test NOT LIKE prefix pattern pushdown | ||
| query TT | ||
| EXPLAIN SELECT * FROM default.default.test_unpartitioned_table WHERE name NOT LIKE 'Al%' | ||
| ---- | ||
| logical_plan | ||
| 01)Filter: default.default.test_unpartitioned_table.name NOT LIKE Utf8("Al%") | ||
| 02)--TableScan: default.default.test_unpartitioned_table projection=[id, name], partial_filters=[default.default.test_unpartitioned_table.name NOT LIKE Utf8("Al%")] | ||
| physical_plan | ||
| 01)CoalesceBatchesExec: target_batch_size=8192 | ||
| 02)--FilterExec: name@1 NOT LIKE Al% | ||
| 03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 | ||
| 04)------CooperativeExec | ||
| 05)--------IcebergTableScan projection:[id,name] predicate:[name NOT STARTS WITH "Al"] | ||
|
|
||
| # Test NOT LIKE filtering | ||
| query IT rowsort | ||
| SELECT * FROM default.default.test_unpartitioned_table WHERE name NOT LIKE 'Al%' | ||
| ---- | ||
| 2 Bob | ||
| 3 Charlie | ||
| 5 alice | ||
| 7 Bobby | ||
| 8 Carol | ||
|
|
||
| # Test case sensitivity - lowercase 'al%' should not match 'Alice' or 'Albert' | ||
| query IT rowsort | ||
| SELECT * FROM default.default.test_unpartitioned_table WHERE name LIKE 'al%' | ||
| ---- | ||
| 5 alice | ||
|
|
||
| # Test LIKE with empty prefix (matches everything except NULL) | ||
| query IT rowsort | ||
| SELECT * FROM default.default.test_unpartitioned_table WHERE name LIKE '%' | ||
| ---- | ||
| 1 Alice | ||
| 2 Bob | ||
| 3 Charlie | ||
| 5 alice | ||
| 6 Albert | ||
| 7 Bobby | ||
| 8 Carol | ||
|
|
||
| # Test LIKE with single character prefix | ||
| query IT rowsort | ||
| SELECT * FROM default.default.test_unpartitioned_table WHERE name LIKE 'B%' | ||
| ---- | ||
| 2 Bob | ||
| 7 Bobby | ||
|
|
||
| # Test LIKE combined with other predicates | ||
| query IT rowsort | ||
| SELECT * FROM default.default.test_unpartitioned_table WHERE name LIKE 'Al%' AND id > 1 | ||
| ---- | ||
| 6 Albert | ||
|
|
||
| # Test LIKE in OR expression | ||
| query IT rowsort | ||
| SELECT * FROM default.default.test_unpartitioned_table WHERE name LIKE 'Al%' OR name LIKE 'Bo%' | ||
| ---- | ||
| 1 Alice | ||
| 2 Bob | ||
| 6 Albert | ||
| 7 Bobby |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IIRC, iceberg's starts with is case sensitive.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I updated the document here.