Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
147 changes: 145 additions & 2 deletions crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@
use std::vec;

use datafusion::arrow::datatypes::DataType;
use datafusion::logical_expr::{Expr, Operator};
use datafusion::logical_expr::{Expr, Like, Operator};
use datafusion::scalar::ScalarValue;
use iceberg::expr::{BinaryExpression, Predicate, PredicateOperator, Reference, UnaryExpression};
use iceberg::spec::Datum;
use iceberg::spec::{Datum, PrimitiveLiteral};

// A datafusion expression could be an Iceberg predicate, column, or literal.
enum TransformedResult {
Expand Down Expand Up @@ -128,6 +128,56 @@ fn to_iceberg_predicate(expr: &Expr) -> TransformedResult {
}
to_iceberg_predicate(&c.expr)
}
Expr::Like(Like {
negated,
expr,
pattern,
escape_char,
case_insensitive,
}) => {
// Only support simple prefix patterns (e.g., 'prefix%')
// Note: Iceberg's StartsWith operator is case-sensitive, so we cannot
// push down case-insensitive LIKE (ILIKE) patterns
// Escape characters are also not supported for pushdown
if escape_char.is_some() || *case_insensitive {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIRC, iceberg's starts with is case sensitive.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I updated the document here.

return TransformedResult::NotTransformed;
}

// Extract the pattern string
let pattern_str = match to_iceberg_predicate(pattern) {
TransformedResult::Literal(d) => match d.literal() {
PrimitiveLiteral::String(s) => s.clone(),
_ => return TransformedResult::NotTransformed,
},
_ => return TransformedResult::NotTransformed,
};

// Check if it's a simple prefix pattern (ends with % and no other wildcards)
if pattern_str.ends_with('%')
&& !pattern_str[..pattern_str.len() - 1].contains(['%', '_'])
{
// Extract the prefix (remove trailing %)
let prefix = pattern_str[..pattern_str.len() - 1].to_string();

// Get the column reference
let column = match to_iceberg_predicate(expr) {
TransformedResult::Column(r) => r,
_ => return TransformedResult::NotTransformed,
};

// Create the appropriate predicate
let predicate = if *negated {
column.not_starts_with(Datum::string(prefix))
} else {
column.starts_with(Datum::string(prefix))
};

TransformedResult::Predicate(predicate)
} else {
// Complex LIKE patterns cannot be pushed down
TransformedResult::NotTransformed
}
}
_ => TransformedResult::NotTransformed,
}
}
Expand Down Expand Up @@ -458,4 +508,97 @@ mod tests {
.and(Reference::new("bar").equal_to(Datum::binary(vec![1u8, 2u8])));
assert_eq!(predicate, expected_predicate);
}

#[test]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We recently added support for sqllogictests, see https://github.com/liurenjie1024/iceberg-rust/blob/666a9fe1aaf1692583d6f44e4f7a1d52a688b217/crates/sqllogictest/testdata/schedules/df_test.toml#L19

It would be better if we also include such sql logic tests in spite of ut.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay. Added test in sqllogictests.

fn test_predicate_conversion_with_like_starts_with() {
let sql = "bar LIKE 'test%'";
let predicate = convert_to_iceberg_predicate(sql).unwrap();
assert_eq!(
predicate,
Reference::new("bar").starts_with(Datum::string("test"))
);
}

#[test]
fn test_predicate_conversion_with_not_like_starts_with() {
let sql = "bar NOT LIKE 'test%'";
let predicate = convert_to_iceberg_predicate(sql).unwrap();
assert_eq!(
predicate,
Reference::new("bar").not_starts_with(Datum::string("test"))
);
}

#[test]
fn test_predicate_conversion_with_like_empty_prefix() {
let sql = "bar LIKE '%'";
let predicate = convert_to_iceberg_predicate(sql).unwrap();
assert_eq!(
predicate,
Reference::new("bar").starts_with(Datum::string(""))
);
}

#[test]
fn test_predicate_conversion_with_like_complex_pattern() {
// Patterns with wildcards in the middle cannot be pushed down
let sql = "bar LIKE 'te%st'";
let predicate = convert_to_iceberg_predicate(sql);
assert_eq!(predicate, None);
}

#[test]
fn test_predicate_conversion_with_like_underscore_wildcard() {
// Patterns with underscore wildcard cannot be pushed down
let sql = "bar LIKE 'test_'";
let predicate = convert_to_iceberg_predicate(sql);
assert_eq!(predicate, None);
}

#[test]
fn test_predicate_conversion_with_like_no_wildcard() {
// Patterns without trailing % cannot be pushed down as StartsWith
let sql = "bar LIKE 'test'";
let predicate = convert_to_iceberg_predicate(sql);
assert_eq!(predicate, None);
}

#[test]
fn test_predicate_conversion_with_ilike() {
// Case-insensitive LIKE (ILIKE) is not supported
let sql = "bar ILIKE 'test%'";
let predicate = convert_to_iceberg_predicate(sql);
assert_eq!(predicate, None);
}

#[test]
fn test_predicate_conversion_with_like_and_other_conditions() {
let sql = "bar LIKE 'test%' AND foo > 1";
let predicate = convert_to_iceberg_predicate(sql).unwrap();
let expected_predicate = Predicate::and(
Reference::new("bar").starts_with(Datum::string("test")),
Reference::new("foo").greater_than(Datum::long(1)),
);
assert_eq!(predicate, expected_predicate);
}

#[test]
fn test_predicate_conversion_with_like_special_characters() {
// Test LIKE with special characters in prefix
let sql = "bar LIKE 'test-abc_123%'";
let predicate = convert_to_iceberg_predicate(sql);
// This should not be pushed down because it contains underscore
assert_eq!(predicate, None);
}

#[test]
fn test_predicate_conversion_with_like_unicode() {
// Test LIKE with unicode characters in prefix
let sql = "bar LIKE '测试%'";
let predicate = convert_to_iceberg_predicate(sql).unwrap();
assert_eq!(
predicate,
Reference::new("bar").starts_with(Datum::string("测试"))
);
}
}
4 changes: 4 additions & 0 deletions crates/sqllogictest/testdata/schedules/df_test.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ slt = "df_test/insert_into.slt"
engine = "df"
slt = "df_test/binary_predicate_pushdown.slt"

[[steps]]
engine = "df"
slt = "df_test/like_predicate_pushdown.slt"

[[steps]]
engine = "df"
slt = "df_test/drop_table.slt"
113 changes: 113 additions & 0 deletions crates/sqllogictest/testdata/slts/df_test/like_predicate_pushdown.slt
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

# Test LIKE predicate pushdown for StartsWith patterns
# This validates that LIKE 'prefix%' patterns are converted to StartsWith operator
# Note: Iceberg's StartsWith operator is case-sensitive

# Note: test_unpartitioned_table already contains data from insert_into.slt:
# (1, 'Alice'), (2, 'Bob'), (3, 'Charlie'), (4, NULL)
# We'll insert additional test data for LIKE testing
query I
INSERT INTO default.default.test_unpartitioned_table VALUES (5, 'alice'), (6, 'Albert'), (7, 'Bobby'), (8, 'Carol')
----
4

# Test basic LIKE prefix pattern pushdown - verify predicate is pushed to IcebergTableScan
query TT
EXPLAIN SELECT * FROM default.default.test_unpartitioned_table WHERE name LIKE 'Al%'
----
logical_plan
01)Filter: default.default.test_unpartitioned_table.name LIKE Utf8("Al%")
02)--TableScan: default.default.test_unpartitioned_table projection=[id, name], partial_filters=[default.default.test_unpartitioned_table.name LIKE Utf8("Al%")]
physical_plan
01)CoalesceBatchesExec: target_batch_size=8192
02)--FilterExec: name@1 LIKE Al%
03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
04)------CooperativeExec
05)--------IcebergTableScan projection:[id,name] predicate:[name STARTS WITH "Al"]

# Test LIKE filtering with case-sensitive match
query IT rowsort
SELECT * FROM default.default.test_unpartitioned_table WHERE name LIKE 'Al%'
----
1 Alice
6 Albert

# Test NOT LIKE prefix pattern pushdown
query TT
EXPLAIN SELECT * FROM default.default.test_unpartitioned_table WHERE name NOT LIKE 'Al%'
----
logical_plan
01)Filter: default.default.test_unpartitioned_table.name NOT LIKE Utf8("Al%")
02)--TableScan: default.default.test_unpartitioned_table projection=[id, name], partial_filters=[default.default.test_unpartitioned_table.name NOT LIKE Utf8("Al%")]
physical_plan
01)CoalesceBatchesExec: target_batch_size=8192
02)--FilterExec: name@1 NOT LIKE Al%
03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
04)------CooperativeExec
05)--------IcebergTableScan projection:[id,name] predicate:[name NOT STARTS WITH "Al"]

# Test NOT LIKE filtering
query IT rowsort
SELECT * FROM default.default.test_unpartitioned_table WHERE name NOT LIKE 'Al%'
----
2 Bob
3 Charlie
5 alice
7 Bobby
8 Carol

# Test case sensitivity - lowercase 'al%' should not match 'Alice' or 'Albert'
query IT rowsort
SELECT * FROM default.default.test_unpartitioned_table WHERE name LIKE 'al%'
----
5 alice

# Test LIKE with empty prefix (matches everything except NULL)
query IT rowsort
SELECT * FROM default.default.test_unpartitioned_table WHERE name LIKE '%'
----
1 Alice
2 Bob
3 Charlie
5 alice
6 Albert
7 Bobby
8 Carol

# Test LIKE with single character prefix
query IT rowsort
SELECT * FROM default.default.test_unpartitioned_table WHERE name LIKE 'B%'
----
2 Bob
7 Bobby

# Test LIKE combined with other predicates
query IT rowsort
SELECT * FROM default.default.test_unpartitioned_table WHERE name LIKE 'Al%' AND id > 1
----
6 Albert

# Test LIKE in OR expression
query IT rowsort
SELECT * FROM default.default.test_unpartitioned_table WHERE name LIKE 'Al%' OR name LIKE 'Bo%'
----
1 Alice
2 Bob
6 Albert
7 Bobby
Loading