Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion shared/api/image_transforms.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,12 @@ struct Resize {
image_factor_ = patch_size_ * merge_size_;

// Perform Smart Resize if Set
auto [height, width] = smart_resize_ ? smart_resize(height_, width_) : std::make_tuple(height_, width_);
// When smart_resize is enabled, use the original image dimensions (h, w)
// to compute the target size that preserves the aspect ratio and snaps to
// the patch grid, matching the HuggingFace smart_resize behavior.
// When smart_resize is disabled, fall back to the fixed config dimensions.
auto [height, width] = smart_resize_ ? smart_resize(static_cast<int64_t>(h), static_cast<int64_t>(w))
: std::make_tuple(height_, width_);
h = static_cast<int>(height);
w = static_cast<int>(width);

Expand Down
101 changes: 101 additions & 0 deletions test/test_pp_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
pp_diagnosis = None
try:
from transformers import AutoImageProcessor, AutoTokenizer
from transformers import Qwen2VLImageProcessor
from onnxruntime_extensions import pp_api

is_pp_api_available = True
Expand Down Expand Up @@ -693,6 +694,106 @@ def test_gemma_3_image_processor(self):
a_image = regen_image(np.transpose(actual, (1, 2, 0)), phi4_image_mean, phi4_image_std)
a_image.save(f"{self.temp_dir}/a_{idx}_{i}.png")

@unittest.skipIf(is_pp_api_available is False, "pp_api is not available")
def test_qwen2_5_vl_image_processing(self):
"""Test Qwen2.5-VL image processing with smart_resize.

Verifies that smart_resize uses the actual image dimensions (not fixed
config values) to compute the target size, preserving the aspect ratio
and snapping to the patch grid. Images with different aspect ratios
must produce different output shapes.
"""
image_list = [
"data/processor/australia.jpg", # 1300×876, landscape
"data/processor/passport.png", # 600×805, portrait
"data/processor/exceltable.png", # 487×206, wide landscape
]
pil_images = [Image.open(util.get_test_data_file(f)) for f in image_list]

# HuggingFace reference (slow processor for numpy support)
hf_proc = Qwen2VLImageProcessor(
min_pixels=3136, max_pixels=12845056,
patch_size=14, merge_size=2, temporal_patch_size=2)

ort_processor = pp_api.ImageProcessor(
util.get_test_data_file("data/qwen2.5vl/vision_processor.json"))

for idx, (img, path) in enumerate(zip(pil_images, image_list)):
hf_result = hf_proc.preprocess([img], return_tensors="np")
hf_pv = hf_result["pixel_values"] # (num_patches, patch_dim)
hf_grid = hf_result["image_grid_thw"] # (1, 3)

ort_result = ort_processor.pre_process([util.get_test_data_file(path)])
ort_pv = ort_processor.to_numpy(ort_result, 0) # (1, num_patches, patch_dim)
ort_grid = ort_processor.to_numpy(ort_result, 1) # (1, 1, 3)

# Shape check: num_patches and patch_dim must agree
self.assertEqual(
hf_pv.shape, ort_pv.squeeze(0).shape,
f"Shape mismatch for {path}: HF={hf_pv.shape} vs OrtX={ort_pv.squeeze(0).shape}")

# Grid check
np.testing.assert_array_equal(
hf_grid, ort_grid.squeeze(0),
err_msg=f"Grid mismatch for {path}")

# Pixel value MSE check
mse = np.mean((hf_pv - ort_pv.squeeze(0)) ** 2)
print(f"Qwen2.5-VL image {idx} ({path}): shape={hf_pv.shape}, MSE={mse:.6f}")
self.assertLessEqual(mse, 1e-3, f"MSE too high for {path}: {mse}")

# Verify different images yield different num_patches (smart_resize is aspect-aware)
shapes = set()
for img, path in zip(pil_images, image_list):
ort_result = ort_processor.pre_process([util.get_test_data_file(path)])
shapes.add(ort_processor.to_numpy(ort_result, 0).shape)
self.assertGreater(len(shapes), 1,
"smart_resize should produce different shapes for images with different aspect ratios")

@unittest.skipIf(is_pp_api_available is False, "pp_api is not available")
def test_qwen3_vl_image_processing(self):
"""Test Qwen3-VL image processing (patch_size=16 vs Qwen2.5-VL's 14)."""
image_list = [
"data/processor/australia.jpg",
"data/processor/passport.png",
]
pil_images = [Image.open(util.get_test_data_file(f)) for f in image_list]

# HuggingFace reference (Qwen3-VL uses same processor class with patch_size=16)
hf_proc = Qwen2VLImageProcessor(
min_pixels=3136, max_pixels=12845056,
patch_size=16, merge_size=2, temporal_patch_size=2)

ort_processor = pp_api.ImageProcessor(
util.get_test_data_file("data/qwen3vl/vision_processor.json"))

for idx, (img, path) in enumerate(zip(pil_images, image_list)):
hf_result = hf_proc.preprocess([img], return_tensors="np")
hf_pv = hf_result["pixel_values"]
hf_grid = hf_result["image_grid_thw"]

ort_result = ort_processor.pre_process([util.get_test_data_file(path)])
ort_pv = ort_processor.to_numpy(ort_result, 0)
ort_grid = ort_processor.to_numpy(ort_result, 1)

self.assertEqual(
hf_pv.shape, ort_pv.squeeze(0).shape,
f"Shape mismatch for {path}: HF={hf_pv.shape} vs OrtX={ort_pv.squeeze(0).shape}")

np.testing.assert_array_equal(
hf_grid, ort_grid.squeeze(0),
err_msg=f"Grid mismatch for {path}")

mse = np.mean((hf_pv - ort_pv.squeeze(0)) ** 2)
print(f"Qwen3-VL image {idx} ({path}): shape={hf_pv.shape}, MSE={mse:.6f}")
self.assertLessEqual(mse, 1e-3, f"MSE too high for {path}: {mse}")

# patch_dim should be 1536 for Qwen3-VL (3 * 2 * 16 * 16)
ort_result = ort_processor.pre_process([util.get_test_data_file(image_list[0])])
patch_dim = ort_processor.to_numpy(ort_result, 0).shape[-1]
self.assertEqual(patch_dim, 1536,
f"Qwen3-VL patch_dim should be 1536, got {patch_dim}")


if __name__ == "__main__":
unittest.main()
Loading