Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions operators/vision/image_decoder.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,9 +131,29 @@ struct DecodeImage {
// Read the JPEG header to get image info
jpeg_read_header(&cinfo, TRUE);

// Security: explicitly reject CMYK/YCCK color spaces before decompression.
// These have 4 channels and downstream code assumes 3 channels (CVE-class: CWE-122).
if (cinfo.jpeg_color_space == JCS_CMYK || cinfo.jpeg_color_space == JCS_YCCK) {
jpeg_destroy_decompress(&cinfo);
return {kOrtxErrorInvalidArgument,
"[ImageDecoder]: Unsupported JPEG color space (CMYK/YCCK). Only RGB and grayscale are supported."};
}

// Force RGB output to ensure consistent 3-channel output regardless of input
// (e.g., grayscale JPEGs are expanded to RGB).
cinfo.out_color_space = JCS_RGB;

// Start decompression
jpeg_start_decompress(&cinfo);

// Safety net: verify 3-channel output after decompression.
if (cinfo.output_components != 3) {
jpeg_destroy_decompress(&cinfo);
return {kOrtxErrorInvalidArgument,
"[ImageDecoder]: Unexpected JPEG output channels. Expected 3 (RGB), got " +
std::to_string(cinfo.output_components) + "."};
}

// Allocate memory for the image
std::vector<int64_t> output_dimensions{cinfo.output_height, cinfo.output_width, cinfo.output_components};
uint8_t* imageBuffer = output.Allocate(output_dimensions);
Expand All @@ -149,6 +169,7 @@ struct DecodeImage {
}

if (srcManager.extError != kOrtxOK) {
jpeg_destroy_decompress(&cinfo);
return {kOrtxErrorInternal, "[ImageDecoder]: Failed to decode JPEG image."};
}

Expand Down
14 changes: 14 additions & 0 deletions operators/vision/image_decoder_darwin.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,20 @@ struct DecodeImage {
return {kOrtxErrorInternal, "[ImageDecoder]: Failed to create CGImage."};
}

// Security: reject CMYK images explicitly for defense-in-depth.
// While CoreGraphics can convert CMYK to sRGB via CGContextDrawImage,
// we reject CMYK to match other platform decoders and prevent
// 4-channel data from reaching downstream code that assumes 3 channels (CWE-122).
{
CGColorSpaceRef imageColorSpace = CGImageGetColorSpace(image);
CGColorSpaceModel model = CGColorSpaceGetModel(imageColorSpace);
if (model == kCGColorSpaceModelCMYK) {
CGImageRelease(image);
return {kOrtxErrorInvalidArgument,
"[ImageDecoder]: Unsupported CMYK image color space. Only RGB and grayscale are supported."};
}
}

const int64_t width = static_cast<int64_t>(CGImageGetWidth(image));
const int64_t height = static_cast<int64_t>(CGImageGetHeight(image));
const int64_t channels = 3;
Expand Down
11 changes: 11 additions & 0 deletions operators/vision/image_decoder_win32.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,17 @@ struct DecodeImage {
const int width = static_cast<int>(uiWidth);
const int channels = 3; // Asks for RGB

// Security: reject CMYK pixel formats (e.g. CMYK JPEGs) before silent conversion.
// WICConvertBitmapSource can silently convert CMYK→RGB, hiding the 4-channel shape from
// downstream consumers that assume 3 channels, enabling heap overflow (CWE-122).
if (IsEqualGUID(pixelFormat, GUID_WICPixelFormat32bppCMYK) ||
IsEqualGUID(pixelFormat, GUID_WICPixelFormat64bppCMYK) ||
IsEqualGUID(pixelFormat, GUID_WICPixelFormat40bppCMYKAlpha) ||
IsEqualGUID(pixelFormat, GUID_WICPixelFormat80bppCMYKAlpha)) {
return {kOrtxErrorInvalidArgument,
"[ImageDecoder]: Unsupported JPEG color space. Only RGB and grayscale are supported."};
}

std::vector<int64_t> output_dimensions{height, width, channels};
uint8_t* decoded_image_data = output.Allocate(output_dimensions);
if (decoded_image_data == nullptr) {
Expand Down
18 changes: 17 additions & 1 deletion shared/api/image_transforms_phi_4.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,15 @@ class Phi4VisionDynamicPreprocess {
return {kOrtxErrorInvalidArgument, "[Phi4VisionProcessor]: Only raw image formats"};
}

// Security: reject non-RGB inputs (e.g. CMYK JPEGs decoded as 4-channel tensors).
// Downstream allocation is hardcoded to 3 channels, so a 4-channel input would cause
// memcpy to write past the buffer end, enabling heap corruption / RCE (CWE-122).
int64_t c = ts_image.Shape()[2];
if (c != 3) {
return {kOrtxErrorInvalidArgument,
"[Phi4VisionProcessor]: Expected 3-channel RGB input, got " + std::to_string(c) + " channels"};
}
Comment on lines +27 to +34
Copy link

Copilot AI Mar 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This check protects Phi4VisionDynamicPreprocess, but Phi4VisionProcessor::Compute in the same header still hardcodes 3 channels when allocating/copying tensors. For defense-in-depth (and to match the PR’s security goal), consider adding the same channels == 3 validation in Phi4VisionProcessor::Compute so it remains safe even if a non-RGB tensor reaches it through another path.

Copilot uses AI. Check for mistakes.

/*
dyhd_base_resolution = 448

Expand All @@ -42,7 +51,6 @@ class Phi4VisionDynamicPreprocess {
const uint8_t* input_data = ts_image.Data();
int64_t h = ts_image.Shape()[0];
int64_t w = ts_image.Shape()[1];
int64_t c = ts_image.Shape()[2];
Imaging image = ImagingNew("RGB", ort_extensions::narrow<int>(w), ort_extensions::narrow<int>(h));
for (int64_t i = 0; i < h; ++i) {
for (int64_t j = 0; j < w; ++j) {
Expand Down Expand Up @@ -351,6 +359,14 @@ class Phi4VisionProcessor {
ortc::Tensor<int64_t>& image_sizes,
ortc::Tensor<int64_t>& returned_image_attention_mask,
ortc::Tensor<int64_t>& num_img_tokens) const {
// Defense-in-depth: reject non-3-channel inputs. Downstream code hardcodes 3 channels
// when allocating/copying tensors. A non-3-channel input would cause heap corruption (CWE-122).
if (hd_image.Shape().size() >= 3 && hd_image.Shape()[2] != 3) {
return {kOrtxErrorInvalidArgument,
"[Phi4VisionProcessor]: Expected 3-channel input, got " +
std::to_string(hd_image.Shape()[2]) + " channels"};
}

const int32_t base_resolution = ort_extensions::narrow<int32_t>(dyhd_base_resolution_);
const int64_t mask_resolution = base_resolution / 14;
/*
Expand Down
66 changes: 66 additions & 0 deletions test/pp_api_test/test_processor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -278,3 +278,69 @@ TEST(ProcessorTest, TestQwen3VLImageProcessing) {
ASSERT_EQ(patch_dim, 1536);
}

// Security regression test: CMYK JPEG must be rejected at decode time (CWE-122).
//
// A CMYK JPEG has 4 output channels. Phi4VisionDynamicPreprocess allocates a
// 3-channel output buffer but previously copied using the dynamic channel count,
// writing past the buffer end and enabling heap corruption / RCE.
// Both the JPEG decoder and the transform ops now validate channel count.
TEST(ProcessorTest, TestCMYKJpegRejected) {
// Minimal 4x4 CMYK JPEG generated by PIL (Image.new("CMYK", (4,4), (128,64,32,255)))
// Contains an APP14 Adobe marker indicating CMYK color space.
static const uint8_t cmyk_jpeg[] = {
0xff,0xd8,0xff,0xee,0x00,0x0e,0x41,0x64,0x6f,0x62,0x65,0x00,0x64,0x00,0x00,0x00,
0x00,0x00,0xff,0xdb,0x00,0x43,0x00,0x08,0x06,0x06,0x07,0x06,0x05,0x08,0x07,0x07,
0x07,0x09,0x09,0x08,0x0a,0x0c,0x14,0x0d,0x0c,0x0b,0x0b,0x0c,0x19,0x12,0x13,0x0f,
0x14,0x1d,0x1a,0x1f,0x1e,0x1d,0x1a,0x1c,0x1c,0x20,0x24,0x2e,0x27,0x20,0x22,0x2c,
0x23,0x1c,0x1c,0x28,0x37,0x29,0x2c,0x30,0x31,0x34,0x34,0x34,0x1f,0x27,0x39,0x3d,
0x38,0x32,0x3c,0x2e,0x33,0x34,0x32,0xff,0xc0,0x00,0x14,0x08,0x00,0x04,0x00,0x04,
0x04,0x43,0x11,0x00,0x4d,0x11,0x00,0x59,0x11,0x00,0x4b,0x11,0x00,0xff,0xc4,0x00,
0x1f,0x00,0x00,0x01,0x05,0x01,0x01,0x01,0x01,0x01,0x01,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0xff,0xc4,
0x00,0xb5,0x10,0x00,0x02,0x01,0x03,0x03,0x02,0x04,0x03,0x05,0x05,0x04,0x04,0x00,
0x00,0x01,0x7d,0x01,0x02,0x03,0x00,0x04,0x11,0x05,0x12,0x21,0x31,0x41,0x06,0x13,
0x51,0x61,0x07,0x22,0x71,0x14,0x32,0x81,0x91,0xa1,0x08,0x23,0x42,0xb1,0xc1,0x15,
0x52,0xd1,0xf0,0x24,0x33,0x62,0x72,0x82,0x09,0x0a,0x16,0x17,0x18,0x19,0x1a,0x25,
0x26,0x27,0x28,0x29,0x2a,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x43,0x44,0x45,0x46,
0x47,0x48,0x49,0x4a,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,0x63,0x64,0x65,0x66,
0x67,0x68,0x69,0x6a,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x83,0x84,0x85,0x86,
0x87,0x88,0x89,0x8a,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0xa2,0xa3,0xa4,
0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xc2,
0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,
0xda,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xf1,0xf2,0xf3,0xf4,0xf5,
0xf6,0xf7,0xf8,0xf9,0xfa,0xff,0xda,0x00,0x0e,0x04,0x43,0x00,0x4d,0x00,0x59,0x00,
0x4b,0x00,0x00,0x3f,0x00,0x4a,0xef,0xeb,0xd7,0xeb,0xe7,0xfa,0xff,0xd9
};

// Write CMYK JPEG to a temp file
std::filesystem::path temp_path = std::filesystem::temp_directory_path() / "cmyk_test_security.jpg";
{
std::ofstream f(temp_path, std::ios::binary);
ASSERT_TRUE(f.is_open()) << "Failed to create temp CMYK JPEG file";
f.write(reinterpret_cast<const char*>(cmyk_jpeg), sizeof(cmyk_jpeg));
}

const std::string cmyk_path_string = temp_path.string();
const char* cmyk_path_str = cmyk_path_string.c_str();

// Attempt to load the CMYK JPEG - should be rejected by the decoder
OrtxObjectPtr<OrtxRawImages> raw_images{};
extError_t err = OrtxLoadImages(raw_images.ToBeAssigned(), &cmyk_path_str, 1, nullptr);

if (err == kOrtxOK) {
// If the image loaded (shouldn't with our decoder fix), verify the processor rejects it
OrtxObjectPtr<OrtxProcessor> processor;
err = OrtxCreateProcessor(processor.ToBeAssigned(), "data/models/phi-4/vision_processor.json");
if (err == kOrtxOK) {
OrtxObjectPtr<OrtxTensorResult> result;
err = OrtxImagePreProcess(processor.get(), raw_images.get(), result.ToBeAssigned());
}
}

// Clean up temp file
std::filesystem::remove(temp_path);

// The CMYK JPEG must be rejected somewhere in the pipeline (CWE-122 mitigation)
ASSERT_NE(err, kOrtxOK) << "CMYK JPEG must be rejected to prevent heap buffer overflow (CWE-122)";
}

Loading