From 6b72aab9682d9ec9ef2ac2f3aee767022e20fb05 Mon Sep 17 00:00:00 2001 From: jamjamjon Date: Mon, 2 Feb 2026 23:24:00 +0800 Subject: [PATCH 1/2] Add PP-Doclayout-V3 --- docs/model-zoo/ocr.md | 1 + examples/ocr/README.md | 5 ++ examples/ocr/main.rs | 13 ++++- examples/ocr/pp_doclayout.rs | 1 + src/models/vision/pp_doclayout/config.rs | 13 ++++- src/models/vision/pp_doclayout/impl.rs | 66 +++++++++++++++++++++++- 6 files changed, 95 insertions(+), 4 deletions(-) diff --git a/docs/model-zoo/ocr.md b/docs/model-zoo/ocr.md index 6343645..324cb6b 100644 --- a/docs/model-zoo/ocr.md +++ b/docs/model-zoo/ocr.md @@ -18,3 +18,4 @@ hide: | [DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO) | Object Detection | [demo](https://github.com/jamjamjon/usls/tree/main/examples/ocr) | ✅ | ❓ | ✅ | ✅ | ✅ | ❌ | ❌ | | [PP-DocLayout-v1-Plus-L](https://huggingface.co/PaddlePaddle/PP-DocLayout_plus-L) | Object Detection | [demo](https://github.com/jamjamjon/usls/tree/main/examples/ocr) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | [PP-DocLayout-v2](https://huggingface.co/PaddlePaddle/PP-DocLayoutV2) | Object Detection | [demo](https://github.com/jamjamjon/usls/tree/main/examples/ocr) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| [PP-DocLayout-v3](https://huggingface.co/PaddlePaddle/PP-DocLayoutV3) | Object Detection | [demo](https://github.com/jamjamjon/usls/tree/main/examples/ocr) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | diff --git a/examples/ocr/README.md b/examples/ocr/README.md index 1558beb..f40662f 100644 --- a/examples/ocr/README.md +++ b/examples/ocr/README.md @@ -36,6 +36,10 @@ cargo run -F cuda-full -F vlm --example ocr -- picodet-layout --device cuda:0 -- ## PP-DocLayout v1/v2 cargo run -F cuda-full -F vlm --example ocr -- pp-doclayout --device cuda:0 --processor-device cuda:0 --source images/academic.jpg --ver 1 --dtype fp32 + + +## PP-DocLayout v3 +cargo run -F cuda-full -F vlm --example ocr -- pp-doclayout --device cuda:0 --processor-device cuda:0 --source images/vl1.58.png --ver 3 --dtype fp32 ``` ### Table structure recognition @@ -53,3 +57,4 @@ cargo run -F cuda-full -F vlm --example ocr -- slanet --device cuda:0 --processo ![](https://github.com/jamjamjon/assets/releases/download/db/demo-table-ch.png) ![](https://github.com/jamjamjon/assets/releases/download/db/demo-sign.png) ![](https://github.com/jamjamjon/assets/releases/download/yolo/demo-doclayout-yolo.png) +![](https://github.com/jamjamjon/assets/releases/download/pp-doclayout/demo-v3.jpg) diff --git a/examples/ocr/main.rs b/examples/ocr/main.rs index 482db98..7338f95 100644 --- a/examples/ocr/main.rs +++ b/examples/ocr/main.rs @@ -214,7 +214,18 @@ fn main() -> Result<()> { } Commands::PpDoclayout(args) => { let config = pp_doclayout::config(args)?.commit()?; - let annotator = Annotator::default(); + let annotator = Annotator::default() + .with_hbb_style( + usls::HbbStyle::default() + .with_visible(false) + .with_text_visible(false), + ) + .with_mask_style( + usls::MaskStyle::default() + .with_visible(false) + .with_cutout(true) + .with_draw_obbs(true), // .with_draw_polygon_largest(true), + ); run::(config, &cli.source, &annotator, RUN_DOC_LAYOUT) } }?; diff --git a/examples/ocr/pp_doclayout.rs b/examples/ocr/pp_doclayout.rs index b9e9338..ccd65c2 100644 --- a/examples/ocr/pp_doclayout.rs +++ b/examples/ocr/pp_doclayout.rs @@ -41,6 +41,7 @@ pub fn config(args: &PPDoclayoutArgs) -> Result { let config = match args.ver { Version(1, _, _) => Config::pp_doclayout_v1_plus_l(), Version(2, _, _) => Config::pp_doclayout_v2(), + Version(3, _, _) => Config::pp_doclayout_v3(), _ => anyhow::bail!("Unsupported version"), } .with_dtype_all(args.dtype) diff --git a/src/models/vision/pp_doclayout/config.rs b/src/models/vision/pp_doclayout/config.rs index b4ea8f3..ddb21df 100644 --- a/src/models/vision/pp_doclayout/config.rs +++ b/src/models/vision/pp_doclayout/config.rs @@ -6,6 +6,7 @@ /// > /// > # Paper & Code /// > +/// > - **v3 Model**: https://huggingface.co/PaddlePaddle/PP-DocLayoutV3 /// > - **v2 Model**: https://huggingface.co/PaddlePaddle/PP-DocLayoutV2 /// > - **v1 Model**: https://huggingface.co/PaddlePaddle/PP-DocLayout_plus-L /// > - **GitHub**: [PaddlePaddle/PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR) @@ -28,7 +29,7 @@ impl crate::Config { .with_model_ixx(1, 2, 800) .with_model_ixx(1, 3, 800) .with_model_ixx(2, 1, 2) // scale factors - .with_class_confs(&[0.5]) + .with_class_confs(&[0.35]) .with_resize_alg(crate::ResizeAlg::Interpolation( crate::ResizeFilter::Bilinear, )) @@ -50,4 +51,14 @@ impl crate::Config { .with_class_names(&crate::NAMES_PP_DOC_LAYOUT_V2_25) .with_model_file("v2.onnx") } + + /// PP-DocLayoutV3 configuration (25 classes with reading order) + /// PP-DocLayoutV3 is specifically engineered to handle non-planar document images. + /// It can directly predict multi-point bounding boxes for layout elements—as opposed to standard two-point boxes—and determine logical reading orders for skewed and curved surfaces within a single forward pass, significantly reducing cascading errors. + pub fn pp_doclayout_v3() -> Self { + Self::pp_doclayout() + .with_version(3.into()) + .with_class_names(&crate::NAMES_PP_DOC_LAYOUT_V2_25) + .with_model_file("v3.onnx") + } } diff --git a/src/models/vision/pp_doclayout/impl.rs b/src/models/vision/pp_doclayout/impl.rs index 6d58199..da4d695 100644 --- a/src/models/vision/pp_doclayout/impl.rs +++ b/src/models/vision/pp_doclayout/impl.rs @@ -5,10 +5,10 @@ use rayon::prelude::*; use crate::{ elapsed_module, inputs, Config, DynConf, Engine, Engines, FromConfig, Hbb, Image, - ImageProcessor, Model, Module, Version, Xs, X, Y, + ImageProcessor, Mask, Model, Module, Ops, Version, Xs, X, Y, }; -/// RT-DETR: Real-Time Detection Transformer +/// PP-DocLayoutsss-v1/v2/v3 #[derive(Debug, Builder)] pub struct PPDocLayout { pub height: usize, @@ -158,6 +158,68 @@ impl PPDocLayout { Ok(ys) } + Version(3, _, _) => { + // preds: [batch * max_det, 7],[label_index, score, xmin, ymin, xmax, ymax, reading_order] + let preds_reshaped = preds.to_shape([self.batch, self.max_det, 7])?; + let preds_masks = xs + .get::(2) + .ok_or_else(|| anyhow::anyhow!("Failed to get masks"))?; + let preds_masks = preds_masks.to_shape([self.batch, self.max_det, 200, 200])?; + + let ys: Vec = preds_reshaped + .axis_iter(Axis(0)) + .into_par_iter() + .zip(preds_masks.axis_iter(Axis(0)).into_par_iter()) + .enumerate() + .filter_map(|(idx, (preds, preds_masks))| { + let info = &self.processor.images_transform_info[idx]; + let (image_height, image_width) = (info.height_src, info.width_src); + + let mut items: Vec<(Hbb, Mask, usize)> = preds + .outer_iter() + .zip(preds_masks.outer_iter()) + .filter_map(|(pred_slice, pred_mask)| { + let slice = pred_slice.as_slice()?; + let hbb = f(slice)?; + let order = slice[6] as usize; + let (mh, mw) = (pred_mask.shape()[0], pred_mask.shape()[1]); + let (ih, iw) = (image_height, image_width); + let mask_f32: Vec = + pred_mask.iter().map(|&x| 1. / (1. + (-x).exp())).collect(); + let mask_f32: Vec = Ops::interpolate_1d( + &mask_f32, mw as _, mh as _, iw as _, ih as _, false, + ) + .ok()?; + let mask_u8: Vec = mask_f32 + .into_iter() + .map(|x| if x <= 0.5 { 0 } else { 1 }) + .collect(); + + let mut mask = Mask::new(&mask_u8, iw as _, ih as _).ok()?; + + if let Some(id) = hbb.id() { + mask = mask.with_id(id); + } + if let Some(name) = hbb.name() { + mask = mask.with_name(name); + } + if let Some(confidence) = hbb.confidence() { + mask = mask.with_confidence(confidence); + } + + Some((hbb, mask, order)) + }) + .collect(); + + items.sort_by(|a, b| a.2.cmp(&b.2)); + let hbbs: Vec = items.iter().map(|(h, _, _)| h.clone()).collect(); + let masks: Vec = items.into_iter().map(|(_, m, _)| m).collect(); + Some(Y::default().with_hbbs(&hbbs).with_masks(&masks)) + }) + .collect(); + + Ok(ys) + } _ => anyhow::bail!("Unsupported version"), } } From 8fb50140648518409eb26edb0e9004a847fd83c9 Mon Sep 17 00:00:00 2001 From: jamjamjon Date: Mon, 2 Feb 2026 23:28:49 +0800 Subject: [PATCH 2/2] 1 --- examples/ocr/main.rs | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/examples/ocr/main.rs b/examples/ocr/main.rs index 7338f95..920e263 100644 --- a/examples/ocr/main.rs +++ b/examples/ocr/main.rs @@ -214,18 +214,21 @@ fn main() -> Result<()> { } Commands::PpDoclayout(args) => { let config = pp_doclayout::config(args)?.commit()?; - let annotator = Annotator::default() - .with_hbb_style( - usls::HbbStyle::default() - .with_visible(false) - .with_text_visible(false), - ) - .with_mask_style( - usls::MaskStyle::default() - .with_visible(false) - .with_cutout(true) - .with_draw_obbs(true), // .with_draw_polygon_largest(true), - ); + let annotator = match args.ver.0 { + 3 => Annotator::default() + .with_hbb_style( + usls::HbbStyle::default() + .with_visible(false) + .with_text_visible(false), + ) + .with_mask_style( + usls::MaskStyle::default() + .with_visible(false) + .with_cutout(true) + .with_draw_obbs(true), + ), + _ => Annotator::default(), + }; run::(config, &cli.source, &annotator, RUN_DOC_LAYOUT) } }?;