Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/model-zoo/ocr.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@ hide:
| [DocLayout-YOLO](https://github.com/opendatalab/DocLayout-YOLO) | Object Detection | [demo](https://github.com/jamjamjon/usls/tree/main/examples/ocr) | ✅ | ❓ | ✅ | ✅ | ✅ | ❌ | ❌ |
| [PP-DocLayout-v1-Plus-L](https://huggingface.co/PaddlePaddle/PP-DocLayout_plus-L) | Object Detection | [demo](https://github.com/jamjamjon/usls/tree/main/examples/ocr) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [PP-DocLayout-v2](https://huggingface.co/PaddlePaddle/PP-DocLayoutV2) | Object Detection | [demo](https://github.com/jamjamjon/usls/tree/main/examples/ocr) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [PP-DocLayout-v3](https://huggingface.co/PaddlePaddle/PP-DocLayoutV3) | Object Detection | [demo](https://github.com/jamjamjon/usls/tree/main/examples/ocr) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
5 changes: 5 additions & 0 deletions examples/ocr/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ cargo run -F cuda-full -F vlm --example ocr -- picodet-layout --device cuda:0 --

## PP-DocLayout v1/v2
cargo run -F cuda-full -F vlm --example ocr -- pp-doclayout --device cuda:0 --processor-device cuda:0 --source images/academic.jpg --ver 1 --dtype fp32


## PP-DocLayout v3
cargo run -F cuda-full -F vlm --example ocr -- pp-doclayout --device cuda:0 --processor-device cuda:0 --source images/vl1.58.png --ver 3 --dtype fp32
```

### Table structure recognition
Expand All @@ -53,3 +57,4 @@ cargo run -F cuda-full -F vlm --example ocr -- slanet --device cuda:0 --processo
![](https://github.com/jamjamjon/assets/releases/download/db/demo-table-ch.png)
![](https://github.com/jamjamjon/assets/releases/download/db/demo-sign.png)
![](https://github.com/jamjamjon/assets/releases/download/yolo/demo-doclayout-yolo.png)
![](https://github.com/jamjamjon/assets/releases/download/pp-doclayout/demo-v3.jpg)
16 changes: 15 additions & 1 deletion examples/ocr/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,21 @@ fn main() -> Result<()> {
}
Commands::PpDoclayout(args) => {
let config = pp_doclayout::config(args)?.commit()?;
let annotator = Annotator::default();
let annotator = match args.ver.0 {
3 => Annotator::default()
.with_hbb_style(
usls::HbbStyle::default()
.with_visible(false)
.with_text_visible(false),
)
.with_mask_style(
usls::MaskStyle::default()
.with_visible(false)
.with_cutout(true)
.with_draw_obbs(true),
),
_ => Annotator::default(),
};
run::<PPDocLayout>(config, &cli.source, &annotator, RUN_DOC_LAYOUT)
}
}?;
Expand Down
1 change: 1 addition & 0 deletions examples/ocr/pp_doclayout.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ pub fn config(args: &PPDoclayoutArgs) -> Result<Config> {
let config = match args.ver {
Version(1, _, _) => Config::pp_doclayout_v1_plus_l(),
Version(2, _, _) => Config::pp_doclayout_v2(),
Version(3, _, _) => Config::pp_doclayout_v3(),
_ => anyhow::bail!("Unsupported version"),
}
.with_dtype_all(args.dtype)
Expand Down
13 changes: 12 additions & 1 deletion src/models/vision/pp_doclayout/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
/// >
/// > # Paper & Code
/// >
/// > - **v3 Model**: https://huggingface.co/PaddlePaddle/PP-DocLayoutV3
/// > - **v2 Model**: https://huggingface.co/PaddlePaddle/PP-DocLayoutV2
/// > - **v1 Model**: https://huggingface.co/PaddlePaddle/PP-DocLayout_plus-L
/// > - **GitHub**: [PaddlePaddle/PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)
Expand All @@ -28,7 +29,7 @@ impl crate::Config {
.with_model_ixx(1, 2, 800)
.with_model_ixx(1, 3, 800)
.with_model_ixx(2, 1, 2) // scale factors
.with_class_confs(&[0.5])
.with_class_confs(&[0.35])
.with_resize_alg(crate::ResizeAlg::Interpolation(
crate::ResizeFilter::Bilinear,
))
Expand All @@ -50,4 +51,14 @@ impl crate::Config {
.with_class_names(&crate::NAMES_PP_DOC_LAYOUT_V2_25)
.with_model_file("v2.onnx")
}

/// PP-DocLayoutV3 configuration (25 classes with reading order)
/// PP-DocLayoutV3 is specifically engineered to handle non-planar document images.
/// It can directly predict multi-point bounding boxes for layout elements—as opposed to standard two-point boxes—and determine logical reading orders for skewed and curved surfaces within a single forward pass, significantly reducing cascading errors.
pub fn pp_doclayout_v3() -> Self {
Self::pp_doclayout()
.with_version(3.into())
.with_class_names(&crate::NAMES_PP_DOC_LAYOUT_V2_25)
.with_model_file("v3.onnx")
}
}
66 changes: 64 additions & 2 deletions src/models/vision/pp_doclayout/impl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ use rayon::prelude::*;

use crate::{
elapsed_module, inputs, Config, DynConf, Engine, Engines, FromConfig, Hbb, Image,
ImageProcessor, Model, Module, Version, Xs, X, Y,
ImageProcessor, Mask, Model, Module, Ops, Version, Xs, X, Y,
};

/// RT-DETR: Real-Time Detection Transformer
/// PP-DocLayoutsss-v1/v2/v3
#[derive(Debug, Builder)]
pub struct PPDocLayout {
pub height: usize,
Expand Down Expand Up @@ -158,6 +158,68 @@ impl PPDocLayout {

Ok(ys)
}
Version(3, _, _) => {
// preds: [batch * max_det, 7],[label_index, score, xmin, ymin, xmax, ymax, reading_order]
let preds_reshaped = preds.to_shape([self.batch, self.max_det, 7])?;
let preds_masks = xs
.get::<f32>(2)
.ok_or_else(|| anyhow::anyhow!("Failed to get masks"))?;
let preds_masks = preds_masks.to_shape([self.batch, self.max_det, 200, 200])?;

let ys: Vec<Y> = preds_reshaped
.axis_iter(Axis(0))
.into_par_iter()
.zip(preds_masks.axis_iter(Axis(0)).into_par_iter())
.enumerate()
.filter_map(|(idx, (preds, preds_masks))| {
let info = &self.processor.images_transform_info[idx];
let (image_height, image_width) = (info.height_src, info.width_src);

let mut items: Vec<(Hbb, Mask, usize)> = preds
.outer_iter()
.zip(preds_masks.outer_iter())
.filter_map(|(pred_slice, pred_mask)| {
let slice = pred_slice.as_slice()?;
let hbb = f(slice)?;
let order = slice[6] as usize;
let (mh, mw) = (pred_mask.shape()[0], pred_mask.shape()[1]);
let (ih, iw) = (image_height, image_width);
let mask_f32: Vec<f32> =
pred_mask.iter().map(|&x| 1. / (1. + (-x).exp())).collect();
let mask_f32: Vec<f32> = Ops::interpolate_1d(
&mask_f32, mw as _, mh as _, iw as _, ih as _, false,
)
.ok()?;
let mask_u8: Vec<u8> = mask_f32
.into_iter()
.map(|x| if x <= 0.5 { 0 } else { 1 })
.collect();

let mut mask = Mask::new(&mask_u8, iw as _, ih as _).ok()?;

if let Some(id) = hbb.id() {
mask = mask.with_id(id);
}
if let Some(name) = hbb.name() {
mask = mask.with_name(name);
}
if let Some(confidence) = hbb.confidence() {
mask = mask.with_confidence(confidence);
}

Some((hbb, mask, order))
})
.collect();

items.sort_by(|a, b| a.2.cmp(&b.2));
let hbbs: Vec<Hbb> = items.iter().map(|(h, _, _)| h.clone()).collect();
let masks: Vec<Mask> = items.into_iter().map(|(_, m, _)| m).collect();
Some(Y::default().with_hbbs(&hbbs).with_masks(&masks))
})
.collect();

Ok(ys)
}
_ => anyhow::bail!("Unsupported version"),
}
}
Expand Down