diff --git a/CHANGELOG.md b/CHANGELOG.md index 0a42ab5f..99542bb0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,17 @@ rewriter flushes remaining input bytes before propagating a handler error, preserving the response. Currently exposed via the Rust API only; the C API still uses the original behavior. +- Added `Settings::append_bail_out_handler()` and the matching `bail_out!` macro, + `BailOut` rewritable unit, and `BailOutHandler` / `BailOutHandlerSend` type aliases. + Bail-out handlers fire immediately before the raw flush of remaining unparsed input on a + graceful bail-out (memory or content-handler error). Handlers receive the + `RewritingError` and a `BailOut` through which they can append final bytes to the sink + via `BailOut::append(content, content_type)`. Intended for handlers that buffer state + across the document (e.g. text-buffering handlers that defer emission) and need to + flush that state on bail-out. +- Marked `RewritingError` `#[non_exhaustive]` so future error variants can be added without + a major version bump. External callers can still `match` on it, but must include a + catch-all `_ =>` arm. - Reworked `Settings`, `MemorySettings` and `RewriteStrSettings` to use a consuming-builder API. Fields are now private; construction is via `::new()` plus chained `with_*` setters and `append_*` methods for the content-handler vectors. This makes future field additions diff --git a/src/lib.rs b/src/lib.rs index e2ff3293..f7190761 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -41,10 +41,10 @@ mod transform_stream; use cfg_if::cfg_if; pub use self::rewriter::{ - AsciiCompatibleEncoding, CommentHandler, DoctypeHandler, DocumentContentHandlers, - ElementContentHandlers, ElementHandler, EndHandler, EndTagHandler, HandlerResult, HandlerTypes, - HtmlRewriter, LocalHandlerTypes, MemorySettings, RewriteStrSettings, Settings, TextHandler, - rewrite_str, + AsciiCompatibleEncoding, BailOutHandler, CommentHandler, DoctypeHandler, + DocumentContentHandlers, ElementContentHandlers, ElementHandler, EndHandler, EndTagHandler, + HandlerResult, HandlerTypes, HtmlRewriter, LocalHandlerTypes, MemorySettings, + RewriteStrSettings, Settings, TextHandler, rewrite_str, }; pub use self::selectors_vm::Selector; pub use self::transform_stream::OutputSink; @@ -56,9 +56,10 @@ pub use self::transform_stream::OutputSink; /// Rewriting is sequential, so there's no benefit from using the `Send`-compatible rewriter. pub mod send { pub use crate::rewriter::{ - CommentHandlerSend as CommentHandler, DoctypeHandlerSend as DoctypeHandler, - ElementHandlerSend as ElementHandler, EndHandlerSend as EndHandler, - EndTagHandlerSend as EndTagHandler, TextHandlerSend as TextHandler, + BailOutHandlerSend as BailOutHandler, CommentHandlerSend as CommentHandler, + DoctypeHandlerSend as DoctypeHandler, ElementHandlerSend as ElementHandler, + EndHandlerSend as EndHandler, EndTagHandlerSend as EndTagHandler, + TextHandlerSend as TextHandler, }; pub use crate::rewriter::{IntoHandler, SendHandlerTypes}; @@ -95,7 +96,7 @@ pub mod errors { /// HTML content descriptors that can be produced and modified by a rewriter. pub mod html_content { pub use super::rewritable_units::{ - Attribute, Comment, ContentType, Doctype, DocumentEnd, Element, EndTag, StartTag, + Attribute, BailOut, Comment, ContentType, Doctype, DocumentEnd, Element, EndTag, StartTag, StreamingHandler, StreamingHandlerSink, TextChunk, UserData, }; diff --git a/src/rewritable_units/bail_out.rs b/src/rewritable_units/bail_out.rs new file mode 100644 index 00000000..c8e07b03 --- /dev/null +++ b/src/rewritable_units/bail_out.rs @@ -0,0 +1,72 @@ +use super::{ContentType, StreamingHandlerSink}; +use crate::transform_stream::OutputSink; +use encoding_rs::Encoding; + +/// A rewritable unit that represents the moment the rewriter is about to abandon +/// processing through a graceful bail-out. +/// +/// Bail-out handlers registered via [`Settings::append_bail_out_handler()`] receive a +/// `&mut BailOut` and can emit final bytes into the output sink via [`append()`]. This +/// is the only opportunity for content other handlers have buffered (e.g. text withheld +/// pending a future chunk) to land in the response when the rewriter aborts. +/// +/// Bytes appended via this unit are written *before* the rewriter's own raw flush of +/// remaining unparsed input. The resulting sink order is: +/// +/// 1. Transformed bytes the rewriter already emitted normally. +/// 2. Bytes appended by bail-out handlers, in registration order. +/// 3. The rewriter's raw flush of the chunk's unparsed suffix. +/// +/// [`Settings::append_bail_out_handler()`]: +/// crate::Settings::append_bail_out_handler +/// [`append()`]: Self::append +pub struct BailOut<'a> { + output_sink: &'a mut dyn OutputSink, + encoding: &'static Encoding, +} + +impl<'a> BailOut<'a> { + #[inline] + #[must_use] + pub(crate) fn new(output_sink: &'a mut dyn OutputSink, encoding: &'static Encoding) -> Self { + Self { + output_sink, + encoding, + } + } + + /// Appends `content` at the bail-out point. + /// + /// Subsequent calls to this method append `content` to the previously inserted + /// content within the same bail-out invocation. When multiple bail-out handlers are + /// registered, their `append` calls are concatenated in registration order. + /// + /// `content_type` controls how the content is interpreted before being written to + /// the sink. See [`ContentType`]. + /// + /// # Example + /// + /// ``` + /// use lol_html::{bail_out, Settings}; + /// use lol_html::errors::RewritingError; + /// use lol_html::html_content::ContentType; + /// + /// // A handler that, on content-handler-error bail-out, drops a notice into the sink + /// // before the rewriter's own raw flush of remaining unparsed input. + /// let settings = Settings::new() + /// .with_graceful_bail_out_on_content_handler_error(true) + /// .append_bail_out_handler(bail_out!(|err, bail_out| { + /// if matches!(err, RewritingError::ContentHandlerError(_)) { + /// bail_out.append("", ContentType::Html); + /// } + /// })); + /// # let _ = settings; + /// ``` + #[inline] + pub fn append(&mut self, content: &str, content_type: ContentType) { + StreamingHandlerSink::new(self.encoding, &mut |c| { + self.output_sink.handle_chunk(c); + }) + .write_str(content, content_type); + } +} diff --git a/src/rewritable_units/mod.rs b/src/rewritable_units/mod.rs index c26dafb3..8c22fa35 100644 --- a/src/rewritable_units/mod.rs +++ b/src/rewritable_units/mod.rs @@ -4,6 +4,7 @@ pub(crate) use self::mutations::{Mutations, StringChunk}; pub(crate) use self::text_decoder::TextDecoder; pub(crate) use self::text_encoder::{IncompleteUtf8Resync, TextEncoder}; +pub use self::bail_out::*; pub use self::document_end::*; pub use self::element::*; pub use self::mutations::{ContentType, StreamingHandler}; @@ -83,6 +84,7 @@ macro_rules! impl_user_data { #[macro_use] mod mutations; +mod bail_out; mod document_end; mod element; mod streaming_sink; diff --git a/src/rewriter/mod.rs b/src/rewriter/mod.rs index a002a9d8..dbdf7476 100644 --- a/src/rewriter/mod.rs +++ b/src/rewriter/mod.rs @@ -69,8 +69,12 @@ impl TryFrom<&'static Encoding> for AsciiCompatibleEncoding { /// This error is unrecoverable. The rewriter instance will panic on attempt to use it after such an /// error. /// +/// This enum is marked `#[non_exhaustive]` so that future variants can be added in minor +/// releases. External `match` expressions on `RewritingError` must include a wildcard arm. +/// /// [`write`]: ../struct.HtmlRewriter.html#method.write /// [`end`]: ../struct.HtmlRewriter.html#method.end +#[non_exhaustive] #[derive(Error, Debug)] pub enum RewritingError { /// See [`MemoryLimitExceededError`]. @@ -922,9 +926,11 @@ mod tests { mod fatal_errors { use super::*; - use crate::html_content::Comment; + use crate::html_content::{Comment, ContentType}; use crate::memory::MemoryLimitExceededError; use crate::rewritable_units::{Element, TextChunk}; + use std::cell::Cell; + use std::rc::Rc; fn create_rewriter( max_allowed_memory_usage: usize, @@ -1558,6 +1564,233 @@ mod tests { ); } + // --- Bail-out handler tests --- + // + // The bail-out handler is invoked immediately before the raw flush of remaining + // unparsed input. Handlers can append final bytes to the sink via + // [`BailOut::append`] (`text_buffer`-style flushes in ROFL). + // + // The end()-path bail-out site is symmetric with the write() sites but is not + // reachable through normal input: memory errors fire during write()'s parse, and + // EOF-in-tag/attribute emits as text per HTML5 (so handlers don't fire from + // `parse(_, true)`). Tested implicitly by sharing the same code path with the + // write() sites. + + /// Verifies the hook runs and its output lands in the sink ahead of the raw flush, + /// so callers see `[transformed prefix] + [hook output] + [raw remainder]`. + #[test] + fn test_bail_out_handler_emits_before_raw_flush() { + const MAX: usize = 100; + + let mut output = Vec::::new(); + let mut rewriter = HtmlRewriter::new( + Settings::new() + .with_memory_settings( + MemorySettings::new() + .with_max_allowed_memory_usage(MAX) + .with_preallocated_parsing_buffer_size(0) + .with_graceful_bail_out_on_memory_limit_exceeded(true), + ) + .append_document_content_handler(doc_comments!(|c| { + c.set_text("TRANSFORMED").unwrap(); + Ok(()) + })) + .append_bail_out_handler(bail_out!(|_err, bail_out| { + bail_out.append("HOOK", ContentType::Text); + })), + |c: &[u8]| output.extend_from_slice(c), + ); + + // chunk_1: a comment the handler transforms, plus an unfinished tag that gets + // buffered. chunk_2: trying to append this to the buffer exceeds the limit, so + // the Arena::append bail-out site fires. + let chunk_1 = format!("\"{}",", "r".repeat(50)); + + rewriter.write(chunk_1.as_bytes()).unwrap(); + let err = rewriter.write(chunk_2.as_bytes()).unwrap_err(); + + assert!(matches!(err, RewritingError::MemoryLimitExceeded(_))); + + let output_str = std::str::from_utf8(&output).unwrap(); + let transformed_idx = output_str + .find("") + .expect("transformed comment must be present"); + let hook_idx = output_str + .find("HOOK") + .expect("hook output must be present"); + let raw_idx = output_str + .find("\"")firstmiddle"; + let mut output = Vec::::new(); + let hook_called = Rc::new(Cell::new(false)); + let hook_called_clone = Rc::clone(&hook_called); + + let mut rewriter = HtmlRewriter::new( + Settings::new() + .with_graceful_bail_out_on_content_handler_error(true) + .append_element_content_handler(element!("stop", |_| Err( + "handler refused".into() + ))) + .append_bail_out_handler(bail_out!(move |err, bail_out| { + assert!( + matches!(err, RewritingError::ContentHandlerError(_)), + "expected ContentHandlerError in hook, got {err}", + ); + hook_called_clone.set(true); + bail_out.append("HOOK", ContentType::Text); + })), + |c: &[u8]| output.extend_from_slice(c), + ); + + let err = rewriter.write(html).unwrap_err(); + assert!(matches!(err, RewritingError::ContentHandlerError(_))); + assert!(hook_called.get(), "bail-out hook must have been called"); + + let output_str = std::str::from_utf8(&output).unwrap(); + assert!( + output_str.contains("HOOK"), + "hook output must appear in sink, got {output_str:?}", + ); + } + + /// Multiple bail-out handlers fire in registration order. The sink receives their + /// appended bytes in the same order. + #[test] + fn test_multiple_bail_out_handlers_fire_in_order() { + const MAX: usize = 100; + + let mut output = Vec::::new(); + let call_order = Rc::new(Cell::new(String::new())); + let order_a = Rc::clone(&call_order); + let order_b = Rc::clone(&call_order); + let order_c = Rc::clone(&call_order); + + let mut rewriter = HtmlRewriter::new( + Settings::new() + .with_memory_settings( + MemorySettings::new() + .with_max_allowed_memory_usage(MAX) + .with_preallocated_parsing_buffer_size(0) + .with_graceful_bail_out_on_memory_limit_exceeded(true), + ) + // Element handler forces lex mode (default tag-scanner mode would + // consume unterminated attributes as text without buffering). + .append_element_content_handler(element!("*", |_| Ok(()))) + .append_bail_out_handler(bail_out!(move |_err, b| { + let mut s = order_a.take(); + s.push('A'); + order_a.set(s); + b.append("A", ContentType::Text); + })) + .append_bail_out_handler(bail_out!(move |_err, b| { + let mut s = order_b.take(); + s.push('B'); + order_b.set(s); + b.append("B", ContentType::Text); + })) + .append_bail_out_handler(bail_out!(move |_err, b| { + let mut s = order_c.take(); + s.push('C'); + order_c.set(s); + b.append("C", ContentType::Text); + })), + |c: &[u8]| output.extend_from_slice(c), + ); + + let chunk_1 = format!("\"{}",", "r".repeat(MAX / 2)); + rewriter.write(chunk_1.as_bytes()).unwrap(); + let _ = rewriter.write(chunk_2.as_bytes()).unwrap_err(); + + assert_eq!( + call_order.take(), + "ABC", + "handlers must fire in registration order" + ); + + let output_str = std::str::from_utf8(&output).unwrap(); + let a_idx = output_str.find('A').expect("A in sink"); + let b_idx = output_str.find('B').expect("B in sink"); + let c_idx = output_str.find('C').expect("C in sink"); + + assert!( + a_idx < b_idx && b_idx < c_idx, + "appended bytes must appear in registration order, got {output_str:?}", + ); + } + + /// On normal completion (no error), the bail-out hook is never invoked. + #[test] + fn test_bail_out_handler_not_invoked_on_normal_completion() { + let hook_called = Rc::new(Cell::new(false)); + let hook_called_clone = Rc::clone(&hook_called); + + let mut output = Vec::::new(); + let mut rewriter = HtmlRewriter::new( + Settings::new().append_bail_out_handler(bail_out!(move |_err, _b| { + hook_called_clone.set(true); + })), + |c: &[u8]| output.extend_from_slice(c), + ); + + rewriter.write(b"

hello

").unwrap(); + rewriter.end().unwrap(); + + assert!( + !hook_called.get(), + "bail-out hook must not fire on normal completion", + ); + } + + /// When the graceful flag is off, an error still propagates but the bail-out hook + /// is not invoked. The hook is gated by `should_bail_out_for`, just like the raw + /// flush is. + #[test] + fn test_bail_out_handler_not_invoked_when_graceful_flag_disabled() { + let hook_called = Rc::new(Cell::new(false)); + let hook_called_clone = Rc::clone(&hook_called); + + let mut output = Vec::::new(); + // No `with_graceful_bail_out_on_content_handler_error(true)` — flag stays off. + let mut rewriter = HtmlRewriter::new( + Settings::new() + .append_element_content_handler(element!("stop", |_| Err( + "handler refused".into() + ))) + .append_bail_out_handler(bail_out!(move |_err, _b| { + hook_called_clone.set(true); + })), + |c: &[u8]| output.extend_from_slice(c), + ); + + let err = rewriter + .write(b"firstmiddle") + .unwrap_err(); + + assert!(matches!(err, RewritingError::ContentHandlerError(_))); + assert!( + !hook_called.get(), + "bail-out hook must not fire when graceful flag is off", + ); + } + #[test] fn content_handler_error_propagation() { fn assert_err<'h>( diff --git a/src/rewriter/rewrite_controller.rs b/src/rewriter/rewrite_controller.rs index b5cf9351..00095e0a 100644 --- a/src/rewriter/rewrite_controller.rs +++ b/src/rewriter/rewrite_controller.rs @@ -4,7 +4,7 @@ use crate::base::SharedEncoding; use crate::html::{LocalName, Namespace}; use crate::memory::SharedMemoryLimiter; use crate::parser::ActionError; -use crate::rewritable_units::{DocumentEnd, Token, TokenCaptureFlags}; +use crate::rewritable_units::{BailOut, DocumentEnd, Token, TokenCaptureFlags}; use crate::selectors_vm::{ Ast, AuxStartTagInfoRequest, DenseHashSet, ElementData, SelectorMatchingVm, VmError, }; @@ -35,6 +35,7 @@ impl ElementData for ElementDescriptor { pub(crate) struct HtmlRewriteController<'h, H: HandlerTypes> { handlers_dispatcher: ContentHandlersDispatcher<'h, H>, selector_matching_vm: Option>, + bail_out_handlers: Vec>, } impl<'h, H: HandlerTypes> HtmlRewriteController<'h, H> { @@ -83,17 +84,19 @@ impl<'h, H: HandlerTypes> HtmlRewriteController<'h, H> { None }; - Self::new(dispatcher, selector_matching_vm) + Self::new(dispatcher, selector_matching_vm, settings.bail_out_handlers) } #[inline] pub(crate) const fn new( handlers_dispatcher: ContentHandlersDispatcher<'h, H>, selector_matching_vm: Option>, + bail_out_handlers: Vec>, ) -> Self { HtmlRewriteController { handlers_dispatcher, selector_matching_vm, + bail_out_handlers, } } } @@ -188,4 +191,10 @@ impl TransformController for HtmlRewriteController<'_, H> { .handlers_dispatcher .has_matched_elements_with_removed_content() } + + fn handle_bail_out(&mut self, error: &RewritingError, bail_out: &mut BailOut<'_>) { + for handler in &mut self.bail_out_handlers { + handler(error, bail_out); + } + } } diff --git a/src/rewriter/settings.rs b/src/rewriter/settings.rs index eb64da4e..63ff8d95 100644 --- a/src/rewriter/settings.rs +++ b/src/rewriter/settings.rs @@ -1,7 +1,7 @@ -use crate::rewritable_units::{Comment, Doctype, DocumentEnd, Element, EndTag, TextChunk}; +use crate::rewritable_units::{BailOut, Comment, Doctype, DocumentEnd, Element, EndTag, TextChunk}; use crate::selectors_vm::Selector; // N.B. `use crate::` will break this because the constructor is not public, only the struct itself -use super::AsciiCompatibleEncoding; +use super::{AsciiCompatibleEncoding, RewritingError}; use std::borrow::Cow; use std::error::Error; @@ -35,6 +35,10 @@ pub trait HandlerTypes: Sized { type EndTagHandler<'handler>: FnOnce(&mut EndTag<'_>) -> HandlerResult + 'handler; /// Handler type for [`DocumentEnd`]. type EndHandler<'handler>: FnOnce(&mut DocumentEnd<'_>) -> HandlerResult + 'handler; + /// Handler type for [`BailOut`]: invoked when the rewriter triggers a graceful bail-out. + /// + /// See [`Settings::append_bail_out_handler()`] for details. + type BailOutHandler<'handler>: FnMut(&RewritingError, &mut BailOut<'_>) + 'handler; // Inside the HTML rewriter we need to create handlers, and they need to be the most constrained // possible version of a handler (i.e. if we have `Send` and non-`Send` handlers we need to @@ -71,6 +75,7 @@ impl HandlerTypes for LocalHandlerTypes { type ElementHandler<'h> = ElementHandler<'h>; type EndTagHandler<'h> = EndTagHandler<'h>; type EndHandler<'h> = EndHandler<'h>; + type BailOutHandler<'h> = BailOutHandler<'h>; fn new_end_tag_handler<'h>( handler: impl IntoHandler>, @@ -106,6 +111,7 @@ impl HandlerTypes for SendHandlerTypes { type ElementHandler<'h> = ElementHandlerSend<'h, Self>; type EndTagHandler<'h> = EndTagHandlerSend<'h>; type EndHandler<'h> = EndHandlerSend<'h>; + type BailOutHandler<'h> = BailOutHandlerSend<'h>; fn new_end_tag_handler<'h>( handler: impl IntoHandler>, @@ -148,6 +154,11 @@ pub type ElementHandler<'h, H = LocalHandlerTypes> = pub type EndTagHandler<'h> = Box) -> HandlerResult + 'h>; /// Boxed closure for handling the document end. This is called after the last chunk is processed. pub type EndHandler<'h> = Box) -> HandlerResult + 'h>; +/// Boxed closure for handling a graceful bail-out. Called once if the rewriter triggers a +/// bail-out before propagating the [`RewritingError`]. +/// +/// See [`Settings::append_bail_out_handler()`]. +pub type BailOutHandler<'h> = Box) + 'h>; /// [Sendable](crate::send) boxed closure for handling the [document type declaration]. /// @@ -174,6 +185,10 @@ pub type EndTagHandlerSend<'h> = Box) -> HandlerResul /// /// See also non-sendable [`EndHandler`](crate::EndHandler). pub type EndHandlerSend<'h> = Box) -> HandlerResult + Send + 'h>; +/// [Sendable](crate::send) boxed closure for handling a graceful bail-out. +/// +/// See also non-sendable [`BailOutHandler`](crate::BailOutHandler). +pub type BailOutHandlerSend<'h> = Box) + Send + 'h>; /// Trait that allows closures to be used as handlers #[diagnostic::on_unimplemented( @@ -271,6 +286,20 @@ impl<'h, F: FnOnce(&mut DocumentEnd<'_>) -> HandlerResult + Send + 'h> } } +impl<'h, F: FnMut(&RewritingError, &mut BailOut<'_>) + 'h> IntoHandler> for F { + fn into_handler(self) -> BailOutHandler<'h> { + Box::new(self) + } +} + +impl<'h, F: FnMut(&RewritingError, &mut BailOut<'_>) + Send + 'h> + IntoHandler> for F +{ + fn into_handler(self) -> BailOutHandlerSend<'h> { + Box::new(self) + } +} + /// Specifies element content handlers associated with a selector. pub struct ElementContentHandlers<'h, H: HandlerTypes = LocalHandlerTypes> { /// Element handler. See [`element!`](crate::element) and [`HandlerTypes::ElementHandler`]. @@ -746,6 +775,49 @@ macro_rules! end { }}; } +/// A convenience macro to construct a [bail-out handler](Settings::append_bail_out_handler) for +/// the graceful bail-out path. +/// +/// The handler receives a [`&RewritingError`](crate::errors::RewritingError) and a +/// `&mut `[`BailOut`](crate::html_content::BailOut) through which it can append final bytes +/// to the sink before the rewriter's own raw flush. +/// +/// # Example +/// ``` +/// use lol_html::{bail_out, rewrite_str, RewriteStrSettings}; +/// use lol_html::errors::RewritingError; +/// use lol_html::html_content::ContentType; +/// +/// let result = rewrite_str( +/// r#"foo"#, +/// RewriteStrSettings::new() +/// .append_bail_out_handler(bail_out!(|err, bail_out| { +/// if matches!(err, RewritingError::ContentHandlerError(_)) { +/// bail_out.append("", ContentType::Html); +/// } +/// })), +/// ) +/// .unwrap(); +/// +/// // No bail-out happened, so the handler never fired. +/// assert_eq!(result, "foo"); +/// ``` +#[macro_export(local_inner_macros)] +macro_rules! bail_out { + ($handler:expr) => {{ + // Without this rust won't be able to always infer the type of the handler. + #[inline(always)] + const fn type_hint(h: T) -> T + where + T: FnMut(&$crate::errors::RewritingError, &mut $crate::html_content::BailOut<'_>), + { + h + } + + type_hint($handler) + }}; +} + /// Specifies the memory settings for [`HtmlRewriter`]. /// /// Construct with [`MemorySettings::new()`] (or [`MemorySettings::default()`]) and configure the @@ -901,6 +973,7 @@ pub struct Settings<'handlers, 'selectors, H: HandlerTypes = LocalHandlerTypes> ElementContentHandlers<'handlers, H>, )>, pub(crate) document_content_handlers: Vec>, + pub(crate) bail_out_handlers: Vec>, pub(crate) encoding: AsciiCompatibleEncoding, pub(crate) memory_settings: MemorySettings, pub(crate) strict: bool, @@ -942,6 +1015,7 @@ impl<'handlers, 'selectors, H: HandlerTypes> Settings<'handlers, 'selectors, H> Settings { element_content_handlers: vec![], document_content_handlers: vec![], + bail_out_handlers: vec![], encoding: AsciiCompatibleEncoding(encoding_rs::UTF_8), memory_settings: MemorySettings::new(), strict: true, @@ -1014,6 +1088,55 @@ impl<'handlers, 'selectors, H: HandlerTypes> Settings<'handlers, 'selectors, H> self } + /// Appends a handler to be invoked when the rewriter triggers a graceful bail-out. + /// + /// Bail-out handlers fire when the rewriter is about to abort processing and propagate a + /// [`RewritingError`] through a graceful bail-out (i.e. when one of the + /// `graceful_bail_out_on_*` settings is enabled and the corresponding error fires). Each + /// handler receives the error and a [`BailOut`] through which it can append final bytes to + /// the sink via [`BailOut::append()`]. + /// + /// Handlers fire in registration order, *before* the rewriter's own raw flush of remaining + /// unparsed input. The resulting sink order is: + /// + /// 1. Transformed bytes the rewriter already emitted normally. + /// 2. Bytes appended by bail-out handlers, in registration order. + /// 3. The rewriter's raw flush of the chunk's unparsed suffix. + /// + /// Handlers do not return errors. Any cleanup they cannot complete must be silently + /// abandoned. + /// + /// ### Hint + /// + /// The [`bail_out!`] convenience macro returns a value of the expected type, so it can be + /// passed directly: + /// + /// ``` + /// use lol_html::{bail_out, Settings}; + /// use lol_html::errors::RewritingError; + /// use lol_html::html_content::ContentType; + /// + /// let settings = Settings::new() + /// .with_graceful_bail_out_on_content_handler_error(true) + /// .append_bail_out_handler(bail_out!(|err, bail_out| { + /// if matches!(err, RewritingError::ContentHandlerError(_)) { + /// bail_out.append("", ContentType::Html); + /// } + /// })); + /// # let _ = settings; + /// ``` + /// + /// [`bail_out!`]: macro.bail_out.html + #[inline] + #[must_use] + pub fn append_bail_out_handler( + mut self, + handler: impl IntoHandler>, + ) -> Self { + self.bail_out_handlers.push(handler.into_handler()); + self + } + /// Sets the [character encoding] for the input and the output of the rewriter. /// /// Can be a [label] for any of the web-compatible encodings with an exception for `UTF-16LE`, @@ -1187,6 +1310,7 @@ impl<'h, 's, H: HandlerTypes> From> for Settings<' Settings { element_content_handlers: settings.element_content_handlers, document_content_handlers: settings.document_content_handlers, + bail_out_handlers: settings.bail_out_handlers, strict: settings.strict, enable_esi_tags: settings.enable_esi_tags, ..Settings::new_for_handler_types() @@ -1223,6 +1347,7 @@ pub struct RewriteStrSettings<'handlers, 'selectors, H: HandlerTypes = LocalHand ElementContentHandlers<'handlers, H>, )>, pub(crate) document_content_handlers: Vec>, + pub(crate) bail_out_handlers: Vec>, pub(crate) strict: bool, pub(crate) enable_esi_tags: bool, } @@ -1260,6 +1385,7 @@ impl<'handlers, 'selectors, H: HandlerTypes> RewriteStrSettings<'handlers, 'sele RewriteStrSettings { element_content_handlers: vec![], document_content_handlers: vec![], + bail_out_handlers: vec![], strict: true, enable_esi_tags: true, } @@ -1326,6 +1452,19 @@ impl<'handlers, 'selectors, H: HandlerTypes> RewriteStrSettings<'handlers, 'sele self } + /// Appends a handler to be invoked when the rewriter triggers a graceful bail-out. + /// + /// See [`Settings::append_bail_out_handler()`] for full semantics. Same shape. + #[inline] + #[must_use] + pub fn append_bail_out_handler( + mut self, + handler: impl IntoHandler>, + ) -> Self { + self.bail_out_handlers.push(handler.into_handler()); + self + } + /// If set to `true` the rewriter bails out if it encounters markup that drives the HTML parser /// into ambiguous state. /// diff --git a/src/transform_stream/dispatcher.rs b/src/transform_stream/dispatcher.rs index ca021631..c8b11ddd 100644 --- a/src/transform_stream/dispatcher.rs +++ b/src/transform_stream/dispatcher.rs @@ -9,7 +9,7 @@ use crate::parser::{ }; use crate::rewritable_units::TextDecoder; use crate::rewritable_units::ToTokenResult; -use crate::rewritable_units::{DocumentEnd, Serialize, ToToken, Token, TokenCaptureFlags}; +use crate::rewritable_units::{BailOut, DocumentEnd, Serialize, ToToken, Token, TokenCaptureFlags}; use crate::rewriter::RewritingError; use encoding_rs::Encoding; @@ -44,6 +44,11 @@ pub trait TransformController: Sized { fn handle_token(&mut self, token: &mut Token<'_>) -> Result<(), RewritingError>; fn handle_end(&mut self, document_end: &mut DocumentEnd<'_>) -> Result<(), RewritingError>; fn should_emit_content(&self) -> bool; + + /// Invoked when the rewriter triggers a graceful bail-out. Default impl does nothing; + /// the production `HtmlRewriteController` overrides this to run the user-registered + /// bail-out handlers. + fn handle_bail_out(&mut self, _error: &RewritingError, _bail_out: &mut BailOut<'_>) {} } /// Defines an interface for the [`HtmlRewriter`]'s output. @@ -416,6 +421,19 @@ where self.delegate.remaining_content_start = 0; } + /// Invokes the transform controller's bail-out handlers (in registration order), + /// constructing a [`BailOut`] wrapper around the output sink and the current encoding. + /// Must be called *before* [`flush_for_bail_out()`] so that handler emissions land in + /// the sink ahead of the raw flush of remaining unparsed input. + /// + /// [`flush_for_bail_out()`]: Self::flush_for_bail_out + pub fn run_bail_out_handlers(&mut self, error: &RewritingError) { + let mut bail_out = BailOut::new(&mut self.delegate.output_sink, self.encoding.get()); + self.delegate + .transform_controller + .handle_bail_out(error, &mut bail_out); + } + pub fn finish(&mut self, input: &[u8]) -> Result<(), RewritingError> { self.delegate.finish(self.encoding.get(), input) } diff --git a/src/transform_stream/mod.rs b/src/transform_stream/mod.rs index 9d9fe932..1d4fa08a 100644 --- a/src/transform_stream/mod.rs +++ b/src/transform_stream/mod.rs @@ -107,13 +107,16 @@ where // previous calls. Neither chunk has been emitted to the sink yet, so on a // graceful bail-out we flush both as-is and let the caller continue the // response from where they were. - if self.graceful_bail_out_on_memory_limit_exceeded { + let err = RewritingError::MemoryLimitExceeded(e); + + if self.should_bail_out_for(&err) { let dispatcher = self.parser.get_dispatcher(); + dispatcher.run_bail_out_handlers(&err); dispatcher.flush_for_bail_out(self.buffer.bytes()); dispatcher.flush_for_bail_out(data); } - return Err(RewritingError::MemoryLimitExceeded(e)); + return Err(err); } } } else { @@ -131,7 +134,9 @@ where // between `emit_chunk_before_lexeme()` and `consume_lexeme()`). Flushing from // there preserves all bytes the caller fed us. if self.should_bail_out_for(&e) { - self.parser.get_dispatcher().flush_for_bail_out(chunk); + let dispatcher = self.parser.get_dispatcher(); + dispatcher.run_bail_out_handlers(&e); + dispatcher.flush_for_bail_out(chunk); } return Err(e); @@ -150,11 +155,15 @@ where // Parsing succeeded but we can't buffer the leftover bytes for the next // call. On a graceful bail-out we flush the leftover raw so the response // stays whole. - if self.graceful_bail_out_on_memory_limit_exceeded { - self.parser.get_dispatcher().flush_for_bail_out(unconsumed); + let err = RewritingError::MemoryLimitExceeded(e); + + if self.should_bail_out_for(&err) { + let dispatcher = self.parser.get_dispatcher(); + dispatcher.run_bail_out_handlers(&err); + dispatcher.flush_for_bail_out(unconsumed); } - return Err(RewritingError::MemoryLimitExceeded(e)); + return Err(err); } self.has_buffered_data = true; @@ -183,7 +192,9 @@ where // Same reasoning as in `write()`: if we can bail out gracefully, make sure the sink // has all the input bytes before propagating the error. if self.should_bail_out_for(&e) { - self.parser.get_dispatcher().flush_for_bail_out(chunk); + let dispatcher = self.parser.get_dispatcher(); + dispatcher.run_bail_out_handlers(&e); + dispatcher.flush_for_bail_out(chunk); } return Err(e);