diff --git a/CHANGELOG.md b/CHANGELOG.md index 520ea394..59e45c1a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,18 @@ # Changelog +## v3.0.0 + +- Added `MemorySettings::graceful_bail_out_on_memory_limit_exceeded`: when set, the rewriter + flushes every input byte it has received but not yet emitted to the sink (as-is) before + returning `MemoryLimitExceededError`, so callers can continue the response by writing + subsequent bytes directly to their downstream sink instead of breaking it. +- Added `Settings::graceful_bail_out_on_content_handler_error`: symmetric to the memory flag + above, but for `RewritingError::ContentHandlerError`. When set, the rewriter flushes + remaining input bytes before propagating a handler error, preserving the response. + Currently exposed via the Rust API only; the C API still uses the original behavior. +- Adding new fields to `MemorySettings` and `Settings` is a SemVer-breaking change for + existing struct-literal construction, hence the major version bump. + ## v2.9.0 - Added `OutputSink::set_encoding` diff --git a/Cargo.toml b/Cargo.toml index 98622fc3..489b810a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "lol_html" -version = "2.9.0" +version = "3.0.0" authors = ["Ivan Nikulin "] license = "BSD-3-Clause" description = "Streaming HTML rewriter/parser with CSS selector-based API" diff --git a/c-api/Cargo.lock b/c-api/Cargo.lock index af4d3cab..b2935794 100644 --- a/c-api/Cargo.lock +++ b/c-api/Cargo.lock @@ -137,7 +137,7 @@ checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" [[package]] name = "lol_html" -version = "2.9.0" +version = "3.0.0" dependencies = [ "bitflags", "cfg-if", diff --git a/c-api/include/lol_html.h b/c-api/include/lol_html.h index baa760d6..e36440cc 100644 --- a/c-api/include/lol_html.h +++ b/c-api/include/lol_html.h @@ -255,6 +255,27 @@ typedef struct { // `lol_html_rewriter_write` and `lol_html_rewriter_end` will return an error // if this limit is exceeded. size_t max_allowed_memory_usage; + // Controls how the rewriter recovers when `max_allowed_memory_usage` is + // exceeded. + // + // When `false` (the default), the rewriter aborts processing the response, + // returns an error, and leaves the output sink in a potentially inconsistent + // state (i.e. the sink will have received the transformed bytes the rewriter + // had already produced, but the remaining input bytes are lost). This + // typically results in a truncated, broken response. + // + // When `true`, before returning the error the rewriter flushes every input + // byte it has received but not yet emitted to the sink, *as-is* (i.e. + // without any transformation). The caller can then continue the response by + // writing any subsequent input bytes directly to its own downstream sink, + // bypassing the (now poisoned) rewriter. The resulting response will have + // the rewriter's transformations applied up to some boundary, followed by + // the original bytes after that boundary, but the response will not be + // broken. + // + // The rewriter is still poisoned after the error and must not be used again, + // regardless of this setting. + bool graceful_bail_out_on_memory_limit_exceeded; } lol_html_memory_settings_t; // Builds HTML-rewriter out of the provided builder. Can be called diff --git a/c-api/src/rewriter.rs b/c-api/src/rewriter.rs index 402a5ea7..9fdd8dc9 100644 --- a/c-api/src/rewriter.rs +++ b/c-api/src/rewriter.rs @@ -61,6 +61,10 @@ fn lol_html_rewriter_build_inner( strict, enable_esi_tags, adjust_charset_on_meta_tag: false, + // TODO: expose `graceful_bail_out_on_content_handler_error` through the C API. Adding + // a new parameter to `lol_html_rewriter_build()` is a breaking ABI change, so it + // belongs behind a new function variant or a settings struct. + graceful_bail_out_on_content_handler_error: false, }; let output_sink = ExternOutputSink::new(output_sink, output_sink_user_data); diff --git a/fuzz/test_case/src/lib.rs b/fuzz/test_case/src/lib.rs index 807f9586..66b269b8 100644 --- a/fuzz/test_case/src/lib.rs +++ b/fuzz/test_case/src/lib.rs @@ -176,6 +176,7 @@ fn run_rewriter_iter(data: &[u8], selector: &str, encoding: &'static Encoding) { memory_settings: MemorySettings::new(), strict: false, adjust_charset_on_meta_tag: false, + graceful_bail_out_on_content_handler_error: false, }, |_: &[u8]| {}, ); @@ -199,6 +200,7 @@ fn run_c_api_rewriter_iter(data: &[u8], encoding: &str) { lol_html_memory_settings_t { preallocated_parsing_buffer_size: 0, max_allowed_memory_usage: usize::MAX, + graceful_bail_out_on_memory_limit_exceeded: false, }, Some(empty_handler), output_data_ptr, diff --git a/js-api/Cargo.lock b/js-api/Cargo.lock index bf9ab83a..de0b9932 100644 --- a/js-api/Cargo.lock +++ b/js-api/Cargo.lock @@ -173,7 +173,7 @@ checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" [[package]] name = "lol-html" -version = "2.9.0" +version = "3.0.0" dependencies = [ "encoding_rs", "js-sys", @@ -186,7 +186,7 @@ dependencies = [ [[package]] name = "lol_html" -version = "2.9.0" +version = "3.0.0" dependencies = [ "bitflags", "cfg-if", diff --git a/js-api/Cargo.toml b/js-api/Cargo.toml index d74365fa..b4098110 100644 --- a/js-api/Cargo.toml +++ b/js-api/Cargo.toml @@ -2,7 +2,7 @@ name = "lol-html" description = "Streaming HTML rewriter/parser with CSS selector-based API" license = "BSD-3-Clause" -version = "2.9.0" +version = "3.0.0" authors = ["Ivan Nikulin ", "Gus Caplan "] repository = "https://github.com/cloudflare/lol-html" edition = "2024" @@ -14,7 +14,7 @@ crate-type = ["cdylib", "rlib"] [dependencies] js-sys = "0.3.97" -lol_html_native = { package = "lol_html", path = "../", version = "2.9.0" } +lol_html_native = { package = "lol_html", path = "../", version = "3.0.0" } serde = { version = "1.0.228", features = ["derive"] } encoding_rs = "0.8.35" serde-wasm-bindgen = "0.6.5" diff --git a/src/rewriter/mod.rs b/src/rewriter/mod.rs index 1de85bfc..38934fd1 100644 --- a/src/rewriter/mod.rs +++ b/src/rewriter/mod.rs @@ -164,6 +164,11 @@ impl<'h, O: OutputSink, H: HandlerTypes> HtmlRewriter<'h, O, H> { pub fn new<'s>(settings: Settings<'h, 's, H>, output_sink: O) -> Self { let preallocated_parsing_buffer_size = settings.memory_settings.preallocated_parsing_buffer_size; + let graceful_bail_out_on_memory_limit_exceeded = settings + .memory_settings + .graceful_bail_out_on_memory_limit_exceeded; + let graceful_bail_out_on_content_handler_error = + settings.graceful_bail_out_on_content_handler_error; let strict = settings.strict; let encoding = settings.encoding; @@ -184,6 +189,8 @@ impl<'h, O: OutputSink, H: HandlerTypes> HtmlRewriter<'h, O, H> { encoding, next_encoding, strict, + graceful_bail_out_on_memory_limit_exceeded, + graceful_bail_out_on_content_handler_error, }); HtmlRewriter { @@ -976,6 +983,7 @@ mod tests { memory_settings: MemorySettings { max_allowed_memory_usage, preallocated_parsing_buffer_size: 0, + ..MemorySettings::new() }, ..Settings::new() }, @@ -1016,6 +1024,616 @@ mod tests { rewriter.end().unwrap_err(); } + fn create_rewriter_with_graceful_bail_out( + max_allowed_memory_usage: usize, + output_sink: O, + ) -> HtmlRewriter<'static, O> { + HtmlRewriter::new( + Settings { + element_content_handlers: vec![element!("*", |_| Ok(()))], + memory_settings: MemorySettings { + max_allowed_memory_usage, + preallocated_parsing_buffer_size: 0, + graceful_bail_out_on_memory_limit_exceeded: true, + }, + ..Settings::new() + }, + output_sink, + ) + } + + /// Exercises the bail-out path inside `Arena::append()`: with two chunks where the open + /// tag spans both, the parser can't consume chunk 1, so it gets buffered. Chunk 2's + /// append then exceeds the memory limit. The graceful bail-out should flush both chunks + /// to the sink as-is, so the caller can continue the response. + #[test] + fn test_graceful_bail_out_in_buffer_append() { + const MAX: usize = 100; + + let mut output = Vec::::new(); + let mut rewriter = create_rewriter_with_graceful_bail_out(MAX, |c: &[u8]| { + output.extend_from_slice(c); + }); + + let chunk_1 = format!("\"{}",", "r".repeat(MAX / 2)); + + rewriter.write(chunk_1.as_bytes()).unwrap(); + + let err = rewriter.write(chunk_2.as_bytes()).unwrap_err(); + + match err { + RewritingError::MemoryLimitExceeded(e) => assert_eq!(e, MemoryLimitExceededError), + _ => panic!("{}", err), + } + + let expected: Vec = [chunk_1.as_bytes(), chunk_2.as_bytes()].concat(); + + assert_eq!(output, expected); + } + + /// Exercises the bail-out path inside `Arena::init_with()`: with no buffered data, the + /// parser can't consume a chunk that ends with an unfinished tag *name* (the tag + /// scanner keeps `tag_start` set, so everything from there onwards is unconsumed). The + /// unconsumed bytes are bigger than the limit, so `init_with` fails. The graceful + /// bail-out should flush the entire chunk to the sink as-is. + #[test] + fn test_graceful_bail_out_in_buffer_init_with() { + const MAX: usize = 1; + + let mut output = Vec::::new(); + // No element handlers, so we avoid allocating the selectors VM stack which would + // fail first with such a tight limit. + let mut rewriter = HtmlRewriter::new( + Settings { + memory_settings: MemorySettings { + max_allowed_memory_usage: MAX, + preallocated_parsing_buffer_size: 0, + graceful_bail_out_on_memory_limit_exceeded: true, + }, + ..Settings::new() + }, + |c: &[u8]| output.extend_from_slice(c), + ); + + // Unfinished tag name: the scanner can't call `finish_tag_name()` (no space or + // `>`), so `tag_start` stays set and the whole chunk becomes unconsumed. + let chunk = b" assert_eq!(e, MemoryLimitExceededError), + _ => panic!("{}", err), + } + + assert_eq!(output, chunk); + } + + /// Exercises the bail-out path inside `Parser::parse()`: the selectors VM stack push + /// exceeds the memory limit while processing the very first start tag, so the parser + /// returns an error mid-chunk. The graceful bail-out flushes everything from + /// `remaining_content_start` onwards, which (since no lexeme has been consumed yet) + /// covers the whole chunk. + #[test] + fn test_graceful_bail_out_in_parser() { + // Too small for even the initial selectors VM stack allocation. + const MAX: usize = 16; + + let mut output = Vec::::new(); + let mut rewriter = create_rewriter_with_graceful_bail_out(MAX, |c: &[u8]| { + output.extend_from_slice(c); + }); + + let chunk = b"
foo
"; + + let err = rewriter.write(chunk).unwrap_err(); + + match err { + RewritingError::MemoryLimitExceeded(e) => assert_eq!(e, MemoryLimitExceededError), + _ => panic!("{}", err), + } + + assert_eq!(output, chunk); + } + + /// Verifies that transformations applied to tokens processed before the failure point + /// are preserved, and that the rest of the input is flushed as-is. This mirrors the + /// contract the caller relies on: the sink contains the transformed prefix and the raw + /// suffix, and feeding the next chunk of the original response continues it correctly. + /// + /// Uses a document-level comment handler (no selectors VM, so we don't burn memory on + /// the VM stack) and the buffer-append bail-out path: chunk 1 contains a complete + /// comment that the handler transforms, plus the start of an unfinished tag that gets + /// buffered; chunk 2 then overflows the buffer. + #[test] + fn test_graceful_bail_out_preserves_prefix_transformations() { + const MAX: usize = 100; + + let mut output = Vec::::new(); + let mut rewriter = HtmlRewriter::new( + Settings { + document_content_handlers: vec![doc_comments!(|c| { + let text = c.text(); + c.set_text(&format!("REWRITTEN-{text}")).unwrap(); + Ok(()) + })], + memory_settings: MemorySettings { + max_allowed_memory_usage: MAX, + preallocated_parsing_buffer_size: 0, + graceful_bail_out_on_memory_limit_exceeded: true, + }, + ..Settings::new() + }, + |c: &[u8]| output.extend_from_slice(c), + ); + + // chunk_1: a complete comment that the handler will transform, followed by an + // unfinished tag whose remaining bytes get buffered for the next write. + let chunk_1 = format!("\"{}",", "r".repeat(50)); + + rewriter.write(chunk_1.as_bytes()).unwrap(); + + let err = rewriter.write(chunk_2.as_bytes()).unwrap_err(); + + assert!( + matches!(err, RewritingError::MemoryLimitExceeded(_)), + "expected MemoryLimitExceeded, got {err}", + ); + + let output_str = std::str::from_utf8(&output).unwrap(); + + // The comment must have been transformed (proves the prefix kept its handler + // changes). + assert!( + output_str.contains(""), + "expected transformed comment, got {output_str:?}", + ); + + // The unfinished tag's bytes must be present raw (proves the bail-out flushed the + // buffered + new bytes the caller had handed in). + assert!( + output_str.contains("\""),"), + "expected raw closing of the tag at the end, got {output_str:?}", + ); + + // Sanity check: the bytes from the original input (minus what the handler + // transformed) are all present. The transformed comment grew by some bytes; the + // rest is byte-for-byte. + let original_input_minus_comment = + format!("\"{}{}\"", "l".repeat(50), "r".repeat(50),); + + assert!( + output_str.contains(&original_input_minus_comment), + "expected original (raw) suffix in output, got {output_str:?}", + ); + } + + /// Sanity check: without the opt-in flag, the existing behavior is preserved (the + /// sink does NOT receive the unprocessed bytes after a memory error). + #[test] + fn test_no_graceful_bail_out_by_default() { + const MAX: usize = 100; + + let mut output = Vec::::new(); + // `create_rewriter` uses default MemorySettings: graceful bail-out is off. + let mut rewriter = create_rewriter(MAX, |c: &[u8]| output.extend_from_slice(c)); + + let chunk_1 = format!("\"{}",", "r".repeat(MAX / 2)); + + rewriter.write(chunk_1.as_bytes()).unwrap(); + let err = rewriter.write(chunk_2.as_bytes()).unwrap_err(); + + assert!(matches!(err, RewritingError::MemoryLimitExceeded(_))); + // Sink received nothing: chunk_1 was buffered (never emitted), chunk_2 couldn't be + // appended, and we didn't bail out gracefully. + assert!( + output.is_empty(), + "without graceful bail-out the sink should be empty, got {output:?}", + ); + } + + // --- Response reconstruction tests --- + // + // Each test below verifies that, after a `MemoryLimitExceeded` bail-out, the caller + // can reconstruct the complete response by concatenating: + // + // sink_output + unfed_remaining_bytes == original_html + // + // The handlers used are no-ops, so serialized tokens are byte-for-byte identical to + // the original input, and the assertion is an exact byte comparison. + // + // Note: CDATA sections (``) are not tested here because CDATA content + // is emitted incrementally by the lexer (it only needs to buffer the partial `]]>` + // closing marker, not the whole section), so it doesn't cause Arena growth. + + fn bail_out_settings(max_memory: usize) -> MemorySettings { + MemorySettings { + max_allowed_memory_usage: max_memory, + preallocated_parsing_buffer_size: 0, + graceful_bail_out_on_memory_limit_exceeded: true, + } + } + + /// Feeds `html` to a graceful-bail-out rewriter in `chunk_size`-byte pieces. When + /// `MemoryLimitExceeded` fires (during `write()` or `end()`), the remaining unfed + /// bytes are appended verbatim to the sink output, simulating what a caller would do: + /// stop using the poisoned rewriter and pipe the rest of the response directly. + /// + /// Panics if no `MemoryLimitExceeded` error fires (test misconfiguration). + fn reconstruct_response_on_oom( + html: &[u8], + chunk_size: usize, + settings: Settings<'_, '_>, + ) -> Vec { + let mut output = Vec::::new(); + let mut rewriter = HtmlRewriter::new(settings, |c: &[u8]| output.extend_from_slice(c)); + + let mut fed_bytes = 0; + let mut hit_limit = false; + + for chunk in html.chunks(chunk_size) { + match rewriter.write(chunk) { + Ok(()) => fed_bytes += chunk.len(), + Err(RewritingError::MemoryLimitExceeded(_)) => { + fed_bytes += chunk.len(); + hit_limit = true; + break; + } + Err(e) => panic!("unexpected error: {e}"), + } + } + + if !hit_limit { + // All writes succeeded; try `end()` which may trigger the error during final + // buffer processing. + if let Err(e) = rewriter.end() { + match e { + RewritingError::MemoryLimitExceeded(_) => hit_limit = true, + e => panic!("unexpected error: {e}"), + } + } + } + + assert!( + hit_limit, + "expected MemoryLimitExceeded but processing completed \ + (memory limit too generous for this test)", + ); + + // Append bytes we never fed to the rewriter. + output.extend_from_slice(&html[fed_bytes..]); + + output + } + + /// Tag with a huge base64-encoded attribute value, the shape that caused + /// INCIDENT-6638. The lexer buffers the entire tag until `>` is found; the buffer + /// exceeds the memory limit before that. + #[test] + fn test_bail_out_reconstruct_huge_attribute() { + let html = format!( + "

Hello

World

", + "A".repeat(16384), + ); + + let reconstructed = reconstruct_response_on_oom( + html.as_bytes(), + 512, + Settings { + element_content_handlers: vec![element!("*", |_| Ok(()))], + memory_settings: bail_out_settings(8192), + ..Settings::new() + }, + ); + + assert_eq!( + reconstructed, + html.as_bytes(), + "response with huge attribute must be reconstructable", + ); + } + + /// Tag with hundreds of small attributes whose total length exceeds the memory limit. + /// Same mechanism as the huge-attribute test (the lexer buffers the whole tag), just a + /// different real-world shape. + #[test] + fn test_bail_out_reconstruct_many_attributes() { + let attrs: String = (0..500) + .map(|i| format!(" data-attr-{i}=\"value-{i}\"")) + .collect(); + let html = format!("

Hello

inner

World

"); + + let reconstructed = reconstruct_response_on_oom( + html.as_bytes(), + 512, + Settings { + element_content_handlers: vec![element!("*", |_| Ok(()))], + memory_settings: bail_out_settings(8192), + ..Settings::new() + }, + ); + + assert_eq!( + reconstructed, + html.as_bytes(), + "response with many attributes must be reconstructable", + ); + } + + /// Huge HTML comment (``). The lexer buffers from ``, so a + /// comment body larger than the limit overflows the Arena the same way a huge tag does. + /// The comment handler puts the parser in lex mode for comments inside the outer + /// `
`. + #[test] + fn test_bail_out_reconstruct_huge_comment() { + let html = format!("
BeforeAfter
", "X".repeat(16384),); + + let reconstructed = reconstruct_response_on_oom( + html.as_bytes(), + 512, + Settings { + element_content_handlers: vec![comments!("div", |_| Ok(()))], + memory_settings: bail_out_settings(8192), + ..Settings::new() + }, + ); + + assert_eq!( + reconstructed, + html.as_bytes(), + "response with huge comment must be reconstructable", + ); + } + + /// Deeply nested non-void elements. Each `
` pushes a `StackItem` onto the + /// selectors-VM stack; eventually the `LimitedVec` growth exceeds the memory limit. + #[test] + fn test_bail_out_reconstruct_deeply_nested() { + let depth = 200; + let open_tags: String = (0..depth).map(|_| "
".to_owned()).collect(); + let close_tags: String = (0..depth).map(|_| "
".to_owned()).collect(); + let html = format!("{open_tags}leaf{close_tags}"); + + let reconstructed = reconstruct_response_on_oom( + html.as_bytes(), + 512, + Settings { + element_content_handlers: vec![element!("*", |_| Ok(()))], + memory_settings: bail_out_settings(4096), + ..Settings::new() + }, + ); + + assert_eq!( + reconstructed, + html.as_bytes(), + "deeply nested response must be reconstructable", + ); + } + + /// Many non-void start tags that are never closed: the selectors-VM stack grows + /// without any pops, the same as deep nesting but a pattern more likely seen in + /// broken or malicious HTML. + #[test] + fn test_bail_out_reconstruct_unclosed_tags() { + let html: String = (0..200) + .map(|i| format!("text ")) + .collect(); + + let reconstructed = reconstruct_response_on_oom( + html.as_bytes(), + 512, + Settings { + element_content_handlers: vec![element!("*", |_| Ok(()))], + memory_settings: bail_out_settings(4096), + ..Settings::new() + }, + ); + + assert_eq!( + reconstructed, + html.as_bytes(), + "response with unclosed tags must be reconstructable", + ); + } + + // --- Content-handler-error bail-out tests --- + // + // These mirror the memory bail-out tests but with handler errors as the trigger. The + // contract is the same: when `graceful_bail_out_on_content_handler_error = true`, the + // sink receives every input byte the rewriter had been given before the error. + + /// An element handler that returns `Err` aborts the rewriter. With graceful bail-out + /// enabled, the sink keeps every byte the caller had fed in, with the failing element + /// and everything after it flushed raw (no transformation), and earlier elements + /// transformed normally. + #[test] + fn test_graceful_bail_out_on_element_handler_error() { + let html = b"firstmiddlelast"; + + let mut output = Vec::::new(); + let mut rewriter = HtmlRewriter::new( + Settings { + element_content_handlers: vec![ + element!("a", |el| { + el.set_attribute("rewritten", "yes").unwrap(); + Ok(()) + }), + element!("stop", |_| Err("handler refused".into())), + ], + graceful_bail_out_on_content_handler_error: true, + ..Settings::new() + }, + |c: &[u8]| output.extend_from_slice(c), + ); + + let err = rewriter.write(html).unwrap_err(); + + assert!( + matches!(err, RewritingError::ContentHandlerError(_)), + "expected ContentHandlerError, got {err}", + ); + + // The full original bytes from `` onwards are present raw, and the `` tag + // before that has been transformed. + let output_str = std::str::from_utf8(&output).unwrap(); + + assert!( + output_str.starts_with("first"), + "expected transformed prefix, got {output_str:?}", + ); + assert!( + output_str.ends_with("middlelast"), + "expected raw bytes from the failing tag onwards, got {output_str:?}", + ); + } + + /// Without the opt-in flag, an element-handler error still aborts processing without + /// flushing remaining bytes (existing behavior preserved). + #[test] + fn test_no_graceful_bail_out_on_content_handler_error_by_default() { + let html = b"firstmiddle"; + + let mut output = Vec::::new(); + let mut rewriter = HtmlRewriter::new( + Settings { + element_content_handlers: vec![element!("stop", |_| Err( + "handler refused".into() + ))], + ..Settings::new() + }, + |c: &[u8]| output.extend_from_slice(c), + ); + + let err = rewriter.write(html).unwrap_err(); + + assert!(matches!(err, RewritingError::ContentHandlerError(_))); + assert!( + !output.ends_with(b"middle"), + "without graceful bail-out the sink must NOT contain the failing tag, got {output:?}", + ); + } + + /// A comment handler that returns `Err` is recoverable too. Comments live on the same + /// `lexeme_consumed` path as elements, so this exercises the same restructured ordering. + #[test] + fn test_graceful_bail_out_on_comment_handler_error() { + let html = b"
BeforeAfter
"; + + let mut output = Vec::::new(); + let mut rewriter = HtmlRewriter::new( + Settings { + element_content_handlers: vec![comments!("div", |_| { + Err("comment refused".into()) + })], + graceful_bail_out_on_content_handler_error: true, + ..Settings::new() + }, + |c: &[u8]| output.extend_from_slice(c), + ); + + let err = rewriter.write(html).unwrap_err(); + assert!(matches!(err, RewritingError::ContentHandlerError(_))); + + // The whole document is in the sink (handler error doesn't lose bytes). + assert_eq!(output, html); + } + + /// A handler error from `handle_end` arrives after `flush_remaining_input` has already + /// emitted every input byte, so the sink already has the complete document. Bail-out + /// just propagates the error without losing anything. + #[test] + fn test_graceful_bail_out_on_end_handler_error() { + let html = b"
content
"; + + let mut output = Vec::::new(); + let mut rewriter = HtmlRewriter::new( + Settings { + document_content_handlers: vec![end!(|_| Err("end refused".into()))], + graceful_bail_out_on_content_handler_error: true, + ..Settings::new() + }, + |c: &[u8]| output.extend_from_slice(c), + ); + + rewriter.write(html).unwrap(); + + let err = rewriter.end().unwrap_err(); + + assert!(matches!(err, RewritingError::ContentHandlerError(_))); + // All input bytes already in sink before `handle_end()` runs. + assert_eq!(output, html); + } + + /// Reconstruction test: when a handler in the middle of a document errors, the sink + /// output plus any unfed bytes must equal the original document. + #[test] + fn test_bail_out_reconstruct_handler_error_midstream() { + let html = b"

before

middle
after"; + + let mut output = Vec::::new(); + let mut rewriter = HtmlRewriter::new( + Settings { + element_content_handlers: vec![element!("div", |_| { + Err("div refused".into()) + })], + graceful_bail_out_on_content_handler_error: true, + ..Settings::new() + }, + |c: &[u8]| output.extend_from_slice(c), + ); + + let err = rewriter.write(html).unwrap_err(); + assert!(matches!(err, RewritingError::ContentHandlerError(_))); + + assert_eq!( + output, html, + "response must be reconstructable byte-for-byte when handler errors midstream", + ); + } + + /// The two bail-out flags are independent: enabling content-handler bail-out does not + /// affect memory-limit behavior, and vice versa. + #[test] + fn test_bail_out_flags_independent() { + // Memory limit error with content-handler bail-out only: should NOT bail out. + const MAX: usize = 100; + let mut output = Vec::::new(); + let mut rewriter = HtmlRewriter::new( + Settings { + element_content_handlers: vec![element!("*", |_| Ok(()))], + memory_settings: MemorySettings { + max_allowed_memory_usage: MAX, + preallocated_parsing_buffer_size: 0, + graceful_bail_out_on_memory_limit_exceeded: false, + }, + graceful_bail_out_on_content_handler_error: true, + ..Settings::new() + }, + |c: &[u8]| output.extend_from_slice(c), + ); + + let chunk_1 = format!("\"{}",", "r".repeat(MAX / 2)); + rewriter.write(chunk_1.as_bytes()).unwrap(); + let err = rewriter.write(chunk_2.as_bytes()).unwrap_err(); + + assert!(matches!(err, RewritingError::MemoryLimitExceeded(_))); + assert!( + output.is_empty(), + "content-handler flag must not enable memory bail-out, got {output:?}", + ); + } + #[test] fn content_handler_error_propagation() { fn assert_err<'h>( diff --git a/src/rewriter/settings.rs b/src/rewriter/settings.rs index 7cdc98c1..8adcbfde 100644 --- a/src/rewriter/settings.rs +++ b/src/rewriter/settings.rs @@ -831,6 +831,40 @@ pub struct MemorySettings { /// [`write`]: struct.HtmlRewriter.html#method.write /// [`end`]: struct.HtmlRewriter.html#method.end pub max_allowed_memory_usage: usize, + + /// Controls how the rewriter recovers when [`max_allowed_memory_usage`] is exceeded. + /// + /// When `false` (the default), the rewriter aborts processing the response, returns + /// [`MemoryLimitExceededError`], and leaves the output sink in a potentially inconsistent state + /// (i.e. the sink will have received the transformed bytes the rewriter had already produced, + /// but the remaining input bytes are lost). This typically results in a truncated, broken + /// response. + /// + /// When `true`, before returning [`MemoryLimitExceededError`] the rewriter flushes every input + /// byte it has received but not yet emitted to the sink, *as-is* (i.e. without any + /// transformation). The caller can then continue the response by writing any subsequent input + /// bytes directly to its own downstream sink, bypassing the (now poisoned) rewriter. The + /// resulting response will have the rewriter's transformations applied up to some boundary, + /// followed by the original bytes after that boundary, but the response will not be broken. + /// + /// The rewriter is still poisoned after the error and must not be used again, regardless of + /// this setting. + /// + /// ### Caveat + /// + /// If a handler was actively removing element content (e.g. via [`Element::remove()`]) at the + /// time the memory limit was exceeded, the removed content's surrounding tags can end up + /// mismatched in the resulting response. In practice removing content is uncommon, and a + /// well-formed-but-imperfect response is still much better than a truncated one. + /// + /// ### Default + /// + /// `false` when constructed with `MemorySettings::new()`. + /// + /// [`MemoryLimitExceededError`]: struct.MemoryLimitExceededError.html + /// [`max_allowed_memory_usage`]: #structfield.max_allowed_memory_usage + /// [`Element::remove()`]: html_content/struct.Element.html#method.remove + pub graceful_bail_out_on_memory_limit_exceeded: bool, } impl Default for MemorySettings { @@ -839,6 +873,7 @@ impl Default for MemorySettings { Self { preallocated_parsing_buffer_size: 1024, max_allowed_memory_usage: usize::MAX, + graceful_bail_out_on_memory_limit_exceeded: false, } } } @@ -999,6 +1034,54 @@ pub struct Settings<'handlers, 'selectors, H: HandlerTypes = LocalHandlerTypes> /// /// `false` when constructed with `Settings::new()`. pub adjust_charset_on_meta_tag: bool, + + /// Controls how the rewriter recovers when a content handler returns an `Err`. + /// + /// When `false` (the default), the rewriter aborts processing the response, returns the + /// handler's [`RewritingError::ContentHandlerError`], and leaves the output sink in a + /// potentially inconsistent state. Downstream this typically manifests as a truncated, + /// broken response. + /// + /// When `true`, before propagating [`RewritingError::ContentHandlerError`] the rewriter + /// flushes every input byte it has received but not yet emitted to the sink, *as-is*. The + /// caller can then continue the response by writing any subsequent input bytes directly + /// to its own downstream sink, bypassing the (now poisoned) rewriter. The resulting + /// response will have the rewriter's transformations applied up to some boundary, + /// followed by the original bytes after that boundary, but the response will not be + /// broken. + /// + /// The rewriter is still poisoned after the error and must not be used again, regardless + /// of this setting. + /// + /// This is symmetric with + /// [`MemorySettings::graceful_bail_out_on_memory_limit_exceeded`], but kept as a separate + /// flag because the underlying error has different semantics: a memory limit is an + /// environmental constraint, whereas a content handler returning `Err` is an explicit + /// signal from the application that something is wrong with the input. Some callers will + /// want graceful recovery for one but not the other. + /// + /// ### Caveats + /// + /// 1. If a handler was actively removing element content (e.g. via [`Element::remove()`]) + /// at the time of the bail-out, the surrounding tags can end up mismatched in the + /// resulting response. In practice removing content is uncommon, and a + /// well-formed-but-imperfect response is still much better than a truncated one. + /// 2. If a text content handler returns an error after some chunks of the same text node + /// have already been emitted (rare; typically only happens with multi-chunk + /// encoding-converted text), the bail-out flush will re-emit the input bytes raw, + /// duplicating the already-emitted chunks. The response is byte-bigger but not + /// truncated. + /// + /// ### Default + /// + /// `false` when constructed with `Settings::new()`. + /// + /// [`MemorySettings::graceful_bail_out_on_memory_limit_exceeded`]: + /// struct.MemorySettings.html#structfield.graceful_bail_out_on_memory_limit_exceeded + /// [`RewritingError::ContentHandlerError`]: + /// errors/enum.RewritingError.html#variant.ContentHandlerError + /// [`Element::remove()`]: html_content/struct.Element.html#method.remove + pub graceful_bail_out_on_content_handler_error: bool, } impl Default for Settings<'_, '_, LocalHandlerTypes> { @@ -1039,6 +1122,7 @@ impl Settings<'_, '_, H> { strict: true, enable_esi_tags: false, adjust_charset_on_meta_tag: false, + graceful_bail_out_on_content_handler_error: false, } } } diff --git a/src/selectors_vm/tests.rs b/src/selectors_vm/tests.rs index a65506d4..5da2b1e5 100644 --- a/src/selectors_vm/tests.rs +++ b/src/selectors_vm/tests.rs @@ -89,6 +89,8 @@ mod vm_tests { next_encoding: SharedEncoding::default(), memory_limiter: SharedMemoryLimiter::new(2048), strict: true, + graceful_bail_out_on_memory_limit_exceeded: false, + graceful_bail_out_on_content_handler_error: false, }); transform_stream.write(&html).unwrap(); diff --git a/src/transform_stream/dispatcher.rs b/src/transform_stream/dispatcher.rs index e1f52c34..ca021631 100644 --- a/src/transform_stream/dispatcher.rs +++ b/src/transform_stream/dispatcher.rs @@ -124,7 +124,18 @@ where Ok(()) } - fn lexeme_consumed(&mut self, lexeme: &Lexeme<'_, T>) { + /// Emit the raw chunk of input that lies between the previously emitted byte and the + /// start of `lexeme`, and move `remaining_content_start` to the start of `lexeme`. + /// + /// This is the first half of what used to be `lexeme_consumed()`. It is split out so that + /// the second half ([`consume_lexeme()`]) can be deferred until after the lexeme's token + /// has been successfully emitted; that way, if token emission fails (e.g. a content + /// handler returns an error), `remaining_content_start` still points at the failing + /// lexeme's start and a graceful bail-out can flush the lexeme bytes raw rather than + /// losing them. + /// + /// [`consume_lexeme()`]: Self::consume_lexeme + fn emit_chunk_before_lexeme(&mut self, lexeme: &Lexeme<'_, T>) { let lexeme_range = lexeme.raw_range(); let chunk_range = Range { @@ -138,7 +149,16 @@ where self.output_sink.handle_chunk(&chunk); } - self.remaining_content_start = lexeme_range.end; + self.remaining_content_start = lexeme_range.start; + } + + /// Advance `remaining_content_start` past the end of `lexeme`, marking it as committed. + /// Must only be called after the lexeme's token has been successfully emitted; see + /// [`emit_chunk_before_lexeme()`]. + /// + /// [`emit_chunk_before_lexeme()`]: Self::emit_chunk_before_lexeme + fn consume_lexeme(&mut self, lexeme: &Lexeme<'_, T>) { + self.remaining_content_start = lexeme.raw_range().end; } #[inline] @@ -234,13 +254,14 @@ where { match lexeme.to_token(&mut self.delegate.capture_flags, self.encoding.get()) { ToTokenResult::Token(token) => { - self.delegate.lexeme_consumed(lexeme); + self.delegate.emit_chunk_before_lexeme(lexeme); self.delegate.token_produced(token)?; + self.delegate.consume_lexeme(lexeme); // handler for may have changed encoding self.flush_encoding_change(); } ToTokenResult::Text(text_type) => { - self.delegate.lexeme_consumed(lexeme); + self.delegate.emit_chunk_before_lexeme(lexeme); self.last_text_type = text_type; self.text_decoder.feed_text( lexeme.spanned(), @@ -255,6 +276,7 @@ where ) }, )?; + self.delegate.consume_lexeme(lexeme); } ToTokenResult::None => {} } @@ -373,6 +395,27 @@ where .flush_remaining_input(input, consumed_byte_count); } + /// Flushes all input bytes the dispatcher has received but not yet emitted, *as-is*. Used + /// when `MemorySettings::graceful_bail_out_on_memory_limit_exceeded` is enabled and a memory + /// error is being propagated: the caller wants to preserve enough of the input in the sink + /// to be able to continue the response without breaking it. + /// + /// Unlike [`flush_remaining_input()`], this ignores `emission_enabled`. If a handler was + /// removing element content at the time of the bail-out, those bytes will still be flushed + /// raw. The alternative (skipping the flush) would lose bytes from the input chunk entirely, + /// which the caller cannot recover from since they don't buffer their input. + pub fn flush_for_bail_out(&mut self, input: &[u8]) { + let output = input + .get(self.delegate.remaining_content_start..) + .unwrap_or_default(); + + if !output.is_empty() { + self.delegate.output_sink.handle_chunk(output); + } + + self.delegate.remaining_content_start = 0; + } + pub fn finish(&mut self, input: &[u8]) -> Result<(), RewritingError> { self.delegate.finish(self.encoding.get(), input) } diff --git a/src/transform_stream/mod.rs b/src/transform_stream/mod.rs index 3b952db1..d42fafcb 100644 --- a/src/transform_stream/mod.rs +++ b/src/transform_stream/mod.rs @@ -23,6 +23,8 @@ where pub encoding: AsciiCompatibleEncoding, pub next_encoding: SharedEncoding, pub strict: bool, + pub graceful_bail_out_on_memory_limit_exceeded: bool, + pub graceful_bail_out_on_content_handler_error: bool, } // Pub only for integration tests @@ -34,6 +36,8 @@ where parser: Parser>, buffer: Arena, has_buffered_data: bool, + graceful_bail_out_on_memory_limit_exceeded: bool, + graceful_bail_out_on_content_handler_error: bool, } impl TransformStream @@ -70,6 +74,25 @@ where parser, buffer, has_buffered_data: false, + graceful_bail_out_on_memory_limit_exceeded: settings + .graceful_bail_out_on_memory_limit_exceeded, + graceful_bail_out_on_content_handler_error: settings + .graceful_bail_out_on_content_handler_error, + } + } + + /// Returns whether the current settings allow bailing out gracefully on `err`. Memory and + /// content-handler errors are gated by independent flags; parsing-ambiguity errors are + /// never recovered from (the whole point of strict mode is to refuse uncertain markup). + fn should_bail_out_for(&self, err: &RewritingError) -> bool { + match err { + RewritingError::MemoryLimitExceeded(_) => { + self.graceful_bail_out_on_memory_limit_exceeded + } + RewritingError::ContentHandlerError(_) => { + self.graceful_bail_out_on_content_handler_error + } + RewritingError::ParsingAmbiguity(_) => false, } } @@ -77,18 +100,43 @@ where trace!(@write data); let chunk = if self.has_buffered_data { - self.buffer - .append(data) - .map_err(RewritingError::MemoryLimitExceeded)?; - - self.buffer.bytes() + match self.buffer.append(data) { + Ok(()) => self.buffer.bytes(), + Err(e) => { + // We can't fit `data` next to the buffered (still-unparsed) bytes from + // previous calls. Neither chunk has been emitted to the sink yet, so on a + // graceful bail-out we flush both as-is and let the caller continue the + // response from where they were. + if self.graceful_bail_out_on_memory_limit_exceeded { + let dispatcher = self.parser.get_dispatcher(); + dispatcher.flush_for_bail_out(self.buffer.bytes()); + dispatcher.flush_for_bail_out(data); + } + + return Err(RewritingError::MemoryLimitExceeded(e)); + } + } } else { data }; trace!(@chunk chunk); - let consumed_byte_count = self.parser.parse(chunk, false)?; + let consumed_byte_count = match self.parser.parse(chunk, false) { + Ok(c) => c, + Err(e) => { + // The parser failed mid-chunk. The dispatcher's `remaining_content_start` + // points to the first byte of `chunk` that hasn't been emitted yet (memory + // errors happen before `lexeme_consumed()`; content handler errors happen + // between `emit_chunk_before_lexeme()` and `consume_lexeme()`). Flushing from + // there preserves all bytes the caller fed us. + if self.should_bail_out_for(&e) { + self.parser.get_dispatcher().flush_for_bail_out(chunk); + } + + return Err(e); + } + }; self.parser .get_dispatcher() @@ -98,9 +146,16 @@ where if self.has_buffered_data { self.buffer.shift(consumed_byte_count); } else if let Some(unconsumed) = data.get(consumed_byte_count..) { - self.buffer - .init_with(unconsumed) - .map_err(RewritingError::MemoryLimitExceeded)?; + if let Err(e) = self.buffer.init_with(unconsumed) { + // Parsing succeeded but we can't buffer the leftover bytes for the next + // call. On a graceful bail-out we flush the leftover raw so the response + // stays whole. + if self.graceful_bail_out_on_memory_limit_exceeded { + self.parser.get_dispatcher().flush_for_bail_out(unconsumed); + } + + return Err(RewritingError::MemoryLimitExceeded(e)); + } self.has_buffered_data = true; } else { @@ -124,7 +179,20 @@ where trace!(@chunk chunk); - self.parser.parse(chunk, true)?; + if let Err(e) = self.parser.parse(chunk, true) { + // Same reasoning as in `write()`: if we can bail out gracefully, make sure the sink + // has all the input bytes before propagating the error. + if self.should_bail_out_for(&e) { + self.parser.get_dispatcher().flush_for_bail_out(chunk); + } + + return Err(e); + } + + // `finish()` flushes any remaining input *first* and only then calls `handle_end()`, + // so a `ContentHandlerError` from the end handler arrives after the sink already has + // every input byte. No additional flush needed; the caller continues from where the + // rewriter left off. self.parser.get_dispatcher().finish(chunk) } diff --git a/tests/fixtures/token_capturing.rs b/tests/fixtures/token_capturing.rs index 082e1907..f3a1ccc1 100644 --- a/tests/fixtures/token_capturing.rs +++ b/tests/fixtures/token_capturing.rs @@ -106,6 +106,8 @@ pub fn parse( encoding, next_encoding: Default::default(), strict: true, + graceful_bail_out_on_memory_limit_exceeded: false, + graceful_bail_out_on_content_handler_error: false, }); let parser = transform_stream.parser(); diff --git a/tools/parser_trace/src/main.rs b/tools/parser_trace/src/main.rs index 82fe422c..c13840ab 100644 --- a/tools/parser_trace/src/main.rs +++ b/tools/parser_trace/src/main.rs @@ -104,6 +104,8 @@ fn main() { encoding: AsciiCompatibleEncoding::utf_8(), next_encoding: Default::default(), strict: true, + graceful_bail_out_on_memory_limit_exceeded: false, + graceful_bail_out_on_content_handler_error: false, }); let parser = transform_stream.parser();