From 4420d341a0a0bc279f4af5ac3ec8df29f81297f2 Mon Sep 17 00:00:00 2001 From: Diogo Sousa Date: Wed, 20 May 2026 17:08:28 +0100 Subject: [PATCH 1/3] Allow rewriter to gracefully bail out on memory limit exceeded. By default the rewriter aborts processing when `max_allowed_memory_usage` is hit, leaving the output sink in a potentially inconsistent state (i.e. the sink has received whatever transformed bytes the rewriter had already produced, but the remaining input bytes are lost). Downstream this typically manifests as a truncated, broken HTTP response. This change adds an opt-in `MemorySettings::graceful_bail_out_on_memory_limit_exceeded` flag. When set, before returning `MemoryLimitExceededError` the rewriter flushes every input byte it has received but not yet emitted to the sink, as-is. The caller can then continue the response by writing any subsequent input bytes directly to its own downstream sink, bypassing the (now poisoned) rewriter. The resulting response has the rewriter's transformations applied up to some boundary and the original bytes after that, but it is not broken. The flush is wired in at all three memory-failure sites in `TransformStream::write()`/`end()`: 1. `Arena::append()`: the buffered (still-unparsed) bytes from previous calls plus the new chunk can't fit. 2. `Parser::parse()`: the selectors VM stack push exceeds the limit mid-parse. The dispatcher's `remaining_content_start` correctly points at the first byte of the failing lexeme (the limit check runs before `lexeme_consumed()`), so flushing from there preserves everything. 3. `Arena::init_with()`: the leftover unconsumed bytes can't be buffered for the next call. A new `Dispatcher::flush_for_bail_out()` does the actual flushing. Unlike `flush_remaining_input()` it ignores `emission_enabled`, which means a caller that was actively removing element content at the time of the bail-out can end up with mismatched tags (the removed content's start tag was suppressed but the trailing bytes flushed raw may include the matching end tag). The alternative (skipping the flush when emission is disabled) would lose bytes from the input chunk entirely, which the caller can't recover from since it doesn't buffer its input. The caveat is documented on the new field. The C API gains the corresponding `bool graceful_bail_out_on_memory_limit_exceeded` field in `lol_html_memory_settings_t`. Existing C clients using designated initializers continue to work: the field defaults to `false` / `0`. Tested with 10 new tests in `rewriter::tests::fatal_errors`: 1. Three failure-site tests: one per memory-error origin (`test_graceful_bail_out_in_buffer_append`, `_in_buffer_init_with`, `_in_parser`). 2. `test_graceful_bail_out_preserves_prefix_transformations`: verifies handler transformations on tokens processed before the failure survive the bail-out. 3. `test_no_graceful_bail_out_by_default`: sanity check that the flag is opt-in. 4. Five response-reconstruction tests, each verifying `sink_output + unfed_remaining == original_html` for a different real-world high-memory shape: a tag with a huge base64 attribute (the INCIDENT-6638 shape), hundreds of small attributes on one tag, a huge HTML comment, deeply nested elements (selectors VM stack overflow), and many never-closed tags. --- CHANGELOG.md | 7 + c-api/include/lol_html.h | 21 ++ fuzz/test_case/src/lib.rs | 1 + src/rewriter/mod.rs | 427 +++++++++++++++++++++++++++++ src/rewriter/settings.rs | 35 +++ src/selectors_vm/tests.rs | 1 + src/transform_stream/dispatcher.rs | 21 ++ src/transform_stream/mod.rs | 68 ++++- tests/fixtures/token_capturing.rs | 1 + tools/parser_trace/src/main.rs | 1 + 10 files changed, 573 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 520ea394..7c67e9dd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,12 @@ # Changelog +## Unreleased + +- Added `MemorySettings::graceful_bail_out_on_memory_limit_exceeded`: when set, the rewriter + flushes every input byte it has received but not yet emitted to the sink (as-is) before + returning `MemoryLimitExceededError`, so callers can continue the response by writing + subsequent bytes directly to their downstream sink instead of breaking it. + ## v2.9.0 - Added `OutputSink::set_encoding` diff --git a/c-api/include/lol_html.h b/c-api/include/lol_html.h index baa760d6..e36440cc 100644 --- a/c-api/include/lol_html.h +++ b/c-api/include/lol_html.h @@ -255,6 +255,27 @@ typedef struct { // `lol_html_rewriter_write` and `lol_html_rewriter_end` will return an error // if this limit is exceeded. size_t max_allowed_memory_usage; + // Controls how the rewriter recovers when `max_allowed_memory_usage` is + // exceeded. + // + // When `false` (the default), the rewriter aborts processing the response, + // returns an error, and leaves the output sink in a potentially inconsistent + // state (i.e. the sink will have received the transformed bytes the rewriter + // had already produced, but the remaining input bytes are lost). This + // typically results in a truncated, broken response. + // + // When `true`, before returning the error the rewriter flushes every input + // byte it has received but not yet emitted to the sink, *as-is* (i.e. + // without any transformation). The caller can then continue the response by + // writing any subsequent input bytes directly to its own downstream sink, + // bypassing the (now poisoned) rewriter. The resulting response will have + // the rewriter's transformations applied up to some boundary, followed by + // the original bytes after that boundary, but the response will not be + // broken. + // + // The rewriter is still poisoned after the error and must not be used again, + // regardless of this setting. + bool graceful_bail_out_on_memory_limit_exceeded; } lol_html_memory_settings_t; // Builds HTML-rewriter out of the provided builder. Can be called diff --git a/fuzz/test_case/src/lib.rs b/fuzz/test_case/src/lib.rs index 807f9586..8c570d8f 100644 --- a/fuzz/test_case/src/lib.rs +++ b/fuzz/test_case/src/lib.rs @@ -199,6 +199,7 @@ fn run_c_api_rewriter_iter(data: &[u8], encoding: &str) { lol_html_memory_settings_t { preallocated_parsing_buffer_size: 0, max_allowed_memory_usage: usize::MAX, + graceful_bail_out_on_memory_limit_exceeded: false, }, Some(empty_handler), output_data_ptr, diff --git a/src/rewriter/mod.rs b/src/rewriter/mod.rs index 1de85bfc..a034c91d 100644 --- a/src/rewriter/mod.rs +++ b/src/rewriter/mod.rs @@ -164,6 +164,9 @@ impl<'h, O: OutputSink, H: HandlerTypes> HtmlRewriter<'h, O, H> { pub fn new<'s>(settings: Settings<'h, 's, H>, output_sink: O) -> Self { let preallocated_parsing_buffer_size = settings.memory_settings.preallocated_parsing_buffer_size; + let graceful_bail_out_on_memory_limit_exceeded = settings + .memory_settings + .graceful_bail_out_on_memory_limit_exceeded; let strict = settings.strict; let encoding = settings.encoding; @@ -184,6 +187,7 @@ impl<'h, O: OutputSink, H: HandlerTypes> HtmlRewriter<'h, O, H> { encoding, next_encoding, strict, + graceful_bail_out_on_memory_limit_exceeded, }); HtmlRewriter { @@ -976,6 +980,7 @@ mod tests { memory_settings: MemorySettings { max_allowed_memory_usage, preallocated_parsing_buffer_size: 0, + ..MemorySettings::new() }, ..Settings::new() }, @@ -1016,6 +1021,428 @@ mod tests { rewriter.end().unwrap_err(); } + fn create_rewriter_with_graceful_bail_out( + max_allowed_memory_usage: usize, + output_sink: O, + ) -> HtmlRewriter<'static, O> { + HtmlRewriter::new( + Settings { + element_content_handlers: vec![element!("*", |_| Ok(()))], + memory_settings: MemorySettings { + max_allowed_memory_usage, + preallocated_parsing_buffer_size: 0, + graceful_bail_out_on_memory_limit_exceeded: true, + }, + ..Settings::new() + }, + output_sink, + ) + } + + /// Exercises the bail-out path inside `Arena::append()`: with two chunks where the open + /// tag spans both, the parser can't consume chunk 1, so it gets buffered. Chunk 2's + /// append then exceeds the memory limit. The graceful bail-out should flush both chunks + /// to the sink as-is, so the caller can continue the response. + #[test] + fn test_graceful_bail_out_in_buffer_append() { + const MAX: usize = 100; + + let mut output = Vec::::new(); + let mut rewriter = create_rewriter_with_graceful_bail_out(MAX, |c: &[u8]| { + output.extend_from_slice(c); + }); + + let chunk_1 = format!("\"{}",", "r".repeat(MAX / 2)); + + rewriter.write(chunk_1.as_bytes()).unwrap(); + + let err = rewriter.write(chunk_2.as_bytes()).unwrap_err(); + + match err { + RewritingError::MemoryLimitExceeded(e) => assert_eq!(e, MemoryLimitExceededError), + _ => panic!("{}", err), + } + + let expected: Vec = [chunk_1.as_bytes(), chunk_2.as_bytes()].concat(); + + assert_eq!(output, expected); + } + + /// Exercises the bail-out path inside `Arena::init_with()`: with no buffered data, the + /// parser can't consume a chunk that ends with an unfinished tag *name* (the tag + /// scanner keeps `tag_start` set, so everything from there onwards is unconsumed). The + /// unconsumed bytes are bigger than the limit, so `init_with` fails. The graceful + /// bail-out should flush the entire chunk to the sink as-is. + #[test] + fn test_graceful_bail_out_in_buffer_init_with() { + const MAX: usize = 1; + + let mut output = Vec::::new(); + // No element handlers, so we avoid allocating the selectors VM stack which would + // fail first with such a tight limit. + let mut rewriter = HtmlRewriter::new( + Settings { + memory_settings: MemorySettings { + max_allowed_memory_usage: MAX, + preallocated_parsing_buffer_size: 0, + graceful_bail_out_on_memory_limit_exceeded: true, + }, + ..Settings::new() + }, + |c: &[u8]| output.extend_from_slice(c), + ); + + // Unfinished tag name: the scanner can't call `finish_tag_name()` (no space or + // `>`), so `tag_start` stays set and the whole chunk becomes unconsumed. + let chunk = b" assert_eq!(e, MemoryLimitExceededError), + _ => panic!("{}", err), + } + + assert_eq!(output, chunk); + } + + /// Exercises the bail-out path inside `Parser::parse()`: the selectors VM stack push + /// exceeds the memory limit while processing the very first start tag, so the parser + /// returns an error mid-chunk. The graceful bail-out flushes everything from + /// `remaining_content_start` onwards, which (since no lexeme has been consumed yet) + /// covers the whole chunk. + #[test] + fn test_graceful_bail_out_in_parser() { + // Too small for even the initial selectors VM stack allocation. + const MAX: usize = 16; + + let mut output = Vec::::new(); + let mut rewriter = create_rewriter_with_graceful_bail_out(MAX, |c: &[u8]| { + output.extend_from_slice(c); + }); + + let chunk = b"
foo
"; + + let err = rewriter.write(chunk).unwrap_err(); + + match err { + RewritingError::MemoryLimitExceeded(e) => assert_eq!(e, MemoryLimitExceededError), + _ => panic!("{}", err), + } + + assert_eq!(output, chunk); + } + + /// Verifies that transformations applied to tokens processed before the failure point + /// are preserved, and that the rest of the input is flushed as-is. This mirrors the + /// contract the caller relies on: the sink contains the transformed prefix and the raw + /// suffix, and feeding the next chunk of the original response continues it correctly. + /// + /// Uses a document-level comment handler (no selectors VM, so we don't burn memory on + /// the VM stack) and the buffer-append bail-out path: chunk 1 contains a complete + /// comment that the handler transforms, plus the start of an unfinished tag that gets + /// buffered; chunk 2 then overflows the buffer. + #[test] + fn test_graceful_bail_out_preserves_prefix_transformations() { + const MAX: usize = 100; + + let mut output = Vec::::new(); + let mut rewriter = HtmlRewriter::new( + Settings { + document_content_handlers: vec![doc_comments!(|c| { + let text = c.text(); + c.set_text(&format!("REWRITTEN-{text}")).unwrap(); + Ok(()) + })], + memory_settings: MemorySettings { + max_allowed_memory_usage: MAX, + preallocated_parsing_buffer_size: 0, + graceful_bail_out_on_memory_limit_exceeded: true, + }, + ..Settings::new() + }, + |c: &[u8]| output.extend_from_slice(c), + ); + + // chunk_1: a complete comment that the handler will transform, followed by an + // unfinished tag whose remaining bytes get buffered for the next write. + let chunk_1 = format!("\"{}",", "r".repeat(50)); + + rewriter.write(chunk_1.as_bytes()).unwrap(); + + let err = rewriter.write(chunk_2.as_bytes()).unwrap_err(); + + assert!( + matches!(err, RewritingError::MemoryLimitExceeded(_)), + "expected MemoryLimitExceeded, got {err}", + ); + + let output_str = std::str::from_utf8(&output).unwrap(); + + // The comment must have been transformed (proves the prefix kept its handler + // changes). + assert!( + output_str.contains(""), + "expected transformed comment, got {output_str:?}", + ); + + // The unfinished tag's bytes must be present raw (proves the bail-out flushed the + // buffered + new bytes the caller had handed in). + assert!( + output_str.contains("\""),"), + "expected raw closing of the tag at the end, got {output_str:?}", + ); + + // Sanity check: the bytes from the original input (minus what the handler + // transformed) are all present. The transformed comment grew by some bytes; the + // rest is byte-for-byte. + let original_input_minus_comment = + format!("\"{}{}\"", "l".repeat(50), "r".repeat(50),); + + assert!( + output_str.contains(&original_input_minus_comment), + "expected original (raw) suffix in output, got {output_str:?}", + ); + } + + /// Sanity check: without the opt-in flag, the existing behavior is preserved (the + /// sink does NOT receive the unprocessed bytes after a memory error). + #[test] + fn test_no_graceful_bail_out_by_default() { + const MAX: usize = 100; + + let mut output = Vec::::new(); + // `create_rewriter` uses default MemorySettings: graceful bail-out is off. + let mut rewriter = create_rewriter(MAX, |c: &[u8]| output.extend_from_slice(c)); + + let chunk_1 = format!("\"{}",", "r".repeat(MAX / 2)); + + rewriter.write(chunk_1.as_bytes()).unwrap(); + let err = rewriter.write(chunk_2.as_bytes()).unwrap_err(); + + assert!(matches!(err, RewritingError::MemoryLimitExceeded(_))); + // Sink received nothing: chunk_1 was buffered (never emitted), chunk_2 couldn't be + // appended, and we didn't bail out gracefully. + assert!( + output.is_empty(), + "without graceful bail-out the sink should be empty, got {output:?}", + ); + } + + // --- Response reconstruction tests --- + // + // Each test below verifies that, after a `MemoryLimitExceeded` bail-out, the caller + // can reconstruct the complete response by concatenating: + // + // sink_output + unfed_remaining_bytes == original_html + // + // The handlers used are no-ops, so serialized tokens are byte-for-byte identical to + // the original input, and the assertion is an exact byte comparison. + // + // Note: CDATA sections (``) are not tested here because CDATA content + // is emitted incrementally by the lexer (it only needs to buffer the partial `]]>` + // closing marker, not the whole section), so it doesn't cause Arena growth. + + fn bail_out_settings(max_memory: usize) -> MemorySettings { + MemorySettings { + max_allowed_memory_usage: max_memory, + preallocated_parsing_buffer_size: 0, + graceful_bail_out_on_memory_limit_exceeded: true, + } + } + + /// Feeds `html` to a graceful-bail-out rewriter in `chunk_size`-byte pieces. When + /// `MemoryLimitExceeded` fires (during `write()` or `end()`), the remaining unfed + /// bytes are appended verbatim to the sink output, simulating what a caller would do: + /// stop using the poisoned rewriter and pipe the rest of the response directly. + /// + /// Panics if no `MemoryLimitExceeded` error fires (test misconfiguration). + fn reconstruct_response_on_oom( + html: &[u8], + chunk_size: usize, + settings: Settings<'_, '_>, + ) -> Vec { + let mut output = Vec::::new(); + let mut rewriter = HtmlRewriter::new(settings, |c: &[u8]| output.extend_from_slice(c)); + + let mut fed_bytes = 0; + let mut hit_limit = false; + + for chunk in html.chunks(chunk_size) { + match rewriter.write(chunk) { + Ok(()) => fed_bytes += chunk.len(), + Err(RewritingError::MemoryLimitExceeded(_)) => { + fed_bytes += chunk.len(); + hit_limit = true; + break; + } + Err(e) => panic!("unexpected error: {e}"), + } + } + + if !hit_limit { + // All writes succeeded; try `end()` which may trigger the error during final + // buffer processing. + if let Err(e) = rewriter.end() { + match e { + RewritingError::MemoryLimitExceeded(_) => hit_limit = true, + e => panic!("unexpected error: {e}"), + } + } + } + + assert!( + hit_limit, + "expected MemoryLimitExceeded but processing completed \ + (memory limit too generous for this test)", + ); + + // Append bytes we never fed to the rewriter. + output.extend_from_slice(&html[fed_bytes..]); + + output + } + + /// Tag with a huge base64-encoded attribute value, the shape that caused + /// INCIDENT-6638. The lexer buffers the entire tag until `>` is found; the buffer + /// exceeds the memory limit before that. + #[test] + fn test_bail_out_reconstruct_huge_attribute() { + let html = format!( + "

Hello

World

", + "A".repeat(16384), + ); + + let reconstructed = reconstruct_response_on_oom( + html.as_bytes(), + 512, + Settings { + element_content_handlers: vec![element!("*", |_| Ok(()))], + memory_settings: bail_out_settings(8192), + ..Settings::new() + }, + ); + + assert_eq!( + reconstructed, + html.as_bytes(), + "response with huge attribute must be reconstructable", + ); + } + + /// Tag with hundreds of small attributes whose total length exceeds the memory limit. + /// Same mechanism as the huge-attribute test (the lexer buffers the whole tag), just a + /// different real-world shape. + #[test] + fn test_bail_out_reconstruct_many_attributes() { + let attrs: String = (0..500) + .map(|i| format!(" data-attr-{i}=\"value-{i}\"")) + .collect(); + let html = format!("

Hello

inner

World

"); + + let reconstructed = reconstruct_response_on_oom( + html.as_bytes(), + 512, + Settings { + element_content_handlers: vec![element!("*", |_| Ok(()))], + memory_settings: bail_out_settings(8192), + ..Settings::new() + }, + ); + + assert_eq!( + reconstructed, + html.as_bytes(), + "response with many attributes must be reconstructable", + ); + } + + /// Huge HTML comment (``). The lexer buffers from ``, so a + /// comment body larger than the limit overflows the Arena the same way a huge tag does. + /// The comment handler puts the parser in lex mode for comments inside the outer + /// `
`. + #[test] + fn test_bail_out_reconstruct_huge_comment() { + let html = format!("
BeforeAfter
", "X".repeat(16384),); + + let reconstructed = reconstruct_response_on_oom( + html.as_bytes(), + 512, + Settings { + element_content_handlers: vec![comments!("div", |_| Ok(()))], + memory_settings: bail_out_settings(8192), + ..Settings::new() + }, + ); + + assert_eq!( + reconstructed, + html.as_bytes(), + "response with huge comment must be reconstructable", + ); + } + + /// Deeply nested non-void elements. Each `
` pushes a `StackItem` onto the + /// selectors-VM stack; eventually the `LimitedVec` growth exceeds the memory limit. + #[test] + fn test_bail_out_reconstruct_deeply_nested() { + let depth = 200; + let open_tags: String = (0..depth).map(|_| "
".to_owned()).collect(); + let close_tags: String = (0..depth).map(|_| "
".to_owned()).collect(); + let html = format!("{open_tags}leaf{close_tags}"); + + let reconstructed = reconstruct_response_on_oom( + html.as_bytes(), + 512, + Settings { + element_content_handlers: vec![element!("*", |_| Ok(()))], + memory_settings: bail_out_settings(4096), + ..Settings::new() + }, + ); + + assert_eq!( + reconstructed, + html.as_bytes(), + "deeply nested response must be reconstructable", + ); + } + + /// Many non-void start tags that are never closed: the selectors-VM stack grows + /// without any pops, the same as deep nesting but a pattern more likely seen in + /// broken or malicious HTML. + #[test] + fn test_bail_out_reconstruct_unclosed_tags() { + let html: String = (0..200) + .map(|i| format!("text ")) + .collect(); + + let reconstructed = reconstruct_response_on_oom( + html.as_bytes(), + 512, + Settings { + element_content_handlers: vec![element!("*", |_| Ok(()))], + memory_settings: bail_out_settings(4096), + ..Settings::new() + }, + ); + + assert_eq!( + reconstructed, + html.as_bytes(), + "response with unclosed tags must be reconstructable", + ); + } + #[test] fn content_handler_error_propagation() { fn assert_err<'h>( diff --git a/src/rewriter/settings.rs b/src/rewriter/settings.rs index 7cdc98c1..0ab39782 100644 --- a/src/rewriter/settings.rs +++ b/src/rewriter/settings.rs @@ -831,6 +831,40 @@ pub struct MemorySettings { /// [`write`]: struct.HtmlRewriter.html#method.write /// [`end`]: struct.HtmlRewriter.html#method.end pub max_allowed_memory_usage: usize, + + /// Controls how the rewriter recovers when [`max_allowed_memory_usage`] is exceeded. + /// + /// When `false` (the default), the rewriter aborts processing the response, returns + /// [`MemoryLimitExceededError`], and leaves the output sink in a potentially inconsistent state + /// (i.e. the sink will have received the transformed bytes the rewriter had already produced, + /// but the remaining input bytes are lost). This typically results in a truncated, broken + /// response. + /// + /// When `true`, before returning [`MemoryLimitExceededError`] the rewriter flushes every input + /// byte it has received but not yet emitted to the sink, *as-is* (i.e. without any + /// transformation). The caller can then continue the response by writing any subsequent input + /// bytes directly to its own downstream sink, bypassing the (now poisoned) rewriter. The + /// resulting response will have the rewriter's transformations applied up to some boundary, + /// followed by the original bytes after that boundary, but the response will not be broken. + /// + /// The rewriter is still poisoned after the error and must not be used again, regardless of + /// this setting. + /// + /// ### Caveat + /// + /// If a handler was actively removing element content (e.g. via [`Element::remove()`]) at the + /// time the memory limit was exceeded, the removed content's surrounding tags can end up + /// mismatched in the resulting response. In practice removing content is uncommon, and a + /// well-formed-but-imperfect response is still much better than a truncated one. + /// + /// ### Default + /// + /// `false` when constructed with `MemorySettings::new()`. + /// + /// [`MemoryLimitExceededError`]: struct.MemoryLimitExceededError.html + /// [`max_allowed_memory_usage`]: #structfield.max_allowed_memory_usage + /// [`Element::remove()`]: html_content/struct.Element.html#method.remove + pub graceful_bail_out_on_memory_limit_exceeded: bool, } impl Default for MemorySettings { @@ -839,6 +873,7 @@ impl Default for MemorySettings { Self { preallocated_parsing_buffer_size: 1024, max_allowed_memory_usage: usize::MAX, + graceful_bail_out_on_memory_limit_exceeded: false, } } } diff --git a/src/selectors_vm/tests.rs b/src/selectors_vm/tests.rs index a65506d4..0f9c8f8c 100644 --- a/src/selectors_vm/tests.rs +++ b/src/selectors_vm/tests.rs @@ -89,6 +89,7 @@ mod vm_tests { next_encoding: SharedEncoding::default(), memory_limiter: SharedMemoryLimiter::new(2048), strict: true, + graceful_bail_out_on_memory_limit_exceeded: false, }); transform_stream.write(&html).unwrap(); diff --git a/src/transform_stream/dispatcher.rs b/src/transform_stream/dispatcher.rs index e1f52c34..6b49db59 100644 --- a/src/transform_stream/dispatcher.rs +++ b/src/transform_stream/dispatcher.rs @@ -373,6 +373,27 @@ where .flush_remaining_input(input, consumed_byte_count); } + /// Flushes all input bytes the dispatcher has received but not yet emitted, *as-is*. Used + /// when `MemorySettings::graceful_bail_out_on_memory_limit_exceeded` is enabled and a memory + /// error is being propagated: the caller wants to preserve enough of the input in the sink + /// to be able to continue the response without breaking it. + /// + /// Unlike [`flush_remaining_input()`], this ignores `emission_enabled`. If a handler was + /// removing element content at the time of the bail-out, those bytes will still be flushed + /// raw. The alternative (skipping the flush) would lose bytes from the input chunk entirely, + /// which the caller cannot recover from since they don't buffer their input. + pub fn flush_for_bail_out(&mut self, input: &[u8]) { + let output = input + .get(self.delegate.remaining_content_start..) + .unwrap_or_default(); + + if !output.is_empty() { + self.delegate.output_sink.handle_chunk(output); + } + + self.delegate.remaining_content_start = 0; + } + pub fn finish(&mut self, input: &[u8]) -> Result<(), RewritingError> { self.delegate.finish(self.encoding.get(), input) } diff --git a/src/transform_stream/mod.rs b/src/transform_stream/mod.rs index 3b952db1..001c9779 100644 --- a/src/transform_stream/mod.rs +++ b/src/transform_stream/mod.rs @@ -23,6 +23,7 @@ where pub encoding: AsciiCompatibleEncoding, pub next_encoding: SharedEncoding, pub strict: bool, + pub graceful_bail_out_on_memory_limit_exceeded: bool, } // Pub only for integration tests @@ -34,6 +35,7 @@ where parser: Parser>, buffer: Arena, has_buffered_data: bool, + graceful_bail_out_on_memory_limit_exceeded: bool, } impl TransformStream @@ -70,6 +72,8 @@ where parser, buffer, has_buffered_data: false, + graceful_bail_out_on_memory_limit_exceeded: settings + .graceful_bail_out_on_memory_limit_exceeded, } } @@ -77,18 +81,44 @@ where trace!(@write data); let chunk = if self.has_buffered_data { - self.buffer - .append(data) - .map_err(RewritingError::MemoryLimitExceeded)?; - - self.buffer.bytes() + match self.buffer.append(data) { + Ok(()) => self.buffer.bytes(), + Err(e) => { + // We can't fit `data` next to the buffered (still-unparsed) bytes from + // previous calls. Neither chunk has been emitted to the sink yet, so on a + // graceful bail-out we flush both as-is and let the caller continue the + // response from where they were. + if self.graceful_bail_out_on_memory_limit_exceeded { + let dispatcher = self.parser.get_dispatcher(); + dispatcher.flush_for_bail_out(self.buffer.bytes()); + dispatcher.flush_for_bail_out(data); + } + + return Err(RewritingError::MemoryLimitExceeded(e)); + } + } } else { data }; trace!(@chunk chunk); - let consumed_byte_count = self.parser.parse(chunk, false)?; + let consumed_byte_count = match self.parser.parse(chunk, false) { + Ok(c) => c, + Err(e) => { + // The parser ran out of memory mid-chunk (e.g. the selectors VM stack hit the + // limit). The dispatcher's `remaining_content_start` points to the first byte of + // `chunk` that hasn't been emitted yet, so on a graceful bail-out flushing from + // there preserves all bytes the caller fed us. + if matches!(e, RewritingError::MemoryLimitExceeded(_)) + && self.graceful_bail_out_on_memory_limit_exceeded + { + self.parser.get_dispatcher().flush_for_bail_out(chunk); + } + + return Err(e); + } + }; self.parser .get_dispatcher() @@ -98,9 +128,16 @@ where if self.has_buffered_data { self.buffer.shift(consumed_byte_count); } else if let Some(unconsumed) = data.get(consumed_byte_count..) { - self.buffer - .init_with(unconsumed) - .map_err(RewritingError::MemoryLimitExceeded)?; + if let Err(e) = self.buffer.init_with(unconsumed) { + // Parsing succeeded but we can't buffer the leftover bytes for the next + // call. On a graceful bail-out we flush the leftover raw so the response + // stays whole. + if self.graceful_bail_out_on_memory_limit_exceeded { + self.parser.get_dispatcher().flush_for_bail_out(unconsumed); + } + + return Err(RewritingError::MemoryLimitExceeded(e)); + } self.has_buffered_data = true; } else { @@ -124,7 +161,18 @@ where trace!(@chunk chunk); - self.parser.parse(chunk, true)?; + if let Err(e) = self.parser.parse(chunk, true) { + // Same reasoning as in `write()`: if we can bail out gracefully, make sure the sink + // has all the input bytes before propagating the error. + if matches!(e, RewritingError::MemoryLimitExceeded(_)) + && self.graceful_bail_out_on_memory_limit_exceeded + { + self.parser.get_dispatcher().flush_for_bail_out(chunk); + } + + return Err(e); + } + self.parser.get_dispatcher().finish(chunk) } diff --git a/tests/fixtures/token_capturing.rs b/tests/fixtures/token_capturing.rs index 082e1907..419f5553 100644 --- a/tests/fixtures/token_capturing.rs +++ b/tests/fixtures/token_capturing.rs @@ -106,6 +106,7 @@ pub fn parse( encoding, next_encoding: Default::default(), strict: true, + graceful_bail_out_on_memory_limit_exceeded: false, }); let parser = transform_stream.parser(); diff --git a/tools/parser_trace/src/main.rs b/tools/parser_trace/src/main.rs index 82fe422c..4cb79f1b 100644 --- a/tools/parser_trace/src/main.rs +++ b/tools/parser_trace/src/main.rs @@ -104,6 +104,7 @@ fn main() { encoding: AsciiCompatibleEncoding::utf_8(), next_encoding: Default::default(), strict: true, + graceful_bail_out_on_memory_limit_exceeded: false, }); let parser = transform_stream.parser(); From 559fb99482e8e8b7ae6896e5bd31c23f22420509 Mon Sep 17 00:00:00 2001 From: Diogo Sousa Date: Wed, 20 May 2026 18:07:17 +0100 Subject: [PATCH 2/3] Allow rewriter to gracefully bail out on content handler errors. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is the symmetric companion to the previous `MemoryLimitExceeded` bail-out. Until now, a content handler returning `Err` would abort the rewriter and leave the output sink in a potentially inconsistent state (the sink has whatever transformed bytes the rewriter had already produced, but the remaining input bytes are lost). Downstream this typically manifests as a truncated, broken HTTP response. This change adds an opt-in `Settings::graceful_bail_out_on_content_handler_error` flag. When set, before propagating `RewritingError::ContentHandlerError` the rewriter flushes every input byte it has received but not yet emitted to the sink, as-is. The caller can then continue the response by writing subsequent bytes directly to its own downstream sink, bypassing the (now poisoned) rewriter. The flag lives on `Settings` rather than `MemorySettings` because the underlying error has nothing to do with memory: a handler returning `Err` is an explicit signal from the application, not an environmental constraint. The two flags are intentionally independent so callers can pick the recovery posture per error class. The implementation requires a small but important refactor of the dispatcher. Previously, `lexeme_consumed()` did two things: emit the chunk of input before the lexeme, and advance `remaining_content_start` past the lexeme. The second step happened before the lexeme's token was actually serialized to the sink, so a handler error from `token_produced()` (or text emission) left `remaining_content_start` past a lexeme whose bytes had never been emitted. Flushing from there as-is would lose the lexeme. We split that into: 1. `emit_chunk_before_lexeme()`: emit the raw input chunk between the previously emitted byte and the start of the lexeme, advance `remaining_content_start` to the lexeme's start. 2. `consume_lexeme()`: advance `remaining_content_start` past the lexeme's end. Called only after the lexeme's token has been successfully emitted. With this split, on a `ContentHandlerError` mid-lexeme `remaining_content_start` still points at the failing lexeme's start, and a graceful bail-out can flush the lexeme + the rest of the chunk raw. The bail-out decision in `TransformStream::write()` and `end()` is centralized in `should_bail_out_for()`, which considers each `RewritingError` variant against its own flag. `ParsingAmbiguity` is never recovered from (the whole point of strict mode is to refuse uncertain markup). The C API is left on the original behavior for now. The previous memory flag fit cleanly into `MemorySettings` (a `#[repr(C)]` struct), so it was an additive change to the C ABI. The new flag lives on `Settings`, which is not `repr(C)`, so exposing it through the C API needs either a new function signature or a new settings struct — a maintainer call. A TODO captures this in `c-api/src/rewriter.rs`, and the CHANGELOG notes it. Tested with 6 new tests in `rewriter::tests::fatal_errors`: 1. `test_graceful_bail_out_on_element_handler_error`: element handler returns `Err`, sink ends up with the transformed prefix + the failing element and everything after it raw. 2. `test_no_graceful_bail_out_on_content_handler_error_by_default`: sanity check that the flag is opt-in. 3. `test_graceful_bail_out_on_comment_handler_error`: comment handler error path (exercises the same `lexeme_consumed` restructure). 4. `test_graceful_bail_out_on_end_handler_error`: `handle_end()` returns `Err` after `flush_remaining_input` has already emitted every input byte; sink already has the complete document. 5. `test_bail_out_reconstruct_handler_error_midstream`: end-to-end reconstruction (`sink_output == original_html`). 6. `test_bail_out_flags_independent`: enabling content-handler bail-out doesn't accidentally enable memory bail-out, and vice versa. Caveats documented on the new field's doc comment: 1. Active content removal at the time of the bail-out can produce mismatched tags. Same caveat as the memory flag; uncommon in practice and not byte-truncating. 2. Multi-chunk text emission where a later chunk's handler errors will re-emit the raw input bytes, duplicating the already-emitted chunks. Rare (only on encoding-converted text); the response is byte-bigger but not truncated. --- CHANGELOG.md | 4 + c-api/src/rewriter.rs | 4 + fuzz/test_case/src/lib.rs | 1 + src/rewriter/mod.rs | 191 +++++++++++++++++++++++++++++ src/rewriter/settings.rs | 49 ++++++++ src/selectors_vm/tests.rs | 1 + src/transform_stream/dispatcher.rs | 30 ++++- src/transform_stream/mod.rs | 38 ++++-- tests/fixtures/token_capturing.rs | 1 + tools/parser_trace/src/main.rs | 1 + 10 files changed, 307 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7c67e9dd..0db4639a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,10 @@ flushes every input byte it has received but not yet emitted to the sink (as-is) before returning `MemoryLimitExceededError`, so callers can continue the response by writing subsequent bytes directly to their downstream sink instead of breaking it. +- Added `Settings::graceful_bail_out_on_content_handler_error`: symmetric to the memory flag + above, but for `RewritingError::ContentHandlerError`. When set, the rewriter flushes + remaining input bytes before propagating a handler error, preserving the response. + Currently exposed via the Rust API only; the C API still uses the original behavior. ## v2.9.0 diff --git a/c-api/src/rewriter.rs b/c-api/src/rewriter.rs index 402a5ea7..9fdd8dc9 100644 --- a/c-api/src/rewriter.rs +++ b/c-api/src/rewriter.rs @@ -61,6 +61,10 @@ fn lol_html_rewriter_build_inner( strict, enable_esi_tags, adjust_charset_on_meta_tag: false, + // TODO: expose `graceful_bail_out_on_content_handler_error` through the C API. Adding + // a new parameter to `lol_html_rewriter_build()` is a breaking ABI change, so it + // belongs behind a new function variant or a settings struct. + graceful_bail_out_on_content_handler_error: false, }; let output_sink = ExternOutputSink::new(output_sink, output_sink_user_data); diff --git a/fuzz/test_case/src/lib.rs b/fuzz/test_case/src/lib.rs index 8c570d8f..66b269b8 100644 --- a/fuzz/test_case/src/lib.rs +++ b/fuzz/test_case/src/lib.rs @@ -176,6 +176,7 @@ fn run_rewriter_iter(data: &[u8], selector: &str, encoding: &'static Encoding) { memory_settings: MemorySettings::new(), strict: false, adjust_charset_on_meta_tag: false, + graceful_bail_out_on_content_handler_error: false, }, |_: &[u8]| {}, ); diff --git a/src/rewriter/mod.rs b/src/rewriter/mod.rs index a034c91d..38934fd1 100644 --- a/src/rewriter/mod.rs +++ b/src/rewriter/mod.rs @@ -167,6 +167,8 @@ impl<'h, O: OutputSink, H: HandlerTypes> HtmlRewriter<'h, O, H> { let graceful_bail_out_on_memory_limit_exceeded = settings .memory_settings .graceful_bail_out_on_memory_limit_exceeded; + let graceful_bail_out_on_content_handler_error = + settings.graceful_bail_out_on_content_handler_error; let strict = settings.strict; let encoding = settings.encoding; @@ -188,6 +190,7 @@ impl<'h, O: OutputSink, H: HandlerTypes> HtmlRewriter<'h, O, H> { next_encoding, strict, graceful_bail_out_on_memory_limit_exceeded, + graceful_bail_out_on_content_handler_error, }); HtmlRewriter { @@ -1443,6 +1446,194 @@ mod tests { ); } + // --- Content-handler-error bail-out tests --- + // + // These mirror the memory bail-out tests but with handler errors as the trigger. The + // contract is the same: when `graceful_bail_out_on_content_handler_error = true`, the + // sink receives every input byte the rewriter had been given before the error. + + /// An element handler that returns `Err` aborts the rewriter. With graceful bail-out + /// enabled, the sink keeps every byte the caller had fed in, with the failing element + /// and everything after it flushed raw (no transformation), and earlier elements + /// transformed normally. + #[test] + fn test_graceful_bail_out_on_element_handler_error() { + let html = b"firstmiddlelast"; + + let mut output = Vec::::new(); + let mut rewriter = HtmlRewriter::new( + Settings { + element_content_handlers: vec![ + element!("a", |el| { + el.set_attribute("rewritten", "yes").unwrap(); + Ok(()) + }), + element!("stop", |_| Err("handler refused".into())), + ], + graceful_bail_out_on_content_handler_error: true, + ..Settings::new() + }, + |c: &[u8]| output.extend_from_slice(c), + ); + + let err = rewriter.write(html).unwrap_err(); + + assert!( + matches!(err, RewritingError::ContentHandlerError(_)), + "expected ContentHandlerError, got {err}", + ); + + // The full original bytes from `` onwards are present raw, and the `` tag + // before that has been transformed. + let output_str = std::str::from_utf8(&output).unwrap(); + + assert!( + output_str.starts_with("first"), + "expected transformed prefix, got {output_str:?}", + ); + assert!( + output_str.ends_with("middlelast"), + "expected raw bytes from the failing tag onwards, got {output_str:?}", + ); + } + + /// Without the opt-in flag, an element-handler error still aborts processing without + /// flushing remaining bytes (existing behavior preserved). + #[test] + fn test_no_graceful_bail_out_on_content_handler_error_by_default() { + let html = b"firstmiddle"; + + let mut output = Vec::::new(); + let mut rewriter = HtmlRewriter::new( + Settings { + element_content_handlers: vec![element!("stop", |_| Err( + "handler refused".into() + ))], + ..Settings::new() + }, + |c: &[u8]| output.extend_from_slice(c), + ); + + let err = rewriter.write(html).unwrap_err(); + + assert!(matches!(err, RewritingError::ContentHandlerError(_))); + assert!( + !output.ends_with(b"middle"), + "without graceful bail-out the sink must NOT contain the failing tag, got {output:?}", + ); + } + + /// A comment handler that returns `Err` is recoverable too. Comments live on the same + /// `lexeme_consumed` path as elements, so this exercises the same restructured ordering. + #[test] + fn test_graceful_bail_out_on_comment_handler_error() { + let html = b"
BeforeAfter
"; + + let mut output = Vec::::new(); + let mut rewriter = HtmlRewriter::new( + Settings { + element_content_handlers: vec![comments!("div", |_| { + Err("comment refused".into()) + })], + graceful_bail_out_on_content_handler_error: true, + ..Settings::new() + }, + |c: &[u8]| output.extend_from_slice(c), + ); + + let err = rewriter.write(html).unwrap_err(); + assert!(matches!(err, RewritingError::ContentHandlerError(_))); + + // The whole document is in the sink (handler error doesn't lose bytes). + assert_eq!(output, html); + } + + /// A handler error from `handle_end` arrives after `flush_remaining_input` has already + /// emitted every input byte, so the sink already has the complete document. Bail-out + /// just propagates the error without losing anything. + #[test] + fn test_graceful_bail_out_on_end_handler_error() { + let html = b"
content
"; + + let mut output = Vec::::new(); + let mut rewriter = HtmlRewriter::new( + Settings { + document_content_handlers: vec![end!(|_| Err("end refused".into()))], + graceful_bail_out_on_content_handler_error: true, + ..Settings::new() + }, + |c: &[u8]| output.extend_from_slice(c), + ); + + rewriter.write(html).unwrap(); + + let err = rewriter.end().unwrap_err(); + + assert!(matches!(err, RewritingError::ContentHandlerError(_))); + // All input bytes already in sink before `handle_end()` runs. + assert_eq!(output, html); + } + + /// Reconstruction test: when a handler in the middle of a document errors, the sink + /// output plus any unfed bytes must equal the original document. + #[test] + fn test_bail_out_reconstruct_handler_error_midstream() { + let html = b"

before

middle
after"; + + let mut output = Vec::::new(); + let mut rewriter = HtmlRewriter::new( + Settings { + element_content_handlers: vec![element!("div", |_| { + Err("div refused".into()) + })], + graceful_bail_out_on_content_handler_error: true, + ..Settings::new() + }, + |c: &[u8]| output.extend_from_slice(c), + ); + + let err = rewriter.write(html).unwrap_err(); + assert!(matches!(err, RewritingError::ContentHandlerError(_))); + + assert_eq!( + output, html, + "response must be reconstructable byte-for-byte when handler errors midstream", + ); + } + + /// The two bail-out flags are independent: enabling content-handler bail-out does not + /// affect memory-limit behavior, and vice versa. + #[test] + fn test_bail_out_flags_independent() { + // Memory limit error with content-handler bail-out only: should NOT bail out. + const MAX: usize = 100; + let mut output = Vec::::new(); + let mut rewriter = HtmlRewriter::new( + Settings { + element_content_handlers: vec![element!("*", |_| Ok(()))], + memory_settings: MemorySettings { + max_allowed_memory_usage: MAX, + preallocated_parsing_buffer_size: 0, + graceful_bail_out_on_memory_limit_exceeded: false, + }, + graceful_bail_out_on_content_handler_error: true, + ..Settings::new() + }, + |c: &[u8]| output.extend_from_slice(c), + ); + + let chunk_1 = format!("\"{}",", "r".repeat(MAX / 2)); + rewriter.write(chunk_1.as_bytes()).unwrap(); + let err = rewriter.write(chunk_2.as_bytes()).unwrap_err(); + + assert!(matches!(err, RewritingError::MemoryLimitExceeded(_))); + assert!( + output.is_empty(), + "content-handler flag must not enable memory bail-out, got {output:?}", + ); + } + #[test] fn content_handler_error_propagation() { fn assert_err<'h>( diff --git a/src/rewriter/settings.rs b/src/rewriter/settings.rs index 0ab39782..8adcbfde 100644 --- a/src/rewriter/settings.rs +++ b/src/rewriter/settings.rs @@ -1034,6 +1034,54 @@ pub struct Settings<'handlers, 'selectors, H: HandlerTypes = LocalHandlerTypes> /// /// `false` when constructed with `Settings::new()`. pub adjust_charset_on_meta_tag: bool, + + /// Controls how the rewriter recovers when a content handler returns an `Err`. + /// + /// When `false` (the default), the rewriter aborts processing the response, returns the + /// handler's [`RewritingError::ContentHandlerError`], and leaves the output sink in a + /// potentially inconsistent state. Downstream this typically manifests as a truncated, + /// broken response. + /// + /// When `true`, before propagating [`RewritingError::ContentHandlerError`] the rewriter + /// flushes every input byte it has received but not yet emitted to the sink, *as-is*. The + /// caller can then continue the response by writing any subsequent input bytes directly + /// to its own downstream sink, bypassing the (now poisoned) rewriter. The resulting + /// response will have the rewriter's transformations applied up to some boundary, + /// followed by the original bytes after that boundary, but the response will not be + /// broken. + /// + /// The rewriter is still poisoned after the error and must not be used again, regardless + /// of this setting. + /// + /// This is symmetric with + /// [`MemorySettings::graceful_bail_out_on_memory_limit_exceeded`], but kept as a separate + /// flag because the underlying error has different semantics: a memory limit is an + /// environmental constraint, whereas a content handler returning `Err` is an explicit + /// signal from the application that something is wrong with the input. Some callers will + /// want graceful recovery for one but not the other. + /// + /// ### Caveats + /// + /// 1. If a handler was actively removing element content (e.g. via [`Element::remove()`]) + /// at the time of the bail-out, the surrounding tags can end up mismatched in the + /// resulting response. In practice removing content is uncommon, and a + /// well-formed-but-imperfect response is still much better than a truncated one. + /// 2. If a text content handler returns an error after some chunks of the same text node + /// have already been emitted (rare; typically only happens with multi-chunk + /// encoding-converted text), the bail-out flush will re-emit the input bytes raw, + /// duplicating the already-emitted chunks. The response is byte-bigger but not + /// truncated. + /// + /// ### Default + /// + /// `false` when constructed with `Settings::new()`. + /// + /// [`MemorySettings::graceful_bail_out_on_memory_limit_exceeded`]: + /// struct.MemorySettings.html#structfield.graceful_bail_out_on_memory_limit_exceeded + /// [`RewritingError::ContentHandlerError`]: + /// errors/enum.RewritingError.html#variant.ContentHandlerError + /// [`Element::remove()`]: html_content/struct.Element.html#method.remove + pub graceful_bail_out_on_content_handler_error: bool, } impl Default for Settings<'_, '_, LocalHandlerTypes> { @@ -1074,6 +1122,7 @@ impl Settings<'_, '_, H> { strict: true, enable_esi_tags: false, adjust_charset_on_meta_tag: false, + graceful_bail_out_on_content_handler_error: false, } } } diff --git a/src/selectors_vm/tests.rs b/src/selectors_vm/tests.rs index 0f9c8f8c..5da2b1e5 100644 --- a/src/selectors_vm/tests.rs +++ b/src/selectors_vm/tests.rs @@ -90,6 +90,7 @@ mod vm_tests { memory_limiter: SharedMemoryLimiter::new(2048), strict: true, graceful_bail_out_on_memory_limit_exceeded: false, + graceful_bail_out_on_content_handler_error: false, }); transform_stream.write(&html).unwrap(); diff --git a/src/transform_stream/dispatcher.rs b/src/transform_stream/dispatcher.rs index 6b49db59..ca021631 100644 --- a/src/transform_stream/dispatcher.rs +++ b/src/transform_stream/dispatcher.rs @@ -124,7 +124,18 @@ where Ok(()) } - fn lexeme_consumed(&mut self, lexeme: &Lexeme<'_, T>) { + /// Emit the raw chunk of input that lies between the previously emitted byte and the + /// start of `lexeme`, and move `remaining_content_start` to the start of `lexeme`. + /// + /// This is the first half of what used to be `lexeme_consumed()`. It is split out so that + /// the second half ([`consume_lexeme()`]) can be deferred until after the lexeme's token + /// has been successfully emitted; that way, if token emission fails (e.g. a content + /// handler returns an error), `remaining_content_start` still points at the failing + /// lexeme's start and a graceful bail-out can flush the lexeme bytes raw rather than + /// losing them. + /// + /// [`consume_lexeme()`]: Self::consume_lexeme + fn emit_chunk_before_lexeme(&mut self, lexeme: &Lexeme<'_, T>) { let lexeme_range = lexeme.raw_range(); let chunk_range = Range { @@ -138,7 +149,16 @@ where self.output_sink.handle_chunk(&chunk); } - self.remaining_content_start = lexeme_range.end; + self.remaining_content_start = lexeme_range.start; + } + + /// Advance `remaining_content_start` past the end of `lexeme`, marking it as committed. + /// Must only be called after the lexeme's token has been successfully emitted; see + /// [`emit_chunk_before_lexeme()`]. + /// + /// [`emit_chunk_before_lexeme()`]: Self::emit_chunk_before_lexeme + fn consume_lexeme(&mut self, lexeme: &Lexeme<'_, T>) { + self.remaining_content_start = lexeme.raw_range().end; } #[inline] @@ -234,13 +254,14 @@ where { match lexeme.to_token(&mut self.delegate.capture_flags, self.encoding.get()) { ToTokenResult::Token(token) => { - self.delegate.lexeme_consumed(lexeme); + self.delegate.emit_chunk_before_lexeme(lexeme); self.delegate.token_produced(token)?; + self.delegate.consume_lexeme(lexeme); // handler for may have changed encoding self.flush_encoding_change(); } ToTokenResult::Text(text_type) => { - self.delegate.lexeme_consumed(lexeme); + self.delegate.emit_chunk_before_lexeme(lexeme); self.last_text_type = text_type; self.text_decoder.feed_text( lexeme.spanned(), @@ -255,6 +276,7 @@ where ) }, )?; + self.delegate.consume_lexeme(lexeme); } ToTokenResult::None => {} } diff --git a/src/transform_stream/mod.rs b/src/transform_stream/mod.rs index 001c9779..d42fafcb 100644 --- a/src/transform_stream/mod.rs +++ b/src/transform_stream/mod.rs @@ -24,6 +24,7 @@ where pub next_encoding: SharedEncoding, pub strict: bool, pub graceful_bail_out_on_memory_limit_exceeded: bool, + pub graceful_bail_out_on_content_handler_error: bool, } // Pub only for integration tests @@ -36,6 +37,7 @@ where buffer: Arena, has_buffered_data: bool, graceful_bail_out_on_memory_limit_exceeded: bool, + graceful_bail_out_on_content_handler_error: bool, } impl TransformStream @@ -74,6 +76,23 @@ where has_buffered_data: false, graceful_bail_out_on_memory_limit_exceeded: settings .graceful_bail_out_on_memory_limit_exceeded, + graceful_bail_out_on_content_handler_error: settings + .graceful_bail_out_on_content_handler_error, + } + } + + /// Returns whether the current settings allow bailing out gracefully on `err`. Memory and + /// content-handler errors are gated by independent flags; parsing-ambiguity errors are + /// never recovered from (the whole point of strict mode is to refuse uncertain markup). + fn should_bail_out_for(&self, err: &RewritingError) -> bool { + match err { + RewritingError::MemoryLimitExceeded(_) => { + self.graceful_bail_out_on_memory_limit_exceeded + } + RewritingError::ContentHandlerError(_) => { + self.graceful_bail_out_on_content_handler_error + } + RewritingError::ParsingAmbiguity(_) => false, } } @@ -106,13 +125,12 @@ where let consumed_byte_count = match self.parser.parse(chunk, false) { Ok(c) => c, Err(e) => { - // The parser ran out of memory mid-chunk (e.g. the selectors VM stack hit the - // limit). The dispatcher's `remaining_content_start` points to the first byte of - // `chunk` that hasn't been emitted yet, so on a graceful bail-out flushing from + // The parser failed mid-chunk. The dispatcher's `remaining_content_start` + // points to the first byte of `chunk` that hasn't been emitted yet (memory + // errors happen before `lexeme_consumed()`; content handler errors happen + // between `emit_chunk_before_lexeme()` and `consume_lexeme()`). Flushing from // there preserves all bytes the caller fed us. - if matches!(e, RewritingError::MemoryLimitExceeded(_)) - && self.graceful_bail_out_on_memory_limit_exceeded - { + if self.should_bail_out_for(&e) { self.parser.get_dispatcher().flush_for_bail_out(chunk); } @@ -164,15 +182,17 @@ where if let Err(e) = self.parser.parse(chunk, true) { // Same reasoning as in `write()`: if we can bail out gracefully, make sure the sink // has all the input bytes before propagating the error. - if matches!(e, RewritingError::MemoryLimitExceeded(_)) - && self.graceful_bail_out_on_memory_limit_exceeded - { + if self.should_bail_out_for(&e) { self.parser.get_dispatcher().flush_for_bail_out(chunk); } return Err(e); } + // `finish()` flushes any remaining input *first* and only then calls `handle_end()`, + // so a `ContentHandlerError` from the end handler arrives after the sink already has + // every input byte. No additional flush needed; the caller continues from where the + // rewriter left off. self.parser.get_dispatcher().finish(chunk) } diff --git a/tests/fixtures/token_capturing.rs b/tests/fixtures/token_capturing.rs index 419f5553..f3a1ccc1 100644 --- a/tests/fixtures/token_capturing.rs +++ b/tests/fixtures/token_capturing.rs @@ -107,6 +107,7 @@ pub fn parse( next_encoding: Default::default(), strict: true, graceful_bail_out_on_memory_limit_exceeded: false, + graceful_bail_out_on_content_handler_error: false, }); let parser = transform_stream.parser(); diff --git a/tools/parser_trace/src/main.rs b/tools/parser_trace/src/main.rs index 4cb79f1b..c13840ab 100644 --- a/tools/parser_trace/src/main.rs +++ b/tools/parser_trace/src/main.rs @@ -105,6 +105,7 @@ fn main() { next_encoding: Default::default(), strict: true, graceful_bail_out_on_memory_limit_exceeded: false, + graceful_bail_out_on_content_handler_error: false, }); let parser = transform_stream.parser(); From 402a19ac5f81a71740d4435b130f6964356e8317 Mon Sep 17 00:00:00 2001 From: Diogo Sousa Date: Wed, 20 May 2026 20:55:34 +0100 Subject: [PATCH 3/3] Bump to v3.0.0. The two preceding commits added public fields to `MemorySettings` and `Settings`, externally-constructible structs without `#[non_exhaustive]`. `cargo-semver-checks` flags this as requiring a major version bump. Also bumps `js-api` to match. `lol_html_c_api` keeps its own version cadence; the maintainer can bump it separately if desired. --- CHANGELOG.md | 4 +++- Cargo.toml | 2 +- c-api/Cargo.lock | 2 +- js-api/Cargo.lock | 4 ++-- js-api/Cargo.toml | 4 ++-- 5 files changed, 9 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0db4639a..59e45c1a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ # Changelog -## Unreleased +## v3.0.0 - Added `MemorySettings::graceful_bail_out_on_memory_limit_exceeded`: when set, the rewriter flushes every input byte it has received but not yet emitted to the sink (as-is) before @@ -10,6 +10,8 @@ above, but for `RewritingError::ContentHandlerError`. When set, the rewriter flushes remaining input bytes before propagating a handler error, preserving the response. Currently exposed via the Rust API only; the C API still uses the original behavior. +- Adding new fields to `MemorySettings` and `Settings` is a SemVer-breaking change for + existing struct-literal construction, hence the major version bump. ## v2.9.0 diff --git a/Cargo.toml b/Cargo.toml index 98622fc3..489b810a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "lol_html" -version = "2.9.0" +version = "3.0.0" authors = ["Ivan Nikulin "] license = "BSD-3-Clause" description = "Streaming HTML rewriter/parser with CSS selector-based API" diff --git a/c-api/Cargo.lock b/c-api/Cargo.lock index af4d3cab..b2935794 100644 --- a/c-api/Cargo.lock +++ b/c-api/Cargo.lock @@ -137,7 +137,7 @@ checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" [[package]] name = "lol_html" -version = "2.9.0" +version = "3.0.0" dependencies = [ "bitflags", "cfg-if", diff --git a/js-api/Cargo.lock b/js-api/Cargo.lock index bf9ab83a..de0b9932 100644 --- a/js-api/Cargo.lock +++ b/js-api/Cargo.lock @@ -173,7 +173,7 @@ checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" [[package]] name = "lol-html" -version = "2.9.0" +version = "3.0.0" dependencies = [ "encoding_rs", "js-sys", @@ -186,7 +186,7 @@ dependencies = [ [[package]] name = "lol_html" -version = "2.9.0" +version = "3.0.0" dependencies = [ "bitflags", "cfg-if", diff --git a/js-api/Cargo.toml b/js-api/Cargo.toml index d74365fa..b4098110 100644 --- a/js-api/Cargo.toml +++ b/js-api/Cargo.toml @@ -2,7 +2,7 @@ name = "lol-html" description = "Streaming HTML rewriter/parser with CSS selector-based API" license = "BSD-3-Clause" -version = "2.9.0" +version = "3.0.0" authors = ["Ivan Nikulin ", "Gus Caplan "] repository = "https://github.com/cloudflare/lol-html" edition = "2024" @@ -14,7 +14,7 @@ crate-type = ["cdylib", "rlib"] [dependencies] js-sys = "0.3.97" -lol_html_native = { package = "lol_html", path = "../", version = "2.9.0" } +lol_html_native = { package = "lol_html", path = "../", version = "3.0.0" } serde = { version = "1.0.228", features = ["derive"] } encoding_rs = "0.8.35" serde-wasm-bindgen = "0.6.5"