From 6f8c6d5448f72353dfcf3a57f74a9bc0887272e1 Mon Sep 17 00:00:00 2001 From: Iain McGinniss <309153+iainmcgin@users.noreply.github.com> Date: Fri, 26 Jun 2026 20:53:12 +0000 Subject: [PATCH] types: make WirePayload opaque so to_str reaches the slack-aware validator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit WirePayload was a public-variant enum (Borrowed(&[u8]) / Owned(Bytes)). read_field_payload pre-sliced to &chunk[..len] before constructing Borrowed, so to_str() could only call plain validate_str — the custom-ProtoString decode path (decode_string_to) was the one string site #241's slack integration didn't reach. This makes WirePayload an opaque struct over a private repr enum whose Borrowed variant carries (chunk, len) with the invariant len <= chunk.len(); to_str() now calls validate_str_in(chunk, len) and reaches verify_with_slack whenever the chunk has SLACK readable bytes after the field. read_field_payload passes the full remaining chunk instead of pre-slicing. Public surface: same accessors (as_slice, to_str, into_bytes) plus new len / is_empty / is_owned, and borrowed(&[u8]) / owned(Bytes) constructors. Manual Debug prints only the field bytes, never the surrounding tail. from_wire impls that use the accessors are unaffected (all in-tree ones do); construct sites lowercase the constructor; match-on-variant moves to is_owned() + accessors. Breaking (0.9.0-tier): WirePayload shipped in 0.8.0 one day ago, and making it opaque now means the next repr change (single-chunk-Bytes zero-copy) is non-breaking. --- .../breaking-changes-20260626-204917.yaml | 17 ++ DESIGN.md | 4 +- buffa/src/types.rs | 169 ++++++++++++++---- examples/buffa-smolstr/src/lib.rs | 8 +- 4 files changed, 162 insertions(+), 36 deletions(-) create mode 100644 .changes/unreleased/breaking-changes-20260626-204917.yaml diff --git a/.changes/unreleased/breaking-changes-20260626-204917.yaml b/.changes/unreleased/breaking-changes-20260626-204917.yaml new file mode 100644 index 00000000..2e0f90b8 --- /dev/null +++ b/.changes/unreleased/breaking-changes-20260626-204917.yaml @@ -0,0 +1,17 @@ +kind: Breaking changes +body: |- + **`WirePayload` is now an opaque struct.** The 0.8.0 public-variant + enum (`Borrowed(&[u8])` / `Owned(Bytes)`) is replaced by a struct with + private fields, the same accessors (`as_slice`, `to_str`, `into_bytes`, + plus new `len` / `is_empty` / `is_owned`), and + `WirePayload::borrowed(&[u8])` / `WirePayload::owned(Bytes)` + constructors. `ProtoString` / `ProtoBytes` `from_wire` implementations + that use the accessors are unaffected; code that *constructed* + `WirePayload::Borrowed(..)` / `::Owned(..)` migrates by lowercasing to + `::borrowed(..)` / `::owned(..)`; code that *matched* on the variants + moves to the accessors — `is_owned()` covers the + take-the-`Bytes`-only-when-free pattern. The reshape lets a borrowed + payload carry the surrounding wire-buffer tail, so `to_str` now reaches + the slack-aware UTF-8 validator on the custom-`ProtoString` decode path + (it was the one `string` decode site #241 didn't cover). +time: 2026-06-26T20:49:17.028558817Z diff --git a/DESIGN.md b/DESIGN.md index 06cdc05f..a13bc856 100644 --- a/DESIGN.md +++ b/DESIGN.md @@ -532,9 +532,9 @@ By default a `string` field generates as `String` and a `bytes` field as `Vec) -> Result`, atop the supertraits `Clone + PartialEq + Default + Debug + Send + Sync`, plus `Deref` + `AsRef` + `From` + `From<&str>` for strings, and `Deref` + `AsRef<[u8]>` + `From>` for bytes. The `Deref` bound is load-bearing rather than cosmetic: generated code borrows a field as `&str` / `&[u8]` by plain reference coercion (`&self.field`), so a representation must `Deref` to the slice type, not merely offer `AsRef`. (`ProtoBytes` deliberately omits `From<&[u8]>`, because `bytes::Bytes` implements that only for `&'static [u8]`; requiring it would exclude `Bytes` itself.) There is deliberately **no blanket impl**: a representation implements the trait (a foreign type via a local newtype), and buffa ships the built-in impls for `String`, `Vec`, and `bytes::Bytes`. The earlier design *was* a blanket impl over `From` / `From>`; it was tightened to `from_wire` because the blanket path always pays an allocate-and-copy and a transient heap allocation — even for a short string an inline type could keep on the stack — so it disadvantaged every custom type. -**`from_wire` owns validation and borrow-vs-own; the defaults keep their specialized paths.** `WirePayload` is the decoder's gift to the representation: `Borrowed(&'a [u8])` when the field is contiguous in the input chunk (the common case for slice- and `Bytes`-backed sources) or `Owned(Bytes)` otherwise, exposing `as_slice()` (zero-copy borrow) and `into_bytes()` (zero-copy from a `Bytes`-backed source). So an inline string type validates `as_slice()` and stores it without a heap allocation, and a refcounted bytes type takes ownership with `into_bytes()`. `decode_string_to::()` / `decode_bytes_to::()` read one length-delimited field, build the payload, and call `S::from_wire` / `B::from_wire`. The default representations keep their hand-tuned fast paths: `String` reuses its buffer in place via `merge_string`, `Vec` via `merge_bytes` (`bytes::Bytes` and every other type go through `from_wire`). Clearing a field resets it to `Default::default()` rather than `.clear()`, because a substituted type may be immutable. The `From` / `From<&str>` supertraits remain, serving the JSON, text-format, and view→owned paths. The `arbitrary` support is type-agnostic: codegen attaches a generic builder (`arbitrary_proto_string::` / `arbitrary_proto_bytes::`) that materializes the canonical `String` / `Vec` and converts via `From`, so a substituted type needs no native `Arbitrary` impl. +**`from_wire` owns validation and borrow-vs-own; the defaults keep their specialized paths.** `WirePayload` is the decoder's gift to the representation: an opaque struct holding the field's bytes either borrowed from the input chunk (the common case for slice- and `Bytes`-backed sources) or owned as `Bytes` otherwise, exposing `as_slice()` (zero-copy borrow), `to_str()` (UTF-8-validated borrow, slack-aware on the borrowed path), `into_bytes()` (zero-copy when owned), and `is_owned()` to branch on which is free. So an inline string type validates via `to_str()` and stores it without a heap allocation, and a refcounted bytes type takes ownership with `into_bytes()`. `decode_string_to::()` / `decode_bytes_to::()` read one length-delimited field, build the payload, and call `S::from_wire` / `B::from_wire`. The default representations keep their hand-tuned fast paths: `String` reuses its buffer in place via `merge_string`, `Vec` via `merge_bytes` (`bytes::Bytes` and every other type go through `from_wire`). Clearing a field resets it to `Default::default()` rather than `.clear()`, because a substituted type may be immutable. The `From` / `From<&str>` supertraits remain, serving the JSON, text-format, and view→owned paths. The `arbitrary` support is type-agnostic: codegen attaches a generic builder (`arbitrary_proto_string::` / `arbitrary_proto_bytes::`) that materializes the canonical `String` / `Vec` and converts via `From`, so a substituted type needs no native `Arbitrary` impl. -A note on what is *not* yet optimal: a `Borrowed` payload (single-chunk source — the common case) copies into an `Owned(Bytes)` only when a representation calls `into_bytes()`, and the `Owned` variant is currently produced only for multi-chunk sources. So a refcounted bytes representation does not yet zero-copy *share* a single `Bytes` input buffer — it is no worse than `Vec` today, and a per-type `PREFERS_OWNED_BYTES` preference is a future, additive (non-breaking) enhancement. Inline string representations already get their full win (zero heap for short strings) regardless. +A note on what is *not* yet optimal: a borrowed payload (single-chunk source — the common case) copies into an owned `Bytes` only when a representation calls `into_bytes()`, and an owned payload is currently produced only for multi-chunk sources. So a refcounted bytes representation does not yet zero-copy *share* a single `Bytes` input buffer — it is no worse than `Vec` today, and a per-type `PREFERS_OWNED_BYTES` preference is a future, additive (non-breaking) enhancement. Inline string representations already get their full win (zero heap for short strings) regardless. **Foreign types reach the seam by newtype.** A foreign type cannot implement `ProtoString` / `ProtoBytes` directly (orphan rule), so it wraps in a local newtype that implements the trait — a thin `from_wire` plus the supertrait forwards. The [`examples/custom-types/`](examples/custom-types/) crate is the end-to-end reference (one newtype per knob, round-tripped through binary and JSON); the **`buffa-smolstr`** crate is the minimal single-type template: `pub struct SmolStr(smol_str::SmolStr)` with an inline, allocation-free `from_wire` (validate `as_slice()`, build via `SmolStr::from(&str)`), and an optional `serde` feature so it works in `optional` / `repeated` JSON fields. The preset crates and downstream fixtures follow the same shape. There is one caveat, specific to a custom type used as the element of a **`repeated`** field or as a **`map`** value: codegen must emit per-element trait impls that buffa cannot blanket-provide for a foreign type. Vtable reflection needs `impl ReflectElement` (so the field's reflective `get` can return `ValueRef::List` / `ValueRef::Map`), and the JSON `bytes` element path needs `impl ProtoElemJson` (base64); both have no blanket impl (they would collide with the concrete scalar impls under coherence, see §11), so codegen emits them — and an emitted `impl ForeignTrait for ForeignType` only compiles when the element type is **local** to the generating crate. (A custom `string` element also serializes its JSON through the element's own `Serialize`/`Deserialize`, which a local newtype derives.) So a custom `repeated` element or `map` value must be a crate-local type; every other use — singular / optional / oneof fields, bridge-mode reflection, non-reflective use — works with a foreign type directly. (A custom `bytes` map value is honored just like the built-in `bytes::Bytes`; only the `map` carve-out keeps `Vec` values.) diff --git a/buffa/src/types.rs b/buffa/src/types.rs index 83715de2..b5f1f487 100644 --- a/buffa/src/types.rs +++ b/buffa/src/types.rs @@ -701,41 +701,121 @@ pub fn merge_string(value: &mut String, buf: &mut impl Buf) -> Result<(), Decode /// construct itself directly from the wire — validating (or skipping validation) /// and choosing borrow-vs-own on its own terms. /// -/// The decoder hands over `Borrowed` when the field's bytes are contiguous in -/// the current input chunk (the common case for slice- and `Bytes`-backed -/// sources) and `Owned` only otherwise (e.g. a field straddling a `Chain` -/// boundary). A representation validates-and-borrows with +/// The decoder hands over a borrowed payload when the field's bytes are +/// contiguous in the current input chunk (the common case for slice- and +/// `Bytes`-backed sources) and an owned [`Bytes`] only otherwise (e.g. a field +/// straddling a `Chain` boundary). A representation validates-and-borrows with /// [`to_str`](Self::to_str), reads the raw bytes with /// [`as_slice`](Self::as_slice) (always zero-copy), or takes ownership with -/// [`into_bytes`](Self::into_bytes) (zero-copy only for an `Owned` payload — -/// see that method). -#[derive(Debug, Clone)] -pub enum WirePayload<'a> { - /// The field's bytes borrowed directly from the input buffer. - Borrowed(&'a [u8]), - /// The field's bytes owned as `Bytes` (reference-counted). Produced today - /// only for multi-chunk sources; a single-chunk source — including a single - /// `Bytes` buffer — currently arrives as `Borrowed`. +/// [`into_bytes`](Self::into_bytes) (zero-copy only for an owned payload — see +/// that method). +/// +/// Opaque so that a borrowed payload can carry the surrounding wire-buffer tail +/// for the slack-aware UTF-8 validator without exposing it to consumers; use the +/// accessors above and the [`borrowed`](Self::borrowed) / [`owned`](Self::owned) +/// constructors. +#[derive(Clone)] +pub struct WirePayload<'a>(WirePayloadRepr<'a>); + +impl core::fmt::Debug for WirePayload<'_> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + // Print only the field bytes, never the surrounding tail (which for a + // decoder-constructed payload is subsequent fields' wire bytes). + f.debug_struct("WirePayload") + .field("bytes", &self.as_slice()) + .field("owned", &self.is_owned()) + .finish() + } +} + +#[derive(Clone)] +enum WirePayloadRepr<'a> { + /// `chunk[..len]` is the field; `chunk[len..]` is the surrounding + /// wire-buffer tail (empty when constructed via `WirePayload::borrowed`). + /// Invariant: `len <= chunk.len()`. + Borrowed { + chunk: &'a [u8], + len: usize, + }, Owned(Bytes), } -impl WirePayload<'_> { +impl<'a> WirePayload<'a> { + /// A payload borrowing exactly `field` (no surrounding wire buffer). + /// + /// Tests and hand-built payloads use this; the decoder uses an internal + /// constructor that also carries the surrounding tail so + /// [`to_str`](Self::to_str) can take the slack-aware fast path. + #[inline] + #[must_use] + pub const fn borrowed(field: &'a [u8]) -> Self { + Self(WirePayloadRepr::Borrowed { + chunk: field, + len: field.len(), + }) + } + + /// A payload owning `bytes`. + #[inline] + #[must_use] + pub const fn owned(bytes: Bytes) -> Self { + Self(WirePayloadRepr::Owned(bytes)) + } + + /// `chunk[..len]` is the field; `chunk[len..]` is readable tail. The caller + /// has established `len <= chunk.len()` (the EOF check before every call). + #[inline] + pub(crate) fn borrowed_in(chunk: &'a [u8], len: usize) -> Self { + debug_assert!(len <= chunk.len()); + Self(WirePayloadRepr::Borrowed { chunk, len }) + } + /// Borrow the field's bytes (zero-copy in both variants). #[inline] #[must_use] pub fn as_slice(&self) -> &[u8] { - match self { - WirePayload::Borrowed(s) => s, - WirePayload::Owned(b) => b, + match &self.0 { + WirePayloadRepr::Borrowed { chunk, len } => &chunk[..*len], + WirePayloadRepr::Owned(b) => b, } } + /// Field length in bytes. + #[inline] + #[must_use] + pub fn len(&self) -> usize { + match &self.0 { + WirePayloadRepr::Borrowed { len, .. } => *len, + WirePayloadRepr::Owned(b) => b.len(), + } + } + + /// Whether the field is empty. + #[inline] + #[must_use] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Whether this payload owns its bytes (and so [`into_bytes`](Self::into_bytes) + /// is zero-copy). A `from_wire` impl that wants the [`Bytes`] only when it's + /// free can branch on this and fall back to [`as_slice`](Self::as_slice) + /// otherwise. + #[inline] + #[must_use] + pub fn is_owned(&self) -> bool { + matches!(self.0, WirePayloadRepr::Owned(_)) + } + /// Borrow the payload as a `&str` if it is valid UTF-8. /// /// Convenience for [`ProtoString::from_wire`] implementations; uses /// buffa's UTF-8 validator (so it picks up the `fast-utf8` feature when /// enabled) and returns the same [`DecodeError::InvalidUtf8`] the - /// built-in `String` representation would. + /// built-in `String` representation would. For a borrowed payload that + /// carries surrounding wire-buffer tail (the decoder's common case) this + /// reaches the slack-aware fast path; the `borrowed`/`owned` constructors + /// have no tail, so they take the plain validator. /// /// # Errors /// @@ -743,23 +823,30 @@ impl WirePayload<'_> { #[inline] #[must_use = "the validated `&str` is the only way to use the payload as a string"] pub fn to_str(&self) -> Result<&str, DecodeError> { - validate_str(self.as_slice()) + match &self.0 { + // SAFETY: the `Borrowed` invariant is `len <= chunk.len()`, + // satisfying `validate_str_in`'s precondition. Established at + // `borrowed_in` (debug-asserted) by the decoder's preceding EOF + // check, and at `borrowed` by `len = field.len()`. + WirePayloadRepr::Borrowed { chunk, len } => unsafe { validate_str_in(chunk, *len) }, + WirePayloadRepr::Owned(b) => validate_str(b), + } } /// Take ownership of the field's bytes as [`Bytes`]. /// - /// Zero-copy only for an `Owned` payload, which today is produced only for + /// Zero-copy only for an owned payload, which today is produced only for /// multi-chunk sources — a single-chunk source (including a single `Bytes` - /// buffer) arrives as `Borrowed` and is copied here. For a guaranteed - /// zero-copy `bytes` field path use the built-in `bytes::Bytes` representation + /// buffer) arrives borrowed and is copied here. For a guaranteed zero-copy + /// `bytes` field path use the built-in `bytes::Bytes` representation /// ([`decode_bytes_to_bytes`]); single-chunk-`Bytes` zero-copy for custom /// types is a planned additive enhancement. #[inline] #[must_use] pub fn into_bytes(self) -> Bytes { - match self { - WirePayload::Borrowed(s) => Bytes::copy_from_slice(s), - WirePayload::Owned(b) => b, + match self.0 { + WirePayloadRepr::Borrowed { chunk, len } => Bytes::copy_from_slice(&chunk[..len]), + WirePayloadRepr::Owned(b) => b, } } } @@ -793,14 +880,15 @@ pub(crate) fn read_field_payload( } let chunk = buf.chunk(); if chunk.len() >= len { - // Whole field is contiguous: hand over a borrowed slice (zero-copy). - let r = f(WirePayload::Borrowed(&chunk[..len]))?; + // Whole field is contiguous: hand over a borrowed payload carrying the + // full remaining chunk (so `to_str` can take the slack-aware path). + let r = f(WirePayload::borrowed_in(chunk, len))?; buf.advance(len); Ok(r) } else { // Field straddles chunk boundaries: take an owned `Bytes` (zero-copy // when `buf` is `Bytes`-backed, a copy otherwise). - f(WirePayload::Owned(buf.copy_to_bytes(len))) + f(WirePayload::owned(buf.copy_to_bytes(len))) } } @@ -1558,11 +1646,32 @@ mod tests { #[test] fn wire_payload_to_str() { - assert_eq!(WirePayload::Borrowed(b"abc").to_str().unwrap(), "abc"); + assert_eq!(WirePayload::borrowed(b"abc").to_str().unwrap(), "abc"); + assert!(matches!( + WirePayload::borrowed(&[0xFF]).to_str(), + Err(DecodeError::InvalidUtf8) + )); + // Decoder-constructed payload with surrounding tail: `to_str` reaches + // `validate_str_in` and the tail is excluded from the result. + let wire = b"abc\x00\x00\x00\x00\x00\x00\x00\x00trailing"; + assert_eq!(WirePayload::borrowed_in(wire, 3).to_str().unwrap(), "abc"); + assert_eq!(WirePayload::borrowed_in(wire, 3).as_slice(), b"abc"); + assert_eq!(WirePayload::borrowed_in(wire, 3).len(), 3); + // Tail bytes after an invalid field don't rescue it. assert!(matches!( - WirePayload::Borrowed(&[0xFF]).to_str(), + WirePayload::borrowed_in(b"\xFFtailtailtail", 1).to_str(), Err(DecodeError::InvalidUtf8) )); + // Owned path is unchanged. + let p = WirePayload::owned(Bytes::from_static(b"hi")); + assert_eq!(p.to_str().unwrap(), "hi"); + assert!(p.is_owned()); + assert!(!WirePayload::borrowed(b"hi").is_owned()); + assert!(WirePayload::borrowed(b"").is_empty()); + // Debug never prints the surrounding tail. + let dbg = alloc::format!("{:?}", WirePayload::borrowed_in(wire, 3)); + assert!(dbg.contains("[97, 98, 99]"), "{dbg}"); + assert!(!dbg.contains("trailing"), "{dbg}"); } /// A custom string representation that enforces an extra invariant in diff --git a/examples/buffa-smolstr/src/lib.rs b/examples/buffa-smolstr/src/lib.rs index 1db06a77..37523eaa 100644 --- a/examples/buffa-smolstr/src/lib.rs +++ b/examples/buffa-smolstr/src/lib.rs @@ -102,7 +102,7 @@ mod tests { fn short_string_decodes_inline_without_heap() { // 5 bytes is well within smol_str's 23-byte inline capacity, so // `from_wire` must produce an inline value with no heap allocation. - let s = SmolStr::from_wire(WirePayload::Borrowed(b"hello")).unwrap(); + let s = SmolStr::from_wire(WirePayload::borrowed(b"hello")).unwrap(); assert_eq!(s.as_ref(), "hello"); assert!( !s.0.is_heap_allocated(), @@ -113,18 +113,18 @@ mod tests { #[test] fn long_string_roundtrips() { let long = "x".repeat(64); - let s = SmolStr::from_wire(WirePayload::Borrowed(long.as_bytes())).unwrap(); + let s = SmolStr::from_wire(WirePayload::borrowed(long.as_bytes())).unwrap(); assert_eq!(s.as_ref(), long.as_str()); } #[test] fn invalid_utf8_is_rejected() { - assert!(SmolStr::from_wire(WirePayload::Borrowed(&[0xff, 0xfe])).is_err()); + assert!(SmolStr::from_wire(WirePayload::borrowed(&[0xff, 0xfe])).is_err()); } #[test] fn owned_payload_decodes() { - let payload = WirePayload::Owned(::buffa::bytes::Bytes::from_static(b"hi")); + let payload = WirePayload::owned(::buffa::bytes::Bytes::from_static(b"hi")); let s = SmolStr::from_wire(payload).unwrap(); assert_eq!(s.as_ref(), "hi"); assert!(!s.0.is_heap_allocated());