From b2c588e6423b8d5a7e060786cd52c0d4e11e790a Mon Sep 17 00:00:00 2001 From: OmarAlJarrah Date: Tue, 16 Jun 2026 16:18:40 +0300 Subject: [PATCH 1/3] feat: charset-aware, binary-safe body previews in request/response logging MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Body previews in the default sync and async instrumentation steps decoded every captured payload as UTF-8. That produced mojibake for a text body declared in another charset (a latin-1 response logged `café` as `caf?`) and a run of U+FFFD replacement characters for a binary body (gzip, an image), which is both unreadable and capable of corrupting log viewers. Add a shared `BodyPreview` renderer that the two steps delegate to: - Text bodies are decoded with the charset declared on the body's `MediaType`, falling back to UTF-8 when the media type omits a charset or names one the JVM cannot resolve. - A short text/binary sniff (NUL byte, or a high ratio of non-whitespace control bytes in a leading sample) classifies binary bodies, which are rendered as a size-only `[binary N bytes]` summary instead of being decoded as text. Decoding still substitutes the replacement character for malformed input rather than throwing, so a snapshot truncated mid-multibyte-sequence by the bounded capture cannot crash the log line. The bounded/streaming capture behaviour of the loggable bodies is unchanged. --- .../core/http/pipeline/steps/BodyPreview.kt | 127 ++++++++++++++++ .../steps/DefaultAsyncInstrumentationStep.kt | 13 +- .../steps/DefaultInstrumentationStep.kt | 13 +- .../http/pipeline/steps/BodyPreviewTest.kt | 85 +++++++++++ .../pipeline/steps/InstrumentationStepTest.kt | 137 +++++++++++++++++- 5 files changed, 352 insertions(+), 23 deletions(-) create mode 100644 sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreview.kt create mode 100644 sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreviewTest.kt diff --git a/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreview.kt b/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreview.kt new file mode 100644 index 00000000..dfb8146e --- /dev/null +++ b/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreview.kt @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2026 dexpace and Omar Aljarrah + * + * Licensed under the MIT License. See LICENSE in the project root. + * SPDX-License-Identifier: MIT + */ + +package org.dexpace.sdk.core.http.pipeline.steps + +import org.dexpace.sdk.core.http.common.MediaType +import java.nio.charset.Charset + +/** + * Renders captured request/response body bytes into a short, log-safe preview string. + * + * The default instrumentation steps previously decoded every preview as UTF-8, which produced + * mojibake for a text body declared in another charset (e.g. ISO-8859-1) and a stream of + * `U+FFFD` replacement characters for a binary body (a gzip payload, an image, …). This helper + * makes previews: + * + * - **Charset-aware** — a text body is decoded with the charset declared on its [MediaType] + * (`Content-Type: text/plain;charset=ISO-8859-1`). When the media type omits a charset, or + * names one the JVM does not recognise, the preview falls back to [DEFAULT_CHARSET] (UTF-8), + * matching the HTTP default for most text types and the previous behaviour. + * - **Binary-safe** — a body that does not look like text is never decoded. Instead it is + * summarised as `[binary N bytes]`, so an operator sees a stable, greppable marker rather + * than a line of replacement characters that can corrupt log viewers. + * + * Decoding itself never throws: [Charset]-based decoding substitutes the replacement character + * for malformed input, so a snapshot that ends mid-multibyte-sequence (a real possibility on a + * bounded capture) still yields a usable preview rather than crashing the log line. + * + * This is an `internal` seam shared by [DefaultInstrumentationStep] and + * [DefaultAsyncInstrumentationStep]; it has no public API surface. + */ +internal object BodyPreview { + /** Charset used when a text body declares no charset, or names an unknown one. */ + internal val DEFAULT_CHARSET: Charset = Charsets.UTF_8 + + /** + * Number of leading bytes inspected by [isProbablyText]. A small fixed sample keeps the + * heuristic O(1) regardless of body size while still catching the common binary signatures + * (NUL bytes, control-byte runs) that appear near the start of a payload. + */ + private const val SNIFF_SAMPLE_BYTES = 1024 + + /** + * Fraction of sampled bytes that may be non-text control bytes before the body is judged + * binary. Real text can carry the odd control byte (an ESC sequence, a stray form feed), so + * the cutoff sits well above zero rather than rejecting on the first control byte. + */ + private const val MAX_CONTROL_BYTE_RATIO = 0.30 + + // Byte values used by the text/binary heuristic, named so the rule logic reads clearly and + // so the values are not flagged as inline magic numbers. + private const val UNSIGNED_BYTE_MASK = 0xFF + private const val NUL = 0x00 + private const val FIRST_PRINTABLE = 0x20 // space; bytes below this are C0 control bytes + private const val TAB = 0x09 + private const val LINE_FEED = 0x0A + private const val CARRIAGE_RETURN = 0x0D + private const val DEL = 0x7F + + /** + * Renders [bytes] as a preview string. + * + * Empty input yields the empty string. A body that does not pass [isProbablyText] is rendered + * as a size-only `[binary N bytes]` summary. Otherwise the bytes are decoded with the charset + * from [mediaType] (or [DEFAULT_CHARSET] when absent/unknown). + */ + internal fun render( + bytes: ByteArray, + mediaType: MediaType?, + ): String { + if (bytes.isEmpty()) return "" + if (!isProbablyText(bytes)) return binarySummary(bytes.size) + return previewText(bytes, mediaType) + } + + /** + * Decodes [bytes] using the charset declared on [mediaType], falling back to + * [DEFAULT_CHARSET] when the media type is null, declares no charset, or names a charset the + * JVM cannot resolve ([MediaType.charset] returns null in the latter two cases). Invalid byte + * sequences are replaced rather than throwing. + */ + internal fun previewText( + bytes: ByteArray, + mediaType: MediaType?, + ): String { + if (bytes.isEmpty()) return "" + val charset = mediaType?.charset ?: DEFAULT_CHARSET + return String(bytes, charset) + } + + /** + * Heuristically decides whether [bytes] is text. Samples the first [SNIFF_SAMPLE_BYTES]: + * a single NUL byte is treated as a strong binary signal, and a control-byte ratio above + * [MAX_CONTROL_BYTE_RATIO] (excluding the common text whitespace `\t`, `\n`, `\r`) also marks + * the body binary. Bytes >= 0x80 are not counted against the body — they are legitimate in + * UTF-8 multibyte sequences and in single-byte charsets such as ISO-8859-1. + */ + @Suppress("ReturnCount") + internal fun isProbablyText(bytes: ByteArray): Boolean { + if (bytes.isEmpty()) return true + val sample = minOf(bytes.size, SNIFF_SAMPLE_BYTES) + var controlBytes = 0 + for (i in 0 until sample) { + val b = bytes[i].toInt() and UNSIGNED_BYTE_MASK + // A NUL byte essentially never appears in real text and is the canonical marker of a + // binary payload (gzip, images, protobuf, …); reject immediately. + if (b == NUL) return false + if (isControlByte(b)) controlBytes++ + } + return controlBytes.toDouble() / sample <= MAX_CONTROL_BYTE_RATIO + } + + /** + * True for an ASCII C0 control byte that is not one of the whitespace characters routinely + * found in text ([TAB], [LINE_FEED], [CARRIAGE_RETURN]), or for the [DEL] byte. High bytes + * (>= 0x80) are deliberately excluded — see [isProbablyText]. + */ + private fun isControlByte(b: Int): Boolean = + (b < FIRST_PRINTABLE && b != TAB && b != LINE_FEED && b != CARRIAGE_RETURN) || b == DEL + + /** The size-only summary emitted for a binary body. */ + private fun binarySummary(size: Int): String = "[binary $size bytes]" +} diff --git a/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/DefaultAsyncInstrumentationStep.kt b/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/DefaultAsyncInstrumentationStep.kt index 9d481d35..99d02131 100644 --- a/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/DefaultAsyncInstrumentationStep.kt +++ b/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/DefaultAsyncInstrumentationStep.kt @@ -308,13 +308,13 @@ public class DefaultAsyncInstrumentationStep if (requestBody != null) { val preview = requestBody.snapshot(options.bodyPreviewMaxBytes) ev.field("request.body.size", preview.size.toLong()) - .field("request.body.preview", utf8Preview(preview)) + .field("request.body.preview", BodyPreview.render(preview, requestBody.mediaType())) } val responseBody = response.body if (responseBody is LoggableResponseBody) { val preview = responseBody.snapshot(options.bodyPreviewMaxBytes) ev.field("response.body.size", preview.size.toLong()) - .field("response.body.preview", utf8Preview(preview)) + .field("response.body.preview", BodyPreview.render(preview, responseBody.mediaType())) responseBody.captureException?.let { ev.field("response.body.drain_error", it.javaClass.simpleName ?: "Throwable") } @@ -346,7 +346,7 @@ public class DefaultAsyncInstrumentationStep if (shouldCaptureBody() && requestBody != null) { val preview = requestBody.snapshot(options.bodyPreviewMaxBytes) ev.field("request.body.size", preview.size.toLong()) - .field("request.body.preview", utf8Preview(preview)) + .field("request.body.preview", BodyPreview.render(preview, requestBody.mediaType())) } ev.log() } catch (t: Throwable) { @@ -416,13 +416,6 @@ public class DefaultAsyncInstrumentationStep private fun elapsedMillis(startNanos: Long): Double = (clock.monotonic() - startNanos) / NANOS_PER_MILLI_DOUBLE - private fun utf8Preview(bytes: ByteArray): String { - if (bytes.isEmpty()) return "" - // Defensive: a snapshot that ends mid-UTF-8 codepoint shouldn't crash the log line. - // String(bytes, charset) replaces invalid sequences rather than throwing. - return String(bytes, Charsets.UTF_8) - } - private fun emitInstrumentationError( event: String, phase: String, diff --git a/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/DefaultInstrumentationStep.kt b/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/DefaultInstrumentationStep.kt index 1dfbc908..2b7e84b1 100644 --- a/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/DefaultInstrumentationStep.kt +++ b/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/DefaultInstrumentationStep.kt @@ -212,13 +212,13 @@ public class DefaultInstrumentationStep if (requestBody != null) { val preview = requestBody.snapshot(options.bodyPreviewMaxBytes) ev.field("request.body.size", preview.size.toLong()) - .field("request.body.preview", utf8Preview(preview)) + .field("request.body.preview", BodyPreview.render(preview, requestBody.mediaType())) } val responseBody = response.body if (responseBody is LoggableResponseBody) { val preview = responseBody.snapshot(options.bodyPreviewMaxBytes) ev.field("response.body.size", preview.size.toLong()) - .field("response.body.preview", utf8Preview(preview)) + .field("response.body.preview", BodyPreview.render(preview, responseBody.mediaType())) responseBody.captureException?.let { ev.field("response.body.drain_error", it.javaClass.simpleName ?: "Throwable") } @@ -250,7 +250,7 @@ public class DefaultInstrumentationStep if (shouldCaptureBody() && requestBody != null) { val preview = requestBody.snapshot(options.bodyPreviewMaxBytes) ev.field("request.body.size", preview.size.toLong()) - .field("request.body.preview", utf8Preview(preview)) + .field("request.body.preview", BodyPreview.render(preview, requestBody.mediaType())) } ev.log() } catch (t: Throwable) { @@ -320,13 +320,6 @@ public class DefaultInstrumentationStep private fun elapsedMillis(startNanos: Long): Double = (clock.monotonic() - startNanos) / NANOS_PER_MILLI_DOUBLE - private fun utf8Preview(bytes: ByteArray): String { - if (bytes.isEmpty()) return "" - // Defensive: a snapshot that ends mid-UTF-8 codepoint shouldn't crash the log line. - // String(bytes, charset) replaces invalid sequences rather than throwing. - return String(bytes, Charsets.UTF_8) - } - private fun emitInstrumentationError( event: String, phase: String, diff --git a/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreviewTest.kt b/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreviewTest.kt new file mode 100644 index 00000000..d1939776 --- /dev/null +++ b/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreviewTest.kt @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2026 dexpace and Omar Aljarrah + * + * Licensed under the MIT License. See LICENSE in the project root. + * SPDX-License-Identifier: MIT + */ + +package org.dexpace.sdk.core.http.pipeline.steps + +import org.dexpace.sdk.core.http.common.MediaType +import java.nio.charset.StandardCharsets +import kotlin.test.Test +import kotlin.test.assertEquals +import kotlin.test.assertFalse +import kotlin.test.assertTrue + +class BodyPreviewTest { + @Test + fun `empty bytes render to empty string`() { + assertEquals("", BodyPreview.render(ByteArray(0), MediaType.parse("text/plain"))) + } + + @Test + fun `text body declared as ISO-8859-1 is decoded with that charset`() { + // 0xE9 is 'é' in ISO-8859-1. Decoded as UTF-8 it would be the U+FFFD replacement char. + val bytes = "café".toByteArray(StandardCharsets.ISO_8859_1) + val preview = BodyPreview.render(bytes, MediaType.parse("text/plain;charset=ISO-8859-1")) + assertEquals("café", preview) + assertFalse(preview.contains('�'), "latin-1 body must not be decoded as mojibake") + } + + @Test + fun `text body declared as UTF-8 is decoded with UTF-8`() { + val bytes = "naïve — text".toByteArray(StandardCharsets.UTF_8) + assertEquals("naïve — text", BodyPreview.render(bytes, MediaType.parse("text/plain;charset=utf-8"))) + } + + @Test + fun `text body with no declared charset falls back to UTF-8`() { + val bytes = "résumé".toByteArray(StandardCharsets.UTF_8) + // No charset parameter on the media type → documented default is UTF-8. + assertEquals("résumé", BodyPreview.render(bytes, MediaType.parse("text/plain"))) + // A null media type also falls back to UTF-8. + assertEquals("résumé", BodyPreview.render(bytes, null)) + } + + @Test + fun `unknown declared charset falls back to UTF-8`() { + val bytes = "hello".toByteArray(StandardCharsets.UTF_8) + // MediaType.charset returns null for an unrecognised charset name; preview falls back. + assertEquals("hello", BodyPreview.render(bytes, MediaType.parse("text/plain;charset=not-a-real-charset"))) + } + + @Test + fun `binary body with NUL byte is summarised as size only and not decoded`() { + // gzip magic header (0x1f 0x8b 0x08) followed by a NUL byte — a representative binary body. + val bytes = byteArrayOf(0x1f, 0x8b.toByte(), 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03) + val preview = BodyPreview.render(bytes, MediaType.parse("application/gzip")) + assertEquals("[binary ${bytes.size} bytes]", preview) + assertFalse(preview.contains('�'), "binary body must not be rendered as replacement chars") + } + + @Test + fun `binary body declared with a text charset is still detected as binary`() { + // Even if a server mislabels a gzip stream as text, the NUL byte keeps it from being decoded. + val bytes = byteArrayOf(0x1f, 0x8b.toByte(), 0x00, 0x42, 0x00, 0x13) + val preview = BodyPreview.render(bytes, MediaType.parse("text/plain;charset=utf-8")) + assertEquals("[binary ${bytes.size} bytes]", preview) + } + + @Test + fun `isProbablyText accepts plain ASCII and latin-1 high bytes`() { + assertTrue(BodyPreview.isProbablyText("plain ascii text".toByteArray(StandardCharsets.US_ASCII))) + assertTrue(BodyPreview.isProbablyText("café".toByteArray(StandardCharsets.ISO_8859_1))) + // Common text whitespace control bytes do not flip the verdict. + assertTrue(BodyPreview.isProbablyText("line1\r\n\tline2".toByteArray(StandardCharsets.UTF_8))) + } + + @Test + fun `isProbablyText rejects a NUL byte and a control-byte-heavy body`() { + assertFalse(BodyPreview.isProbablyText(byteArrayOf(0x41, 0x00, 0x42))) + // A run of non-whitespace C0 control bytes exceeds the ratio cutoff. + assertFalse(BodyPreview.isProbablyText(byteArrayOf(0x01, 0x02, 0x03, 0x04, 0x05))) + } +} diff --git a/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/InstrumentationStepTest.kt b/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/InstrumentationStepTest.kt index 6658ea18..f97108c3 100644 --- a/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/InstrumentationStepTest.kt +++ b/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/InstrumentationStepTest.kt @@ -617,9 +617,9 @@ class InstrumentationStepTest { } @Test - fun `empty body produces empty utf8 preview`() { - // utf8Preview takes a fast empty branch on a zero-length array. Send a request that - // results in a zero-byte response body so the preview hits the empty path. + fun `empty body produces empty preview`() { + // The preview renderer takes a fast empty branch on a zero-length array. Send a request + // that results in a zero-byte response body so the preview hits the empty path. val fake = FakeHttpClient().enqueue { status(200).body("", MediaType.parse("text/plain")) } val pipeline = HttpPipelineBuilder(fake) @@ -635,6 +635,119 @@ class InstrumentationStepTest { response.close() } + @Test + fun `response body preview honours the declared ISO-8859-1 charset`() { + // 0xE9 is 'é' in ISO-8859-1. Decoded as UTF-8 (the old hardcoded assumption) it would be + // the U+FFFD replacement char, so this pins charset-aware decoding end to end. + val fakeSlf4j = FakeSlf4jLogger("test.instrumentation") + val clientLogger = ClientLogger.forTesting(fakeSlf4j) + val latin1 = MediaType.parse("text/plain;charset=ISO-8859-1") + val bytes = "café".toByteArray(Charsets.ISO_8859_1) + val fake = + FakeHttpClient().enqueue { + status(200).body(ResponseBody.create(Io.provider.source(bytes), latin1, bytes.size.toLong())) + } + val pipeline = + HttpPipelineBuilder(fake) + .append( + DefaultInstrumentationStep( + HttpInstrumentationOptions(logLevel = HttpLogLevel.BODY_AND_HEADERS), + FixedClock(), + clientLogger, + ), + ) + .build() + + val response = pipeline.send(getRequest("https://api.example.com/x")) + response.close() + + assertEquals("café", responsePreview(fakeSlf4j)) + } + + @Test + fun `response body preview without a declared charset defaults to UTF-8`() { + val fakeSlf4j = FakeSlf4jLogger("test.instrumentation") + val clientLogger = ClientLogger.forTesting(fakeSlf4j) + // No charset parameter — documented default is UTF-8. body(String, ...) encodes as UTF-8. + val fake = FakeHttpClient().enqueue { status(200).body("résumé", MediaType.parse("text/plain")) } + val pipeline = + HttpPipelineBuilder(fake) + .append( + DefaultInstrumentationStep( + HttpInstrumentationOptions(logLevel = HttpLogLevel.BODY_AND_HEADERS), + FixedClock(), + clientLogger, + ), + ) + .build() + + val response = pipeline.send(getRequest("https://api.example.com/x")) + response.close() + + assertEquals("résumé", responsePreview(fakeSlf4j)) + } + + @Test + fun `binary response body is summarised by size and not rendered as text`() { + val fakeSlf4j = FakeSlf4jLogger("test.instrumentation") + val clientLogger = ClientLogger.forTesting(fakeSlf4j) + // gzip-shaped bytes including a NUL — must not be decoded into replacement characters. + val binary = byteArrayOf(0x1f, 0x8b.toByte(), 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03) + val mediaType = MediaType.parse("application/gzip") + val fake = + FakeHttpClient().enqueue { + status(200).body(ResponseBody.create(Io.provider.source(binary), mediaType, binary.size.toLong())) + } + val pipeline = + HttpPipelineBuilder(fake) + .append( + DefaultInstrumentationStep( + HttpInstrumentationOptions(logLevel = HttpLogLevel.BODY_AND_HEADERS), + FixedClock(), + clientLogger, + ), + ) + .build() + + val response = pipeline.send(getRequest("https://api.example.com/x")) + response.close() + + val preview = responsePreview(fakeSlf4j) + assertEquals("[binary ${binary.size} bytes]", preview) + assertFalse(preview.contains('�'), "binary body must not be logged as replacement characters") + } + + @Test + fun `request body preview honours the declared ISO-8859-1 charset`() { + val fakeSlf4j = FakeSlf4jLogger("test.instrumentation") + val clientLogger = ClientLogger.forTesting(fakeSlf4j) + val latin1 = MediaType.parse("text/plain;charset=ISO-8859-1") + val bytes = "naïve".toByteArray(Charsets.ISO_8859_1) + val fake = FakeHttpClient().enqueue { status(200).body("ok", MediaType.parse("text/plain")) } + val draining = DrainingClient(fake) + val pipeline = + HttpPipelineBuilder(draining) + .append( + DefaultInstrumentationStep( + HttpInstrumentationOptions(logLevel = HttpLogLevel.BODY_AND_HEADERS), + FixedClock(), + clientLogger, + ), + ) + .build() + + val req = + Request.builder() + .method(Method.POST) + .url("https://api.example.com/echo") + .body(RequestBody.create(bytes, latin1)) + .build() + val response = pipeline.send(req) + response.close() + + assertEquals("naïve", requestPreview(fakeSlf4j)) + } + @Test fun `emit_request_failed instrumentation error is logged when contentLength on the body throws`() { // Drive the catch branch in emitRequestEvent: a request whose body.contentLength() @@ -791,6 +904,24 @@ class InstrumentationStepTest { // -- Helpers -------------------------------------------------------------------------------- + /** Extracts the `response.body.preview` field value from the latest `http.response` event. */ + private fun responsePreview(slf4j: FakeSlf4jLogger): String = previewField(slf4j, "response.body.preview") + + /** Extracts the `request.body.preview` field value from the latest `http.response` event. */ + private fun requestPreview(slf4j: FakeSlf4jLogger): String = previewField(slf4j, "request.body.preview") + + private fun previewField( + slf4j: FakeSlf4jLogger, + key: String, + ): String { + val record = + slf4j.records.last { rec -> + rec.keyValues.any { it.key == "event" && it.value == "http.response" } + } + return record.keyValues.firstOrNull { it.key == key }?.value as? String + ?: fail("expected '$key' field on the http.response event") + } + private fun getRequest(url: String): Request = Request.builder() .method(Method.GET) From 3f5bcba62057b121ab35acac81c2b63a86fc54a2 Mon Sep 17 00:00:00 2001 From: OmarAlJarrah Date: Tue, 16 Jun 2026 22:26:32 +0300 Subject: [PATCH 2/3] feat: decode declared wide-charset bodies in previews; tighten BodyPreview seam The binary heuristic treated any NUL byte as a definitive binary signal and ran before the declared charset was consulted, so a body declared as a fixed-width multi-byte charset (UTF-16/UTF-32) was rendered as [binary N bytes] even though its NUL padding is legitimate text. render() now consults the declared charset first: when the MediaType names a resolvable charset whose ASCII encoding contains NUL bytes, the body is decoded with that charset and the NUL-based heuristic is skipped. Single-byte charsets and bodies with no declared charset still run the heuristic. Document the wide-charset handling and the best-effort nature of the content sniff in the BodyPreview KDoc. Mark previewText and DEFAULT_CHARSET private so the shared/tested seam is exactly render + isProbablyText. Add an async end-to-end test mirroring the sync ISO-8859-1 response-preview assertion so the async step's mediaType() pass-through is covered. --- .../core/http/pipeline/steps/BodyPreview.kt | 63 +++++++++++++++---- .../steps/AsyncInstrumentationStepTest.kt | 41 ++++++++++++ .../http/pipeline/steps/BodyPreviewTest.kt | 16 +++++ 3 files changed, 108 insertions(+), 12 deletions(-) diff --git a/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreview.kt b/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreview.kt index dfb8146e..b2f4c347 100644 --- a/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreview.kt +++ b/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreview.kt @@ -30,12 +30,32 @@ import java.nio.charset.Charset * for malformed input, so a snapshot that ends mid-multibyte-sequence (a real possibility on a * bounded capture) still yields a usable preview rather than crashing the log line. * + * ## Multi-byte charsets and the binary heuristic + * The text/binary heuristic ([isProbablyText]) normally treats a NUL byte as a definitive binary + * signal. That rule misfires for a fixed-width multi-byte charset such as UTF-16 or UTF-32, where + * ASCII content is padded with NUL bytes (`'A'` is `0x00 0x41` in UTF-16BE). To avoid rendering + * legitimate wide-charset text as `[binary N bytes]`, [render] consults the declared charset + * first: when the [MediaType] explicitly declares a *resolvable* charset whose encoding is not + * NUL-free for ASCII (UTF-16/UTF-32 and friends), the body is decoded with that charset and the + * NUL-based heuristic is skipped. When no charset is declared, or the declared one is single-byte + * (US-ASCII, ISO-8859-1, UTF-8), the heuristic still runs so a genuinely binary body is summarised + * rather than decoded into noise. + * + * ## Best-effort detection + * The heuristic is **best-effort**, not a guarantee. It samples only the first + * [SNIFF_SAMPLE_BYTES] and counts C0 control bytes; bytes `>= 0x80` are not counted (they are + * legitimate in UTF-8 and single-byte charsets). A binary payload whose leading bytes happen to + * contain no NUL and few control bytes — e.g. a region dominated by high bytes — can therefore + * slip through as text and decode to replacement-character noise. Most real binary formats carry + * a NUL or control run early, so the practical miss rate is low, but callers should treat the + * preview as a diagnostic aid rather than a reliable content-type classifier. + * * This is an `internal` seam shared by [DefaultInstrumentationStep] and * [DefaultAsyncInstrumentationStep]; it has no public API surface. */ internal object BodyPreview { /** Charset used when a text body declares no charset, or names an unknown one. */ - internal val DEFAULT_CHARSET: Charset = Charsets.UTF_8 + private val DEFAULT_CHARSET: Charset = Charsets.UTF_8 /** * Number of leading bytes inspected by [isProbablyText]. A small fixed sample keeps the @@ -64,34 +84,53 @@ internal object BodyPreview { /** * Renders [bytes] as a preview string. * - * Empty input yields the empty string. A body that does not pass [isProbablyText] is rendered - * as a size-only `[binary N bytes]` summary. Otherwise the bytes are decoded with the charset - * from [mediaType] (or [DEFAULT_CHARSET] when absent/unknown). + * Empty input yields the empty string. When [mediaType] explicitly declares a resolvable + * multi-byte charset (UTF-16/UTF-32 and friends), the bytes are decoded with that charset and + * the NUL-based binary heuristic is skipped — see the class KDoc. Otherwise a body that does + * not pass [isProbablyText] is rendered as a size-only `[binary N bytes]` summary, and a body + * that does is decoded with the charset from [mediaType] (or [DEFAULT_CHARSET] when + * absent/unknown). */ internal fun render( bytes: ByteArray, mediaType: MediaType?, ): String { if (bytes.isEmpty()) return "" + val declared = mediaType?.charset + if (declared != null && encodesAsciiWithNul(declared)) { + // A declared wide charset (UTF-16/UTF-32) pads ASCII with NUL bytes, so the NUL-based + // heuristic would misclassify it as binary. Trust the explicit declaration and decode. + return previewText(bytes, declared) + } if (!isProbablyText(bytes)) return binarySummary(bytes.size) - return previewText(bytes, mediaType) + return previewText(bytes, declared ?: DEFAULT_CHARSET) } /** - * Decodes [bytes] using the charset declared on [mediaType], falling back to - * [DEFAULT_CHARSET] when the media type is null, declares no charset, or names a charset the - * JVM cannot resolve ([MediaType.charset] returns null in the latter two cases). Invalid byte - * sequences are replaced rather than throwing. + * Decodes [bytes] using [charset]. Invalid byte sequences are replaced rather than throwing. */ - internal fun previewText( + private fun previewText( bytes: ByteArray, - mediaType: MediaType?, + charset: Charset, ): String { if (bytes.isEmpty()) return "" - val charset = mediaType?.charset ?: DEFAULT_CHARSET return String(bytes, charset) } + /** + * True when [charset] encodes a plain ASCII character to a byte sequence that contains a NUL + * byte — the signature of a fixed-width multi-byte charset such as UTF-16 or UTF-32, where + * `'A'` becomes e.g. `0x00 0x41`. Single-byte charsets (US-ASCII, ISO-8859-1) and UTF-8 + * encode ASCII without NUL padding and return false. The probe is computed from the charset's + * own encoder, so it covers any such charset the JVM knows, not a hard-coded name list. + */ + private fun encodesAsciiWithNul(charset: Charset): Boolean = + try { + "A".toByteArray(charset).any { it.toInt() == NUL } + } catch (_: Exception) { + false + } + /** * Heuristically decides whether [bytes] is text. Samples the first [SNIFF_SAMPLE_BYTES]: * a single NUL byte is treated as a strong binary signal, and a control-byte ratio above diff --git a/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/AsyncInstrumentationStepTest.kt b/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/AsyncInstrumentationStepTest.kt index 82ae2cda..a17709da 100644 --- a/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/AsyncInstrumentationStepTest.kt +++ b/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/AsyncInstrumentationStepTest.kt @@ -131,6 +131,47 @@ class AsyncInstrumentationStepTest { response.close() } + @Test + fun `response body preview honours the declared ISO-8859-1 charset`() { + // Mirrors the sync InstrumentationStepTest assertion: 0xE9 is 'é' in ISO-8859-1; decoded + // as UTF-8 (the old hardcoded assumption) it would be U+FFFD. This pins that the async + // step passes mediaType() through to BodyPreview rather than regressing to UTF-8. + val fakeSlf4j = FakeSlf4jLogger("test.async.instrumentation.charset") + val clientLogger = ClientLogger.forTesting(fakeSlf4j) + val latin1 = MediaType.parse("text/plain;charset=ISO-8859-1") + val bytes = "café".toByteArray(Charsets.ISO_8859_1) + val fakeAsync = + AsyncHttpClient { request -> + CompletableFuture.completedFuture( + Response.builder() + .request(request) + .protocol(Protocol.HTTP_1_1) + .status(Status.OK) + .body(ResponseBody.create(Io.provider.source(bytes), latin1, bytes.size.toLong())) + .build(), + ) + } + val pipeline = + AsyncHttpPipelineBuilder(fakeAsync) + .append( + DefaultAsyncInstrumentationStep( + options = HttpInstrumentationOptions(logLevel = HttpLogLevel.BODY_AND_HEADERS), + logger = clientLogger, + ), + ) + .build() + + val response = pipeline.sendAsync(getRequest("https://api.example.com/x")).join() + response.close() + + val responseRecord = + fakeSlf4j.records.first { rec -> + rec.keyValues.any { it.key == "event" && it.value == "http.response" } + } + val preview = responseRecord.keyValues.first { it.key == "response.body.preview" }.value + assertEquals("café", preview) + } + @Test fun `known-length response body is wrapped and bounded to the preview cap`() { // A known-length body larger than bodyPreviewMaxBytes is wrapped: the capture is bounded diff --git a/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreviewTest.kt b/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreviewTest.kt index d1939776..b9939570 100644 --- a/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreviewTest.kt +++ b/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreviewTest.kt @@ -68,6 +68,22 @@ class BodyPreviewTest { assertEquals("[binary ${bytes.size} bytes]", preview) } + @Test + fun `text body declared as UTF-16 is decoded despite NUL padding`() { + // UTF-16BE pads ASCII with NUL bytes ('A' = 0x00 0x41), which the NUL heuristic would + // otherwise treat as binary. An explicitly declared wide charset must still decode. + val bytes = "café".toByteArray(StandardCharsets.UTF_16BE) + val preview = BodyPreview.render(bytes, MediaType.parse("text/plain;charset=UTF-16BE")) + assertEquals("café", preview) + assertFalse(preview.contains("[binary"), "declared UTF-16 text must not be summarised as binary") + } + + @Test + fun `text body declared as UTF-16LE is decoded despite NUL padding`() { + val bytes = "hello".toByteArray(StandardCharsets.UTF_16LE) + assertEquals("hello", BodyPreview.render(bytes, MediaType.parse("text/plain;charset=UTF-16LE"))) + } + @Test fun `isProbablyText accepts plain ASCII and latin-1 high bytes`() { assertTrue(BodyPreview.isProbablyText("plain ascii text".toByteArray(StandardCharsets.US_ASCII))) From d0d76f89cdae7d3f9cf48841d180c0d083bbca9b Mon Sep 17 00:00:00 2001 From: OmarAlJarrah Date: Tue, 16 Jun 2026 23:45:31 +0300 Subject: [PATCH 3/3] refactor: memoise the wide-charset probe and clarify the binary summary Polish the BodyPreview seam in response to review: - Memoise encodesAsciiWithNul per Charset in a ConcurrentHashMap so the ASCII probe encodes at most once per distinct charset instead of on every render; subsequent renders are a plain map lookup. - Rename the binary summary to [binary N bytes captured]. The byte count is the bounded preview snapshot, not necessarily the full body length, so the wording no longer implies a size it never observed. - Drop the unreachable empty-array branch in previewText; render already returns early for empty input before any decode path is reached. --- .../core/http/pipeline/steps/BodyPreview.kt | 47 +++++++++++++------ .../http/pipeline/steps/BodyPreviewTest.kt | 4 +- .../pipeline/steps/InstrumentationStepTest.kt | 2 +- 3 files changed, 36 insertions(+), 17 deletions(-) diff --git a/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreview.kt b/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreview.kt index b2f4c347..4954eae0 100644 --- a/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreview.kt +++ b/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreview.kt @@ -9,6 +9,7 @@ package org.dexpace.sdk.core.http.pipeline.steps import org.dexpace.sdk.core.http.common.MediaType import java.nio.charset.Charset +import java.util.concurrent.ConcurrentHashMap /** * Renders captured request/response body bytes into a short, log-safe preview string. @@ -23,8 +24,10 @@ import java.nio.charset.Charset * names one the JVM does not recognise, the preview falls back to [DEFAULT_CHARSET] (UTF-8), * matching the HTTP default for most text types and the previous behaviour. * - **Binary-safe** — a body that does not look like text is never decoded. Instead it is - * summarised as `[binary N bytes]`, so an operator sees a stable, greppable marker rather - * than a line of replacement characters that can corrupt log viewers. + * summarised as `[binary N bytes captured]`, so an operator sees a stable, greppable marker + * rather than a line of replacement characters that can corrupt log viewers. `N` is the number + * of bytes in the (already bounded) preview snapshot — i.e. how much was captured, not + * necessarily the full body length — so the summary cannot imply a size it didn't observe. * * Decoding itself never throws: [Charset]-based decoding substitutes the replacement character * for malformed input, so a snapshot that ends mid-multibyte-sequence (a real possibility on a @@ -38,8 +41,9 @@ import java.nio.charset.Charset * first: when the [MediaType] explicitly declares a *resolvable* charset whose encoding is not * NUL-free for ASCII (UTF-16/UTF-32 and friends), the body is decoded with that charset and the * NUL-based heuristic is skipped. When no charset is declared, or the declared one is single-byte - * (US-ASCII, ISO-8859-1, UTF-8), the heuristic still runs so a genuinely binary body is summarised - * rather than decoded into noise. + * (US-ASCII, ISO-8859-1, UTF-8), the heuristic still runs so a genuinely binary body is + * summarised rather than decoded into noise. The wide-charset probe is memoised per [Charset], so + * the cost is a single small encode the first time a charset is seen and a map lookup thereafter. * * ## Best-effort detection * The heuristic is **best-effort**, not a guarantee. It samples only the first @@ -87,8 +91,8 @@ internal object BodyPreview { * Empty input yields the empty string. When [mediaType] explicitly declares a resolvable * multi-byte charset (UTF-16/UTF-32 and friends), the bytes are decoded with that charset and * the NUL-based binary heuristic is skipped — see the class KDoc. Otherwise a body that does - * not pass [isProbablyText] is rendered as a size-only `[binary N bytes]` summary, and a body - * that does is decoded with the charset from [mediaType] (or [DEFAULT_CHARSET] when + * not pass [isProbablyText] is rendered as a size-only `[binary N bytes captured]` summary, and + * a body that does is decoded with the charset from [mediaType] (or [DEFAULT_CHARSET] when * absent/unknown). */ internal fun render( @@ -108,23 +112,34 @@ internal object BodyPreview { /** * Decodes [bytes] using [charset]. Invalid byte sequences are replaced rather than throwing. + * [render] guarantees a non-empty array before calling this. */ private fun previewText( bytes: ByteArray, charset: Charset, - ): String { - if (bytes.isEmpty()) return "" - return String(bytes, charset) - } + ): String = String(bytes, charset) + + /** Memoised results of [computeEncodesAsciiWithNul], keyed by [Charset] (a JVM singleton). */ + private val asciiWithNulByCharset = ConcurrentHashMap() /** * True when [charset] encodes a plain ASCII character to a byte sequence that contains a NUL * byte — the signature of a fixed-width multi-byte charset such as UTF-16 or UTF-32, where * `'A'` becomes e.g. `0x00 0x41`. Single-byte charsets (US-ASCII, ISO-8859-1) and UTF-8 - * encode ASCII without NUL padding and return false. The probe is computed from the charset's - * own encoder, so it covers any such charset the JVM knows, not a hard-coded name list. + * encode ASCII without NUL padding and return false. + * + * The result is memoised per charset so the encode runs at most once per distinct [Charset]; + * subsequent renders for that charset are a plain map lookup with no per-call allocation. */ private fun encodesAsciiWithNul(charset: Charset): Boolean = + asciiWithNulByCharset.computeIfAbsent(charset) { computeEncodesAsciiWithNul(it) } + + /** + * Computes the [encodesAsciiWithNul] verdict from the charset's own encoder, so it covers any + * such charset the JVM knows rather than a hard-coded name list. An encoder that rejects the + * probe is treated as non-wide (the heuristic path then applies). + */ + private fun computeEncodesAsciiWithNul(charset: Charset): Boolean = try { "A".toByteArray(charset).any { it.toInt() == NUL } } catch (_: Exception) { @@ -161,6 +176,10 @@ internal object BodyPreview { private fun isControlByte(b: Int): Boolean = (b < FIRST_PRINTABLE && b != TAB && b != LINE_FEED && b != CARRIAGE_RETURN) || b == DEL - /** The size-only summary emitted for a binary body. */ - private fun binarySummary(size: Int): String = "[binary $size bytes]" + /** + * The size-only summary emitted for a binary body. [size] is the captured/preview byte count + * (the input is an already-bounded snapshot), so the wording says "captured" rather than + * implying it observed the full body length. + */ + private fun binarySummary(size: Int): String = "[binary $size bytes captured]" } diff --git a/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreviewTest.kt b/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreviewTest.kt index b9939570..44356c26 100644 --- a/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreviewTest.kt +++ b/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreviewTest.kt @@ -56,7 +56,7 @@ class BodyPreviewTest { // gzip magic header (0x1f 0x8b 0x08) followed by a NUL byte — a representative binary body. val bytes = byteArrayOf(0x1f, 0x8b.toByte(), 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03) val preview = BodyPreview.render(bytes, MediaType.parse("application/gzip")) - assertEquals("[binary ${bytes.size} bytes]", preview) + assertEquals("[binary ${bytes.size} bytes captured]", preview) assertFalse(preview.contains('�'), "binary body must not be rendered as replacement chars") } @@ -65,7 +65,7 @@ class BodyPreviewTest { // Even if a server mislabels a gzip stream as text, the NUL byte keeps it from being decoded. val bytes = byteArrayOf(0x1f, 0x8b.toByte(), 0x00, 0x42, 0x00, 0x13) val preview = BodyPreview.render(bytes, MediaType.parse("text/plain;charset=utf-8")) - assertEquals("[binary ${bytes.size} bytes]", preview) + assertEquals("[binary ${bytes.size} bytes captured]", preview) } @Test diff --git a/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/InstrumentationStepTest.kt b/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/InstrumentationStepTest.kt index f97108c3..82eca054 100644 --- a/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/InstrumentationStepTest.kt +++ b/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/InstrumentationStepTest.kt @@ -713,7 +713,7 @@ class InstrumentationStepTest { response.close() val preview = responsePreview(fakeSlf4j) - assertEquals("[binary ${binary.size} bytes]", preview) + assertEquals("[binary ${binary.size} bytes captured]", preview) assertFalse(preview.contains('�'), "binary body must not be logged as replacement characters") }