From b2c588e6423b8d5a7e060786cd52c0d4e11e790a Mon Sep 17 00:00:00 2001
From: OmarAlJarrah <o.mazari.om63@gmail.com>
Date: Tue, 16 Jun 2026 16:18:40 +0300
Subject: [PATCH 1/3] feat: charset-aware, binary-safe body previews in
 request/response logging
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Body previews in the default sync and async instrumentation steps decoded
every captured payload as UTF-8. That produced mojibake for a text body
declared in another charset (a latin-1 response logged `café` as `caf?`)
and a run of U+FFFD replacement characters for a binary body (gzip, an
image), which is both unreadable and capable of corrupting log viewers.

Add a shared `BodyPreview` renderer that the two steps delegate to:

- Text bodies are decoded with the charset declared on the body's
  `MediaType`, falling back to UTF-8 when the media type omits a charset
  or names one the JVM cannot resolve.
- A short text/binary sniff (NUL byte, or a high ratio of non-whitespace
  control bytes in a leading sample) classifies binary bodies, which are
  rendered as a size-only `[binary N bytes]` summary instead of being
  decoded as text.

Decoding still substitutes the replacement character for malformed input
rather than throwing, so a snapshot truncated mid-multibyte-sequence by the
bounded capture cannot crash the log line. The bounded/streaming capture
behaviour of the loggable bodies is unchanged.
---
 .../core/http/pipeline/steps/BodyPreview.kt   | 127 ++++++++++++++++
 .../steps/DefaultAsyncInstrumentationStep.kt  |  13 +-
 .../steps/DefaultInstrumentationStep.kt       |  13 +-
 .../http/pipeline/steps/BodyPreviewTest.kt    |  85 +++++++++++
 .../pipeline/steps/InstrumentationStepTest.kt | 137 +++++++++++++++++-
 5 files changed, 352 insertions(+), 23 deletions(-)
 create mode 100644 sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreview.kt
 create mode 100644 sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreviewTest.kt

diff --git a/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreview.kt b/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreview.kt
new file mode 100644
index 00000000..dfb8146e
--- /dev/null
+++ b/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreview.kt
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2026 dexpace and Omar Aljarrah
+ *
+ * Licensed under the MIT License. See LICENSE in the project root.
+ * SPDX-License-Identifier: MIT
+ */
+
+package org.dexpace.sdk.core.http.pipeline.steps
+
+import org.dexpace.sdk.core.http.common.MediaType
+import java.nio.charset.Charset
+
+/**
+ * Renders captured request/response body bytes into a short, log-safe preview string.
+ *
+ * The default instrumentation steps previously decoded every preview as UTF-8, which produced
+ * mojibake for a text body declared in another charset (e.g. ISO-8859-1) and a stream of
+ * `U+FFFD` replacement characters for a binary body (a gzip payload, an image, …). This helper
+ * makes previews:
+ *
+ * - **Charset-aware** — a text body is decoded with the charset declared on its [MediaType]
+ *   (`Content-Type: text/plain;charset=ISO-8859-1`). When the media type omits a charset, or
+ *   names one the JVM does not recognise, the preview falls back to [DEFAULT_CHARSET] (UTF-8),
+ *   matching the HTTP default for most text types and the previous behaviour.
+ * - **Binary-safe** — a body that does not look like text is never decoded. Instead it is
+ *   summarised as `[binary N bytes]`, so an operator sees a stable, greppable marker rather
+ *   than a line of replacement characters that can corrupt log viewers.
+ *
+ * Decoding itself never throws: [Charset]-based decoding substitutes the replacement character
+ * for malformed input, so a snapshot that ends mid-multibyte-sequence (a real possibility on a
+ * bounded capture) still yields a usable preview rather than crashing the log line.
+ *
+ * This is an `internal` seam shared by [DefaultInstrumentationStep] and
+ * [DefaultAsyncInstrumentationStep]; it has no public API surface.
+ */
+internal object BodyPreview {
+    /** Charset used when a text body declares no charset, or names an unknown one. */
+    internal val DEFAULT_CHARSET: Charset = Charsets.UTF_8
+
+    /**
+     * Number of leading bytes inspected by [isProbablyText]. A small fixed sample keeps the
+     * heuristic O(1) regardless of body size while still catching the common binary signatures
+     * (NUL bytes, control-byte runs) that appear near the start of a payload.
+     */
+    private const val SNIFF_SAMPLE_BYTES = 1024
+
+    /**
+     * Fraction of sampled bytes that may be non-text control bytes before the body is judged
+     * binary. Real text can carry the odd control byte (an ESC sequence, a stray form feed), so
+     * the cutoff sits well above zero rather than rejecting on the first control byte.
+     */
+    private const val MAX_CONTROL_BYTE_RATIO = 0.30
+
+    // Byte values used by the text/binary heuristic, named so the rule logic reads clearly and
+    // so the values are not flagged as inline magic numbers.
+    private const val UNSIGNED_BYTE_MASK = 0xFF
+    private const val NUL = 0x00
+    private const val FIRST_PRINTABLE = 0x20 // space; bytes below this are C0 control bytes
+    private const val TAB = 0x09
+    private const val LINE_FEED = 0x0A
+    private const val CARRIAGE_RETURN = 0x0D
+    private const val DEL = 0x7F
+
+    /**
+     * Renders [bytes] as a preview string.
+     *
+     * Empty input yields the empty string. A body that does not pass [isProbablyText] is rendered
+     * as a size-only `[binary N bytes]` summary. Otherwise the bytes are decoded with the charset
+     * from [mediaType] (or [DEFAULT_CHARSET] when absent/unknown).
+     */
+    internal fun render(
+        bytes: ByteArray,
+        mediaType: MediaType?,
+    ): String {
+        if (bytes.isEmpty()) return ""
+        if (!isProbablyText(bytes)) return binarySummary(bytes.size)
+        return previewText(bytes, mediaType)
+    }
+
+    /**
+     * Decodes [bytes] using the charset declared on [mediaType], falling back to
+     * [DEFAULT_CHARSET] when the media type is null, declares no charset, or names a charset the
+     * JVM cannot resolve ([MediaType.charset] returns null in the latter two cases). Invalid byte
+     * sequences are replaced rather than throwing.
+     */
+    internal fun previewText(
+        bytes: ByteArray,
+        mediaType: MediaType?,
+    ): String {
+        if (bytes.isEmpty()) return ""
+        val charset = mediaType?.charset ?: DEFAULT_CHARSET
+        return String(bytes, charset)
+    }
+
+    /**
+     * Heuristically decides whether [bytes] is text. Samples the first [SNIFF_SAMPLE_BYTES]:
+     * a single NUL byte is treated as a strong binary signal, and a control-byte ratio above
+     * [MAX_CONTROL_BYTE_RATIO] (excluding the common text whitespace `\t`, `\n`, `\r`) also marks
+     * the body binary. Bytes >= 0x80 are not counted against the body — they are legitimate in
+     * UTF-8 multibyte sequences and in single-byte charsets such as ISO-8859-1.
+     */
+    @Suppress("ReturnCount")
+    internal fun isProbablyText(bytes: ByteArray): Boolean {
+        if (bytes.isEmpty()) return true
+        val sample = minOf(bytes.size, SNIFF_SAMPLE_BYTES)
+        var controlBytes = 0
+        for (i in 0 until sample) {
+            val b = bytes[i].toInt() and UNSIGNED_BYTE_MASK
+            // A NUL byte essentially never appears in real text and is the canonical marker of a
+            // binary payload (gzip, images, protobuf, …); reject immediately.
+            if (b == NUL) return false
+            if (isControlByte(b)) controlBytes++
+        }
+        return controlBytes.toDouble() / sample <= MAX_CONTROL_BYTE_RATIO
+    }
+
+    /**
+     * True for an ASCII C0 control byte that is not one of the whitespace characters routinely
+     * found in text ([TAB], [LINE_FEED], [CARRIAGE_RETURN]), or for the [DEL] byte. High bytes
+     * (>= 0x80) are deliberately excluded — see [isProbablyText].
+     */
+    private fun isControlByte(b: Int): Boolean =
+        (b < FIRST_PRINTABLE && b != TAB && b != LINE_FEED && b != CARRIAGE_RETURN) || b == DEL
+
+    /** The size-only summary emitted for a binary body. */
+    private fun binarySummary(size: Int): String = "[binary $size bytes]"
+}
diff --git a/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/DefaultAsyncInstrumentationStep.kt b/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/DefaultAsyncInstrumentationStep.kt
index 9d481d35..99d02131 100644
--- a/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/DefaultAsyncInstrumentationStep.kt
+++ b/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/DefaultAsyncInstrumentationStep.kt
@@ -308,13 +308,13 @@ public class DefaultAsyncInstrumentationStep
                     if (requestBody != null) {
                         val preview = requestBody.snapshot(options.bodyPreviewMaxBytes)
                         ev.field("request.body.size", preview.size.toLong())
-                            .field("request.body.preview", utf8Preview(preview))
+                            .field("request.body.preview", BodyPreview.render(preview, requestBody.mediaType()))
                     }
                     val responseBody = response.body
                     if (responseBody is LoggableResponseBody) {
                         val preview = responseBody.snapshot(options.bodyPreviewMaxBytes)
                         ev.field("response.body.size", preview.size.toLong())
-                            .field("response.body.preview", utf8Preview(preview))
+                            .field("response.body.preview", BodyPreview.render(preview, responseBody.mediaType()))
                         responseBody.captureException?.let {
                             ev.field("response.body.drain_error", it.javaClass.simpleName ?: "Throwable")
                         }
@@ -346,7 +346,7 @@ public class DefaultAsyncInstrumentationStep
                 if (shouldCaptureBody() && requestBody != null) {
                     val preview = requestBody.snapshot(options.bodyPreviewMaxBytes)
                     ev.field("request.body.size", preview.size.toLong())
-                        .field("request.body.preview", utf8Preview(preview))
+                        .field("request.body.preview", BodyPreview.render(preview, requestBody.mediaType()))
                 }
                 ev.log()
             } catch (t: Throwable) {
@@ -416,13 +416,6 @@ public class DefaultAsyncInstrumentationStep
 
         private fun elapsedMillis(startNanos: Long): Double = (clock.monotonic() - startNanos) / NANOS_PER_MILLI_DOUBLE
 
-        private fun utf8Preview(bytes: ByteArray): String {
-            if (bytes.isEmpty()) return ""
-            // Defensive: a snapshot that ends mid-UTF-8 codepoint shouldn't crash the log line.
-            // String(bytes, charset) replaces invalid sequences rather than throwing.
-            return String(bytes, Charsets.UTF_8)
-        }
-
         private fun emitInstrumentationError(
             event: String,
             phase: String,
diff --git a/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/DefaultInstrumentationStep.kt b/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/DefaultInstrumentationStep.kt
index 1dfbc908..2b7e84b1 100644
--- a/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/DefaultInstrumentationStep.kt
+++ b/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/DefaultInstrumentationStep.kt
@@ -212,13 +212,13 @@ public class DefaultInstrumentationStep
                     if (requestBody != null) {
                         val preview = requestBody.snapshot(options.bodyPreviewMaxBytes)
                         ev.field("request.body.size", preview.size.toLong())
-                            .field("request.body.preview", utf8Preview(preview))
+                            .field("request.body.preview", BodyPreview.render(preview, requestBody.mediaType()))
                     }
                     val responseBody = response.body
                     if (responseBody is LoggableResponseBody) {
                         val preview = responseBody.snapshot(options.bodyPreviewMaxBytes)
                         ev.field("response.body.size", preview.size.toLong())
-                            .field("response.body.preview", utf8Preview(preview))
+                            .field("response.body.preview", BodyPreview.render(preview, responseBody.mediaType()))
                         responseBody.captureException?.let {
                             ev.field("response.body.drain_error", it.javaClass.simpleName ?: "Throwable")
                         }
@@ -250,7 +250,7 @@ public class DefaultInstrumentationStep
                 if (shouldCaptureBody() && requestBody != null) {
                     val preview = requestBody.snapshot(options.bodyPreviewMaxBytes)
                     ev.field("request.body.size", preview.size.toLong())
-                        .field("request.body.preview", utf8Preview(preview))
+                        .field("request.body.preview", BodyPreview.render(preview, requestBody.mediaType()))
                 }
                 ev.log()
             } catch (t: Throwable) {
@@ -320,13 +320,6 @@ public class DefaultInstrumentationStep
 
         private fun elapsedMillis(startNanos: Long): Double = (clock.monotonic() - startNanos) / NANOS_PER_MILLI_DOUBLE
 
-        private fun utf8Preview(bytes: ByteArray): String {
-            if (bytes.isEmpty()) return ""
-            // Defensive: a snapshot that ends mid-UTF-8 codepoint shouldn't crash the log line.
-            // String(bytes, charset) replaces invalid sequences rather than throwing.
-            return String(bytes, Charsets.UTF_8)
-        }
-
         private fun emitInstrumentationError(
             event: String,
             phase: String,
diff --git a/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreviewTest.kt b/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreviewTest.kt
new file mode 100644
index 00000000..d1939776
--- /dev/null
+++ b/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreviewTest.kt
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2026 dexpace and Omar Aljarrah
+ *
+ * Licensed under the MIT License. See LICENSE in the project root.
+ * SPDX-License-Identifier: MIT
+ */
+
+package org.dexpace.sdk.core.http.pipeline.steps
+
+import org.dexpace.sdk.core.http.common.MediaType
+import java.nio.charset.StandardCharsets
+import kotlin.test.Test
+import kotlin.test.assertEquals
+import kotlin.test.assertFalse
+import kotlin.test.assertTrue
+
+class BodyPreviewTest {
+    @Test
+    fun `empty bytes render to empty string`() {
+        assertEquals("", BodyPreview.render(ByteArray(0), MediaType.parse("text/plain")))
+    }
+
+    @Test
+    fun `text body declared as ISO-8859-1 is decoded with that charset`() {
+        // 0xE9 is 'é' in ISO-8859-1. Decoded as UTF-8 it would be the U+FFFD replacement char.
+        val bytes = "café".toByteArray(StandardCharsets.ISO_8859_1)
+        val preview = BodyPreview.render(bytes, MediaType.parse("text/plain;charset=ISO-8859-1"))
+        assertEquals("café", preview)
+        assertFalse(preview.contains('�'), "latin-1 body must not be decoded as mojibake")
+    }
+
+    @Test
+    fun `text body declared as UTF-8 is decoded with UTF-8`() {
+        val bytes = "naïve — text".toByteArray(StandardCharsets.UTF_8)
+        assertEquals("naïve — text", BodyPreview.render(bytes, MediaType.parse("text/plain;charset=utf-8")))
+    }
+
+    @Test
+    fun `text body with no declared charset falls back to UTF-8`() {
+        val bytes = "résumé".toByteArray(StandardCharsets.UTF_8)
+        // No charset parameter on the media type → documented default is UTF-8.
+        assertEquals("résumé", BodyPreview.render(bytes, MediaType.parse("text/plain")))
+        // A null media type also falls back to UTF-8.
+        assertEquals("résumé", BodyPreview.render(bytes, null))
+    }
+
+    @Test
+    fun `unknown declared charset falls back to UTF-8`() {
+        val bytes = "hello".toByteArray(StandardCharsets.UTF_8)
+        // MediaType.charset returns null for an unrecognised charset name; preview falls back.
+        assertEquals("hello", BodyPreview.render(bytes, MediaType.parse("text/plain;charset=not-a-real-charset")))
+    }
+
+    @Test
+    fun `binary body with NUL byte is summarised as size only and not decoded`() {
+        // gzip magic header (0x1f 0x8b 0x08) followed by a NUL byte — a representative binary body.
+        val bytes = byteArrayOf(0x1f, 0x8b.toByte(), 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03)
+        val preview = BodyPreview.render(bytes, MediaType.parse("application/gzip"))
+        assertEquals("[binary ${bytes.size} bytes]", preview)
+        assertFalse(preview.contains('�'), "binary body must not be rendered as replacement chars")
+    }
+
+    @Test
+    fun `binary body declared with a text charset is still detected as binary`() {
+        // Even if a server mislabels a gzip stream as text, the NUL byte keeps it from being decoded.
+        val bytes = byteArrayOf(0x1f, 0x8b.toByte(), 0x00, 0x42, 0x00, 0x13)
+        val preview = BodyPreview.render(bytes, MediaType.parse("text/plain;charset=utf-8"))
+        assertEquals("[binary ${bytes.size} bytes]", preview)
+    }
+
+    @Test
+    fun `isProbablyText accepts plain ASCII and latin-1 high bytes`() {
+        assertTrue(BodyPreview.isProbablyText("plain ascii text".toByteArray(StandardCharsets.US_ASCII)))
+        assertTrue(BodyPreview.isProbablyText("café".toByteArray(StandardCharsets.ISO_8859_1)))
+        // Common text whitespace control bytes do not flip the verdict.
+        assertTrue(BodyPreview.isProbablyText("line1\r\n\tline2".toByteArray(StandardCharsets.UTF_8)))
+    }
+
+    @Test
+    fun `isProbablyText rejects a NUL byte and a control-byte-heavy body`() {
+        assertFalse(BodyPreview.isProbablyText(byteArrayOf(0x41, 0x00, 0x42)))
+        // A run of non-whitespace C0 control bytes exceeds the ratio cutoff.
+        assertFalse(BodyPreview.isProbablyText(byteArrayOf(0x01, 0x02, 0x03, 0x04, 0x05)))
+    }
+}
diff --git a/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/InstrumentationStepTest.kt b/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/InstrumentationStepTest.kt
index 6658ea18..f97108c3 100644
--- a/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/InstrumentationStepTest.kt
+++ b/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/InstrumentationStepTest.kt
@@ -617,9 +617,9 @@ class InstrumentationStepTest {
     }
 
     @Test
-    fun `empty body produces empty utf8 preview`() {
-        // utf8Preview takes a fast empty branch on a zero-length array. Send a request that
-        // results in a zero-byte response body so the preview hits the empty path.
+    fun `empty body produces empty preview`() {
+        // The preview renderer takes a fast empty branch on a zero-length array. Send a request
+        // that results in a zero-byte response body so the preview hits the empty path.
         val fake = FakeHttpClient().enqueue { status(200).body("", MediaType.parse("text/plain")) }
         val pipeline =
             HttpPipelineBuilder(fake)
@@ -635,6 +635,119 @@ class InstrumentationStepTest {
         response.close()
     }
 
+    @Test
+    fun `response body preview honours the declared ISO-8859-1 charset`() {
+        // 0xE9 is 'é' in ISO-8859-1. Decoded as UTF-8 (the old hardcoded assumption) it would be
+        // the U+FFFD replacement char, so this pins charset-aware decoding end to end.
+        val fakeSlf4j = FakeSlf4jLogger("test.instrumentation")
+        val clientLogger = ClientLogger.forTesting(fakeSlf4j)
+        val latin1 = MediaType.parse("text/plain;charset=ISO-8859-1")
+        val bytes = "café".toByteArray(Charsets.ISO_8859_1)
+        val fake =
+            FakeHttpClient().enqueue {
+                status(200).body(ResponseBody.create(Io.provider.source(bytes), latin1, bytes.size.toLong()))
+            }
+        val pipeline =
+            HttpPipelineBuilder(fake)
+                .append(
+                    DefaultInstrumentationStep(
+                        HttpInstrumentationOptions(logLevel = HttpLogLevel.BODY_AND_HEADERS),
+                        FixedClock(),
+                        clientLogger,
+                    ),
+                )
+                .build()
+
+        val response = pipeline.send(getRequest("https://api.example.com/x"))
+        response.close()
+
+        assertEquals("café", responsePreview(fakeSlf4j))
+    }
+
+    @Test
+    fun `response body preview without a declared charset defaults to UTF-8`() {
+        val fakeSlf4j = FakeSlf4jLogger("test.instrumentation")
+        val clientLogger = ClientLogger.forTesting(fakeSlf4j)
+        // No charset parameter — documented default is UTF-8. body(String, ...) encodes as UTF-8.
+        val fake = FakeHttpClient().enqueue { status(200).body("résumé", MediaType.parse("text/plain")) }
+        val pipeline =
+            HttpPipelineBuilder(fake)
+                .append(
+                    DefaultInstrumentationStep(
+                        HttpInstrumentationOptions(logLevel = HttpLogLevel.BODY_AND_HEADERS),
+                        FixedClock(),
+                        clientLogger,
+                    ),
+                )
+                .build()
+
+        val response = pipeline.send(getRequest("https://api.example.com/x"))
+        response.close()
+
+        assertEquals("résumé", responsePreview(fakeSlf4j))
+    }
+
+    @Test
+    fun `binary response body is summarised by size and not rendered as text`() {
+        val fakeSlf4j = FakeSlf4jLogger("test.instrumentation")
+        val clientLogger = ClientLogger.forTesting(fakeSlf4j)
+        // gzip-shaped bytes including a NUL — must not be decoded into replacement characters.
+        val binary = byteArrayOf(0x1f, 0x8b.toByte(), 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03)
+        val mediaType = MediaType.parse("application/gzip")
+        val fake =
+            FakeHttpClient().enqueue {
+                status(200).body(ResponseBody.create(Io.provider.source(binary), mediaType, binary.size.toLong()))
+            }
+        val pipeline =
+            HttpPipelineBuilder(fake)
+                .append(
+                    DefaultInstrumentationStep(
+                        HttpInstrumentationOptions(logLevel = HttpLogLevel.BODY_AND_HEADERS),
+                        FixedClock(),
+                        clientLogger,
+                    ),
+                )
+                .build()
+
+        val response = pipeline.send(getRequest("https://api.example.com/x"))
+        response.close()
+
+        val preview = responsePreview(fakeSlf4j)
+        assertEquals("[binary ${binary.size} bytes]", preview)
+        assertFalse(preview.contains('�'), "binary body must not be logged as replacement characters")
+    }
+
+    @Test
+    fun `request body preview honours the declared ISO-8859-1 charset`() {
+        val fakeSlf4j = FakeSlf4jLogger("test.instrumentation")
+        val clientLogger = ClientLogger.forTesting(fakeSlf4j)
+        val latin1 = MediaType.parse("text/plain;charset=ISO-8859-1")
+        val bytes = "naïve".toByteArray(Charsets.ISO_8859_1)
+        val fake = FakeHttpClient().enqueue { status(200).body("ok", MediaType.parse("text/plain")) }
+        val draining = DrainingClient(fake)
+        val pipeline =
+            HttpPipelineBuilder(draining)
+                .append(
+                    DefaultInstrumentationStep(
+                        HttpInstrumentationOptions(logLevel = HttpLogLevel.BODY_AND_HEADERS),
+                        FixedClock(),
+                        clientLogger,
+                    ),
+                )
+                .build()
+
+        val req =
+            Request.builder()
+                .method(Method.POST)
+                .url("https://api.example.com/echo")
+                .body(RequestBody.create(bytes, latin1))
+                .build()
+        val response = pipeline.send(req)
+        response.close()
+
+        assertEquals("naïve", requestPreview(fakeSlf4j))
+    }
+
     @Test
     fun `emit_request_failed instrumentation error is logged when contentLength on the body throws`() {
         // Drive the catch branch in emitRequestEvent: a request whose body.contentLength()
@@ -791,6 +904,24 @@ class InstrumentationStepTest {
 
     // -- Helpers --------------------------------------------------------------------------------
 
+    /** Extracts the `response.body.preview` field value from the latest `http.response` event. */
+    private fun responsePreview(slf4j: FakeSlf4jLogger): String = previewField(slf4j, "response.body.preview")
+
+    /** Extracts the `request.body.preview` field value from the latest `http.response` event. */
+    private fun requestPreview(slf4j: FakeSlf4jLogger): String = previewField(slf4j, "request.body.preview")
+
+    private fun previewField(
+        slf4j: FakeSlf4jLogger,
+        key: String,
+    ): String {
+        val record =
+            slf4j.records.last { rec ->
+                rec.keyValues.any { it.key == "event" && it.value == "http.response" }
+            }
+        return record.keyValues.firstOrNull { it.key == key }?.value as? String
+            ?: fail("expected '$key' field on the http.response event")
+    }
+
     private fun getRequest(url: String): Request =
         Request.builder()
             .method(Method.GET)

From 3f5bcba62057b121ab35acac81c2b63a86fc54a2 Mon Sep 17 00:00:00 2001
From: OmarAlJarrah <o.mazari.om63@gmail.com>
Date: Tue, 16 Jun 2026 22:26:32 +0300
Subject: [PATCH 2/3] feat: decode declared wide-charset bodies in previews;
 tighten BodyPreview seam

The binary heuristic treated any NUL byte as a definitive binary signal and
ran before the declared charset was consulted, so a body declared as a
fixed-width multi-byte charset (UTF-16/UTF-32) was rendered as
[binary N bytes] even though its NUL padding is legitimate text. render()
now consults the declared charset first: when the MediaType names a
resolvable charset whose ASCII encoding contains NUL bytes, the body is
decoded with that charset and the NUL-based heuristic is skipped. Single-byte
charsets and bodies with no declared charset still run the heuristic.

Document the wide-charset handling and the best-effort nature of the
content sniff in the BodyPreview KDoc. Mark previewText and DEFAULT_CHARSET
private so the shared/tested seam is exactly render + isProbablyText. Add an
async end-to-end test mirroring the sync ISO-8859-1 response-preview
assertion so the async step's mediaType() pass-through is covered.
---
 .../core/http/pipeline/steps/BodyPreview.kt   | 63 +++++++++++++++----
 .../steps/AsyncInstrumentationStepTest.kt     | 41 ++++++++++++
 .../http/pipeline/steps/BodyPreviewTest.kt    | 16 +++++
 3 files changed, 108 insertions(+), 12 deletions(-)

diff --git a/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreview.kt b/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreview.kt
index dfb8146e..b2f4c347 100644
--- a/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreview.kt
+++ b/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreview.kt
@@ -30,12 +30,32 @@ import java.nio.charset.Charset
  * for malformed input, so a snapshot that ends mid-multibyte-sequence (a real possibility on a
  * bounded capture) still yields a usable preview rather than crashing the log line.
  *
+ * ## Multi-byte charsets and the binary heuristic
+ * The text/binary heuristic ([isProbablyText]) normally treats a NUL byte as a definitive binary
+ * signal. That rule misfires for a fixed-width multi-byte charset such as UTF-16 or UTF-32, where
+ * ASCII content is padded with NUL bytes (`'A'` is `0x00 0x41` in UTF-16BE). To avoid rendering
+ * legitimate wide-charset text as `[binary N bytes]`, [render] consults the declared charset
+ * first: when the [MediaType] explicitly declares a *resolvable* charset whose encoding is not
+ * NUL-free for ASCII (UTF-16/UTF-32 and friends), the body is decoded with that charset and the
+ * NUL-based heuristic is skipped. When no charset is declared, or the declared one is single-byte
+ * (US-ASCII, ISO-8859-1, UTF-8), the heuristic still runs so a genuinely binary body is summarised
+ * rather than decoded into noise.
+ *
+ * ## Best-effort detection
+ * The heuristic is **best-effort**, not a guarantee. It samples only the first
+ * [SNIFF_SAMPLE_BYTES] and counts C0 control bytes; bytes `>= 0x80` are not counted (they are
+ * legitimate in UTF-8 and single-byte charsets). A binary payload whose leading bytes happen to
+ * contain no NUL and few control bytes — e.g. a region dominated by high bytes — can therefore
+ * slip through as text and decode to replacement-character noise. Most real binary formats carry
+ * a NUL or control run early, so the practical miss rate is low, but callers should treat the
+ * preview as a diagnostic aid rather than a reliable content-type classifier.
+ *
  * This is an `internal` seam shared by [DefaultInstrumentationStep] and
  * [DefaultAsyncInstrumentationStep]; it has no public API surface.
  */
 internal object BodyPreview {
     /** Charset used when a text body declares no charset, or names an unknown one. */
-    internal val DEFAULT_CHARSET: Charset = Charsets.UTF_8
+    private val DEFAULT_CHARSET: Charset = Charsets.UTF_8
 
     /**
      * Number of leading bytes inspected by [isProbablyText]. A small fixed sample keeps the
@@ -64,34 +84,53 @@ internal object BodyPreview {
     /**
      * Renders [bytes] as a preview string.
      *
-     * Empty input yields the empty string. A body that does not pass [isProbablyText] is rendered
-     * as a size-only `[binary N bytes]` summary. Otherwise the bytes are decoded with the charset
-     * from [mediaType] (or [DEFAULT_CHARSET] when absent/unknown).
+     * Empty input yields the empty string. When [mediaType] explicitly declares a resolvable
+     * multi-byte charset (UTF-16/UTF-32 and friends), the bytes are decoded with that charset and
+     * the NUL-based binary heuristic is skipped — see the class KDoc. Otherwise a body that does
+     * not pass [isProbablyText] is rendered as a size-only `[binary N bytes]` summary, and a body
+     * that does is decoded with the charset from [mediaType] (or [DEFAULT_CHARSET] when
+     * absent/unknown).
      */
     internal fun render(
         bytes: ByteArray,
         mediaType: MediaType?,
     ): String {
         if (bytes.isEmpty()) return ""
+        val declared = mediaType?.charset
+        if (declared != null && encodesAsciiWithNul(declared)) {
+            // A declared wide charset (UTF-16/UTF-32) pads ASCII with NUL bytes, so the NUL-based
+            // heuristic would misclassify it as binary. Trust the explicit declaration and decode.
+            return previewText(bytes, declared)
+        }
         if (!isProbablyText(bytes)) return binarySummary(bytes.size)
-        return previewText(bytes, mediaType)
+        return previewText(bytes, declared ?: DEFAULT_CHARSET)
     }
 
     /**
-     * Decodes [bytes] using the charset declared on [mediaType], falling back to
-     * [DEFAULT_CHARSET] when the media type is null, declares no charset, or names a charset the
-     * JVM cannot resolve ([MediaType.charset] returns null in the latter two cases). Invalid byte
-     * sequences are replaced rather than throwing.
+     * Decodes [bytes] using [charset]. Invalid byte sequences are replaced rather than throwing.
      */
-    internal fun previewText(
+    private fun previewText(
         bytes: ByteArray,
-        mediaType: MediaType?,
+        charset: Charset,
     ): String {
         if (bytes.isEmpty()) return ""
-        val charset = mediaType?.charset ?: DEFAULT_CHARSET
         return String(bytes, charset)
     }
 
+    /**
+     * True when [charset] encodes a plain ASCII character to a byte sequence that contains a NUL
+     * byte — the signature of a fixed-width multi-byte charset such as UTF-16 or UTF-32, where
+     * `'A'` becomes e.g. `0x00 0x41`. Single-byte charsets (US-ASCII, ISO-8859-1) and UTF-8
+     * encode ASCII without NUL padding and return false. The probe is computed from the charset's
+     * own encoder, so it covers any such charset the JVM knows, not a hard-coded name list.
+     */
+    private fun encodesAsciiWithNul(charset: Charset): Boolean =
+        try {
+            "A".toByteArray(charset).any { it.toInt() == NUL }
+        } catch (_: Exception) {
+            false
+        }
+
     /**
      * Heuristically decides whether [bytes] is text. Samples the first [SNIFF_SAMPLE_BYTES]:
      * a single NUL byte is treated as a strong binary signal, and a control-byte ratio above
diff --git a/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/AsyncInstrumentationStepTest.kt b/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/AsyncInstrumentationStepTest.kt
index 82ae2cda..a17709da 100644
--- a/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/AsyncInstrumentationStepTest.kt
+++ b/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/AsyncInstrumentationStepTest.kt
@@ -131,6 +131,47 @@ class AsyncInstrumentationStepTest {
         response.close()
     }
 
+    @Test
+    fun `response body preview honours the declared ISO-8859-1 charset`() {
+        // Mirrors the sync InstrumentationStepTest assertion: 0xE9 is 'é' in ISO-8859-1; decoded
+        // as UTF-8 (the old hardcoded assumption) it would be U+FFFD. This pins that the async
+        // step passes mediaType() through to BodyPreview rather than regressing to UTF-8.
+        val fakeSlf4j = FakeSlf4jLogger("test.async.instrumentation.charset")
+        val clientLogger = ClientLogger.forTesting(fakeSlf4j)
+        val latin1 = MediaType.parse("text/plain;charset=ISO-8859-1")
+        val bytes = "café".toByteArray(Charsets.ISO_8859_1)
+        val fakeAsync =
+            AsyncHttpClient { request ->
+                CompletableFuture.completedFuture(
+                    Response.builder()
+                        .request(request)
+                        .protocol(Protocol.HTTP_1_1)
+                        .status(Status.OK)
+                        .body(ResponseBody.create(Io.provider.source(bytes), latin1, bytes.size.toLong()))
+                        .build(),
+                )
+            }
+        val pipeline =
+            AsyncHttpPipelineBuilder(fakeAsync)
+                .append(
+                    DefaultAsyncInstrumentationStep(
+                        options = HttpInstrumentationOptions(logLevel = HttpLogLevel.BODY_AND_HEADERS),
+                        logger = clientLogger,
+                    ),
+                )
+                .build()
+
+        val response = pipeline.sendAsync(getRequest("https://api.example.com/x")).join()
+        response.close()
+
+        val responseRecord =
+            fakeSlf4j.records.first { rec ->
+                rec.keyValues.any { it.key == "event" && it.value == "http.response" }
+            }
+        val preview = responseRecord.keyValues.first { it.key == "response.body.preview" }.value
+        assertEquals("café", preview)
+    }
+
     @Test
     fun `known-length response body is wrapped and bounded to the preview cap`() {
         // A known-length body larger than bodyPreviewMaxBytes is wrapped: the capture is bounded
diff --git a/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreviewTest.kt b/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreviewTest.kt
index d1939776..b9939570 100644
--- a/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreviewTest.kt
+++ b/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreviewTest.kt
@@ -68,6 +68,22 @@ class BodyPreviewTest {
         assertEquals("[binary ${bytes.size} bytes]", preview)
     }
 
+    @Test
+    fun `text body declared as UTF-16 is decoded despite NUL padding`() {
+        // UTF-16BE pads ASCII with NUL bytes ('A' = 0x00 0x41), which the NUL heuristic would
+        // otherwise treat as binary. An explicitly declared wide charset must still decode.
+        val bytes = "café".toByteArray(StandardCharsets.UTF_16BE)
+        val preview = BodyPreview.render(bytes, MediaType.parse("text/plain;charset=UTF-16BE"))
+        assertEquals("café", preview)
+        assertFalse(preview.contains("[binary"), "declared UTF-16 text must not be summarised as binary")
+    }
+
+    @Test
+    fun `text body declared as UTF-16LE is decoded despite NUL padding`() {
+        val bytes = "hello".toByteArray(StandardCharsets.UTF_16LE)
+        assertEquals("hello", BodyPreview.render(bytes, MediaType.parse("text/plain;charset=UTF-16LE")))
+    }
+
     @Test
     fun `isProbablyText accepts plain ASCII and latin-1 high bytes`() {
         assertTrue(BodyPreview.isProbablyText("plain ascii text".toByteArray(StandardCharsets.US_ASCII)))

From d0d76f89cdae7d3f9cf48841d180c0d083bbca9b Mon Sep 17 00:00:00 2001
From: OmarAlJarrah <o.mazari.om63@gmail.com>
Date: Tue, 16 Jun 2026 23:45:31 +0300
Subject: [PATCH 3/3] refactor: memoise the wide-charset probe and clarify the
 binary summary

Polish the BodyPreview seam in response to review:

- Memoise encodesAsciiWithNul per Charset in a ConcurrentHashMap so the
  ASCII probe encodes at most once per distinct charset instead of on
  every render; subsequent renders are a plain map lookup.
- Rename the binary summary to [binary N bytes captured]. The byte count
  is the bounded preview snapshot, not necessarily the full body length,
  so the wording no longer implies a size it never observed.
- Drop the unreachable empty-array branch in previewText; render already
  returns early for empty input before any decode path is reached.
---
 .../core/http/pipeline/steps/BodyPreview.kt   | 47 +++++++++++++------
 .../http/pipeline/steps/BodyPreviewTest.kt    |  4 +-
 .../pipeline/steps/InstrumentationStepTest.kt |  2 +-
 3 files changed, 36 insertions(+), 17 deletions(-)

diff --git a/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreview.kt b/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreview.kt
index b2f4c347..4954eae0 100644
--- a/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreview.kt
+++ b/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreview.kt
@@ -9,6 +9,7 @@ package org.dexpace.sdk.core.http.pipeline.steps
 
 import org.dexpace.sdk.core.http.common.MediaType
 import java.nio.charset.Charset
+import java.util.concurrent.ConcurrentHashMap
 
 /**
  * Renders captured request/response body bytes into a short, log-safe preview string.
@@ -23,8 +24,10 @@ import java.nio.charset.Charset
  *   names one the JVM does not recognise, the preview falls back to [DEFAULT_CHARSET] (UTF-8),
  *   matching the HTTP default for most text types and the previous behaviour.
  * - **Binary-safe** — a body that does not look like text is never decoded. Instead it is
- *   summarised as `[binary N bytes]`, so an operator sees a stable, greppable marker rather
- *   than a line of replacement characters that can corrupt log viewers.
+ *   summarised as `[binary N bytes captured]`, so an operator sees a stable, greppable marker
+ *   rather than a line of replacement characters that can corrupt log viewers. `N` is the number
+ *   of bytes in the (already bounded) preview snapshot — i.e. how much was captured, not
+ *   necessarily the full body length — so the summary cannot imply a size it didn't observe.
  *
  * Decoding itself never throws: [Charset]-based decoding substitutes the replacement character
  * for malformed input, so a snapshot that ends mid-multibyte-sequence (a real possibility on a
@@ -38,8 +41,9 @@ import java.nio.charset.Charset
  * first: when the [MediaType] explicitly declares a *resolvable* charset whose encoding is not
  * NUL-free for ASCII (UTF-16/UTF-32 and friends), the body is decoded with that charset and the
  * NUL-based heuristic is skipped. When no charset is declared, or the declared one is single-byte
- * (US-ASCII, ISO-8859-1, UTF-8), the heuristic still runs so a genuinely binary body is summarised
- * rather than decoded into noise.
+ * (US-ASCII, ISO-8859-1, UTF-8), the heuristic still runs so a genuinely binary body is
+ * summarised rather than decoded into noise. The wide-charset probe is memoised per [Charset], so
+ * the cost is a single small encode the first time a charset is seen and a map lookup thereafter.
  *
  * ## Best-effort detection
  * The heuristic is **best-effort**, not a guarantee. It samples only the first
@@ -87,8 +91,8 @@ internal object BodyPreview {
      * Empty input yields the empty string. When [mediaType] explicitly declares a resolvable
      * multi-byte charset (UTF-16/UTF-32 and friends), the bytes are decoded with that charset and
      * the NUL-based binary heuristic is skipped — see the class KDoc. Otherwise a body that does
-     * not pass [isProbablyText] is rendered as a size-only `[binary N bytes]` summary, and a body
-     * that does is decoded with the charset from [mediaType] (or [DEFAULT_CHARSET] when
+     * not pass [isProbablyText] is rendered as a size-only `[binary N bytes captured]` summary, and
+     * a body that does is decoded with the charset from [mediaType] (or [DEFAULT_CHARSET] when
      * absent/unknown).
      */
     internal fun render(
@@ -108,23 +112,34 @@ internal object BodyPreview {
 
     /**
      * Decodes [bytes] using [charset]. Invalid byte sequences are replaced rather than throwing.
+     * [render] guarantees a non-empty array before calling this.
      */
     private fun previewText(
         bytes: ByteArray,
         charset: Charset,
-    ): String {
-        if (bytes.isEmpty()) return ""
-        return String(bytes, charset)
-    }
+    ): String = String(bytes, charset)
+
+    /** Memoised results of [computeEncodesAsciiWithNul], keyed by [Charset] (a JVM singleton). */
+    private val asciiWithNulByCharset = ConcurrentHashMap<Charset, Boolean>()
 
     /**
      * True when [charset] encodes a plain ASCII character to a byte sequence that contains a NUL
      * byte — the signature of a fixed-width multi-byte charset such as UTF-16 or UTF-32, where
      * `'A'` becomes e.g. `0x00 0x41`. Single-byte charsets (US-ASCII, ISO-8859-1) and UTF-8
-     * encode ASCII without NUL padding and return false. The probe is computed from the charset's
-     * own encoder, so it covers any such charset the JVM knows, not a hard-coded name list.
+     * encode ASCII without NUL padding and return false.
+     *
+     * The result is memoised per charset so the encode runs at most once per distinct [Charset];
+     * subsequent renders for that charset are a plain map lookup with no per-call allocation.
      */
     private fun encodesAsciiWithNul(charset: Charset): Boolean =
+        asciiWithNulByCharset.computeIfAbsent(charset) { computeEncodesAsciiWithNul(it) }
+
+    /**
+     * Computes the [encodesAsciiWithNul] verdict from the charset's own encoder, so it covers any
+     * such charset the JVM knows rather than a hard-coded name list. An encoder that rejects the
+     * probe is treated as non-wide (the heuristic path then applies).
+     */
+    private fun computeEncodesAsciiWithNul(charset: Charset): Boolean =
         try {
             "A".toByteArray(charset).any { it.toInt() == NUL }
         } catch (_: Exception) {
@@ -161,6 +176,10 @@ internal object BodyPreview {
     private fun isControlByte(b: Int): Boolean =
         (b < FIRST_PRINTABLE && b != TAB && b != LINE_FEED && b != CARRIAGE_RETURN) || b == DEL
 
-    /** The size-only summary emitted for a binary body. */
-    private fun binarySummary(size: Int): String = "[binary $size bytes]"
+    /**
+     * The size-only summary emitted for a binary body. [size] is the captured/preview byte count
+     * (the input is an already-bounded snapshot), so the wording says "captured" rather than
+     * implying it observed the full body length.
+     */
+    private fun binarySummary(size: Int): String = "[binary $size bytes captured]"
 }
diff --git a/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreviewTest.kt b/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreviewTest.kt
index b9939570..44356c26 100644
--- a/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreviewTest.kt
+++ b/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreviewTest.kt
@@ -56,7 +56,7 @@ class BodyPreviewTest {
         // gzip magic header (0x1f 0x8b 0x08) followed by a NUL byte — a representative binary body.
         val bytes = byteArrayOf(0x1f, 0x8b.toByte(), 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03)
         val preview = BodyPreview.render(bytes, MediaType.parse("application/gzip"))
-        assertEquals("[binary ${bytes.size} bytes]", preview)
+        assertEquals("[binary ${bytes.size} bytes captured]", preview)
         assertFalse(preview.contains('�'), "binary body must not be rendered as replacement chars")
     }
 
@@ -65,7 +65,7 @@ class BodyPreviewTest {
         // Even if a server mislabels a gzip stream as text, the NUL byte keeps it from being decoded.
         val bytes = byteArrayOf(0x1f, 0x8b.toByte(), 0x00, 0x42, 0x00, 0x13)
         val preview = BodyPreview.render(bytes, MediaType.parse("text/plain;charset=utf-8"))
-        assertEquals("[binary ${bytes.size} bytes]", preview)
+        assertEquals("[binary ${bytes.size} bytes captured]", preview)
     }
 
     @Test
diff --git a/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/InstrumentationStepTest.kt b/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/InstrumentationStepTest.kt
index f97108c3..82eca054 100644
--- a/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/InstrumentationStepTest.kt
+++ b/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/InstrumentationStepTest.kt
@@ -713,7 +713,7 @@ class InstrumentationStepTest {
         response.close()
 
         val preview = responsePreview(fakeSlf4j)
-        assertEquals("[binary ${binary.size} bytes]", preview)
+        assertEquals("[binary ${binary.size} bytes captured]", preview)
         assertFalse(preview.contains('�'), "binary body must not be logged as replacement characters")
     }