openrewrite · XiaoSK · May 11, 2026 · May 11, 2026
diff --git a/rewrite-core/src/main/java/org/openrewrite/internal/EncodingDetectingInputStream.java b/rewrite-core/src/main/java/org/openrewrite/internal/EncodingDetectingInputStream.java
@@ -148,7 +148,21 @@ public synchronized String toString() {
                 bos.write(buffer, 0, n);
             }
 
-            return bos.toString();
+            Charset detectedCharset = getCharset();
+            String result = bos.toString();
+
+            // If the detected charset can't re-encode the content, the detection
+            // was likely wrong — fall back to UTF-8. This catches cases where
+            // UTF-8 files with CJK content are misdetected as Windows-1252,
+            // since Windows-1252 has undefined byte positions that produce
+            // U+FFFD (which can't be re-encoded as Windows-1252).
+            if (detectedCharset != StandardCharsets.UTF_8 &&
+                    !detectedCharset.newEncoder().canEncode(result)) {
+                charset = StandardCharsets.UTF_8;
+                return new String(bos.toByteArray(), StandardCharsets.UTF_8);
+            }
+
+            return result;
         } catch (IOException e) {
             throw new UncheckedIOException(e);
         }

diff --git a/rewrite-core/src/test/java/org/openrewrite/internal/EncodingDetectingInputStreamTest.java b/rewrite-core/src/test/java/org/openrewrite/internal/EncodingDetectingInputStreamTest.java
@@ -172,6 +172,52 @@ void readFullyDecodesIso8859Correctly() {
         assertThat(result).isEqualTo("Hütte");
     }
 
+    @Test
+    void utf8WithCjkAndStrayByteReDecodedAsUtf8() {
+        // UTF-8 Chinese "配置" contains byte 0x8D which is undefined in Windows-1252,
+        // plus a stray 0xAA byte (invalid in UTF-8).
+        // Without the fix, this would be detected as Windows-1252, causing
+        // garbled content and UnmappableCharacterException on write-back.
+        // With the fix, readFully() detects the canEncode failure and
+        // re-decodes as UTF-8, producing correct Chinese content.
+        String content = "# app配置，详见 http://example.com\n";
+        byte[] utf8Bytes = content.getBytes(UTF_8);
+        // Insert stray 0xAA byte at the boundary between ASCII and CJK (position 5)
+        // to avoid splitting a UTF-8 multi-byte sequence
+        byte[] withStrayByte = new byte[utf8Bytes.length + 1];
+        System.arraycopy(utf8Bytes, 0, withStrayByte, 0, 5);
+        withStrayByte[5] = (byte) 0xAA; // insert stray byte
+        System.arraycopy(utf8Bytes, 5, withStrayByte, 6, utf8Bytes.length - 5);
+
+        EncodingDetectingInputStream is = new EncodingDetectingInputStream(new ByteArrayInputStream(withStrayByte));
+        String result = is.readFully();
+
+        assertThat(is.getCharset()).isEqualTo(UTF_8);
+        assertThat(result).contains("配置");
+    }
+
+    @Test
+    void utf8WithCjkContentDetectedAsUtf8() {
+        // Pure UTF-8 CJK content should be detected as UTF-8
+        String cjk = "# 配置，详见 http://example.com\n# 关闭内置tomcat\n";
+        byte[] bytes = cjk.getBytes(UTF_8);
+        EncodingDetectingInputStream is = new EncodingDetectingInputStream(new ByteArrayInputStream(bytes));
+        String result = is.readFully();
+        assertThat(is.getCharset()).isEqualTo(UTF_8);
+        assertThat(result).isEqualTo(cjk);
+    }
+
+    @Test
+    void genuineWindows1252StillDetectedAsWindows1252() {
+        // Genuine Windows-1252 content (Western European) should remain Windows-1252
+        String french = "Café résumé naïve";
+        byte[] bytes = french.getBytes(WINDOWS_1252);
+        EncodingDetectingInputStream is = new EncodingDetectingInputStream(new ByteArrayInputStream(bytes));
+        String result = is.readFully();
+        assertThat(is.getCharset()).isEqualTo(WINDOWS_1252);
+        assertThat(result).isEqualTo(french);
+    }
+
     private static byte[] parseHex(String hex) {
         String[] parts = hex.trim().split("\\s+");
         byte[] bytes = new byte[parts.length];