diff --git a/rewrite-core/src/main/java/org/openrewrite/internal/EncodingDetectingInputStream.java b/rewrite-core/src/main/java/org/openrewrite/internal/EncodingDetectingInputStream.java index 8b0924460c9..7ed411de26d 100644 --- a/rewrite-core/src/main/java/org/openrewrite/internal/EncodingDetectingInputStream.java +++ b/rewrite-core/src/main/java/org/openrewrite/internal/EncodingDetectingInputStream.java @@ -148,7 +148,21 @@ public synchronized String toString() { bos.write(buffer, 0, n); } - return bos.toString(); + Charset detectedCharset = getCharset(); + String result = bos.toString(); + + // If the detected charset can't re-encode the content, the detection + // was likely wrong — fall back to UTF-8. This catches cases where + // UTF-8 files with CJK content are misdetected as Windows-1252, + // since Windows-1252 has undefined byte positions that produce + // U+FFFD (which can't be re-encoded as Windows-1252). + if (detectedCharset != StandardCharsets.UTF_8 && + !detectedCharset.newEncoder().canEncode(result)) { + charset = StandardCharsets.UTF_8; + return new String(bos.toByteArray(), StandardCharsets.UTF_8); + } + + return result; } catch (IOException e) { throw new UncheckedIOException(e); } diff --git a/rewrite-core/src/test/java/org/openrewrite/internal/EncodingDetectingInputStreamTest.java b/rewrite-core/src/test/java/org/openrewrite/internal/EncodingDetectingInputStreamTest.java index 0724f4dd2e0..de3e3d5bad9 100644 --- a/rewrite-core/src/test/java/org/openrewrite/internal/EncodingDetectingInputStreamTest.java +++ b/rewrite-core/src/test/java/org/openrewrite/internal/EncodingDetectingInputStreamTest.java @@ -172,6 +172,52 @@ void readFullyDecodesIso8859Correctly() { assertThat(result).isEqualTo("Hütte"); } + @Test + void utf8WithCjkAndStrayByteReDecodedAsUtf8() { + // UTF-8 Chinese "配置" contains byte 0x8D which is undefined in Windows-1252, + // plus a stray 0xAA byte (invalid in UTF-8). + // Without the fix, this would be detected as Windows-1252, causing + // garbled content and UnmappableCharacterException on write-back. + // With the fix, readFully() detects the canEncode failure and + // re-decodes as UTF-8, producing correct Chinese content. + String content = "# app配置,详见 http://example.com\n"; + byte[] utf8Bytes = content.getBytes(UTF_8); + // Insert stray 0xAA byte at the boundary between ASCII and CJK (position 5) + // to avoid splitting a UTF-8 multi-byte sequence + byte[] withStrayByte = new byte[utf8Bytes.length + 1]; + System.arraycopy(utf8Bytes, 0, withStrayByte, 0, 5); + withStrayByte[5] = (byte) 0xAA; // insert stray byte + System.arraycopy(utf8Bytes, 5, withStrayByte, 6, utf8Bytes.length - 5); + + EncodingDetectingInputStream is = new EncodingDetectingInputStream(new ByteArrayInputStream(withStrayByte)); + String result = is.readFully(); + + assertThat(is.getCharset()).isEqualTo(UTF_8); + assertThat(result).contains("配置"); + } + + @Test + void utf8WithCjkContentDetectedAsUtf8() { + // Pure UTF-8 CJK content should be detected as UTF-8 + String cjk = "# 配置,详见 http://example.com\n# 关闭内置tomcat\n"; + byte[] bytes = cjk.getBytes(UTF_8); + EncodingDetectingInputStream is = new EncodingDetectingInputStream(new ByteArrayInputStream(bytes)); + String result = is.readFully(); + assertThat(is.getCharset()).isEqualTo(UTF_8); + assertThat(result).isEqualTo(cjk); + } + + @Test + void genuineWindows1252StillDetectedAsWindows1252() { + // Genuine Windows-1252 content (Western European) should remain Windows-1252 + String french = "Café résumé naïve"; + byte[] bytes = french.getBytes(WINDOWS_1252); + EncodingDetectingInputStream is = new EncodingDetectingInputStream(new ByteArrayInputStream(bytes)); + String result = is.readFully(); + assertThat(is.getCharset()).isEqualTo(WINDOWS_1252); + assertThat(result).isEqualTo(french); + } + private static byte[] parseHex(String hex) { String[] parts = hex.trim().split("\\s+"); byte[] bytes = new byte[parts.length];