Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,21 @@ public synchronized String toString() {
bos.write(buffer, 0, n);
}

return bos.toString();
Charset detectedCharset = getCharset();
String result = bos.toString();

// If the detected charset can't re-encode the content, the detection
// was likely wrong — fall back to UTF-8. This catches cases where
// UTF-8 files with CJK content are misdetected as Windows-1252,
// since Windows-1252 has undefined byte positions that produce
// U+FFFD (which can't be re-encoded as Windows-1252).
if (detectedCharset != StandardCharsets.UTF_8 &&
!detectedCharset.newEncoder().canEncode(result)) {
charset = StandardCharsets.UTF_8;
return new String(bos.toByteArray(), StandardCharsets.UTF_8);
}

return result;
} catch (IOException e) {
throw new UncheckedIOException(e);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,52 @@ void readFullyDecodesIso8859Correctly() {
assertThat(result).isEqualTo("Hütte");
}

@Test
void utf8WithCjkAndStrayByteReDecodedAsUtf8() {
// UTF-8 Chinese "配置" contains byte 0x8D which is undefined in Windows-1252,
// plus a stray 0xAA byte (invalid in UTF-8).
// Without the fix, this would be detected as Windows-1252, causing
// garbled content and UnmappableCharacterException on write-back.
// With the fix, readFully() detects the canEncode failure and
// re-decodes as UTF-8, producing correct Chinese content.
String content = "# app配置,详见 http://example.com\n";
byte[] utf8Bytes = content.getBytes(UTF_8);
// Insert stray 0xAA byte at the boundary between ASCII and CJK (position 5)
// to avoid splitting a UTF-8 multi-byte sequence
byte[] withStrayByte = new byte[utf8Bytes.length + 1];
System.arraycopy(utf8Bytes, 0, withStrayByte, 0, 5);
withStrayByte[5] = (byte) 0xAA; // insert stray byte
System.arraycopy(utf8Bytes, 5, withStrayByte, 6, utf8Bytes.length - 5);

EncodingDetectingInputStream is = new EncodingDetectingInputStream(new ByteArrayInputStream(withStrayByte));
String result = is.readFully();

assertThat(is.getCharset()).isEqualTo(UTF_8);
assertThat(result).contains("配置");
}

@Test
void utf8WithCjkContentDetectedAsUtf8() {
// Pure UTF-8 CJK content should be detected as UTF-8
String cjk = "# 配置,详见 http://example.com\n# 关闭内置tomcat\n";
byte[] bytes = cjk.getBytes(UTF_8);
EncodingDetectingInputStream is = new EncodingDetectingInputStream(new ByteArrayInputStream(bytes));
String result = is.readFully();
assertThat(is.getCharset()).isEqualTo(UTF_8);
assertThat(result).isEqualTo(cjk);
}

@Test
void genuineWindows1252StillDetectedAsWindows1252() {
// Genuine Windows-1252 content (Western European) should remain Windows-1252
String french = "Café résumé naïve";
byte[] bytes = french.getBytes(WINDOWS_1252);
EncodingDetectingInputStream is = new EncodingDetectingInputStream(new ByteArrayInputStream(bytes));
String result = is.readFully();
assertThat(is.getCharset()).isEqualTo(WINDOWS_1252);
assertThat(result).isEqualTo(french);
}

private static byte[] parseHex(String hex) {
String[] parts = hex.trim().split("\\s+");
byte[] bytes = new byte[parts.length];
Expand Down
Loading