diff --git a/src/pymax/formatting/markdown.py b/src/pymax/formatting/markdown.py index f228688..f076699 100644 --- a/src/pymax/formatting/markdown.py +++ b/src/pymax/formatting/markdown.py @@ -2,6 +2,10 @@ class Formatter: + # Characters above this value are encoded as surrogate pairs in UTF-16, + # occupying 2 code units instead of 1. + BMP_MAX = 0xFFFF + MARKERS = { "```": "CODE", "**": "STRONG", @@ -14,6 +18,10 @@ class Formatter: MARKER_ORDER = ["```", "**", "__", "~~", "`", "_", "*"] + @staticmethod + def _code_units_len(text: str) -> int: + return len(text.encode("utf-16-le")) // 2 + @staticmethod def _parse_link( text: str, @@ -64,15 +72,16 @@ def format_markdown(text: str) -> tuple[str, list[Element]]: label, url, next_i = parsed_link start = clean_pos + utf16_label_len = Formatter._code_units_len(label) clean_text += label - clean_pos += len(label) + clean_pos += utf16_label_len entities.append( Element( type="LINK", from_=start, - length=len(label), + length=utf16_label_len, attributes=ElementAttributes(url=url), ) ) @@ -93,9 +102,10 @@ def format_markdown(text: str) -> tuple[str, list[Element]]: start = clean_pos while i < len(text) and text[i] != "\n": - clean_text += text[i] + ch = text[i] + clean_text += ch i += 1 - clean_pos += 1 + clean_pos += 2 if ord(ch) > Formatter.BMP_MAX else 1 length = clean_pos - start @@ -123,9 +133,10 @@ def format_markdown(text: str) -> tuple[str, list[Element]]: start = clean_pos while i < len(text) and text[i] != "\n": - clean_text += text[i] + ch = text[i] + clean_text += ch i += 1 - clean_pos += 1 + clean_pos += 2 if ord(ch) > Formatter.BMP_MAX else 1 length = clean_pos - start @@ -211,10 +222,11 @@ def format_markdown(text: str) -> tuple[str, list[Element]]: line_start = False continue - clean_text += text[i] - line_start = text[i] == "\n" + ch = text[i] + clean_text += ch + line_start = ch == "\n" i += 1 - clean_pos += 1 + clean_pos += 2 if ord(ch) > Formatter.BMP_MAX else 1 return clean_text, entities