Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
126 changes: 74 additions & 52 deletions gapic/utils/rst.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,53 @@
# limitations under the License.

import re
from typing import Optional
from typing import Optional, Dict

import pypandoc # type: ignore

from gapic.utils.lines import wrap

# Cache for the few complex items we actually send to pandoc
_RAW_RST_CACHE: Dict[str, str] = {}

def _tuned_fast_convert(text: str) -> Optional[str]:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

This new conversion logic in _tuned_fast_convert is a critical part of the performance improvement, but it lacks unit tests. Additionally, the changes in the rst function appear to break the existing tests in tests/unit/utils/test_rst.py (e.g., test_rst_formatted), as they now bypass the mocked pypandoc.convert_text call.

It's crucial to:

  1. Add comprehensive unit tests for _tuned_fast_convert covering all the regex conversions and edge cases (links, code blocks, headings, lists).
  2. Update the existing tests for the rst function to reflect the new logic, including caching and the fast-path converter.

Without proper tests, it's hard to verify the correctness of these performance optimizations and prevent future regressions.

"""
Converts Markdown to RST using pure Python.
Only falls back to Pandoc for Tables and Images.
"""
# --- 1. FALLBACKS ---
# Tables (pipe surrounded by spaces) or Images (![).
# We allow "][" (Reference Links) to be handled by Python now.
if (re.search(r" \| ", text) or re.search(r"\|\n", text)) or "![" in text:
return None

# --- 2. CONVERSION ---

# A. CODE BLOCKS: `code` -> ``code``
# CRITICAL: Run this FIRST. This ensures we handle existing backticks
# before we create NEW backticks for links.
# (?<!:) ensures we don't break Sphinx roles like :class:`MyClass`
converted = re.sub(r"(?<!:|`)`([^`]+)`(?!`)", r"``\1``", text)

# B. REFERENCE LINKS: [Text][Ref] -> `Text <Ref>`__
# We fix the broken documentation by converting these to valid RST links.
# Since step A is done, these new backticks will NOT be doubled.
converted = re.sub(r"\[([^\]]+)\]\[([^\]]+)\]", r"`\1 <\2>`__", converted)

# C. STANDARD LINKS: [Text](URL) -> `Text <URL>`__
converted = re.sub(r"\[([^\]]+)\]\(([^)]+)\)", r"`\1 <\2>`__", converted)

# D. BOLD/ITALICS:
converted = re.sub(r"(?<!_)\b_([^_]+)_\b(?!_)", r"*\1*", converted)

# E. HEADINGS: # Heading -> Heading\n=======
converted = re.sub(r"^# (.*)$", r"\1\n" + "=" * 10, converted, flags=re.MULTILINE)
converted = re.sub(r"^## (.*)$", r"\1\n" + "-" * 10, converted, flags=re.MULTILINE)
Comment on lines +56 to +57
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The RST specification requires that heading underlines are at least as long as the heading text. Using a fixed length of 10 characters for the underline can result in invalid RST for headings longer than 10 characters, which would break the documentation build. The replacement in re.sub should be a function that calculates the correct underline length based on the matched heading text.

Suggested change
converted = re.sub(r"^# (.*)$", r"\1\n" + "=" * 10, converted, flags=re.MULTILINE)
converted = re.sub(r"^## (.*)$", r"\1\n" + "-" * 10, converted, flags=re.MULTILINE)
converted = re.sub(r"^# (.*)$", lambda m: f"{m.group(1)}\n{'=' * len(m.group(1))}", converted, flags=re.MULTILINE)
converted = re.sub(r"^## (.*)$", lambda m: f"{m.group(1)}\n{'-' * len(m.group(1))}", converted, flags=re.MULTILINE)


# F. LISTS: Markdown (- item) needs a preceding newline for RST.
converted = re.sub(r"(\n[^-*].*)\n\s*([-*] )", r"\1\n\n\2", converted)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The regex for list conversion only handles unordered lists starting with - or *. It does not handle numbered lists (e.g., 1. item). This will cause numbered lists to be rendered incorrectly, as they also require a blank line preceding them in RST. The logic should be updated to also handle numbered lists.


return converted

def rst(
text: str,
Expand All @@ -27,60 +68,41 @@ def rst(
nl: Optional[bool] = None,
source_format: str = "commonmark",
):
"""Convert the given text to ReStructured Text.

Args:
text (str): The text to convert.
width (int): The number of columns.
indent (int): The number of columns to indent each line of text
(except the first).
nl (bool): Whether to append a trailing newline.
Defaults to appending a newline if the result is more than
one line long.
source_format (str): The source format. This is ``commonmark`` by
default, which is what is used by convention in protocol buffers.

Returns:
str: The same text, in RST format.
"""
# Quick check: If the text block does not appear to have any formatting,
# do not convert it.
# (This makes code generation significantly faster; calling out to pandoc
# is by far the most expensive thing we do.)
if not re.search(r"[|*`_[\]]", text):
answer = wrap(
text,
indent=indent,
offset=indent + 3,
width=width - indent,
)
# 1. Super Fast Path: No special chars? Just wrap.
if not re.search(r"[|*`_[\]#]", text):
answer = wrap(text, indent=indent, offset=indent + 3, width=width - indent)
return _finalize(answer, nl, indent)

# 2. Check Cache
if text in _RAW_RST_CACHE:
raw_rst = _RAW_RST_CACHE[text]
else:
# Convert from CommonMark to ReStructured Text.
answer = (
pypandoc.convert_text(
text,
"rst",
format=source_format,
verify_format=False,
extra_args=["--columns=%d" % (width - indent)],
)
.strip()
.replace("\n", f"\n{' ' * indent}")
)

# Add a newline to the end of the document if any line breaks are
# already present.
#
# This causes the closing """ to be on the subsequent line only when
# appropriate.
# 3. Try Tuned Python Convert (Fastest)
fast_result = _tuned_fast_convert(text)

if fast_result is not None:
raw_rst = fast_result.strip()
else:
# 4. Fallback to Pandoc (Only for Tables/Images)
raw_rst = pypandoc.convert_text(
text, "rst", format=source_format, extra_args=["--columns=1000"]
).strip()

_RAW_RST_CACHE[text] = raw_rst

# 5. Python Formatting
if "::" in raw_rst or ".. code" in raw_rst:
answer = raw_rst.replace("\n", f"\n{' ' * indent}")
else:
answer = wrap(raw_rst, indent=indent, offset=indent, width=width - indent)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The call to wrap here uses offset=indent, while the "Super Fast Path" on line 73 uses offset=indent + 3. This inconsistency can lead to different wrapping for plain text versus text with markdown, resulting in inconsistent formatting. For consistent output, the offset should be the same in both places.

Suggested change
answer = wrap(raw_rst, indent=indent, offset=indent, width=width - indent)
answer = wrap(raw_rst, indent=indent, offset=indent + 3, width=width - indent)


return _finalize(answer, nl, indent)


def _finalize(answer, nl, indent):
"""Helper to handle trailing newlines and quotes."""
if nl or ("\n" in answer and nl is None):
answer += "\n" + " " * indent

# If the text ends in a double-quote, append a period.
# This ensures that we do not get a parse error when this output is
# followed by triple-quotes.
if answer.endswith('"'):
answer += "."

# Done; return the answer.
return answer
Loading