typeshedding_cpython_docs/utils.py at main · guoci/typeshedding_cpython_docs · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
import json
import ast
import enum
from typing import Final, Literal
import pathlib
import re
import tomllib

with open("config.toml", "rb") as f:
    config = tomllib.load(f)

cpython_docs_dir = pathlib.Path(config["cpython_docs_dir"]).expanduser()
cpython_branch = config["cpython_branch"]
pyrefly_dir = pathlib.Path(config["pyrefly_dir"]).expanduser()
modify_docs = config["modify_docs"]

_Missing = enum.Enum("_Missing", "MISSING")
MISSING: Final = _Missing.MISSING

# rst directive types for Python APIs
directive_types: Final = [
    "attribute",
    "awaitablefunction",
    "awaitablemethod",
    "class",
    "classmethod",
    "data",
    "decorator",
    "exception",
    "function",
    "method",
    "monitoring-event",  # only in `sys.monitoring.rst`
    # Implemented in: https://github.com/python/cpython/blob/main/Doc/tools/extensions/pyspecific.py
    "staticmethod",
    # "describe", # ~53 usages in cpython docs, majority in `stdtypes.rst`
    # Documentation: https://www.sphinx-doc.org/en/master/usage/domains/standard.html#directive-describe
]

directive_types_regex = "|".join(map(re.escape, directive_types))

directive_types_regex_with_module = "|".join(
    map(re.escape, directive_types + ["module", "currentmodule"])
)

rec_directive_v0 = re.compile(
    rf"""^(?P<api_type_head>\.\. (?P<directive_type>{directive_types_regex})::\s+)(?s:.+?)\n(?=\S|\Z)""",
    re.MULTILINE,
)  # match .. directive:: ... \n\n ... until next .. directive:: or end of file


rec_directive = re.compile(
    rf"^(?P<indent> *)(?P<api_type_head>\.\. (?P<directive_type>{directive_types_regex}):: +)(?P<first_api>.+)"
    r"(?:\n(?:(?P=indent) .+| *)$)*",
    re.MULTILINE,
)

rec_directive_with_module = re.compile(
    rf"^(?P<indent> *)(?P<api_type_head>\.\. (?P<directive_type>{directive_types_regex_with_module}):: +)(?P<first_api>.+)"
    r"(?:\n(?:(?P=indent) .+| *)$)*",
    re.MULTILINE,
)

assert sorted(directive_types) == directive_types


def dedent(text):
    """Remove any common leading whitespace from every line in `text`.

    This can be used to make triple-quoted strings line up with the left
    edge of the display, while still presenting them in the source code
    in indented form.

    Note that tabs and spaces are both treated as whitespace, but they
    are not equal: the lines "  hello" and "\\thello" are
    considered to have no common leading whitespace.

    Entirely blank lines are normalized to a newline character.
    """
    try:
        lines = text.split("\n")
    except (AttributeError, TypeError):
        msg = f"expected str object, not {type(text).__qualname__!r}"
        raise TypeError(msg) from None

    # Get length of leading whitespace, inspired by ``os.path.commonprefix()``.
    non_blank_lines = [l for l in lines if l and not l.isspace()]
    l1 = min(non_blank_lines, default="")
    l2 = max(non_blank_lines, default="")
    margin = 0
    for margin, c in enumerate(l1):
        if c != l2[margin] or c not in " \t":
            break

    return "\n".join([l[margin:] if not l.isspace() else "" for l in lines]), margin


all_directives = set()
rec = re.compile(r"^ *?\.\. ([^ ]+?):: ", re.MULTILINE)
for e in sorted((cpython_docs_dir / "library").glob("*.rst")):
    all_directives.update(rec.findall(e.read_text(encoding="utf-8")))
assert set(directive_types) - set(all_directives) == set(), (
    f"some directive types are not used in cpython docs, {set(directive_types) - set(all_directives)}"
)


# directive types known to be used in CPython docs as of 2026-02-12 on the `main` branch
directives_last_good = """attribute audit-event availability awaitablefunction awaitablemethod
caution class classmethod cmdoption code code-block currentmodule
data decorator deprecated deprecated-removed describe doctest
envvar exception figure function highlight hint image impl-detail include index
list-table literalinclude method module monitoring-event note
only opcode option pdbcommand productionlist program rubric raw
seealso sidebar staticmethod
tabularcolumns testcleanup testcode testoutput testsetup versionadded versionchanged warning"""

# once a new directive type is added to cpython, we should decide whether to add it to `directive_types` or not
new_directives = all_directives - set(directives_last_good.split())
assert new_directives == set(), (
    f"new directive types added in cpython docs {sorted(new_directives)}"
)
removed_directives = set(directives_last_good.split()) - set(all_directives)
if removed_directives:
    print("directive types removed in cpython docs:")
    print(sorted(removed_directives))

print("non python api directive types:")
print(sorted(set(all_directives) - set(directive_types)))


def check_trailing_spaces():
    for rst_path in sorted((cpython_docs_dir / "library").glob("**/*.rst")):
        a = re.compile(r"[ \t\r\f\v]+$", re.MULTILINE).finditer(
            rst_path.read_text(encoding="utf-8")
        )
        for e in a:
            print(f"trailing spaces found in {rst_path}")
            sp = e.span()
            print(repr(e[0]))
            print(repr(e.string[sp[0] - 10 : sp[1] + 10]))
            print(repr(e.string[sp[0] - 100 : sp[1] + 100]))
            raise AssertionError("trailing spaces found")


check_trailing_spaces()


def check_module_directive_format():
    rec = re.compile(r"^( *)\.\. (?:currentmodule|module)::( *)(.*)$", re.MULTILINE)
    for rst_path in sorted((cpython_docs_dir / "library").glob("**/*.rst")):
        txt = rst_path.read_text(encoding="utf-8")
        for e in rec.findall(txt):
            ind, spaces, mod_name = e
            assert ind == "", (
                f"unexpected indent before .. currentmodule or .. module in {rst_path}: {e!r}"
            )
            assert spaces != "", (
                f"expected space after .. currentmodule or .. module in {rst_path}: {e!r}"
            )
            assert mod_name != "", (
                f"expected module name after .. currentmodule or .. module in {rst_path}: {e!r}"
            )


check_module_directive_format()


def check_for_tabs():
    for rst_path in sorted((cpython_docs_dir / "library").glob("**/*.rst")):
        try:
            txt = rst_path.read_text(encoding="utf-8")
        except UnicodeDecodeError:
            print(f"cannot read {rst_path} as utf-8")
            continue
        if "\t" in txt:
            print(f"tab found in {rst_path}")


check_for_tabs()


def get_github_sha(owner: str, repo: str) -> str:
    print("""
    check here for updates to the bundled typeshed used by pyrefly:
    https://github.com/facebook/pyrefly/tree/main/crates/pyrefly_bundled/third_party/typeshed
    using the hardcoded sha to reduce diffs
    f689b10e67021a7313e7cb2c21de1ccc03a4c66d
    """)
    return "2b8928"
    import json
    from urllib.request import Request, urlopen

    headers = {
        "Accept": "application/vnd.github+json",
        "User-Agent": "python-urllib",
    }
    base_url = f"https://api.github.com/repos/{owner}/{repo}"
    with urlopen(Request(base_url, headers=headers)) as r:
        default_branch = json.load(r)["default_branch"]

    url = f"{base_url}/branches/{default_branch}"

    with urlopen(Request(url, headers=headers)) as resp:
        data = json.load(resp)

    return data["commit"]["sha"]


# pyrefly_revision = get_github_sha("facebook", "pyrefly")

with (
    pyrefly_dir
    / "crates"
    / "pyrefly_bundled"
    / "third_party"
    / "typeshed_metadata.json"
).open("r") as f:
    typeshed_revision = json.load(f)["url"].rsplit("/")[-1]


def remove_self_parameter(s: str) -> str:
    """
    Remove 'self' parameter from method signature string.

    Examples:
    >>> remove_self_parameter('method(self: int, a, b)')
    'method(a, b)'
    >>> remove_self_parameter('classmethod(self: classmethod[_T, _P, _R_co], f: (type[_T], ParamSpec(_P)) -> _R_co, /) -> None')
    'classmethod(f: (type[_T], ParamSpec(_P)) -> _R_co, /) -> None'
    >>> remove_self_parameter('args[_T](self: partial[_T]) -> tuple[Any, ...]')
    'args[_T]() -> tuple[Any, ...]'
    >>> remove_self_parameter("ArgumentParser.add_subparsers(self: _ArgumentParserT, *, title: str = 'subcommands', description: str | None = None, prog: str | None = None, action: type[Action] = ..., option_string: str = ..., dest: str | None = None, required: bool = False, help: str | None = None, metavar: str | None = None) -> _SubParsersAction[_ArgumentParserT]")
    'ArgumentParser.add_subparsers(*, title: str = 'subcommands', description: str | None = None, prog: str | None = None, action: type[Action] = ..., option_string: str = ..., dest: str | None = None, required: bool = False, help: str | None = None, metavar: str | None = None) -> _SubParsersAction[_ArgumentParserT]'
    """
    return remove_self_or_cls_parameter(s, "self")


def remove_cls_parameter(s: str) -> str:
    return remove_self_or_cls_parameter(s, "cls")


def remove_self_or_cls_parameter(s: str, pname: Literal["self", "cls"]) -> str:
    pnamec = pname + ":"
    if pnamec not in s:
        return s
    start_idx = s.index(pnamec) + len(pnamec)
    while start_idx < len(s):
        i1 = s.find(")", start_idx)
        i2 = s.find(",", start_idx)
        idx = min(len(s) if i1 == -1 else i1, len(s) if i2 == -1 else i2)
        c = "def f(" + s[s.index(pnamec) : idx] + "):..."
        try:
            ast.parse(c)
        except SyntaxError:
            start_idx = idx + 1
        else:
            break

    assert s[idx:].startswith((")", ", ")), f"unexpected: {s!r} {idx=} {s[idx:]=}"
    if s[idx:].startswith(", "):
        idx += 2
    return s[: s.index(pnamec)] + s[idx:]