diff --git a/dev/merge_spark_pr.py b/dev/merge_spark_pr.py index b630e13b968c7..6479cf172bcc1 100755 --- a/dev/merge_spark_pr.py +++ b/dev/merge_spark_pr.py @@ -43,6 +43,7 @@ import subprocess import sys import traceback +from typing import List from urllib.request import urlopen from urllib.request import Request from urllib.error import HTTPError @@ -700,77 +701,275 @@ def resolve_jira_issues(title, merge_branches, comment): resolve_jira_issue(merge_branches, comment, jira_id) -def standardize_jira_ref(text): - """ - Standardize the [SPARK-XXXXX] [MODULE] prefix - Converts "[SPARK-XXX][mllib] Issue", "[MLLib] SPARK-XXX. Issue" or "SPARK XXX [MLLIB]: Issue" to - "[SPARK-XXX][MLLIB] Issue" - - >>> standardize_jira_ref( - ... "[SPARK-5821] [SQL] ParquetRelation2 CTAS should check if delete is successful") - '[SPARK-5821][SQL] ParquetRelation2 CTAS should check if delete is successful' - >>> standardize_jira_ref( - ... "[SPARK-4123][Project Infra][WIP]: Show new dependencies added in pull requests") - '[SPARK-4123][PROJECT INFRA][WIP] Show new dependencies added in pull requests' - >>> standardize_jira_ref("[MLlib] Spark 5954: Top by key") - '[SPARK-5954][MLLIB] Top by key' - >>> standardize_jira_ref("[SPARK-979] a LRU scheduler for load balancing in TaskSchedulerImpl") - '[SPARK-979] a LRU scheduler for load balancing in TaskSchedulerImpl' - >>> standardize_jira_ref( - ... "SPARK-1094 Support MiMa for reporting binary compatibility across versions.") - '[SPARK-1094] Support MiMa for reporting binary compatibility across versions.' - >>> standardize_jira_ref("[WIP] [SPARK-1146] Vagrant support for Spark") - '[SPARK-1146][WIP] Vagrant support for Spark' - >>> standardize_jira_ref( - ... "SPARK-1032. If Yarn app fails before registering, app master stays aroun...") - '[SPARK-1032] If Yarn app fails before registering, app master stays aroun...' - >>> standardize_jira_ref( - ... "[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved words in DDL parser.") - '[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved words in DDL parser.' - >>> standardize_jira_ref( - ... 'Revert "[SPARK-48591][PYTHON] Simplify the if-else branches with F.lit"') - 'Revert "[SPARK-48591][PYTHON] Simplify the if-else branches with F.lit"' - >>> standardize_jira_ref("Additional information for users building from source code") - 'Additional information for users building from source code' + +class Component: + """A Spark PR-title tag, paired with its canonical JIRA component name. + + ``jira_name`` is the canonical name of the SPARK JIRA component (e.g. + "Documentation"); empty for status markers like [MINOR] that are not + JIRA components but are still recognized in PR titles. + + ``tag`` is the preferred PR-title abbreviation (uppercase, no brackets, + e.g. "DOC"). ``aliases`` lists other accepted spellings that resolve to + the same component (e.g. "DOCS", "DOCUMENTATION" -> "DOC"). + + ``primary`` marks components whose presence alone satisfies the merge-time + requirement. Non-primary JIRA components (e.g. [TEST], [PS], [SHUFFLE]) + remain recognized — they normalize and pass through validation — but + they must be paired with a primary tag (e.g. [SQL][TEST]). Status + markers are never primary. [WIP] is intentionally absent from the + registry: a WIP PR should be aborted at the earlier WIP warning, not + merged. """ - jira_refs = [] - components = [] - # If this is a Revert PR, no need to process any further - if text.startswith('Revert "') and text.endswith('"'): - return text + def __init__(self, tag, aliases=(), primary=False, jira_name=""): + self.tag = tag + self.aliases = frozenset(aliases) + self.primary = primary + self.jira_name = jira_name - # If the string is compliant, no need to process any further - if re.search(r"^\[SPARK-[0-9]{3,6}\](\[[A-Z0-9_\s,]+\] )+\S+", text): - return text + def matches(self, token): + return token == self.tag or token in self.aliases - # Extract JIRA ref(s): - pattern = re.compile(r"(SPARK[-\s]*[0-9]{3,6})+", re.IGNORECASE) - for ref in pattern.findall(text): - # Add brackets, replace spaces with a dash, & convert to uppercase - jira_refs.append("[" + re.sub(r"\s+", "-", ref.upper()) + "]") - text = text.replace(ref, "") + @classmethod + def find(cls, token): + """Return the Component matching ``token`` (case-insensitive), or None.""" + if token is None: + return None + token = token.strip().upper() + for c in COMPONENTS: + if c.matches(token): + return c + return None - # Extract spark component(s): - # Look for alphanumeric chars, spaces, dashes, periods, and/or commas - pattern = re.compile(r"(\[[\w\s,.-]+\])", re.IGNORECASE) - for component in pattern.findall(text): - components.append(component.upper()) - text = text.replace(component, "") - # Cleanup any remaining symbols: - pattern = re.compile(r"^\W+(.*)", re.IGNORECASE) - if pattern.search(text) is not None: - text = pattern.search(text).groups()[0] - # Assemble full text (JIRA ref(s), module(s), remaining text) - clean_text = "".join(jira_refs).strip() + "".join(components).strip() + " " + text.strip() +# Full SPARK JIRA component list (sorted alphabetically by tag), followed +# by status markers. Keep in sync with the components in JIRA — fetch the +# current list with: +# curl -s https://issues.apache.org/jira/rest/api/2/project/SPARK/components +# A `primary=True` marker indicates the tag alone satisfies the merge-time +# component requirement; non-primary JIRA components must be paired with a +# primary one (e.g. [SQL][TEST], [PYTHON][PS], [CORE][SHUFFLE]). Status +# markers leave `jira_name` empty. +COMPONENTS = ( + Component("BLOCK MANAGER", ("BLOCK_MANAGER",), jira_name="Block Manager"), + Component("BUILD", primary=True, jira_name="Build"), + Component("CONNECT", primary=True, jira_name="Connect"), + Component("CORE", ("SPARK CORE", "SPARK_CORE"), primary=True, jira_name="Spark Core"), + Component("DEPLOY", jira_name="Deploy"), + Component("DOC", ("DOCS", "DOCUMENTATION"), primary=True, jira_name="Documentation"), + Component("DOCKER", primary=True, jira_name="Spark Docker"), + Component("EC2", jira_name="EC2"), + Component("EXAMPLE", ("EXAMPLES",), jira_name="Examples"), + Component("GRAPHX", primary=True, jira_name="GraphX"), + Component("INFRA", ("PROJECT INFRA", "PROJECT_INFRA"), primary=True, jira_name="Project Infra"), + Component("IO", jira_name="Input/Output"), + Component("JAVA", ("JAVA API", "JAVA_API", "JAVAAPI"), jira_name="Java API"), + Component("K8S", ("KUBERNETES",), primary=True, jira_name="Kubernetes"), + Component("MESOS", jira_name="Mesos"), + Component("ML", primary=True, jira_name="ML"), + Component("MLLIB", primary=True, jira_name="MLlib"), + Component("OPTIMIZER", jira_name="Optimizer"), + Component("PROTOBUF", jira_name="Protobuf"), + Component("PS", primary=True, jira_name="Pandas API on Spark"), + Component("PYTHON", ("PYSPARK",), primary=True, jira_name="PySpark"), + Component("R", ("SPARKR",), primary=True, jira_name="R"), + Component("REPL", ("SHELL", "SPARK SHELL", "SPARK_SHELL"), jira_name="Spark Shell"), + Component("SCHEDULER", jira_name="Scheduler"), + Component("SDP", ("PIPELINES",), primary=True, jira_name="Declarative Pipelines"), + Component("SECURITY", primary=True, jira_name="Security"), + Component("SHUFFLE", jira_name="Shuffle"), + # Component("SPARKR", jira_name="SparkR"), # SPARKR is treated as an alias of R above + Component("SQL", primary=True, jira_name="SQL"), + Component("SS", primary=True, jira_name="Structured Streaming"), + Component("STREAMING", ("DSTREAM", "DSTREAMS"), primary=True, jira_name="DStreams"), + Component("SUBMIT", jira_name="Spark Submit"), + Component("TEST", ("TESTS", "TEST-ONLY", "TESTS-ONLY"), jira_name="Tests"), + Component("UI", ("WEBUI", "WEB UI", "WEB_UI"), primary=True, jira_name="Web UI"), + Component("WINDOWS", primary=True, jira_name="Windows"), + Component("YARN", primary=True, jira_name="YARN"), + # Status markers — recognized in PR titles, but not JIRA components. + Component("FOLLOWUP", ("FOLLOW-UP", "FOLLOW UP")), + Component("MINOR"), + Component("TRIVIAL"), +) + + +_BRACKET_TAG_RE = re.compile(r"\[\s*([A-Za-z0-9._-]+)\s*\]") +_SPARK_ID_RE = re.compile(r"^SPARK-\d+$", re.IGNORECASE) +_VERSION_TAG_RE = re.compile(r"^\d+\.(\d+|X)$") +_LEADING_TAGS = frozenset({"MINOR", "TRIVIAL"}) + + +class Title: + """Structured PR title: SPARK refs, component tags, and body. + + ``leading`` — SPARK-NNNNN IDs and [MINOR]/[TRIVIAL] markers, in order. + ``components`` — all other bracket tags, in order. + ``text`` — body text following the bracket sequence. + + >>> t = Title.parse("[SPARK-1234][SQL] Fix something") + >>> t.leading, t.components, t.text + (['SPARK-1234'], ['SQL'], 'Fix something') + >>> str(t) + '[SPARK-1234][SQL] Fix something' + >>> t = Title.parse("[SPARK-1234][SQL][FOLLOWUP] Fix something") + >>> t.leading, t.components, t.text + (['SPARK-1234'], ['SQL', 'FOLLOWUP'], 'Fix something') + >>> str(t) + '[SPARK-1234][SQL][FOLLOWUP] Fix something' + """ - # Replace multiple spaces with a single space, e.g. if no jira refs and/or components were - # included - clean_text = re.sub(r"\s+", " ", clean_text.strip()) + def __init__( + self, + leading: List[str], + components: List[str], + text: str, + ) -> None: + self.leading = leading + self.components = components + self.text = text + + @classmethod + def parse(cls, raw: str) -> "Title": + """Parse a PR title string into a :class:`Title`. + + A title must open with a leading tag ([SPARK-NNNNN], [MINOR], or + [TRIVIAL]); otherwise :exc:`ValueError` is raised. Subsequent bracket + tokens (spaces trimmed, separated by optional whitespace) go to + ``components``. The remainder is ``text``. + + >>> t = Title.parse("[SPARK-1234][SQL][TESTS] Fix something") + >>> t.leading, t.components, t.text + (['SPARK-1234'], ['SQL', 'TESTS'], 'Fix something') + >>> t = Title.parse(" [ SPARK-1234 ] [ SQL ] [ TESTS ] Fix something") + >>> t.leading, t.components, t.text + (['SPARK-1234'], ['SQL', 'TESTS'], 'Fix something') + >>> t = Title.parse("[SPARK-1234 ][ sql ][ followup ] Fix") + >>> t.leading, t.components, t.text + (['SPARK-1234'], ['SQL', 'FOLLOWUP'], 'Fix') + >>> str(t) + '[SPARK-1234][SQL][FOLLOWUP] Fix' + >>> Title.parse("[MINOR] Fix typo").leading + ['MINOR'] + >>> t = Title.parse("[spark-1234][sql][followup] Fix") + >>> t.leading, t.components + (['SPARK-1234'], ['SQL', 'FOLLOWUP']) + >>> Title.parse("[SPARK-1234][SPARK-5678][SQL] Fix").leading + ['SPARK-1234', 'SPARK-5678'] + >>> Title.parse("[SPARK-1234][4.X][SQL] Fix").components + ['4.X', 'SQL'] + >>> Title.parse("[SPARK-1234][SQL][4.2] Fix").components + ['SQL', '4.2'] + >>> Title.parse("[SQL] Fix") + Traceback (most recent call last): + ... + ValueError: title must start with [SPARK-NNNNN], [MINOR], or [TRIVIAL]: '[SQL] Fix' + >>> Title.parse("No brackets") + Traceback (most recent call last): + ... + ValueError: title must start with [SPARK-NNNNN], [MINOR], or [TRIVIAL]: 'No brackets' + >>> Title.parse("[SPARK-1234][SQL][SPARK-123] Fix") + Traceback (most recent call last): + ... + ValueError: [SPARK-NNNNN] tags must all appear before other tags: '[SPARK-1234][SQL][SPARK-123] Fix' + >>> Title.parse("[SPARK-1234][MINOR][SQL] Fix") + Traceback (most recent call last): + ... + ValueError: [SPARK-NNNNN], [MINOR], and [TRIVIAL] cannot coexist + >>> Title.parse("[MINOR][TRIVIAL][SQL] Fix") + Traceback (most recent call last): + ... + ValueError: [SPARK-NNNNN], [MINOR], and [TRIVIAL] cannot coexist + """ + leading: List[str] = [] + components: List[str] = [] + + raw = raw.strip() + m0 = _BRACKET_TAG_RE.match(raw) + first = m0.group(1).upper() if m0 else "" + if not (_SPARK_ID_RE.match(first) or first in _LEADING_TAGS): + raise ValueError( + "title must start with [SPARK-NNNNN], [MINOR], or [TRIVIAL]: %r" % raw + ) - return clean_text + past_leading = False + pos = 0 + while pos < len(raw): + m = _BRACKET_TAG_RE.match(raw, pos) + if not m: + break + tag = m.group(1).upper() + if _SPARK_ID_RE.match(tag): + if past_leading: + raise ValueError( + "[SPARK-NNNNN] tags must all appear before other tags: %r" % raw + ) + leading.append(tag) + elif tag in _LEADING_TAGS: + leading.append(tag) + else: + components.append(tag) + past_leading = True + pos = m.end() + while pos < len(raw) and raw[pos] == " ": + pos += 1 + + text = raw[pos:].lstrip() + markers = [t for t in leading if t in _LEADING_TAGS] + if len(markers) > 1 or (markers and len(leading) > len(markers)): + raise ValueError( + "[SPARK-NNNNN], [MINOR], and [TRIVIAL] cannot coexist" + ) + return cls(leading, components, text) + + def __str__(self) -> str: + parts = "".join("[%s]" % t for t in self.leading) + parts += "".join("[%s]" % c for c in self.components) + if not self.text: + return parts + return parts + (" " if parts else "") + self.text + + +def prompt_for_components(): + """ + Prompt the committer for component(s) when the PR title lacks a primary + component. Each entered token is normalized via Component.find + (e.g. "DOCS" -> "DOC", "PYSPARK" -> "PYTHON"). Unrecognized tokens are + passed through as-is. Re-prompts until at least one entered token resolves + to a tag in PRIMARY_COMPONENTS. Returns an uppercase list of tags in + insertion order. + """ + print("PR title is missing a primary [COMPONENT] tag.") + print("Primary components (one of these is required):") + primary = [c for c in COMPONENTS if c.primary] + width = max(len(c.tag) for c in primary) + for c in primary: + print(" [%s]%s - %s" % (c.tag, " " * (width - len(c.tag)), c.jira_name)) + while True: + raw = bold_input( + "Enter comma-separated component(s) to insert into the title (e.g. CORE,SQL): " + ) + components = [] + has_primary = False + for token in raw.split(","): + t = token.strip().upper() + if t: + c = Component.find(t) + if c is not None and c.primary: + has_primary = True + components.append(c.tag if c else t) + if not components: + print_error("Component(s) cannot be empty. Please enter at least one.") + continue + if not has_primary: + print_error( + "At least one component must be a primary tag (see list above). " + "Got: %s" % ", ".join(components) + ) + continue + return components def get_current_ref(): @@ -843,28 +1042,65 @@ def main(): pr_events = get_json("%s/issues/%s/events" % (GITHUB_API_BASE, pr_num)) url = pr["url"] + title = pr["title"] - # Warn if the PR is WIP - if "[WIP]" in pr["title"]: - msg = "The PR title has `[WIP]`:\n%s\nContinue?" % pr["title"] - continue_maybe(msg) + # Fail hard on WIP or DO-NOT-MERGE to prevent accidental merges. + if "[WIP]" in title or "[DO-NOT-MERGE]" in title: + fail("Cannot merge a PR with [WIP] or [DO-NOT-MERGE] in the title:\n%s" % title) - # Decide whether to use the modified title or not - modified_title = standardize_jira_ref(pr["title"]).rstrip(".") - if modified_title != pr["title"]: - print("I've re-written the title as follows to match the standard format:") - print("Original: %s" % pr["title"]) - print("Modified: %s" % modified_title) - result = bold_input("Would you like to use the modified title? (y/N): ") - if result.lower() == "y": - title = modified_title - print("Using modified title:") - else: - title = pr["title"] - print("Using original title:") - print(title) - else: - title = pr["title"] + # e.g. 'Revert "[SPARK-56357][BUILD] Upgrade sbt to 1.12.8"' + is_revert_pr = title.startswith('Revert "') and title.endswith('"') + # e.g. 'Reapply "[SPARK-56357][BUILD] Upgrade sbt to 1.12.8"' + is_reapply_pr = title.startswith('Reapply "') and title.endswith('"') + + # Revert and Reapply PRs keep their title verbatim. + if not (is_revert_pr or is_reapply_pr): + # Parse; fail on a malformed title. + try: + parsed = Title.parse(title) + except ValueError as e: + fail("Malformed PR title: %s" % e) + + # Normalize component tags via the registry and track primary. + components = [] + has_primary = False + for tag in parsed.components: + c = Component.find(tag) + if c is not None and c.primary: + has_primary = True + components.append(c.tag if c is not None else tag) + if not has_primary: + new_tags = prompt_for_components() + components = new_tags + components + + # Deduplicate tags in insertion order. + components = list(dict.fromkeys(components)) + + # Move version tags (e.g. [4.X], [4.2]) to the head of components. + versions = [t for t in components if _VERSION_TAG_RE.match(t)] + if versions: + others = [t for t in components if not _VERSION_TAG_RE.match(t)] + components = versions + others + + # Move FOLLOWUP to the last tag. + non_followup = [t for t in components if t != "FOLLOWUP"] + if len(non_followup) < len(components): + components = non_followup + ["FOLLOWUP"] + + # Warn about tags that are neither known components nor version tags. + unknown = [ + t for t in components + if Component.find(t) is None and not _VERSION_TAG_RE.match(t) + ] + if unknown: + print_error( + "Title has unknown tag(s): %s" % ", ".join("[%s]" % t for t in unknown) + ) + + parsed.components = components + title = str(parsed) + if title != pr["title"]: + print("Normalized title: %s" % title) body = pr["body"] if body is None: