From 5dd70b1797867cadfa8bccc4d4ecf770baa32863 Mon Sep 17 00:00:00 2001 From: Kaxil Naik Date: Tue, 17 Mar 2026 00:26:02 +0000 Subject: [PATCH 01/11] Fix registry incremental build processing all providers The `--provider` flag was only passed to `extract_metadata.py` but not to `extract_parameters.py` or `extract_connections.py`. This caused incremental builds to scan all 99 providers and 1625 modules instead of just the requested one. --- dev/breeze/src/airflow_breeze/commands/registry_commands.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dev/breeze/src/airflow_breeze/commands/registry_commands.py b/dev/breeze/src/airflow_breeze/commands/registry_commands.py index 8a7457f124866..f818a16ea01a9 100644 --- a/dev/breeze/src/airflow_breeze/commands/registry_commands.py +++ b/dev/breeze/src/airflow_breeze/commands/registry_commands.py @@ -91,8 +91,8 @@ def extract_data(python: str, provider: str | None): command = ( f"{install_cmd}" f"python dev/registry/extract_metadata.py{provider_flag} && " - "python dev/registry/extract_parameters.py && " - "python dev/registry/extract_connections.py" + f"python dev/registry/extract_parameters.py{provider_flag} && " + f"python dev/registry/extract_connections.py{provider_flag}" ) with ci_group("Extracting registry data"): From b8bc2dd06e04aa2becd5d1fc04a6d050f836efce Mon Sep 17 00:00:00 2001 From: Kaxil Naik Date: Tue, 17 Mar 2026 01:07:40 +0000 Subject: [PATCH 02/11] Fix registry workflow failures due to workspace dependency resolution --- registry/pnpm-workspace.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/registry/pnpm-workspace.yaml b/registry/pnpm-workspace.yaml index d60faacb95e15..d142f35ee783d 100644 --- a/registry/pnpm-workspace.yaml +++ b/registry/pnpm-workspace.yaml @@ -16,6 +16,8 @@ # under the License. --- +packages: + - '.' overrides: liquidjs@<10.25.0: '>=10.25.0' markdown-it@>=13.0.0 <14.1.1: '>=14.1.1' From 305e9a9a977354a7aef785026a9c30c6c75146c3 Mon Sep 17 00:00:00 2001 From: Kaxil Naik Date: Tue, 17 Mar 2026 01:41:08 +0000 Subject: [PATCH 03/11] Use ci-image-build workflow for registry CI image caching The registry workflow was building the CI image from scratch every run (~24 min) because it lacked the BuildKit mount cache that ci-image-build.yml provides. Inline `breeze ci-image build` with registry cache doesn't help because Docker layer cache invalidates on every commit when the build context changes. Split into two jobs following the established pattern used by ci-amd-arm.yml and update-constraints-on-push.yml: - `build-ci-image`: calls ci-image-build.yml which handles mount cache restore, ghcr.io login, registry cache, and image stashing - `build-and-publish-registry`: restores the stashed image via prepare_breeze_and_image action, then runs the rest unchanged --- .github/workflows/registry-build.yml | 90 +++++++++++++++------------- 1 file changed, 50 insertions(+), 40 deletions(-) diff --git a/.github/workflows/registry-build.yml b/.github/workflows/registry-build.yml index 15e844faab816..182510f2545ed 100644 --- a/.github/workflows/registry-build.yml +++ b/.github/workflows/registry-build.yml @@ -53,23 +53,15 @@ on: # yamllint disable-line rule:truthy permissions: contents: read + packages: read jobs: - build-and-publish-registry: - timeout-minutes: 30 - name: "Build & Publish Registry" - runs-on: ubuntu-latest - env: - EXISTING_REGISTRY_DIR: /tmp/existing-registry - REGISTRY_DATA_DIR: dev/registry - REGISTRY_PROVIDERS_JSON: providers.json - REGISTRY_MODULES_JSON: modules.json - REGISTRY_SITE_DATA_DIR: registry/src/_data - REGISTRY_SITE_VERSIONS_DIR: registry/src/_data/versions - REGISTRY_SITE_LOGOS_DIR: registry/public/logos - REGISTRY_CACHE_CONTROL: public, max-age=300 + build-ci-image: + name: "Build CI image" + uses: ./.github/workflows/ci-image-build.yml permissions: contents: read + packages: write if: > github.event_name == 'workflow_call' || contains(fromJSON('[ @@ -86,35 +78,49 @@ jobs: "utkarsharma2", "vincbeck" ]'), github.event.sender.login) + with: + runners: '["ubuntu-22.04"]' + platform: "linux/amd64" + push-image: "false" + upload-image-artifact: "true" + upload-mount-cache-artifact: "false" + python-versions: '["3.12"]' + branch: "main" + constraints-branch: "constraints-main" + use-uv: "true" + upgrade-to-newer-dependencies: "false" + docker-cache: "registry" + disable-airflow-repo-cache: "false" + + build-and-publish-registry: + timeout-minutes: 30 + name: "Build & Publish Registry" + needs: [build-ci-image] + runs-on: ubuntu-latest + env: + EXISTING_REGISTRY_DIR: /tmp/existing-registry + REGISTRY_DATA_DIR: dev/registry + REGISTRY_PROVIDERS_JSON: providers.json + REGISTRY_MODULES_JSON: modules.json + REGISTRY_SITE_DATA_DIR: registry/src/_data + REGISTRY_SITE_VERSIONS_DIR: registry/src/_data/versions + REGISTRY_SITE_LOGOS_DIR: registry/public/logos + REGISTRY_CACHE_CONTROL: public, max-age=300 + permissions: + contents: read steps: - name: "Checkout repository" uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false - # --- Breeze setup --- - # All three extraction scripts run inside breeze so that - # extract_parameters.py and extract_connections.py can import provider - # classes at runtime. extract_metadata.py also runs in breeze for - # consistency — it writes to dev/registry/ (mounted) so the other two - # scripts can read providers.json / modules.json from there. - - name: "Install Breeze" - uses: ./.github/actions/breeze + - name: "Prepare breeze & CI image" + uses: ./.github/actions/prepare_breeze_and_image with: - python-version: "3.12" - - - name: "Build CI image" - # Fallback to raw docker buildx when breeze cache is stale — same - # pattern as publish-docs-to-s3.yml. - run: > - breeze ci-image build --python 3.12 || - docker buildx build --load --builder default --progress=auto --pull - --build-arg AIRFLOW_EXTRAS=devel-ci --build-arg AIRFLOW_PRE_CACHED_PIP_PACKAGES=false - --build-arg AIRFLOW_USE_UV=true --build-arg BUILD_PROGRESS=auto - --build-arg INSTALL_MYSQL_CLIENT_TYPE=mariadb - --build-arg VERSION_SUFFIX_FOR_PYPI=dev0 - -t ghcr.io/apache/airflow/main/ci/python3.12:latest --target main . - -f Dockerfile.ci --platform linux/amd64 + python: "3.12" + platform: "linux/amd64" + use-uv: "true" + make-mnt-writeable-and-cleanup: "false" - name: "Install AWS CLI v2" run: | @@ -195,13 +201,17 @@ jobs: cp \ "${REGISTRY_DATA_DIR}/${REGISTRY_MODULES_JSON}" \ "${REGISTRY_SITE_DATA_DIR}/${REGISTRY_MODULES_JSON}" - if [ -d "${REGISTRY_DATA_DIR}/output/versions" ]; then - cp -r "${REGISTRY_DATA_DIR}/output/versions/"* "${REGISTRY_SITE_VERSIONS_DIR}/" + VERSIONS_SRC="${REGISTRY_DATA_DIR}/output/versions" + if [ -d "${VERSIONS_SRC}" ] && ls "${VERSIONS_SRC}/"* &>/dev/null; then + cp -r "${VERSIONS_SRC}/"* "${REGISTRY_SITE_VERSIONS_DIR}/" fi - # Copy provider logos extracted from providers/*/docs/integration-logos/ - if [ -d "${REGISTRY_DATA_DIR}/logos" ]; then + # Copy provider logos extracted by extract_metadata.py. + # The directory may exist but be empty (incremental build for + # a provider without logos), so check for files before copying. + LOGOS_SRC="${REGISTRY_DATA_DIR}/logos" + if [ -d "${LOGOS_SRC}" ] && ls "${LOGOS_SRC}/"* &>/dev/null; then mkdir -p "${REGISTRY_SITE_LOGOS_DIR}" - cp -r "${REGISTRY_DATA_DIR}/logos/"* "${REGISTRY_SITE_LOGOS_DIR}/" + cp -r "${LOGOS_SRC}/"* "${REGISTRY_SITE_LOGOS_DIR}/" fi - name: "Setup pnpm" From 339c458a4dde93e5eb4743ae44b08100a2862e3b Mon Sep 17 00:00:00 2001 From: Kaxil Naik Date: Tue, 17 Mar 2026 01:43:48 +0000 Subject: [PATCH 04/11] Fix merge crash when incremental extract skips modules.json extract_parameters.py with --provider intentionally skips writing modules.json (only the targeted provider's parameters are extracted). The merge script assumed modules.json always exists, causing a FileNotFoundError during incremental builds. Handle missing new_modules_path the same way missing existing_modules_path is already handled: treat it as an empty list. --- dev/registry/merge_registry_data.py | 4 ++- .../tests/test_merge_registry_data.py | 35 +++++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/dev/registry/merge_registry_data.py b/dev/registry/merge_registry_data.py index bf8696a8b37ac..2bdbe50da12b4 100644 --- a/dev/registry/merge_registry_data.py +++ b/dev/registry/merge_registry_data.py @@ -56,7 +56,9 @@ def merge( existing_modules: list[dict] = [] if existing_modules_path.exists(): existing_modules = json.loads(existing_modules_path.read_text())["modules"] - new_modules = json.loads(new_modules_path.read_text())["modules"] + new_modules: list[dict] = [] + if new_modules_path.exists(): + new_modules = json.loads(new_modules_path.read_text())["modules"] # IDs being replaced new_ids = {p["id"] for p in new_providers} diff --git a/dev/registry/tests/test_merge_registry_data.py b/dev/registry/tests/test_merge_registry_data.py index 537245569aad1..e3f78fd4b792e 100644 --- a/dev/registry/tests/test_merge_registry_data.py +++ b/dev/registry/tests/test_merge_registry_data.py @@ -241,6 +241,41 @@ def test_missing_existing_providers_file(self, tmp_path, output_dir): assert len(result_providers) == 1 assert result_providers[0]["id"] == "amazon" + def test_missing_new_modules_file(self, tmp_path, output_dir): + """Incremental extract with --provider skips modules.json; merge should keep existing modules.""" + existing_providers = _write_json( + tmp_path / "existing_providers.json", + { + "providers": [ + _provider("amazon", "Amazon", "2024-01-01"), + _provider("google", "Google", "2024-02-01"), + ] + }, + ) + existing_modules = _write_json( + tmp_path / "existing_modules.json", + { + "modules": [ + _module("amazon-s3-op", "amazon"), + _module("google-bq-op", "google"), + ] + }, + ) + new_providers = _write_json( + tmp_path / "new_providers.json", + {"providers": [_provider("amazon", "Amazon", "2025-01-01")]}, + ) + # new_modules file does not exist (--provider mode skips modules.json) + new_modules = tmp_path / "nonexistent_modules.json" + + merge(existing_providers, existing_modules, new_providers, new_modules, output_dir) + + result_modules = json.loads((output_dir / "modules.json").read_text())["modules"] + # Existing modules for non-updated providers are kept + assert any(m["id"] == "google-bq-op" for m in result_modules) + # Existing modules for the updated provider are removed (no new ones to replace them) + assert not any(m["provider_id"] == "amazon" for m in result_modules) + def test_output_directory_created_if_missing(self, tmp_path): output_dir = tmp_path / "does" / "not" / "exist" existing_providers = _write_json( From f78dc02d9e3dac4f92b4cd1b5385990026c8e5f1 Mon Sep 17 00:00:00 2001 From: Kaxil Naik Date: Tue, 17 Mar 2026 02:20:24 +0000 Subject: [PATCH 05/11] Fix /mnt not writable when loading stashed CI image The prepare_breeze_and_image action loads the CI image from /mnt, which requires make_mnt_writeable.sh to run first. Each job gets a fresh runner, so the writeable /mnt from the build job doesn't carry over. --- .github/workflows/registry-build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/registry-build.yml b/.github/workflows/registry-build.yml index 182510f2545ed..94b2a4bb9d488 100644 --- a/.github/workflows/registry-build.yml +++ b/.github/workflows/registry-build.yml @@ -120,7 +120,7 @@ jobs: python: "3.12" platform: "linux/amd64" use-uv: "true" - make-mnt-writeable-and-cleanup: "false" + make-mnt-writeable-and-cleanup: "true" - name: "Install AWS CLI v2" run: | From 374d89d7428a1fe9e5f22ef61285d797854599c2 Mon Sep 17 00:00:00 2001 From: Kaxil Naik Date: Tue, 17 Mar 2026 12:38:11 +0000 Subject: [PATCH 06/11] Regenerate pnpm lockfile for workspace mode Adding `packages: ['.']` to pnpm-workspace.yaml changed how pnpm processes overrides, causing ERR_PNPM_LOCKFILE_CONFIG_MISMATCH with --frozen-lockfile. Regenerate the lockfile with pnpm 9 to match. --- registry/pnpm-lock.yaml | 41 +++++++++++++++++++---------------------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/registry/pnpm-lock.yaml b/registry/pnpm-lock.yaml index 59787d2b24416..e253dafd90646 100644 --- a/registry/pnpm-lock.yaml +++ b/registry/pnpm-lock.yaml @@ -4,12 +4,6 @@ settings: autoInstallPeers: true excludeLinksFromLockfile: false -overrides: - liquidjs@<10.25.0: '>=10.25.0' - markdown-it@>=13.0.0 <14.1.1: '>=14.1.1' - minimatch@<3.1.3: '>=3.1.3' - minimatch@<3.1.4: '>=3.1.4' - importers: .: @@ -148,9 +142,8 @@ packages: asap@2.0.6: resolution: {integrity: sha512-BSHWgDSAiKs50o2Re8ppvp3seVHXSRM44cdSsT9FfNEUUZLOGWVCsiWaRPWM1Znn+mqZ1OfVZ3z3DWEzSp7hRA==} - balanced-match@4.0.4: - resolution: {integrity: sha512-BLrgEcRTwX2o6gGxGOCNyMvGSp35YofuYzw9h1IMTRmKqttAZZVU67bdb9Pr2vUHA8+j3i2tJfjO6C6+4myGTA==} - engines: {node: 18 || 20 || >=22} + balanced-match@1.0.2: + resolution: {integrity: sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==} bcp-47-match@2.0.3: resolution: {integrity: sha512-JtTezzbAibu8G0R9op9zb3vcWZd9JF6M0xOYGPn0fNCd7wOpRB1mU2mH9T8gaBGbAAyIIVgB2G7xG0GP98zMAQ==} @@ -165,9 +158,8 @@ packages: resolution: {integrity: sha512-Ceh+7ox5qe7LJuLHoY0feh3pHuUDHAcRUeyL2VYghZwfpkNIy/+8Ocg0a3UuSoYzavmylwuLWQOf3hl0jjMMIw==} engines: {node: '>=8'} - brace-expansion@5.0.4: - resolution: {integrity: sha512-h+DEnpVvxmfVefa4jFbCf5HdH5YMDXRsmKflpf1pILZWRFlTbJpxeU55nJl4Smt5HQaGzg1o6RHFPJaOqnmBDg==} - engines: {node: 18 || 20 || >=22} + brace-expansion@1.1.12: + resolution: {integrity: sha512-9T9UjW3r0UW5c1Q7GTwllptXwhvYmEzFhzMfZ9H7FQWt+uZePjZPjBP/W1ZEyZ1twGWom5/56TF4lPcqjnDHcg==} braces@3.0.3: resolution: {integrity: sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA==} @@ -198,6 +190,9 @@ packages: resolution: {integrity: sha512-QrWXB+ZQSVPmIWIhtEO9H+gwHaMGYiF5ChvoJ+K9ZGHG/sVsa6yiesAD1GC/x46sET00Xlwo1u49RVVVzvcSkw==} engines: {node: '>= 10'} + concat-map@0.0.1: + resolution: {integrity: sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==} + debug@2.6.9: resolution: {integrity: sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==} peerDependencies: @@ -447,9 +442,8 @@ packages: engines: {node: '>=10.0.0'} hasBin: true - minimatch@10.2.4: - resolution: {integrity: sha512-oRjTw/97aTBN0RHbYCdtF1MQfvusSIBQM0IZEgzl6426+8jSC0nF1a/GmnVLpfB9yyr6g6FTqWqiZVbxrtaCIg==} - engines: {node: 18 || 20 || >=22} + minimatch@3.1.5: + resolution: {integrity: sha512-VgjWUsnnT6n+NUk6eZq77zeFdpW2LWDzP6zFGrCbHXiYNul5Dzqk2HHQ5uFH2DNW5Xbp8+jVzaeNt94ssEEl4w==} minimist@1.2.8: resolution: {integrity: sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==} @@ -798,7 +792,7 @@ snapshots: asap@2.0.6: {} - balanced-match@4.0.4: {} + balanced-match@1.0.2: {} bcp-47-match@2.0.3: {} @@ -815,9 +809,10 @@ snapshots: binary-extensions@2.3.0: {} - brace-expansion@5.0.4: + brace-expansion@1.1.12: dependencies: - balanced-match: 4.0.4 + balanced-match: 1.0.2 + concat-map: 0.0.1 braces@3.0.3: dependencies: @@ -852,6 +847,8 @@ snapshots: commander@7.2.0: {} + concat-map@0.0.1: {} + debug@2.6.9: dependencies: ms: 2.0.0 @@ -954,7 +951,7 @@ snapshots: fs.realpath: 1.0.0 inflight: 1.0.6 inherits: 2.0.4 - minimatch: 10.2.4 + minimatch: 3.1.5 once: 1.4.0 path-is-absolute: 1.0.1 @@ -1057,7 +1054,7 @@ snapshots: array-differ: 1.0.0 array-union: 1.0.2 arrify: 1.0.1 - minimatch: 10.2.4 + minimatch: 3.1.5 mdurl@2.0.0: {} @@ -1069,9 +1066,9 @@ snapshots: mime@3.0.0: {} - minimatch@10.2.4: + minimatch@3.1.5: dependencies: - brace-expansion: 5.0.4 + brace-expansion: 1.1.12 minimist@1.2.8: {} From 014a3175ee32c611d87c3a48b22946e16b02de24 Mon Sep 17 00:00:00 2001 From: Kaxil Naik Date: Tue, 17 Mar 2026 13:11:47 +0000 Subject: [PATCH 07/11] Scope prebuild uv resolution to dev/registry project MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The prebuild script ran `uv run` without --project, causing uv to resolve the full workspace including samba → krb5 which needs libkrb5-dev (not installed on the CI runner). --- registry/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/registry/package.json b/registry/package.json index af27cb96fd6af..330fad9498f6c 100644 --- a/registry/package.json +++ b/registry/package.json @@ -4,7 +4,7 @@ "description": "Apache Airflow Provider Registry", "scripts": { "dev": "REGISTRY_PATH_PREFIX=/ pnpm build && REGISTRY_PATH_PREFIX=/ eleventy --serve --port=8080", - "prebuild": "uv run python ../dev/registry/export_registry_schemas.py", + "prebuild": "uv run --project ../dev/registry python ../dev/registry/export_registry_schemas.py", "build": "rm -rf _site && eleventy", "postbuild": "cleancss -o _site/css/main.css _site/css/main.css && node scripts/build-pagefind-index.mjs" }, From 35b9324b885bd3082521dcc9a2b80479bd2efb6f Mon Sep 17 00:00:00 2001 From: Kaxil Naik Date: Tue, 17 Mar 2026 17:42:06 +0000 Subject: [PATCH 08/11] Fix incremental builds overwriting provider connection/parameter data on S3 Eleventy pagination templates emit empty fallback JSON for every provider, even when only one provider's data was extracted. A plain `aws s3 sync` uploads those stubs and overwrites real connection/parameter data. Changes: - Exclude per-provider connections.json and parameters.json from the main S3 sync during incremental builds, then selectively upload only the target provider's API files - Filter connections early in extract_connections.py (before the loop) and support space-separated multi-provider IDs - Suppress SCARF_ANALYTICS and DO_NOT_TRACK telemetry in CI - Document the Eleventy pagination limitation in README and AGENTS.md --- .github/workflows/registry-build.yml | 32 +++++++++++++++++++++++++++- dev/registry/extract_connections.py | 18 +++++++++------- registry/AGENTS.md | 5 ++++- registry/README.md | 20 ++++++++++++++--- 4 files changed, 62 insertions(+), 13 deletions(-) diff --git a/.github/workflows/registry-build.yml b/.github/workflows/registry-build.yml index 94b2a4bb9d488..9fc61a9d70c65 100644 --- a/.github/workflows/registry-build.yml +++ b/.github/workflows/registry-build.yml @@ -98,6 +98,8 @@ jobs: needs: [build-ci-image] runs-on: ubuntu-latest env: + SCARF_ANALYTICS: "false" + DO_NOT_TRACK: "1" EXISTING_REGISTRY_DIR: /tmp/existing-registry REGISTRY_DATA_DIR: dev/registry REGISTRY_PROVIDERS_JSON: providers.json @@ -247,10 +249,38 @@ jobs: - name: "Sync registry to S3" env: S3_BUCKET: ${{ steps.destination.outputs.bucket }} + PROVIDER: ${{ inputs.provider }} run: | + # Incremental builds only extract connections/parameters for the + # target provider(s). The Eleventy site build still emits empty + # stub JSON for every other provider. Uploading those stubs would + # overwrite real data on S3, so we exclude per-provider API JSON + # from the main sync and selectively upload only the target + # provider's files afterward. + EXCLUDE_PROVIDER_API=() + if [[ -n "${PROVIDER}" ]]; then + EXCLUDE_PROVIDER_API=( + --exclude "api/providers/*/connections.json" + --exclude "api/providers/*/parameters.json" + --exclude "api/providers/*/*/connections.json" + --exclude "api/providers/*/*/parameters.json" + ) + fi + aws s3 sync registry/_site/ "${S3_BUCKET}" \ --cache-control "${REGISTRY_CACHE_CONTROL}" \ - --exclude "pagefind/*" + --exclude "pagefind/*" \ + "${EXCLUDE_PROVIDER_API[@]}" + + # For incremental builds, sync only the updated provider's API files. + if [[ -n "${PROVIDER}" ]]; then + for pid in ${PROVIDER}; do + aws s3 sync "registry/_site/api/providers/${pid}/" \ + "${S3_BUCKET}api/providers/${pid}/" \ + --cache-control "${REGISTRY_CACHE_CONTROL}" + done + fi + # Pagefind generates content-hashed filenames (e.g. en_181da6f.pf_index). # Each rebuild produces new hashes, so --delete is needed to remove stale # index files. This is separate from the main sync which intentionally diff --git a/dev/registry/extract_connections.py b/dev/registry/extract_connections.py index ce81cc43a2773..ebb269193c37b 100644 --- a/dev/registry/extract_connections.py +++ b/dev/registry/extract_connections.py @@ -162,7 +162,7 @@ def main(): parser.add_argument( "--provider", default=None, - help="Only output connections for this provider ID (e.g. 'amazon').", + help="Only output connections for these provider ID(s) (space-separated, e.g. 'amazon common-io').", ) parser.add_argument( "--providers-json", @@ -212,12 +212,21 @@ def main(): total_with_custom = 0 total_with_ui = 0 + # Parse space-separated provider filter (matches extract_metadata.py behaviour) + provider_filter: set[str] | None = None + if args.provider: + provider_filter = {pid.strip() for pid in args.provider.split() if pid.strip()} + print(f"Filtering to provider(s): {', '.join(sorted(provider_filter))}") + for conn_type, hook_info in sorted(hooks.items()): if hook_info is None or not hook_info.package_name: continue provider_id = package_name_to_provider_id(hook_info.package_name) + if provider_filter and provider_id not in provider_filter: + continue + standard_fields = build_standard_fields(field_behaviours.get(conn_type)) custom_fields = build_custom_fields(form_widgets, conn_type) @@ -244,13 +253,6 @@ def main(): print(f" {total_with_custom} have custom fields") print(f" {total_with_ui} have UI field customisation") - # Filter to single provider if requested - if args.provider: - provider_connections = { - pid: conns for pid, conns in provider_connections.items() if pid == args.provider - } - print(f"Filtering output to provider: {args.provider}") - # Write per-provider files to versions/{pid}/{version}/connections.json for output_dir in OUTPUT_DIRS: if not output_dir.parent.exists(): diff --git a/registry/AGENTS.md b/registry/AGENTS.md index fc7ce31fb0fb6..b6416ec024496 100644 --- a/registry/AGENTS.md +++ b/registry/AGENTS.md @@ -404,7 +404,10 @@ The registry is built in the `apache/airflow` repo and served at `airflow.apache Supports two modes: - **Full build** (no `provider` input): extracts all ~99 providers (~12 min) - **Incremental build** (`provider=amazon`): extracts one provider (~30s), merges - with existing data from S3 via `merge_registry_data.py`, then builds the full site + with existing data from S3 via `merge_registry_data.py`, then builds the full site. + The S3 sync step excludes per-provider `connections.json` and `parameters.json` + for non-target providers to avoid overwriting real data with Eleventy's empty + fallback stubs (Eleventy 3.x `permalink: false` does not work with pagination). 2. **S3 buckets**: `{live|staging}-docs-airflow-apache-org/registry/` (same bucket as docs, different prefix) 3. **Serving**: Apache HTTPD at `airflow.apache.org` rewrites `/registry/*` to CloudFront, which serves from S3 4. **Auto-trigger**: When `publish-docs-to-s3.yml` publishes provider docs, its diff --git a/registry/README.md b/registry/README.md index 2d0d354c59fa1..6b4df5b78eb0a 100644 --- a/registry/README.md +++ b/registry/README.md @@ -327,10 +327,16 @@ it triggers `registry-build.yml` with the provider ID. The incremental flow: metadata and PyPI stats; `extract_parameters.py` discovers modules for only the specified provider. 3. **Merge** — `merge_registry_data.py` replaces the updated provider's entries in - the downloaded JSON while keeping all other providers intact. + the downloaded JSON while keeping all other providers intact. Only global files + (`providers.json`, `modules.json`) are merged — per-version files like + `connections.json` and `parameters.json` are not downloaded from S3. 4. **Build site** — Eleventy builds all pages from the merged data; Pagefind indexes - all records. -5. **S3 sync** — only changed pages are uploaded (S3 sync diffs). + all records. Because per-version data only exists for the target provider, Eleventy + emits empty fallback JSON for other providers' `connections.json` and + `parameters.json` API endpoints (see **Known limitation** below). +5. **S3 sync (selective)** — the main sync excludes per-provider `connections.json` + and `parameters.json` to avoid overwriting real data with empty stubs. A second + sync uploads only the target provider's API files. 6. **Publish versions** — `publish_versions.py` updates `api/providers/{id}/versions.json`. The merge script (`dev/registry/merge_registry_data.py`) handles edge cases: @@ -338,6 +344,14 @@ The merge script (`dev/registry/merge_registry_data.py`) handles edge cases: - First deploy (no existing data on S3): uses the single-provider output as-is. - Missing modules file: treated as empty. +**Known limitation**: Eleventy's pagination templates generate API files for every +provider in `providers.json`, even when per-version data (connections, parameters) only +exists for the target provider. The templates emit empty fallback JSON +(`{"connection_types":[]}`) for providers without data. The S3 sync step works around +this with `--exclude` patterns during incremental builds. A proper template-level fix +(skipping file generation) is tracked as a follow-up — `permalink: false` does not work +with Eleventy 3.x pagination templates. + To run an incremental build locally: ```bash From 5f9b85600f6a8874937b7f444f598ecc689f91ab Mon Sep 17 00:00:00 2001 From: Kaxil Naik Date: Tue, 17 Mar 2026 18:27:20 +0000 Subject: [PATCH 09/11] Exclude all per-provider API files during incremental S3 sync The previous exclude only covered connections.json and parameters.json, but modules.json and versions.json for non-target providers also contain incomplete data (no version info extracted) and would overwrite correct data on S3. Simplify to exclude the entire api/providers/* subtree and selectively upload only the target provider's directory. --- .github/workflows/registry-build.yml | 21 ++++++++------------- registry/AGENTS.md | 6 +++--- registry/README.md | 4 ++-- 3 files changed, 13 insertions(+), 18 deletions(-) diff --git a/.github/workflows/registry-build.yml b/.github/workflows/registry-build.yml index 9fc61a9d70c65..0a5d7bc954fc3 100644 --- a/.github/workflows/registry-build.yml +++ b/.github/workflows/registry-build.yml @@ -251,20 +251,15 @@ jobs: S3_BUCKET: ${{ steps.destination.outputs.bucket }} PROVIDER: ${{ inputs.provider }} run: | - # Incremental builds only extract connections/parameters for the - # target provider(s). The Eleventy site build still emits empty - # stub JSON for every other provider. Uploading those stubs would - # overwrite real data on S3, so we exclude per-provider API JSON - # from the main sync and selectively upload only the target - # provider's files afterward. + # Incremental builds only extract data for the target provider(s). + # Eleventy still generates API files for every provider, but + # non-target files contain incomplete/empty data (no version info, + # empty connections, etc.). Exclude the entire per-provider API + # subtree from the main sync, then selectively upload only the + # target provider's files. EXCLUDE_PROVIDER_API=() if [[ -n "${PROVIDER}" ]]; then - EXCLUDE_PROVIDER_API=( - --exclude "api/providers/*/connections.json" - --exclude "api/providers/*/parameters.json" - --exclude "api/providers/*/*/connections.json" - --exclude "api/providers/*/*/parameters.json" - ) + EXCLUDE_PROVIDER_API=(--exclude "api/providers/*") fi aws s3 sync registry/_site/ "${S3_BUCKET}" \ @@ -272,7 +267,7 @@ jobs: --exclude "pagefind/*" \ "${EXCLUDE_PROVIDER_API[@]}" - # For incremental builds, sync only the updated provider's API files. + # For incremental builds, sync only the target provider's API files. if [[ -n "${PROVIDER}" ]]; then for pid in ${PROVIDER}; do aws s3 sync "registry/_site/api/providers/${pid}/" \ diff --git a/registry/AGENTS.md b/registry/AGENTS.md index b6416ec024496..99416cb89311c 100644 --- a/registry/AGENTS.md +++ b/registry/AGENTS.md @@ -405,9 +405,9 @@ The registry is built in the `apache/airflow` repo and served at `airflow.apache - **Full build** (no `provider` input): extracts all ~99 providers (~12 min) - **Incremental build** (`provider=amazon`): extracts one provider (~30s), merges with existing data from S3 via `merge_registry_data.py`, then builds the full site. - The S3 sync step excludes per-provider `connections.json` and `parameters.json` - for non-target providers to avoid overwriting real data with Eleventy's empty - fallback stubs (Eleventy 3.x `permalink: false` does not work with pagination). + The S3 sync step excludes the entire `api/providers/` subtree for non-target + providers to avoid overwriting real data with Eleventy's incomplete/empty + stubs (Eleventy 3.x `permalink: false` does not work with pagination). 2. **S3 buckets**: `{live|staging}-docs-airflow-apache-org/registry/` (same bucket as docs, different prefix) 3. **Serving**: Apache HTTPD at `airflow.apache.org` rewrites `/registry/*` to CloudFront, which serves from S3 4. **Auto-trigger**: When `publish-docs-to-s3.yml` publishes provider docs, its diff --git a/registry/README.md b/registry/README.md index 6b4df5b78eb0a..4459c65da00e6 100644 --- a/registry/README.md +++ b/registry/README.md @@ -334,8 +334,8 @@ it triggers `registry-build.yml` with the provider ID. The incremental flow: all records. Because per-version data only exists for the target provider, Eleventy emits empty fallback JSON for other providers' `connections.json` and `parameters.json` API endpoints (see **Known limitation** below). -5. **S3 sync (selective)** — the main sync excludes per-provider `connections.json` - and `parameters.json` to avoid overwriting real data with empty stubs. A second +5. **S3 sync (selective)** — the main sync excludes the entire `api/providers/` + subtree to avoid overwriting real data with incomplete/empty stubs. A second sync uploads only the target provider's API files. 6. **Publish versions** — `publish_versions.py` updates `api/providers/{id}/versions.json`. From a867767322f76134ff7abc120f257527d958077e Mon Sep 17 00:00:00 2001 From: Kaxil Naik Date: Tue, 17 Mar 2026 18:34:59 +0000 Subject: [PATCH 10/11] Also exclude provider HTML pages during incremental S3 sync Non-target provider pages are rebuilt without connection/parameter data (the version-specific extraction files don't exist locally). Without this exclude, the incremental build overwrites complete HTML pages on S3 with versions missing the connection builder section. --- .github/workflows/registry-build.yml | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/.github/workflows/registry-build.yml b/.github/workflows/registry-build.yml index 0a5d7bc954fc3..fcfeb65ab3578 100644 --- a/.github/workflows/registry-build.yml +++ b/.github/workflows/registry-build.yml @@ -252,27 +252,32 @@ jobs: PROVIDER: ${{ inputs.provider }} run: | # Incremental builds only extract data for the target provider(s). - # Eleventy still generates API files for every provider, but - # non-target files contain incomplete/empty data (no version info, - # empty connections, etc.). Exclude the entire per-provider API - # subtree from the main sync, then selectively upload only the - # target provider's files. - EXCLUDE_PROVIDER_API=() + # Eleventy rebuilds all pages, but non-target providers have + # incomplete data (no connections/parameters, wrong version info). + # Exclude both per-provider API JSON and HTML pages from the main + # sync, then selectively upload only the target provider's files. + EXCLUDE_PROVIDERS=() if [[ -n "${PROVIDER}" ]]; then - EXCLUDE_PROVIDER_API=(--exclude "api/providers/*") + EXCLUDE_PROVIDERS=( + --exclude "api/providers/*" + --exclude "providers/*" + ) fi aws s3 sync registry/_site/ "${S3_BUCKET}" \ --cache-control "${REGISTRY_CACHE_CONTROL}" \ --exclude "pagefind/*" \ - "${EXCLUDE_PROVIDER_API[@]}" + "${EXCLUDE_PROVIDERS[@]}" - # For incremental builds, sync only the target provider's API files. + # For incremental builds, sync only the target provider's files. if [[ -n "${PROVIDER}" ]]; then for pid in ${PROVIDER}; do aws s3 sync "registry/_site/api/providers/${pid}/" \ "${S3_BUCKET}api/providers/${pid}/" \ --cache-control "${REGISTRY_CACHE_CONTROL}" + aws s3 sync "registry/_site/providers/${pid}/" \ + "${S3_BUCKET}providers/${pid}/" \ + --cache-control "${REGISTRY_CACHE_CONTROL}" done fi From fce97bc0c56b0055090a8336b56bea904da0a555 Mon Sep 17 00:00:00 2001 From: Kaxil Naik Date: Tue, 17 Mar 2026 18:46:42 +0000 Subject: [PATCH 11/11] Re-include providers/index.html in incremental S3 sync MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The providers listing page uses merged data (all providers) and must be updated during incremental builds — especially for new providers. AWS CLI --include after --exclude re-includes the specific file. --- .github/workflows/registry-build.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/registry-build.yml b/.github/workflows/registry-build.yml index fcfeb65ab3578..df6e08c895ae5 100644 --- a/.github/workflows/registry-build.yml +++ b/.github/workflows/registry-build.yml @@ -258,9 +258,12 @@ jobs: # sync, then selectively upload only the target provider's files. EXCLUDE_PROVIDERS=() if [[ -n "${PROVIDER}" ]]; then + # Exclude per-provider subtrees but re-include global listing + # pages. AWS CLI processes filters in order — later rules win. EXCLUDE_PROVIDERS=( --exclude "api/providers/*" --exclude "providers/*" + --include "providers/index.html" ) fi