diff --git a/.github/workflows/registry-build.yml b/.github/workflows/registry-build.yml index 15e844faab816..df6e08c895ae5 100644 --- a/.github/workflows/registry-build.yml +++ b/.github/workflows/registry-build.yml @@ -53,23 +53,15 @@ on: # yamllint disable-line rule:truthy permissions: contents: read + packages: read jobs: - build-and-publish-registry: - timeout-minutes: 30 - name: "Build & Publish Registry" - runs-on: ubuntu-latest - env: - EXISTING_REGISTRY_DIR: /tmp/existing-registry - REGISTRY_DATA_DIR: dev/registry - REGISTRY_PROVIDERS_JSON: providers.json - REGISTRY_MODULES_JSON: modules.json - REGISTRY_SITE_DATA_DIR: registry/src/_data - REGISTRY_SITE_VERSIONS_DIR: registry/src/_data/versions - REGISTRY_SITE_LOGOS_DIR: registry/public/logos - REGISTRY_CACHE_CONTROL: public, max-age=300 + build-ci-image: + name: "Build CI image" + uses: ./.github/workflows/ci-image-build.yml permissions: contents: read + packages: write if: > github.event_name == 'workflow_call' || contains(fromJSON('[ @@ -86,35 +78,51 @@ jobs: "utkarsharma2", "vincbeck" ]'), github.event.sender.login) + with: + runners: '["ubuntu-22.04"]' + platform: "linux/amd64" + push-image: "false" + upload-image-artifact: "true" + upload-mount-cache-artifact: "false" + python-versions: '["3.12"]' + branch: "main" + constraints-branch: "constraints-main" + use-uv: "true" + upgrade-to-newer-dependencies: "false" + docker-cache: "registry" + disable-airflow-repo-cache: "false" + + build-and-publish-registry: + timeout-minutes: 30 + name: "Build & Publish Registry" + needs: [build-ci-image] + runs-on: ubuntu-latest + env: + SCARF_ANALYTICS: "false" + DO_NOT_TRACK: "1" + EXISTING_REGISTRY_DIR: /tmp/existing-registry + REGISTRY_DATA_DIR: dev/registry + REGISTRY_PROVIDERS_JSON: providers.json + REGISTRY_MODULES_JSON: modules.json + REGISTRY_SITE_DATA_DIR: registry/src/_data + REGISTRY_SITE_VERSIONS_DIR: registry/src/_data/versions + REGISTRY_SITE_LOGOS_DIR: registry/public/logos + REGISTRY_CACHE_CONTROL: public, max-age=300 + permissions: + contents: read steps: - name: "Checkout repository" uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false - # --- Breeze setup --- - # All three extraction scripts run inside breeze so that - # extract_parameters.py and extract_connections.py can import provider - # classes at runtime. extract_metadata.py also runs in breeze for - # consistency — it writes to dev/registry/ (mounted) so the other two - # scripts can read providers.json / modules.json from there. - - name: "Install Breeze" - uses: ./.github/actions/breeze + - name: "Prepare breeze & CI image" + uses: ./.github/actions/prepare_breeze_and_image with: - python-version: "3.12" - - - name: "Build CI image" - # Fallback to raw docker buildx when breeze cache is stale — same - # pattern as publish-docs-to-s3.yml. - run: > - breeze ci-image build --python 3.12 || - docker buildx build --load --builder default --progress=auto --pull - --build-arg AIRFLOW_EXTRAS=devel-ci --build-arg AIRFLOW_PRE_CACHED_PIP_PACKAGES=false - --build-arg AIRFLOW_USE_UV=true --build-arg BUILD_PROGRESS=auto - --build-arg INSTALL_MYSQL_CLIENT_TYPE=mariadb - --build-arg VERSION_SUFFIX_FOR_PYPI=dev0 - -t ghcr.io/apache/airflow/main/ci/python3.12:latest --target main . - -f Dockerfile.ci --platform linux/amd64 + python: "3.12" + platform: "linux/amd64" + use-uv: "true" + make-mnt-writeable-and-cleanup: "true" - name: "Install AWS CLI v2" run: | @@ -195,13 +203,17 @@ jobs: cp \ "${REGISTRY_DATA_DIR}/${REGISTRY_MODULES_JSON}" \ "${REGISTRY_SITE_DATA_DIR}/${REGISTRY_MODULES_JSON}" - if [ -d "${REGISTRY_DATA_DIR}/output/versions" ]; then - cp -r "${REGISTRY_DATA_DIR}/output/versions/"* "${REGISTRY_SITE_VERSIONS_DIR}/" + VERSIONS_SRC="${REGISTRY_DATA_DIR}/output/versions" + if [ -d "${VERSIONS_SRC}" ] && ls "${VERSIONS_SRC}/"* &>/dev/null; then + cp -r "${VERSIONS_SRC}/"* "${REGISTRY_SITE_VERSIONS_DIR}/" fi - # Copy provider logos extracted from providers/*/docs/integration-logos/ - if [ -d "${REGISTRY_DATA_DIR}/logos" ]; then + # Copy provider logos extracted by extract_metadata.py. + # The directory may exist but be empty (incremental build for + # a provider without logos), so check for files before copying. + LOGOS_SRC="${REGISTRY_DATA_DIR}/logos" + if [ -d "${LOGOS_SRC}" ] && ls "${LOGOS_SRC}/"* &>/dev/null; then mkdir -p "${REGISTRY_SITE_LOGOS_DIR}" - cp -r "${REGISTRY_DATA_DIR}/logos/"* "${REGISTRY_SITE_LOGOS_DIR}/" + cp -r "${LOGOS_SRC}/"* "${REGISTRY_SITE_LOGOS_DIR}/" fi - name: "Setup pnpm" @@ -237,10 +249,41 @@ jobs: - name: "Sync registry to S3" env: S3_BUCKET: ${{ steps.destination.outputs.bucket }} + PROVIDER: ${{ inputs.provider }} run: | + # Incremental builds only extract data for the target provider(s). + # Eleventy rebuilds all pages, but non-target providers have + # incomplete data (no connections/parameters, wrong version info). + # Exclude both per-provider API JSON and HTML pages from the main + # sync, then selectively upload only the target provider's files. + EXCLUDE_PROVIDERS=() + if [[ -n "${PROVIDER}" ]]; then + # Exclude per-provider subtrees but re-include global listing + # pages. AWS CLI processes filters in order — later rules win. + EXCLUDE_PROVIDERS=( + --exclude "api/providers/*" + --exclude "providers/*" + --include "providers/index.html" + ) + fi + aws s3 sync registry/_site/ "${S3_BUCKET}" \ --cache-control "${REGISTRY_CACHE_CONTROL}" \ - --exclude "pagefind/*" + --exclude "pagefind/*" \ + "${EXCLUDE_PROVIDERS[@]}" + + # For incremental builds, sync only the target provider's files. + if [[ -n "${PROVIDER}" ]]; then + for pid in ${PROVIDER}; do + aws s3 sync "registry/_site/api/providers/${pid}/" \ + "${S3_BUCKET}api/providers/${pid}/" \ + --cache-control "${REGISTRY_CACHE_CONTROL}" + aws s3 sync "registry/_site/providers/${pid}/" \ + "${S3_BUCKET}providers/${pid}/" \ + --cache-control "${REGISTRY_CACHE_CONTROL}" + done + fi + # Pagefind generates content-hashed filenames (e.g. en_181da6f.pf_index). # Each rebuild produces new hashes, so --delete is needed to remove stale # index files. This is separate from the main sync which intentionally diff --git a/dev/breeze/src/airflow_breeze/commands/registry_commands.py b/dev/breeze/src/airflow_breeze/commands/registry_commands.py index 8a7457f124866..f818a16ea01a9 100644 --- a/dev/breeze/src/airflow_breeze/commands/registry_commands.py +++ b/dev/breeze/src/airflow_breeze/commands/registry_commands.py @@ -91,8 +91,8 @@ def extract_data(python: str, provider: str | None): command = ( f"{install_cmd}" f"python dev/registry/extract_metadata.py{provider_flag} && " - "python dev/registry/extract_parameters.py && " - "python dev/registry/extract_connections.py" + f"python dev/registry/extract_parameters.py{provider_flag} && " + f"python dev/registry/extract_connections.py{provider_flag}" ) with ci_group("Extracting registry data"): diff --git a/dev/registry/extract_connections.py b/dev/registry/extract_connections.py index ce81cc43a2773..ebb269193c37b 100644 --- a/dev/registry/extract_connections.py +++ b/dev/registry/extract_connections.py @@ -162,7 +162,7 @@ def main(): parser.add_argument( "--provider", default=None, - help="Only output connections for this provider ID (e.g. 'amazon').", + help="Only output connections for these provider ID(s) (space-separated, e.g. 'amazon common-io').", ) parser.add_argument( "--providers-json", @@ -212,12 +212,21 @@ def main(): total_with_custom = 0 total_with_ui = 0 + # Parse space-separated provider filter (matches extract_metadata.py behaviour) + provider_filter: set[str] | None = None + if args.provider: + provider_filter = {pid.strip() for pid in args.provider.split() if pid.strip()} + print(f"Filtering to provider(s): {', '.join(sorted(provider_filter))}") + for conn_type, hook_info in sorted(hooks.items()): if hook_info is None or not hook_info.package_name: continue provider_id = package_name_to_provider_id(hook_info.package_name) + if provider_filter and provider_id not in provider_filter: + continue + standard_fields = build_standard_fields(field_behaviours.get(conn_type)) custom_fields = build_custom_fields(form_widgets, conn_type) @@ -244,13 +253,6 @@ def main(): print(f" {total_with_custom} have custom fields") print(f" {total_with_ui} have UI field customisation") - # Filter to single provider if requested - if args.provider: - provider_connections = { - pid: conns for pid, conns in provider_connections.items() if pid == args.provider - } - print(f"Filtering output to provider: {args.provider}") - # Write per-provider files to versions/{pid}/{version}/connections.json for output_dir in OUTPUT_DIRS: if not output_dir.parent.exists(): diff --git a/dev/registry/merge_registry_data.py b/dev/registry/merge_registry_data.py index bf8696a8b37ac..2bdbe50da12b4 100644 --- a/dev/registry/merge_registry_data.py +++ b/dev/registry/merge_registry_data.py @@ -56,7 +56,9 @@ def merge( existing_modules: list[dict] = [] if existing_modules_path.exists(): existing_modules = json.loads(existing_modules_path.read_text())["modules"] - new_modules = json.loads(new_modules_path.read_text())["modules"] + new_modules: list[dict] = [] + if new_modules_path.exists(): + new_modules = json.loads(new_modules_path.read_text())["modules"] # IDs being replaced new_ids = {p["id"] for p in new_providers} diff --git a/dev/registry/tests/test_merge_registry_data.py b/dev/registry/tests/test_merge_registry_data.py index 537245569aad1..e3f78fd4b792e 100644 --- a/dev/registry/tests/test_merge_registry_data.py +++ b/dev/registry/tests/test_merge_registry_data.py @@ -241,6 +241,41 @@ def test_missing_existing_providers_file(self, tmp_path, output_dir): assert len(result_providers) == 1 assert result_providers[0]["id"] == "amazon" + def test_missing_new_modules_file(self, tmp_path, output_dir): + """Incremental extract with --provider skips modules.json; merge should keep existing modules.""" + existing_providers = _write_json( + tmp_path / "existing_providers.json", + { + "providers": [ + _provider("amazon", "Amazon", "2024-01-01"), + _provider("google", "Google", "2024-02-01"), + ] + }, + ) + existing_modules = _write_json( + tmp_path / "existing_modules.json", + { + "modules": [ + _module("amazon-s3-op", "amazon"), + _module("google-bq-op", "google"), + ] + }, + ) + new_providers = _write_json( + tmp_path / "new_providers.json", + {"providers": [_provider("amazon", "Amazon", "2025-01-01")]}, + ) + # new_modules file does not exist (--provider mode skips modules.json) + new_modules = tmp_path / "nonexistent_modules.json" + + merge(existing_providers, existing_modules, new_providers, new_modules, output_dir) + + result_modules = json.loads((output_dir / "modules.json").read_text())["modules"] + # Existing modules for non-updated providers are kept + assert any(m["id"] == "google-bq-op" for m in result_modules) + # Existing modules for the updated provider are removed (no new ones to replace them) + assert not any(m["provider_id"] == "amazon" for m in result_modules) + def test_output_directory_created_if_missing(self, tmp_path): output_dir = tmp_path / "does" / "not" / "exist" existing_providers = _write_json( diff --git a/registry/AGENTS.md b/registry/AGENTS.md index fc7ce31fb0fb6..99416cb89311c 100644 --- a/registry/AGENTS.md +++ b/registry/AGENTS.md @@ -404,7 +404,10 @@ The registry is built in the `apache/airflow` repo and served at `airflow.apache Supports two modes: - **Full build** (no `provider` input): extracts all ~99 providers (~12 min) - **Incremental build** (`provider=amazon`): extracts one provider (~30s), merges - with existing data from S3 via `merge_registry_data.py`, then builds the full site + with existing data from S3 via `merge_registry_data.py`, then builds the full site. + The S3 sync step excludes the entire `api/providers/` subtree for non-target + providers to avoid overwriting real data with Eleventy's incomplete/empty + stubs (Eleventy 3.x `permalink: false` does not work with pagination). 2. **S3 buckets**: `{live|staging}-docs-airflow-apache-org/registry/` (same bucket as docs, different prefix) 3. **Serving**: Apache HTTPD at `airflow.apache.org` rewrites `/registry/*` to CloudFront, which serves from S3 4. **Auto-trigger**: When `publish-docs-to-s3.yml` publishes provider docs, its diff --git a/registry/README.md b/registry/README.md index 2d0d354c59fa1..4459c65da00e6 100644 --- a/registry/README.md +++ b/registry/README.md @@ -327,10 +327,16 @@ it triggers `registry-build.yml` with the provider ID. The incremental flow: metadata and PyPI stats; `extract_parameters.py` discovers modules for only the specified provider. 3. **Merge** — `merge_registry_data.py` replaces the updated provider's entries in - the downloaded JSON while keeping all other providers intact. + the downloaded JSON while keeping all other providers intact. Only global files + (`providers.json`, `modules.json`) are merged — per-version files like + `connections.json` and `parameters.json` are not downloaded from S3. 4. **Build site** — Eleventy builds all pages from the merged data; Pagefind indexes - all records. -5. **S3 sync** — only changed pages are uploaded (S3 sync diffs). + all records. Because per-version data only exists for the target provider, Eleventy + emits empty fallback JSON for other providers' `connections.json` and + `parameters.json` API endpoints (see **Known limitation** below). +5. **S3 sync (selective)** — the main sync excludes the entire `api/providers/` + subtree to avoid overwriting real data with incomplete/empty stubs. A second + sync uploads only the target provider's API files. 6. **Publish versions** — `publish_versions.py` updates `api/providers/{id}/versions.json`. The merge script (`dev/registry/merge_registry_data.py`) handles edge cases: @@ -338,6 +344,14 @@ The merge script (`dev/registry/merge_registry_data.py`) handles edge cases: - First deploy (no existing data on S3): uses the single-provider output as-is. - Missing modules file: treated as empty. +**Known limitation**: Eleventy's pagination templates generate API files for every +provider in `providers.json`, even when per-version data (connections, parameters) only +exists for the target provider. The templates emit empty fallback JSON +(`{"connection_types":[]}`) for providers without data. The S3 sync step works around +this with `--exclude` patterns during incremental builds. A proper template-level fix +(skipping file generation) is tracked as a follow-up — `permalink: false` does not work +with Eleventy 3.x pagination templates. + To run an incremental build locally: ```bash diff --git a/registry/package.json b/registry/package.json index af27cb96fd6af..330fad9498f6c 100644 --- a/registry/package.json +++ b/registry/package.json @@ -4,7 +4,7 @@ "description": "Apache Airflow Provider Registry", "scripts": { "dev": "REGISTRY_PATH_PREFIX=/ pnpm build && REGISTRY_PATH_PREFIX=/ eleventy --serve --port=8080", - "prebuild": "uv run python ../dev/registry/export_registry_schemas.py", + "prebuild": "uv run --project ../dev/registry python ../dev/registry/export_registry_schemas.py", "build": "rm -rf _site && eleventy", "postbuild": "cleancss -o _site/css/main.css _site/css/main.css && node scripts/build-pagefind-index.mjs" }, diff --git a/registry/pnpm-lock.yaml b/registry/pnpm-lock.yaml index 59787d2b24416..e253dafd90646 100644 --- a/registry/pnpm-lock.yaml +++ b/registry/pnpm-lock.yaml @@ -4,12 +4,6 @@ settings: autoInstallPeers: true excludeLinksFromLockfile: false -overrides: - liquidjs@<10.25.0: '>=10.25.0' - markdown-it@>=13.0.0 <14.1.1: '>=14.1.1' - minimatch@<3.1.3: '>=3.1.3' - minimatch@<3.1.4: '>=3.1.4' - importers: .: @@ -148,9 +142,8 @@ packages: asap@2.0.6: resolution: {integrity: sha512-BSHWgDSAiKs50o2Re8ppvp3seVHXSRM44cdSsT9FfNEUUZLOGWVCsiWaRPWM1Znn+mqZ1OfVZ3z3DWEzSp7hRA==} - balanced-match@4.0.4: - resolution: {integrity: sha512-BLrgEcRTwX2o6gGxGOCNyMvGSp35YofuYzw9h1IMTRmKqttAZZVU67bdb9Pr2vUHA8+j3i2tJfjO6C6+4myGTA==} - engines: {node: 18 || 20 || >=22} + balanced-match@1.0.2: + resolution: {integrity: sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==} bcp-47-match@2.0.3: resolution: {integrity: sha512-JtTezzbAibu8G0R9op9zb3vcWZd9JF6M0xOYGPn0fNCd7wOpRB1mU2mH9T8gaBGbAAyIIVgB2G7xG0GP98zMAQ==} @@ -165,9 +158,8 @@ packages: resolution: {integrity: sha512-Ceh+7ox5qe7LJuLHoY0feh3pHuUDHAcRUeyL2VYghZwfpkNIy/+8Ocg0a3UuSoYzavmylwuLWQOf3hl0jjMMIw==} engines: {node: '>=8'} - brace-expansion@5.0.4: - resolution: {integrity: sha512-h+DEnpVvxmfVefa4jFbCf5HdH5YMDXRsmKflpf1pILZWRFlTbJpxeU55nJl4Smt5HQaGzg1o6RHFPJaOqnmBDg==} - engines: {node: 18 || 20 || >=22} + brace-expansion@1.1.12: + resolution: {integrity: sha512-9T9UjW3r0UW5c1Q7GTwllptXwhvYmEzFhzMfZ9H7FQWt+uZePjZPjBP/W1ZEyZ1twGWom5/56TF4lPcqjnDHcg==} braces@3.0.3: resolution: {integrity: sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA==} @@ -198,6 +190,9 @@ packages: resolution: {integrity: sha512-QrWXB+ZQSVPmIWIhtEO9H+gwHaMGYiF5ChvoJ+K9ZGHG/sVsa6yiesAD1GC/x46sET00Xlwo1u49RVVVzvcSkw==} engines: {node: '>= 10'} + concat-map@0.0.1: + resolution: {integrity: sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==} + debug@2.6.9: resolution: {integrity: sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==} peerDependencies: @@ -447,9 +442,8 @@ packages: engines: {node: '>=10.0.0'} hasBin: true - minimatch@10.2.4: - resolution: {integrity: sha512-oRjTw/97aTBN0RHbYCdtF1MQfvusSIBQM0IZEgzl6426+8jSC0nF1a/GmnVLpfB9yyr6g6FTqWqiZVbxrtaCIg==} - engines: {node: 18 || 20 || >=22} + minimatch@3.1.5: + resolution: {integrity: sha512-VgjWUsnnT6n+NUk6eZq77zeFdpW2LWDzP6zFGrCbHXiYNul5Dzqk2HHQ5uFH2DNW5Xbp8+jVzaeNt94ssEEl4w==} minimist@1.2.8: resolution: {integrity: sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==} @@ -798,7 +792,7 @@ snapshots: asap@2.0.6: {} - balanced-match@4.0.4: {} + balanced-match@1.0.2: {} bcp-47-match@2.0.3: {} @@ -815,9 +809,10 @@ snapshots: binary-extensions@2.3.0: {} - brace-expansion@5.0.4: + brace-expansion@1.1.12: dependencies: - balanced-match: 4.0.4 + balanced-match: 1.0.2 + concat-map: 0.0.1 braces@3.0.3: dependencies: @@ -852,6 +847,8 @@ snapshots: commander@7.2.0: {} + concat-map@0.0.1: {} + debug@2.6.9: dependencies: ms: 2.0.0 @@ -954,7 +951,7 @@ snapshots: fs.realpath: 1.0.0 inflight: 1.0.6 inherits: 2.0.4 - minimatch: 10.2.4 + minimatch: 3.1.5 once: 1.4.0 path-is-absolute: 1.0.1 @@ -1057,7 +1054,7 @@ snapshots: array-differ: 1.0.0 array-union: 1.0.2 arrify: 1.0.1 - minimatch: 10.2.4 + minimatch: 3.1.5 mdurl@2.0.0: {} @@ -1069,9 +1066,9 @@ snapshots: mime@3.0.0: {} - minimatch@10.2.4: + minimatch@3.1.5: dependencies: - brace-expansion: 5.0.4 + brace-expansion: 1.1.12 minimist@1.2.8: {} diff --git a/registry/pnpm-workspace.yaml b/registry/pnpm-workspace.yaml index d60faacb95e15..d142f35ee783d 100644 --- a/registry/pnpm-workspace.yaml +++ b/registry/pnpm-workspace.yaml @@ -16,6 +16,8 @@ # under the License. --- +packages: + - '.' overrides: liquidjs@<10.25.0: '>=10.25.0' markdown-it@>=13.0.0 <14.1.1: '>=14.1.1'