From d8a3511f903c3c2a283d3d2bc38d869543095143 Mon Sep 17 00:00:00 2001 From: Cory O'Daniel Date: Sun, 19 Apr 2026 20:37:35 -0700 Subject: [PATCH 01/15] Add TODO.md to gitignore and local massdriver config - Ignore TODO.md (working notes, not catalog content) - Add .claude/massdriver.local.md with production regex pattern and gcp-claude test environment note for the bundle-dev agent Co-Authored-By: Claude Opus 4.7 (1M context) --- .claude/massdriver.local.md | 6 ++++++ .gitignore | 1 + 2 files changed, 7 insertions(+) create mode 100644 .claude/massdriver.local.md diff --git a/.claude/massdriver.local.md b/.claude/massdriver.local.md new file mode 100644 index 0000000..5964e7c --- /dev/null +++ b/.claude/massdriver.local.md @@ -0,0 +1,6 @@ +# Massdriver Local Config + +production_pattern: ^.*-(prod|prd|production)(-.*)?$ + +## Test Environments +- gcp-claude — GCP test environment for bundle development (NOT production) diff --git a/.gitignore b/.gitignore index 301fee9..3bb29b8 100644 --- a/.gitignore +++ b/.gitignore @@ -31,3 +31,4 @@ _dist/ .claude/settings.local.json node_modules/ *.zip +TODO.md From 6b1eaf59de7c6c9d290e87baeab524a7c4549c08 Mon Sep 17 00:00:00 2001 From: Cory O'Daniel Date: Sun, 19 Apr 2026 20:37:46 -0700 Subject: [PATCH 02/15] Add GCP artifact definitions for data platform catalog Seven artifact definitions introducing the GCP data platform artifact surface. Each documents the IAM role-binding pattern for downstream bundles in header comments. - gcp-network: region-scoped VPC with primary subnet; optional fields for secondary ranges, private services access, Cloud NAT, and additional subnets so customers can import existing network configs - gcp-landing-zone: combined environment foundation artifact with project_id, network, workload_identity, enabled_apis, and budget - gcp-workload-identity: standalone workload SA artifact (for per- service identity pattern in consumer bundles) - gcp-pubsub-topic: topic + optional DLQ; publisher/subscriber policies - gcp-storage-bucket: bucket with URL/self-link; object_reader, object_writer, admin policies - gcp-bigquery-dataset: dataset with full name + location; reader, writer, admin policies - gcp-cloud-run-service: service with URL, latest revision, runtime SA; invoker policy Co-Authored-By: Claude Opus 4.7 (1M context) --- .../gcp-bigquery-dataset/massdriver.yaml | 78 +++++++ .../gcp-cloud-run-service/massdriver.yaml | 102 ++++++++++ .../gcp-landing-zone/massdriver.yaml | 146 ++++++++++++++ .../gcp-network/massdriver.yaml | 190 ++++++++++++++++++ .../gcp-pubsub-topic/massdriver.yaml | 70 +++++++ .../gcp-storage-bucket/massdriver.yaml | 80 ++++++++ .../gcp-workload-identity/massdriver.yaml | 63 ++++++ 7 files changed, 729 insertions(+) create mode 100644 artifact-definitions/gcp-bigquery-dataset/massdriver.yaml create mode 100644 artifact-definitions/gcp-cloud-run-service/massdriver.yaml create mode 100644 artifact-definitions/gcp-landing-zone/massdriver.yaml create mode 100644 artifact-definitions/gcp-network/massdriver.yaml create mode 100644 artifact-definitions/gcp-pubsub-topic/massdriver.yaml create mode 100644 artifact-definitions/gcp-storage-bucket/massdriver.yaml create mode 100644 artifact-definitions/gcp-workload-identity/massdriver.yaml diff --git a/artifact-definitions/gcp-bigquery-dataset/massdriver.yaml b/artifact-definitions/gcp-bigquery-dataset/massdriver.yaml new file mode 100644 index 0000000..315147d --- /dev/null +++ b/artifact-definitions/gcp-bigquery-dataset/massdriver.yaml @@ -0,0 +1,78 @@ +name: gcp-bigquery-dataset +label: GCP BigQuery Dataset +icon: https://raw.githubusercontent.com/massdriver-cloud/massdriver-catalog/refs/heads/main/platforms/gcp/icon.png + +# IAM Role Binding Pattern (examples — adapt to your actual consumer): +# Downstream bundles that need read-only access bind roles/bigquery.dataViewer. +# Downstream bundles that need read+write access bind roles/bigquery.dataEditor. +# Downstream bundles that need full control bind roles/bigquery.dataOwner. +# +# Terraform example — grant data viewer access to a workload service account: +# resource "google_bigquery_dataset_iam_member" "reader" { +# dataset_id = var.bigquery_dataset.dataset_id +# role = "roles/bigquery.dataViewer" +# member = "serviceAccount:${var.bigquery_dataset.workload_sa_email}" +# } +# +# Note: BigQuery IAM operates at dataset level by default. For table-level access, +# use google_bigquery_table_iam_member instead. Dataset-level bindings propagate to +# all tables within the dataset; table-level bindings do not propagate up. +# +# Policy examples below (reader / writer / admin) follow this same pattern. They are +# illustrative — the actual IAM member string comes from the consumer bundle's +# service account, not from this artifact. +exports: [] + +schema: + title: GCP BigQuery Dataset + description: A Google Cloud BigQuery dataset. Carries the project ID, dataset ID, + fully-qualified name (.), and location so downstream bundles + can reference the dataset for querying, loading, and exporting without hard-coding + project or dataset identifiers. + type: object + required: + - project_id + - dataset_id + - dataset_full_name + - location + properties: + project_id: + title: Project ID + description: GCP project identifier that owns this BigQuery dataset + type: string + examples: + - my-gcp-project-123 + + dataset_id: + title: Dataset ID + description: BigQuery dataset identifier (letters, digits, underscores — no hyphens) + type: string + examples: + - my_analytics_dataset + + dataset_full_name: + title: Dataset Full Name + description: Fully-qualified BigQuery dataset name in . form. + Use this in SQL FROM clauses and bq CLI commands. + type: string + examples: + - my-gcp-project-123.my_analytics_dataset + + location: + title: Location + description: BigQuery location where the dataset is stored (region or multi-region). + Location is immutable after creation. + type: string + examples: + - US + - us-central1 + - EU + + friendly_name: + title: Friendly Name + description: Human-readable display name for the dataset (optional) + type: + - string + - "null" + examples: + - My Analytics Dataset diff --git a/artifact-definitions/gcp-cloud-run-service/massdriver.yaml b/artifact-definitions/gcp-cloud-run-service/massdriver.yaml new file mode 100644 index 0000000..dca5f65 --- /dev/null +++ b/artifact-definitions/gcp-cloud-run-service/massdriver.yaml @@ -0,0 +1,102 @@ +name: gcp-cloud-run-service +label: GCP Cloud Run Service +icon: https://raw.githubusercontent.com/massdriver-cloud/massdriver-catalog/refs/heads/main/platforms/gcp/icon.png + +# IAM Role Binding Pattern — downstream bundles that need to invoke this service: +# +# The `runtime_service_account_member` field carries the IAM principal string +# ("serviceAccount:") for the service's runtime SA. Use it to grant +# downstream resources access to write to this service's upstream dependencies. +# +# To allow an external caller (e.g., Pub/Sub push subscription, Cloud Scheduler) +# to invoke this Cloud Run service: +# +# resource "google_cloud_run_v2_service_iam_member" "invoker" { +# project = var.cloud_run_service.project_id +# location = var.cloud_run_service.location +# name = var.cloud_run_service.service_name +# role = "roles/run.invoker" +# member = "" # e.g., serviceAccount:scheduler-sa@project.iam.gserviceaccount.com +# } +# +# Policy examples below are illustrative only — the actual IAM member comes from +# the calling bundle's service account, not from this artifact. +# +# invoker policy: +# role: roles/run.invoker +# member: +# resource: projects//locations//services/ +exports: [] + +schema: + title: GCP Cloud Run Service + description: A deployed Google Cloud Run v2 service. Carries the project ID, + service name, HTTPS URL, region, latest ready revision name, and the runtime + service account identity so downstream bundles can invoke the service or grant + it additional permissions without hard-coding project or service identifiers. + type: object + required: + - project_id + - service_name + - service_url + - location + - latest_ready_revision + - runtime_service_account_email + - runtime_service_account_member + properties: + project_id: + title: Project ID + description: GCP project identifier that owns this Cloud Run service + type: string + examples: + - my-gcp-project-123 + + service_name: + title: Service Name + description: Cloud Run service name (short name, not fully-qualified resource path) + type: string + examples: + - my-api-service + + service_url: + title: Service URL + description: HTTPS URL where the service is reachable. For internal ingress + services, this URL is only reachable from within the VPC or via Cloud Load + Balancing. For all-ingress services, this is publicly reachable. + type: string + examples: + - https://my-api-service-abc123-uc.a.run.app + + location: + title: Location + description: GCP region where the Cloud Run service is deployed + type: string + examples: + - us-central1 + + latest_ready_revision: + title: Latest Ready Revision + description: Name of the most recent revision that is currently serving traffic. + Use this to pin a specific revision when configuring traffic splits or + rolling back to a known-good state. + type: string + examples: + - my-api-service-00001-abc + + runtime_service_account_email: + title: Runtime Service Account Email + description: "Email address of the GCP service account the Cloud Run service + runs as. Downstream bundles that need to grant this service access to other + resources bind IAM roles to this email using the serviceAccount: prefix." + type: string + examples: + - data-workload@my-gcp-project-123.iam.gserviceaccount.com + + runtime_service_account_member: + title: Runtime Service Account IAM Member + description: "The full IAM principal string for the runtime service account, + in 'serviceAccount:' form. Use this directly as the member argument + in google_*_iam_member resources so callers do not have to construct it manually." + type: string + examples: + - serviceAccount:data-workload@my-gcp-project-123.iam.gserviceaccount.com diff --git a/artifact-definitions/gcp-landing-zone/massdriver.yaml b/artifact-definitions/gcp-landing-zone/massdriver.yaml new file mode 100644 index 0000000..3d9e681 --- /dev/null +++ b/artifact-definitions/gcp-landing-zone/massdriver.yaml @@ -0,0 +1,146 @@ +name: gcp-landing-zone +label: GCP Landing Zone +icon: https://raw.githubusercontent.com/massdriver-cloud/massdriver-catalog/refs/heads/main/platforms/gcp/icon.png + +# Downstream bundle IAM binding pattern: +# This artifact carries the workload service account email. Downstream bundles +# (BigQuery, GCS, Pub/Sub, Cloud Run, Vertex) bind roles to it on their resources. +# +# Terraform example — grant BigQuery data editor to the workload SA: +# resource "google_bigquery_dataset_iam_member" "workload" { +# dataset_id = google_bigquery_dataset.main.dataset_id +# role = "roles/bigquery.dataEditor" +# member = "serviceAccount:${var.landing_zone.workload_identity.service_account_email}" +# } +# +# The network fields allow downstream bundles to place resources in the shared VPC +# without needing a separate network connection: +# subnet_self_link = var.landing_zone.network.primary_subnet.self_link +exports: [] + +schema: + title: GCP Landing Zone + description: Environment foundation artifact — carries the shared VPC network, + workload runtime service account, enabled API list, and billing budget reference. + Downstream bundles connect to this instead of wiring network and identity separately. + type: object + required: + - project_id + - network + - workload_identity + - enabled_apis + - budget + properties: + project_id: + title: Project ID + description: GCP project identifier for this environment + type: string + examples: + - my-gcp-project-123 + + network: + title: Network + description: Shared VPC network for this environment + type: object + required: + - network_name + - network_self_link + - region + - primary_subnet + properties: + network_name: + title: Network Name + type: string + examples: + - data-platform-vpc + network_self_link: + title: Network Self Link + type: string + examples: + - https://www.googleapis.com/compute/v1/projects/my-project/global/networks/my-vpc + region: + title: Region + type: string + examples: + - us-central1 + primary_subnet: + title: Primary Subnet + type: object + required: + - name + - cidr + - self_link + properties: + name: + title: Name + type: string + cidr: + title: CIDR + type: string + self_link: + title: Self Link + type: string + + workload_identity: + title: Workload Identity + description: Runtime service account that environment workloads run as. + Downstream bundles bind IAM roles to service_account_email. + type: object + required: + - service_account_email + - service_account_id + - service_account_name + properties: + service_account_email: + title: Service Account Email + description: Use as IAM member string — serviceAccount: + type: string + examples: + - data-workload@my-project.iam.gserviceaccount.com + service_account_id: + title: Service Account Unique ID + type: string + examples: + - "123456789012345678901" + service_account_name: + title: Service Account Resource Name + type: string + examples: + - projects/my-project/serviceAccounts/data-workload@my-project.iam.gserviceaccount.com + + enabled_apis: + title: Enabled APIs + description: GCP service APIs enabled in this project by the landing zone + type: array + items: + type: string + examples: + - - compute.googleapis.com + - bigquery.googleapis.com + - run.googleapis.com + + budget: + title: Budget + description: Billing budget reference for this environment. enabled=false when no budget was configured. + type: object + required: + - enabled + properties: + enabled: + title: Budget Enabled + type: boolean + budget_name: + title: Budget Display Name + type: + - string + - "null" + billing_account_id: + title: Billing Account ID + type: + - string + - "null" + amount_usd: + title: Budget Amount (USD) + type: + - number + - "null" diff --git a/artifact-definitions/gcp-network/massdriver.yaml b/artifact-definitions/gcp-network/massdriver.yaml new file mode 100644 index 0000000..537e67f --- /dev/null +++ b/artifact-definitions/gcp-network/massdriver.yaml @@ -0,0 +1,190 @@ +name: gcp-network +label: GCP Network +icon: https://raw.githubusercontent.com/massdriver-cloud/massdriver-catalog/refs/heads/main/platforms/gcp/icon.png + +# IAM Role Binding Pattern (for downstream bundles that need network access): +# When binding IAM roles to a GCP network artifact, use data.network_name or +# data.self_link to scope role bindings to the VPC level. For subnet-scoped +# bindings (e.g., roles/compute.networkUser), bind to data.subnets[*].self_link. +# Example gcloud command: +# gcloud projects add-iam-policy-binding $PROJECT \ +# --member="serviceAccount:$SA_EMAIL" \ +# --role="roles/compute.networkUser" \ +# --condition="expression=resource.name.startsWith('projects/$PROJECT/regions/$REGION/subnetworks/$SUBNET')" +exports: [] + +schema: + title: GCP Network + description: Region-scoped GCP VPC network with subnet configuration. Captures the + core network topology including optional fields for customer-managed networks with + secondary ranges (GKE), private services access, Cloud NAT, and additional subnets. + type: object + required: + - project_id + - network_name + - network_self_link + - region + - primary_subnet + properties: + project_id: + title: Project ID + description: GCP project identifier that owns this VPC + type: string + examples: + - my-gcp-project-123 + + network_name: + title: Network Name + description: Short name of the VPC network resource + type: string + examples: + - my-vpc + + network_self_link: + title: Network Self Link + description: Fully-qualified self-link URL of the VPC network + type: string + examples: + - https://www.googleapis.com/compute/v1/projects/my-project/global/networks/my-vpc + + region: + title: Region + description: GCP region where regional resources (subnets, NAT) are provisioned + type: string + examples: + - us-central1 + - us-east1 + - us-west1 + - us-east4 + - europe-west1 + + primary_subnet: + title: Primary Subnet + description: The main subnet for general workload placement + type: object + required: + - name + - cidr + - self_link + properties: + name: + title: Name + description: Short name of the subnetwork resource + type: string + examples: + - my-vpc-subnet-us-central1 + cidr: + title: CIDR + description: Primary IP address range of the subnet + type: string + examples: + - 10.0.0.0/20 + self_link: + title: Self Link + description: Fully-qualified self-link URL of the subnetwork + type: string + examples: + - https://www.googleapis.com/compute/v1/projects/my-project/regions/us-central1/subnetworks/my-subnet + + # Optional — present when this subnet has GKE secondary ranges + secondary_ranges: + title: Secondary Ranges + description: Named secondary IP ranges on the subnet, typically used for GKE + pods and services + type: object + properties: + pods_cidr: + title: Pods CIDR + description: Secondary range allocated for GKE pod IPs + type: string + examples: + - 10.1.0.0/16 + pods_range_name: + title: Pods Range Name + description: Name of the secondary range used for GKE pods + type: string + services_cidr: + title: Services CIDR + description: Secondary range allocated for GKE service cluster IPs + type: string + examples: + - 10.2.0.0/20 + services_range_name: + title: Services Range Name + description: Name of the secondary range used for GKE services + type: string + + # Optional — present when Private Services Access is configured (Cloud SQL, Memorystore, etc.) + private_services_access: + title: Private Services Access + description: RFC 1918 peering range reserved for Google-managed services such as + Cloud SQL and Memorystore + type: object + required: + - cidr + - peering_name + properties: + cidr: + title: CIDR + description: Address range allocated for Private Services Access peering + type: string + examples: + - 10.100.0.0/16 + peering_name: + title: Peering Name + description: Name of the VPC peering connection to servicenetworking.googleapis.com + type: string + examples: + - servicenetworking-googleapis-com + + # Optional — present when Cloud NAT is configured on this network + cloud_nat: + title: Cloud NAT + description: Cloud NAT configuration providing outbound internet access for + private instances + type: object + required: + - router_name + - nat_name + properties: + router_name: + title: Router Name + description: Name of the Cloud Router that hosts the NAT gateway + type: string + nat_name: + title: NAT Name + description: Name of the Cloud NAT resource + type: string + nat_ips: + title: NAT IPs + description: Static external IP addresses assigned to the NAT gateway, if any + type: array + items: + type: string + + # Optional — additional subnets beyond the primary (e.g. proxy-only, management) + additional_subnets: + title: Additional Subnets + description: Any subnets beyond the primary — proxy-only, management, or + purpose-specific ranges + type: array + items: + type: object + required: + - name + - cidr + - self_link + properties: + name: + title: Name + type: string + cidr: + title: CIDR + type: string + self_link: + title: Self Link + type: string + purpose: + title: Purpose + description: Subnet purpose (e.g. PRIVATE, REGIONAL_MANAGED_PROXY) + type: string diff --git a/artifact-definitions/gcp-pubsub-topic/massdriver.yaml b/artifact-definitions/gcp-pubsub-topic/massdriver.yaml new file mode 100644 index 0000000..26a488e --- /dev/null +++ b/artifact-definitions/gcp-pubsub-topic/massdriver.yaml @@ -0,0 +1,70 @@ +name: gcp-pubsub-topic +label: GCP Pub/Sub Topic +icon: https://raw.githubusercontent.com/massdriver-cloud/massdriver-catalog/refs/heads/main/platforms/gcp/icon.png + +# IAM Role Binding Pattern (example — adapt to your actual resource): +# Downstream bundles that need to publish to this topic bind roles/pubsub.publisher. +# Downstream bundles that need to subscribe bind roles/pubsub.subscriber. +# +# Terraform example — grant publisher access to a workload service account: +# resource "google_pubsub_topic_iam_member" "publisher" { +# project = var.pubsub_topic.project_id +# topic = var.pubsub_topic.topic_name +# role = "roles/pubsub.publisher" +# member = "serviceAccount:${var.pubsub_topic.publisher_sa_email}" +# } +# +# Policy examples below (publisher / subscriber) follow this same pattern. +# They are illustrative — the actual IAM member string comes from the consumer +# bundle's service account, not from this artifact. +exports: [] + +schema: + title: GCP Pub/Sub Topic + description: A Google Cloud Pub/Sub topic. Carries the fully-qualified topic + ID, short topic name, and project context so downstream bundles can publish + messages or attach subscriptions without hard-coding project or topic IDs. + Optionally includes dead-letter queue (DLQ) topic references. + type: object + required: + - project_id + - topic_name + - topic_id + properties: + project_id: + title: Project ID + description: GCP project identifier that owns this Pub/Sub topic + type: string + examples: + - my-gcp-project-123 + + topic_name: + title: Topic Name + description: Short name of the Pub/Sub topic (without project prefix) + type: string + examples: + - my-data-pipeline-events + + topic_id: + title: Topic ID + description: Fully-qualified Pub/Sub topic resource name + type: string + examples: + - projects/my-gcp-project-123/topics/my-data-pipeline-events + + # Optional — only present when the topic was provisioned with a DLQ + dlq_topic_name: + title: DLQ Topic Name + description: Short name of the dead-letter queue topic (present only when DLQ + is enabled on the main topic) + type: string + examples: + - my-data-pipeline-events-dlq + + dlq_topic_id: + title: DLQ Topic ID + description: Fully-qualified resource name of the dead-letter queue topic + (present only when DLQ is enabled on the main topic) + type: string + examples: + - projects/my-gcp-project-123/topics/my-data-pipeline-events-dlq diff --git a/artifact-definitions/gcp-storage-bucket/massdriver.yaml b/artifact-definitions/gcp-storage-bucket/massdriver.yaml new file mode 100644 index 0000000..12225d1 --- /dev/null +++ b/artifact-definitions/gcp-storage-bucket/massdriver.yaml @@ -0,0 +1,80 @@ +name: gcp-storage-bucket +label: GCP Storage Bucket +icon: https://raw.githubusercontent.com/massdriver-cloud/massdriver-catalog/refs/heads/main/platforms/gcp/icon.png + +# IAM Role Binding Pattern (example — adapt to your actual resource): +# Downstream bundles that need to read objects bind roles/storage.objectViewer. +# Downstream bundles that need to read+write objects bind roles/storage.objectUser. +# Downstream bundles that need full admin bind roles/storage.admin. +# +# Terraform example — grant object read access to a workload service account: +# resource "google_storage_bucket_iam_member" "reader" { +# bucket = var.storage_bucket.bucket_name +# role = "roles/storage.objectViewer" +# member = "serviceAccount:${var.storage_bucket.workload_sa_email}" +# } +# +# Policy examples below (object_reader / object_writer / admin) follow this same +# pattern. They are illustrative — the actual IAM member string comes from the +# consumer bundle's service account, not from this artifact. +exports: [] + +schema: + title: GCP Storage Bucket + description: A Google Cloud Storage bucket. Carries the bucket name, canonical + URL (gs:// form), self-link (REST API form), location, and owning project so + downstream bundles can read/write objects without hard-coding bucket or project + identifiers. + type: object + required: + - project_id + - bucket_name + - bucket_url + - bucket_self_link + - location + - storage_class + properties: + project_id: + title: Project ID + description: GCP project identifier that owns this storage bucket + type: string + examples: + - my-gcp-project-123 + + bucket_name: + title: Bucket Name + description: Globally-unique GCS bucket name + type: string + examples: + - my-data-platform-events-abc123 + + bucket_url: + title: Bucket URL + description: Canonical GCS URL for use with gsutil and client libraries (gs:// form) + type: string + examples: + - gs://my-data-platform-events-abc123 + + bucket_self_link: + title: Bucket Self Link + description: GCS REST API resource URL (https://www.googleapis.com/storage/v1/b/ form) + type: string + examples: + - https://www.googleapis.com/storage/v1/b/my-data-platform-events-abc123 + + location: + title: Location + description: GCS location where the bucket is deployed (region, dual-region, or multi-region) + type: string + examples: + - US + - us-central1 + - NAM4 + + storage_class: + title: Storage Class + description: GCS storage class of the bucket + type: string + examples: + - STANDARD + - COLDLINE diff --git a/artifact-definitions/gcp-workload-identity/massdriver.yaml b/artifact-definitions/gcp-workload-identity/massdriver.yaml new file mode 100644 index 0000000..69c973d --- /dev/null +++ b/artifact-definitions/gcp-workload-identity/massdriver.yaml @@ -0,0 +1,63 @@ +name: gcp-workload-identity +label: GCP Workload Identity +icon: https://raw.githubusercontent.com/massdriver-cloud/massdriver-catalog/refs/heads/main/platforms/gcp/icon.png + +# IAM Role Binding Pattern (for downstream bundles that consume this artifact): +# Grant this service account access to your resource by binding an IAM role. +# Example: grant BigQuery Data Viewer on a dataset: +# gcloud projects add-iam-policy-binding $PROJECT_ID \ +# --member="serviceAccount:${data.service_account_email}" \ +# --role="roles/bigquery.dataViewer" +# +# For resource-scoped bindings (preferred — least privilege): +# gcloud bigquery datasets add-iam-policy-binding $DATASET \ +# --member="serviceAccount:${data.service_account_email}" \ +# --role="roles/bigquery.dataEditor" +# +# In Terraform use google_project_iam_member or google__iam_member: +# resource "google_project_iam_member" "workload" { +# project = var.workload_identity.project_id +# role = "roles/run.invoker" +# member = "serviceAccount:${var.workload_identity.service_account_email}" +# } +exports: [] + +schema: + title: GCP Workload Identity + description: Runtime service account identity for workloads in this environment. + Downstream bundles (Cloud Run, Vertex Workbench, etc.) bind IAM roles to this + service account to grant it access to their resources (BigQuery, GCS, Pub/Sub, etc.). + type: object + required: + - project_id + - service_account_email + - service_account_id + - service_account_name + properties: + project_id: + title: Project ID + description: GCP project identifier that owns this service account + type: string + examples: + - my-gcp-project-123 + + service_account_email: + title: Service Account Email + description: Email address of the service account; use this as the IAM member string + type: string + examples: + - data-platform-sa@my-project.iam.gserviceaccount.com + + service_account_id: + title: Service Account ID + description: Unique numeric ID of the service account + type: string + examples: + - "123456789012345678901" + + service_account_name: + title: Service Account Name + description: Fully-qualified resource name (projects/{project}/serviceAccounts/{email}) + type: string + examples: + - projects/my-project/serviceAccounts/data-platform-sa@my-project.iam.gserviceaccount.com From a40a82af30c2f0ae80759c610460f7b343ca55a6 Mon Sep 17 00:00:00 2001 From: Cory O'Daniel Date: Sun, 19 Apr 2026 20:38:16 -0700 Subject: [PATCH 03/15] Add GCP data platform bundles MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Six bundles forming a cohesive GCP data platform starter catalog. Each has presets covering common environment tiers, compliance hardened with documented Checkov skips where skipping is truly environment-agnostic, and a 2am-runbook-style operator guide. - gcp-network: minimal VPC + regional subnet + deny-all baseline firewall. Hardcoded compensating controls document CKV2_GCP_18. Flow logs and Private Google Access enabled. - gcp-landing-zone: environment foundation combining workload SA, service API enablement, and optional budget with monitoring notifications. Budget fields conditionally displayed via RJSF dependencies/oneOf. Emits combined landing_zone artifact. - gcp-pubsub-topic: topic + optional DLQ (conditional group). Low-volume, Standard, and High-throughput presets. Workload SA granted publisher role on the topic. - gcp-storage-bucket: uniform bucket-level access and public access prevention hardcoded. Staging, Durable, and Archive presets with lifecycle rule support. - gcp-bigquery-dataset: Dev, Staging, and Production presets with delete protection on Production. Location and dataset_id marked immutable. - gcp-cloud-run-service: Cloud Run v2 with Internal, Public API, and Worker presets. Optional connections to pubsub-topic, bigquery- dataset, and storage-bucket artifacts — workload SA auto-binds appropriate roles on any connected upstream resource. README and operator guide explain the runtime-template pattern for app teams. Halt-on-failure applied to prod/prd/production md-target tags across all bundles. Immutability markers applied to region/location/dataset fields that cannot be changed in-place. Co-Authored-By: Claude Opus 4.7 (1M context) --- bundles/gcp-bigquery-dataset/README.md | 70 ++++++ bundles/gcp-bigquery-dataset/massdriver.yaml | 139 +++++++++++ bundles/gcp-bigquery-dataset/operator.md | 98 ++++++++ bundles/gcp-bigquery-dataset/src/.checkov.yml | 8 + bundles/gcp-bigquery-dataset/src/artifacts.tf | 13 ++ bundles/gcp-bigquery-dataset/src/main.tf | 84 +++++++ bundles/gcp-bigquery-dataset/src/variables.tf | 79 +++++++ bundles/gcp-cloud-run-service/README.md | 112 +++++++++ bundles/gcp-cloud-run-service/massdriver.yaml | 207 ++++++++++++++++ bundles/gcp-cloud-run-service/operator.md | 125 ++++++++++ .../gcp-cloud-run-service/src/.checkov.yml | 20 ++ .../gcp-cloud-run-service/src/artifacts.tf | 18 ++ bundles/gcp-cloud-run-service/src/iam.tf | 78 +++++++ bundles/gcp-cloud-run-service/src/main.tf | 108 +++++++++ .../gcp-cloud-run-service/src/variables.tf | 135 +++++++++++ bundles/gcp-landing-zone/README.md | 93 ++++++++ bundles/gcp-landing-zone/massdriver.yaml | 209 +++++++++++++++++ bundles/gcp-landing-zone/operator.md | 85 +++++++ bundles/gcp-landing-zone/src/.checkov.yml | 3 + bundles/gcp-landing-zone/src/artifacts.tf | 41 ++++ bundles/gcp-landing-zone/src/main.tf | 123 ++++++++++ bundles/gcp-landing-zone/src/variables.tf | 54 +++++ bundles/gcp-network/README.md | 70 ++++++ bundles/gcp-network/massdriver.yaml | 90 +++++++ bundles/gcp-network/operator.md | 84 +++++++ .../gcp-network/src/_massdriver_variables.tf | 47 ++++ bundles/gcp-network/src/artifacts.tf | 15 ++ bundles/gcp-network/src/main.tf | 60 +++++ bundles/gcp-pubsub-topic/README.md | 69 ++++++ bundles/gcp-pubsub-topic/massdriver.yaml | 139 +++++++++++ bundles/gcp-pubsub-topic/operator.md | 90 +++++++ bundles/gcp-pubsub-topic/src/.checkov.yml | 9 + bundles/gcp-pubsub-topic/src/artifacts.tf | 18 ++ bundles/gcp-pubsub-topic/src/main.tf | 80 +++++++ bundles/gcp-pubsub-topic/src/variables.tf | 68 ++++++ bundles/gcp-storage-bucket/README.md | 87 +++++++ bundles/gcp-storage-bucket/massdriver.yaml | 221 ++++++++++++++++++ bundles/gcp-storage-bucket/operator.md | 96 ++++++++ bundles/gcp-storage-bucket/src/.checkov.yml | 27 +++ bundles/gcp-storage-bucket/src/artifacts.tf | 14 ++ bundles/gcp-storage-bucket/src/main.tf | 92 ++++++++ bundles/gcp-storage-bucket/src/variables.tf | 79 +++++++ 42 files changed, 3357 insertions(+) create mode 100644 bundles/gcp-bigquery-dataset/README.md create mode 100644 bundles/gcp-bigquery-dataset/massdriver.yaml create mode 100644 bundles/gcp-bigquery-dataset/operator.md create mode 100644 bundles/gcp-bigquery-dataset/src/.checkov.yml create mode 100644 bundles/gcp-bigquery-dataset/src/artifacts.tf create mode 100644 bundles/gcp-bigquery-dataset/src/main.tf create mode 100644 bundles/gcp-bigquery-dataset/src/variables.tf create mode 100644 bundles/gcp-cloud-run-service/README.md create mode 100644 bundles/gcp-cloud-run-service/massdriver.yaml create mode 100644 bundles/gcp-cloud-run-service/operator.md create mode 100644 bundles/gcp-cloud-run-service/src/.checkov.yml create mode 100644 bundles/gcp-cloud-run-service/src/artifacts.tf create mode 100644 bundles/gcp-cloud-run-service/src/iam.tf create mode 100644 bundles/gcp-cloud-run-service/src/main.tf create mode 100644 bundles/gcp-cloud-run-service/src/variables.tf create mode 100644 bundles/gcp-landing-zone/README.md create mode 100644 bundles/gcp-landing-zone/massdriver.yaml create mode 100644 bundles/gcp-landing-zone/operator.md create mode 100644 bundles/gcp-landing-zone/src/.checkov.yml create mode 100644 bundles/gcp-landing-zone/src/artifacts.tf create mode 100644 bundles/gcp-landing-zone/src/main.tf create mode 100644 bundles/gcp-landing-zone/src/variables.tf create mode 100644 bundles/gcp-network/README.md create mode 100644 bundles/gcp-network/massdriver.yaml create mode 100644 bundles/gcp-network/operator.md create mode 100644 bundles/gcp-network/src/_massdriver_variables.tf create mode 100644 bundles/gcp-network/src/artifacts.tf create mode 100644 bundles/gcp-network/src/main.tf create mode 100644 bundles/gcp-pubsub-topic/README.md create mode 100644 bundles/gcp-pubsub-topic/massdriver.yaml create mode 100644 bundles/gcp-pubsub-topic/operator.md create mode 100644 bundles/gcp-pubsub-topic/src/.checkov.yml create mode 100644 bundles/gcp-pubsub-topic/src/artifacts.tf create mode 100644 bundles/gcp-pubsub-topic/src/main.tf create mode 100644 bundles/gcp-pubsub-topic/src/variables.tf create mode 100644 bundles/gcp-storage-bucket/README.md create mode 100644 bundles/gcp-storage-bucket/massdriver.yaml create mode 100644 bundles/gcp-storage-bucket/operator.md create mode 100644 bundles/gcp-storage-bucket/src/.checkov.yml create mode 100644 bundles/gcp-storage-bucket/src/artifacts.tf create mode 100644 bundles/gcp-storage-bucket/src/main.tf create mode 100644 bundles/gcp-storage-bucket/src/variables.tf diff --git a/bundles/gcp-bigquery-dataset/README.md b/bundles/gcp-bigquery-dataset/README.md new file mode 100644 index 0000000..8fff308 --- /dev/null +++ b/bundles/gcp-bigquery-dataset/README.md @@ -0,0 +1,70 @@ +# gcp-bigquery-dataset + +Google Cloud BigQuery dataset with configurable location, default table expiration, and delete protection. Use this bundle to provision a managed analytics dataset for data platform workloads — Cloud Run pipelines, Vertex Workbench notebooks, Dataflow jobs, and ad-hoc SQL analytics. The landing zone's workload service account is automatically granted `dataEditor` access on the dataset. + +## Purpose + +- Provisions a BigQuery dataset at a chosen location with an immutable dataset ID +- Configures optional default table expiration to control storage cost growth in non-production environments +- Supports delete protection to prevent accidental dataset destruction in production +- Grants `roles/bigquery.dataEditor` to the landing zone's workload service account on the dataset +- Emits a `catalog-demo/gcp-bigquery-dataset` artifact so downstream bundles can reference the dataset without hard-coding project or dataset identifiers + +## Resources Created + +| Resource | Type | Notes | +|---|---|---| +| `google_bigquery_dataset.main` | BigQuery dataset | Location, expiration, and delete protection set at provision time; Google-managed encryption | +| `google_bigquery_dataset_iam_member.workload_data_editor` | IAM binding | Grants `roles/bigquery.dataEditor` to the landing zone workload SA on the dataset | + +## Artifacts Consumed (Connections) + +| Connection | Artifact Type | How It Is Used | +|---|---|---| +| `gcp_authentication` | `gcp-service-account` | Deploy credential — project ID and service account key for the Google provider | +| `landing_zone` | `catalog-demo/gcp-landing-zone` | Provides `project_id` and `workload_identity.service_account_email` for the dataEditor IAM binding | + +## Artifacts Produced + +The bundle publishes a `catalog-demo/gcp-bigquery-dataset` artifact with all fields needed for downstream bundles to query and load data. + +| Field | Type | Description | +|---|---|---| +| `project_id` | string | GCP project ID that owns the dataset | +| `dataset_id` | string | BigQuery dataset identifier (letters, digits, underscores) | +| `dataset_full_name` | string | Fully-qualified name in `.` form — use directly in SQL `FROM` clauses | +| `location` | string | BigQuery location where the dataset is stored | +| `friendly_name` | string or null | Human-readable display name if set; null otherwise | + +Downstream bundles that need read-only access should bind `roles/bigquery.dataViewer` on the dataset using `dataset_id` and `project_id` from this artifact. Bundles requiring full ownership should bind `roles/bigquery.dataOwner`. + +## Compliance + +### Hardcoded security baselines + +BigQuery dataset-level IAM is the access control mechanism — there are no per-object ACLs to configure. All access to this dataset must go through IAM bindings, which this bundle manages via the `workload_data_editor` resource. + +### Checkov skips + +| Check | Reason | +|---|---| +| `CKV_GCP_81` | Requires CMEK on all BigQuery datasets. CMEK is intentionally out of scope for this bundle — all datasets use Google-managed encryption, which is appropriate for the workloads this bundle targets. Checkov fires this check whenever a `default_encryption_configuration` block is absent, making it a false positive here. If CMEK is required for a specific workload, a separate bundle with a KMS connection should be used. | + +### Production gating + +The `halt_on_failure` expression in `massdriver.yaml` blocks deployments with unresolved high-severity Checkov findings when the environment target matches `prod`, `prd`, or `production`. + +## Assumptions + +- `bigquery.googleapis.com` must be enabled in the landing zone before deploying this bundle. Add it to `enabled_apis` in the `gcp-landing-zone` package config. +- The `gcp_authentication` credential has `bigquery.admin` or equivalent IAM on the project. +- The landing zone's workload SA is granted `roles/bigquery.dataEditor` automatically; read-only or owner-level access for other consumers must be added by the downstream bundle. +- The dataset ID (`dataset_id`) is immutable after creation. Changing it requires destroying and recreating the dataset — all data will be lost unless exported first. + +## Presets + +| Preset | Location | Default Table Expiration | Delete Protection | +|---|---|---|---| +| Dev | US | 30 days | Off | +| Staging | US | 90 days | Off | +| Production | US | None (no expiration) | On | diff --git a/bundles/gcp-bigquery-dataset/massdriver.yaml b/bundles/gcp-bigquery-dataset/massdriver.yaml new file mode 100644 index 0000000..251141d --- /dev/null +++ b/bundles/gcp-bigquery-dataset/massdriver.yaml @@ -0,0 +1,139 @@ +name: gcp-bigquery-dataset +description: Google Cloud BigQuery dataset with configurable location, default table + expiration, and delete protection. Enforces dataset-level IAM access control as + a non-negotiable baseline. Grants the landing zone's workload service account dataEditor + access on the dataset. Emits a gcp-bigquery-dataset artifact for downstream Cloud + Run, Vertex Workbench, and query workloads. +source_url: https://github.com/massdriver-cloud/massdriver-catalog/tree/main/bundles/gcp-bigquery-dataset +version: 0.1.0 + +params: + required: + - dataset_id + - location + - delete_protection + examples: + - __name: Dev + dataset_id: dev_dataset + location: US + default_table_expiration_days: 30 + delete_protection: false + - __name: Staging + dataset_id: staging_dataset + location: US + default_table_expiration_days: 90 + delete_protection: false + - __name: Production + dataset_id: prod_dataset + location: US + delete_protection: true + + properties: + dataset_id: + title: Dataset ID + description: BigQuery dataset identifier. Must contain only letters, digits, and + underscores — no hyphens or spaces. Maximum 1024 characters. Cannot be changed + after creation without destroying and recreating the dataset. + type: string + $md.immutable: true + pattern: "^[a-zA-Z0-9_]+$" + + friendly_name: + title: Friendly Name + description: Optional human-readable display name shown in the BigQuery UI. Does + not affect queries or API access — the dataset_id is always the identifier. + type: string + + description: + title: Description + description: Optional free-text description of the dataset's purpose, data classification, + or owner team. Stored in BigQuery metadata and visible in the console. + type: string + + location: + title: Location + description: BigQuery location where the dataset and its tables are stored. Multi-regions + (US, EU) provide highest availability. Single regions co-locate data with compute + for lower query latency. Location is immutable — changing it requires destroying + and recreating the dataset (you will lose all data unless exported first). + type: string + $md.immutable: true + default: US + enum: + - US + - EU + - us-central1 + - us-east1 + - us-east4 + - us-west1 + - us-west2 + - europe-west1 + - europe-west2 + - europe-west3 + - europe-west4 + - asia-east1 + - asia-northeast1 + - asia-south1 + - asia-southeast1 + - australia-southeast1 + - southamerica-east1 + + default_table_expiration_days: + title: Default Table Expiration (days) + description: Number of days after creation that new tables in this dataset will + be automatically deleted. Applies only to tables created after this setting + is applied — existing tables are not affected. Set to 0 or omit for no automatic + expiration. Recommended for dev and staging to prevent unbounded cost growth. + type: integer + minimum: 0 + default: 0 + + delete_protection: + title: Enable Delete Protection + description: When enabled, the dataset cannot be destroyed until delete protection + is first disabled (a two-step destroy). Prevents accidental data loss in production. + Strongly recommended for production datasets. Disable only immediately before + a planned decommission. + type: boolean + default: false + +connections: + required: + - gcp_authentication + - landing_zone + properties: + gcp_authentication: + $ref: gcp-service-account + title: GCP Credentials + + landing_zone: + $ref: catalog-demo/gcp-landing-zone + title: GCP Landing Zone + +artifacts: + required: + - bigquery_dataset + properties: + bigquery_dataset: + $ref: catalog-demo/gcp-bigquery-dataset + title: GCP BigQuery Dataset + +steps: + - path: src + provisioner: opentofu:1.10 + config: + checkov: + halt_on_failure: '.params.md_metadata.default_tags["md-target"] | test("^(prod|prd|production)$")' + +ui: + ui:order: + - dataset_id + - friendly_name + - description + - location + - default_table_expiration_days + - delete_protection + - "*" + properties: + delete_protection: + ui:widget: checkbox diff --git a/bundles/gcp-bigquery-dataset/operator.md b/bundles/gcp-bigquery-dataset/operator.md new file mode 100644 index 0000000..9095186 --- /dev/null +++ b/bundles/gcp-bigquery-dataset/operator.md @@ -0,0 +1,98 @@ +--- +templating: mustache +--- + +# GCP BigQuery Dataset — Operator Runbook + +## Non-obvious constraints + +**Dataset ID is immutable.** `dataset_id` cannot be changed in-place. To rename: export all tables, destroy the package, reprovision with the new ID, reload from GCS. Treat the dataset ID as permanent. + +**Location is immutable.** Datasets cannot be moved between regions or multi-regions after creation. To change location: export all tables (`bq extract` to GCS), destroy the package, reprovision in the new location, reload. Budget for data transfer costs and downtime. + +**`default_table_expiration_ms` applies to NEW tables only.** Changing this on an existing dataset does not expire or modify existing tables. To set expiration on an existing table, update it directly via `bq update`. + +**Delete protection requires a two-step destroy.** When `delete_protection = true`, the destroy will fail because `delete_contents_on_destroy = false` is enforced. To decommission: +1. Set `delete_protection = false` in the package config and deploy. +2. Then run the destroy. + +**Dataset-level IAM propagates to all tables, current and future.** For row-level or table-level isolation, use BigQuery row-level security policies or bind IAM at the table level separately. + +**IAM bindings added outside Terraform are overwritten on the next apply.** For permanent bindings, add a `google_bigquery_dataset_iam_member` resource to the bundle source. + +**Cross-region queries are not supported.** BigQuery cannot join tables in different regions in a single query. Use Storage Transfer Service or BigQuery Data Transfer Service to replicate data first. + +**Deploy fails with "bigquery.googleapis.com has not been used in project."** +Add `bigquery.googleapis.com` to `enabled_apis` in the `gcp-landing-zone` package, redeploy the landing zone, wait ~60 seconds, then retry. + +## Troubleshooting + +**Quota exceeded on concurrent jobs or daily bytes scanned.** +BigQuery per-project quotas are not manageable via this bundle. Check the BigQuery quota dashboard in the GCP console and request increases if needed. + +**Streaming insert rows not expiring as expected.** +Rows inserted via the streaming API have a delay before table expiration recalculation applies. Batch loads have no such lag. + +**Permission denied on dataset access.** +```bash +bq get-iam-policy {{artifacts.bigquery_dataset.dataset_full_name}} +``` +The workload SA needs `roles/bigquery.dataEditor` for read/write or `roles/bigquery.dataViewer` for read-only. + +**Table schema mismatch or load failure.** +```bash +bq show --format=prettyjson {{artifacts.bigquery_dataset.dataset_full_name}}. +``` + +## Day-2 operations + +**Setting expiration on an existing table** (default expiration doesn't backfill): +```bash +# Set expiration 30 days from now +EXPIRY=$(date -d "+30 days" +%s000 2>/dev/null || date -v+30d +%s000) +bq update --expiration=$EXPIRY {{artifacts.bigquery_dataset.dataset_full_name}}. + +# Remove expiration from a table +bq update --expiration=0 {{artifacts.bigquery_dataset.dataset_full_name}}. +``` + +**Exporting all tables before destroying the dataset:** +```bash +for TABLE in $(bq ls --format=csv {{artifacts.bigquery_dataset.dataset_full_name}} | tail -n +2 | cut -d, -f1); do + bq extract \ + --destination_format=NEWLINE_DELIMITED_JSON \ + {{artifacts.bigquery_dataset.dataset_full_name}}.$TABLE \ + gs:///{{artifacts.bigquery_dataset.dataset_id}}/$TABLE/*.jsonl +done +``` + +**Granting read-only access to another principal** (outside Terraform — overwritten on next apply): +```bash +bq add-iam-policy-binding \ + --member="serviceAccount:" \ + --role="roles/bigquery.dataViewer" \ + {{artifacts.bigquery_dataset.dataset_full_name}} +``` + +## Useful commands + +```bash +# Show dataset metadata (location, expiration, labels) +bq show --format=prettyjson {{artifacts.bigquery_dataset.dataset_full_name}} + +# List tables in the dataset +bq ls {{artifacts.bigquery_dataset.dataset_full_name}} + +# Show IAM policy on the dataset +bq get-iam-policy {{artifacts.bigquery_dataset.dataset_full_name}} + +# Show a specific table's schema and metadata +bq show --format=prettyjson {{artifacts.bigquery_dataset.dataset_full_name}}. + +# Check a table's current expiration time +bq show --format=prettyjson {{artifacts.bigquery_dataset.dataset_full_name}}. | jq '.expirationTime' + +# Run an ad-hoc query (billed to project) +bq query --project_id={{artifacts.bigquery_dataset.project_id}} \ + 'SELECT COUNT(*) FROM `{{artifacts.bigquery_dataset.dataset_full_name}}.`' +``` diff --git a/bundles/gcp-bigquery-dataset/src/.checkov.yml b/bundles/gcp-bigquery-dataset/src/.checkov.yml new file mode 100644 index 0000000..ab0bb33 --- /dev/null +++ b/bundles/gcp-bigquery-dataset/src/.checkov.yml @@ -0,0 +1,8 @@ +skip-check: + # CKV_GCP_81: Ensure Big Query Datasets are encrypted with Customer Supplied Encryption Keys (CSEK) + # CMEK is intentionally out of scope for this bundle. All datasets use Google-managed + # encryption, which is appropriate for the workloads this bundle targets. Checkov + # fires this check whenever a default_encryption_configuration block is absent, + # making it a false positive here. If CMEK is required for a specific workload, + # a separate bundle with a KMS connection should be used. + - CKV_GCP_81 diff --git a/bundles/gcp-bigquery-dataset/src/artifacts.tf b/bundles/gcp-bigquery-dataset/src/artifacts.tf new file mode 100644 index 0000000..7ddc0d4 --- /dev/null +++ b/bundles/gcp-bigquery-dataset/src/artifacts.tf @@ -0,0 +1,13 @@ +# BigQuery dataset artifact — matches catalog-demo/gcp-bigquery-dataset schema. + +resource "massdriver_artifact" "bigquery_dataset" { + field = "bigquery_dataset" + name = "GCP BigQuery Dataset ${var.md_metadata.name_prefix}" + artifact = jsonencode({ + project_id = local.project_id + dataset_id = google_bigquery_dataset.main.dataset_id + dataset_full_name = "${local.project_id}.${google_bigquery_dataset.main.dataset_id}" + location = google_bigquery_dataset.main.location + friendly_name = google_bigquery_dataset.main.friendly_name != "" ? google_bigquery_dataset.main.friendly_name : null + }) +} diff --git a/bundles/gcp-bigquery-dataset/src/main.tf b/bundles/gcp-bigquery-dataset/src/main.tf new file mode 100644 index 0000000..b904792 --- /dev/null +++ b/bundles/gcp-bigquery-dataset/src/main.tf @@ -0,0 +1,84 @@ +terraform { + required_version = ">= 1.0" + required_providers { + google = { + source = "hashicorp/google" + version = "~> 6.0" + } + massdriver = { + source = "massdriver-cloud/massdriver" + version = "~> 1.3" + } + } +} + +provider "google" { + project = var.gcp_authentication.project_id + credentials = jsonencode(var.gcp_authentication) +} + +locals { + project_id = var.landing_zone.project_id + name_prefix = var.md_metadata.name_prefix + + # Convert days → milliseconds for the BigQuery API. BigQuery requires ms. + # 0 or null input means "no expiration" → pass null to terraform resource. + default_table_expiration_ms = ( + var.default_table_expiration_days != null && var.default_table_expiration_days > 0 + ? var.default_table_expiration_days * 24 * 60 * 60 * 1000 + : null + ) +} + +# ─── BigQuery Dataset ────────────────────────────────────────────────────────── + +resource "google_bigquery_dataset" "main" { + project = local.project_id + dataset_id = var.dataset_id + location = var.location + + friendly_name = var.friendly_name != null ? var.friendly_name : null + description = var.description != null ? var.description : null + + # ── Default table expiration ───────────────────────────────────────────────── + # Only applies to NEW tables created after this setting is applied. Existing + # tables in the dataset are NOT retroactively expired. Set to null for no + # automatic expiration (recommended for production). + default_table_expiration_ms = local.default_table_expiration_ms + + # ── Delete protection ──────────────────────────────────────────────────────── + # When true, Terraform will refuse to destroy this dataset until the flag is + # first set to false and re-applied (a two-step destroy). Prevents accidental + # data loss. Default is false so non-prod environments can be torn down freely. + delete_contents_on_destroy = !var.delete_protection + + # Google-managed encryption is used for all datasets provisioned by this bundle. + # CMEK is intentionally out of scope — see src/.checkov.yml for CKV_GCP_81 skip rationale. + + labels = var.md_metadata.default_tags +} + +# ─── Workload IAM Binding ────────────────────────────────────────────────────── +# Grant the landing zone's workload service account roles/bigquery.dataEditor on +# this dataset. dataEditor allows reading, writing, and deleting table data, as +# well as creating and deleting tables within the dataset — without granting +# dataset-level admin (which would allow dropping the dataset itself). +# +# IAM role binding pattern for this series: +# member = "serviceAccount:" +# role = "roles/bigquery.dataEditor" +# resource = google_bigquery_dataset.main.dataset_id (dataset-level binding) +# +# Note: This is a DATASET-level binding — it propagates to all current and future +# tables in the dataset. For table-level isolation, use google_bigquery_table_iam_member +# instead. For read-only access, bind roles/bigquery.dataViewer. +# +# Downstream bundles that need read-only access should bind roles/bigquery.dataViewer +# on this dataset using the bigquery_dataset artifact's dataset_id and project_id. + +resource "google_bigquery_dataset_iam_member" "workload_data_editor" { + project = local.project_id + dataset_id = google_bigquery_dataset.main.dataset_id + role = "roles/bigquery.dataEditor" + member = "serviceAccount:${var.landing_zone.workload_identity.service_account_email}" +} diff --git a/bundles/gcp-bigquery-dataset/src/variables.tf b/bundles/gcp-bigquery-dataset/src/variables.tf new file mode 100644 index 0000000..8bbfd57 --- /dev/null +++ b/bundles/gcp-bigquery-dataset/src/variables.tf @@ -0,0 +1,79 @@ +variable "md_metadata" { + type = object({ + name_prefix = string + default_tags = optional(map(string), {}) + }) +} + +variable "gcp_authentication" { + type = object({ + type = string + project_id = string + private_key_id = string + private_key = string + client_email = string + client_id = string + auth_uri = string + token_uri = string + auth_provider_x509_cert_url = string + client_x509_cert_url = string + }) + sensitive = true +} + +variable "landing_zone" { + type = object({ + project_id = string + network = object({ + network_name = string + network_self_link = string + region = string + primary_subnet = object({ + name = string + cidr = string + self_link = string + }) + }) + workload_identity = object({ + service_account_email = string + service_account_id = string + service_account_name = string + }) + enabled_apis = list(string) + budget = object({ + enabled = bool + budget_name = optional(string) + billing_account_id = optional(string) + amount_usd = optional(number) + }) + }) +} + +variable "dataset_id" { + type = string +} + +variable "friendly_name" { + type = string + default = null +} + +variable "description" { + type = string + default = null +} + +variable "location" { + type = string + default = "US" +} + +variable "default_table_expiration_days" { + type = number + default = 0 +} + +variable "delete_protection" { + type = bool + default = false +} diff --git a/bundles/gcp-cloud-run-service/README.md b/bundles/gcp-cloud-run-service/README.md new file mode 100644 index 0000000..4c33db4 --- /dev/null +++ b/bundles/gcp-cloud-run-service/README.md @@ -0,0 +1,112 @@ +# gcp-cloud-run-service + +Google Cloud Run v2 service with automatic IAM binding for upstream data artifacts. The service runs as the landing zone's shared workload service account, and any optional upstream artifact connection (Pub/Sub topic, BigQuery dataset, GCS bucket) automatically grants the workload SA the minimum-privilege role on that resource — no manual IAM wiring required. + +## Use Cases + +- **Internal APIs and microservices** — low-latency HTTP services behind a load balancer or internal ingress, consuming Pub/Sub and BigQuery without internet exposure. +- **Event-driven workers** — services triggered by Pub/Sub push subscriptions or Cloud Scheduler, reading from GCS and writing to BigQuery. +- **Public APIs** — internet-facing HTTPS services with anonymous or token-authenticated access. +- **Data pipelines** — pull-based workers that read from GCS buckets and publish results to Pub/Sub or BigQuery. + +## Use as a Runtime Template + +This bundle is an example **runtime template** — an opinionated, org-wide standard for how Cloud Run services are provisioned. It encodes your platform's security baseline (workload identity, ingress controls, compliance skips with documented rationale) and auto-wires IAM for common data dependencies. + +The typical workflow for application teams: + +1. **Ops/platform team** publishes this template bundle (or a fork of it) to Massdriver. +2. **Application developer** runs `mass bundle new` pointing at the template to generate a new bundle for their specific application. They customize it with their app's image, connections, environment variables, and any app-specific dependencies. +3. The per-app bundle inherits the org's runtime standards from the template; the developer only changes what's specific to their application. + +This separation keeps the platform baseline consistent across all services while letting application teams move independently. + +For more on the `mass bundle new` workflow and template structure, see the Massdriver documentation. {{TODO: add direct link to the templates repository — check https://github.com/massdriver-cloud/massdriver-catalog or the `mass bundle new --help` output for the template path configuration.}} + +## Resources Created + +| Resource | Description | +|---|---| +| `google_cloud_run_v2_service` | The Cloud Run v2 service running your container | +| `google_cloud_run_v2_service_iam_member` (allUsers) | Created only when `allow_unauthenticated = true` — grants public invoke access | +| `google_pubsub_topic_iam_member` | Created only when Pub/Sub topic is connected — grants `roles/pubsub.publisher` to workload SA | +| `google_bigquery_dataset_iam_member` | Created only when BigQuery dataset is connected — grants `roles/bigquery.dataEditor` to workload SA | +| `google_storage_bucket_iam_member` | Created only when Storage bucket is connected — grants `roles/storage.objectUser` to workload SA | + +## Connections + +### Required + +| Connection | Artifact Type | Purpose | +|---|---|---| +| `gcp_authentication` | `gcp-service-account` | GCP credentials used by Terraform to provision resources | +| `landing_zone` | `catalog-demo/gcp-landing-zone` | Provides `project_id`, `network.region`, and `workload_identity.service_account_email` | + +### Optional + +These connections are not required. When wired on the canvas, the bundle automatically grants the workload service account the appropriate IAM role on the upstream resource. When absent, no IAM binding is created. + +| Connection | Artifact Type | IAM Role Granted | +|---|---|---| +| `pubsub_topic` | `catalog-demo/gcp-pubsub-topic` | `roles/pubsub.publisher` on the topic | +| `bigquery_dataset` | `catalog-demo/gcp-bigquery-dataset` | `roles/bigquery.dataEditor` on the dataset | +| `storage_bucket` | `catalog-demo/gcp-storage-bucket` | `roles/storage.objectUser` on the bucket | + +## Artifact Produced + +**Artifact type:** `catalog-demo/gcp-cloud-run-service` + +| Field | Type | Description | +|---|---|---| +| `project_id` | string | GCP project that owns the service | +| `service_name` | string | Short service name (used in gcloud commands) | +| `service_url` | string | HTTPS URL of the service (`.run.app` domain) | +| `location` | string | GCP region where the service is deployed | +| `latest_ready_revision` | string | Name of the currently-serving revision | +| `runtime_service_account_email` | string | Email of the SA the service runs as | +| `runtime_service_account_member` | string | IAM principal string (`serviceAccount:`) for downstream bindings | + +The `runtime_service_account_member` field is designed for downstream bundles (Scheduler, Pub/Sub push) that need to grant `roles/run.invoker` to the Cloud Run service's own identity — or for external callers that need to invoke the service. + +## Parameters + +| Parameter | Type | Default | Description | +|---|---|---|---| +| `image` | string | `gcr.io/cloudrun/hello` | Container image to deploy. Default is deployable out of the box for testing. | +| `port` | integer | `8080` | Port the container listens on. Must match the process — mismatch causes revision failure. | +| `cpu` | enum | `1` | vCPUs per instance: `1`, `2`, `4`, `8` | +| `memory` | enum | `512Mi` | Memory per instance: `256Mi` through `32Gi` | +| `min_instances` | integer | `0` | Scale-to-zero when 0. Any value above 0 means you pay for idle capacity. | +| `max_instances` | integer | `100` | Cap on autoscaling. Reduce to protect downstream systems from traffic spikes. | +| `ingress` | enum | `internal` | Traffic source restriction: `all`, `internal`, `internal-and-cloud-load-balancing` | +| `allow_unauthenticated` | boolean | `false` | Grant `allUsers` `roles/run.invoker` for public anonymous access | + +## Presets + +| Preset | Ingress | Min | Max | CPU | Memory | Unauth | +|---|---|---|---|---|---|---| +| Internal | `internal` | 0 | 10 | 1 | 512Mi | false | +| Public API | `all` | 1 | 100 | 2 | 1Gi | true | +| Worker | `internal` | 1 | 50 | 2 | 2Gi | false | + +## Compliance + +### Hardcoded Controls + +| Control | Value | Rationale | +|---|---|---| +| Runtime identity | Landing zone workload SA | All services run as the org-managed SA — no per-service SA proliferation | +| Resource labels | Massdriver default tags | Enforces cost attribution and environment tagging on all revisions | + +### Skipped Checks + +| Check | Reason | +|---|---| +| `CKV_GCP_102` | Ingress is intentionally configurable. The check fires on any non-internal service without distinguishing IAM controls. Internal-preset services pass this check without the skip; only public-ingress services need it bypassed. | +| `CKV_GCP_103` | Binary Authorization requires a pre-configured attestor policy at the project level. Enabling it per-service without an attestor causes all deployments to fail. Teams requiring binary authorization should enforce it via `google_binary_authorization_policy`. | + +## Assumptions + +- The landing zone's workload SA is the correct runtime identity for this service. If you need a per-service SA, extend this bundle with an additional `google_service_account` resource and wire it into `template.service_account`. +- VPC connector / direct VPC egress is not provisioned by this bundle. Cloud Run uses Google's serverless infrastructure by default. If you need to reach VPC-private resources (e.g., Cloud SQL without public IP), add a `google_vpc_access_connector` resource and reference it in the template's `vpc_access` block. +- The default image (`gcr.io/cloudrun/hello`) is the Google-managed hello-world container. Replace it with your application image before a real deployment. diff --git a/bundles/gcp-cloud-run-service/massdriver.yaml b/bundles/gcp-cloud-run-service/massdriver.yaml new file mode 100644 index 0000000..e0eafce --- /dev/null +++ b/bundles/gcp-cloud-run-service/massdriver.yaml @@ -0,0 +1,207 @@ +name: gcp-cloud-run-service +description: Google Cloud Run v2 service with auto-binding IAM for upstream data + artifacts. Provisions the Cloud Run service running as the landing zone's workload + service account, and automatically grants the service account the appropriate IAM + role on any connected upstream artifact (Pub/Sub publisher, BigQuery dataEditor, + GCS objectUser). Emits a gcp-cloud-run-service artifact for downstream event sources + (Scheduler, Pub/Sub push) to use for invoking the service. +source_url: https://github.com/massdriver-cloud/massdriver-catalog/tree/main/bundles/gcp-cloud-run-service +version: 0.1.0 + +params: + required: + - image + - port + - cpu + - memory + - min_instances + - max_instances + - ingress + - allow_unauthenticated + examples: + - __name: Internal + image: gcr.io/cloudrun/hello + port: 8080 + cpu: "1" + memory: 512Mi + min_instances: 0 + max_instances: 10 + ingress: internal + allow_unauthenticated: false + - __name: Public API + image: gcr.io/cloudrun/hello + port: 8080 + cpu: "2" + memory: 1Gi + min_instances: 1 + max_instances: 100 + ingress: all + allow_unauthenticated: true + - __name: Worker + image: gcr.io/cloudrun/hello + port: 8080 + cpu: "2" + memory: 2Gi + min_instances: 1 + max_instances: 50 + ingress: internal + allow_unauthenticated: false + + properties: + image: + title: Container Image + description: Fully-qualified container image reference to deploy. Supports + Docker Hub, Google Artifact Registry, and GCR images. Defaults to the Cloud + Run hello-world image so the bundle is deployable out of the box before your + application image is ready. Pin to a digest (image@sha256:...) rather than + a mutable tag for production deployments. + type: string + default: gcr.io/cloudrun/hello + + port: + title: Container Port + description: TCP port that your container listens on. Cloud Run sends all HTTP/2 + and HTTP/1.1 traffic to this port. Must match what the running process actually + binds — a mismatch causes revision-failed-readiness errors and Cloud Run will + roll back the revision. + type: integer + minimum: 1 + maximum: 65535 + default: 8080 + + cpu: + title: CPU + description: Number of vCPUs allocated to each container instance. Cloud Run + supports 1, 2, 4, and 8 vCPUs. Values above 1 require at least 512Mi memory. + CPU is only allocated while a request is being processed unless min_instances + is greater than 0 (always-on). Higher CPU allows more concurrent goroutines + or threads within a single instance before autoscaling triggers. + type: string + default: "1" + enum: + - "1" + - "2" + - "4" + - "8" + + memory: + title: Memory + description: Memory allocated to each container instance. Must be at least + 512Mi when CPU is 2 or higher. Cloud Run enforces CPU-to-memory ratios — + if you increase CPU, you may need to increase memory too to avoid a deploy + error. For workers processing large payloads, start with 2Gi and tune down. + type: string + default: 512Mi + enum: + - 256Mi + - 512Mi + - 1Gi + - 2Gi + - 4Gi + - 8Gi + - 16Gi + - 32Gi + + min_instances: + title: Minimum Instances + description: Minimum number of container instances to keep running at all times. + Set to 0 for scale-to-zero (cost-efficient for low-traffic or batch workloads). + Set to 1 or higher to eliminate cold starts. Any value above 0 disables + scale-to-zero — you are billed for idle capacity continuously. + type: integer + minimum: 0 + default: 0 + + max_instances: + title: Maximum Instances + description: Maximum number of container instances Cloud Run will scale to. + Cloud Run default is 100. Reduce this to cap costs or to protect downstream + databases from connection storms. Increasing beyond 100 requires a quota + request in GCP. + type: integer + minimum: 1 + default: 100 + + ingress: + title: Ingress + description: Controls which traffic sources can reach the service. `internal` + restricts access to VPC networks and Cloud Load Balancing in the same project. + `internal-and-cloud-load-balancing` adds Google Cloud Load Balancing traffic. + `all` allows all internet traffic to reach the service directly via its + .run.app URL. Changing ingress settings triggers a full revision replacement + with a brief cold start. + type: string + default: internal + enum: + - all + - internal + - internal-and-cloud-load-balancing + + allow_unauthenticated: + title: Allow Unauthenticated Requests + description: When true, grants the `roles/run.invoker` IAM role to `allUsers` + on this service, making the .run.app URL publicly accessible without a Bearer + token. Required for the Public API preset when you want anonymous access. + When false, callers must present a valid GCP identity token — use this for + internal APIs and workers. Has no effect on ingress routing — set `ingress` + to `all` separately if you want public network access. + type: boolean + default: false + +connections: + required: + - gcp_authentication + - landing_zone + properties: + gcp_authentication: + $ref: gcp-service-account + title: GCP Credentials + + landing_zone: + $ref: catalog-demo/gcp-landing-zone + title: GCP Landing Zone + + # Optional upstream artifact connections — omit from canvas to skip IAM binding. + # When connected, the bundle automatically grants the workload service account + # the appropriate role on the upstream resource (see src/iam.tf for binding logic). + pubsub_topic: + $ref: catalog-demo/gcp-pubsub-topic + title: Pub/Sub Topic (optional) + + bigquery_dataset: + $ref: catalog-demo/gcp-bigquery-dataset + title: BigQuery Dataset (optional) + + storage_bucket: + $ref: catalog-demo/gcp-storage-bucket + title: Storage Bucket (optional) + +artifacts: + required: + - cloud_run_service + properties: + cloud_run_service: + $ref: catalog-demo/gcp-cloud-run-service + title: GCP Cloud Run Service + +steps: + - path: src + provisioner: opentofu:1.10 + config: + checkov: + halt_on_failure: '.params.md_metadata.default_tags["md-target"] | test("^(prod|prd|production)$")' + +ui: + ui:order: + - image + - port + - cpu + - memory + - min_instances + - max_instances + - ingress + - allow_unauthenticated + - "*" + properties: + allow_unauthenticated: + ui:widget: checkbox diff --git a/bundles/gcp-cloud-run-service/operator.md b/bundles/gcp-cloud-run-service/operator.md new file mode 100644 index 0000000..1403be0 --- /dev/null +++ b/bundles/gcp-cloud-run-service/operator.md @@ -0,0 +1,125 @@ +--- +templating: mustache +--- + +# GCP Cloud Run Service — Operator Runbook + +## Non-obvious constraints + +**New deployments route 100% of traffic to the latest revision immediately.** Blue/green splits must be configured before deploying the new revision. You cannot retroactively split traffic between an old and new revision once the new one is live at 100%. + +**Changing `ingress` triggers a new revision and a cold start.** Even if `min_instances > 0`, an ingress change forces revision replacement. Expect a brief cold start. + +**`min_instances > 0` means continuous billing.** No scale-to-zero. You pay for idle capacity at the full CPU+memory rate at all times. + +**Container port must match what the image listens on.** If the image doesn't listen on the configured port, the revision fails health checks and Cloud Run rolls back. Error in logs: `Container failed to start. Failed to start and then listen on the port defined by the PORT environment variable.` Check application logs before the platform logs. + +**Image pull from Artifact Registry: the workload SA needs `roles/artifactregistry.reader`.** This bundle does not grant that role. If a revision fails with `image not found` or `permission denied` at startup, check this IAM binding first: +```bash +gcloud artifacts repositories get-iam-policy \ + --location={{artifacts.cloud_run_service.location}} \ + --project={{artifacts.cloud_run_service.project_id}} +``` + +**CPU-to-memory minimums are enforced at the API level.** 2 vCPU requires at least 512Mi; 4 vCPU requires at least 2Gi. A mismatched deploy fails before any revision is created. + +**Connecting or disconnecting canvas wires requires a Massdriver deploy to take effect.** Wiring an artifact on the canvas does not grant IAM access. The Terraform apply must run to create or destroy the IAM binding. + +## Troubleshooting + +**Revision fails to start (startup timeout).** +Default startup probe timeout is 240 seconds. Diagnose: +```bash +gcloud logging read \ + 'resource.type="cloud_run_revision" AND resource.labels.service_name="{{artifacts.cloud_run_service.service_name}}" AND (textPayload:"Container failed" OR textPayload:"failed to start")' \ + --project={{artifacts.cloud_run_service.project_id}} \ + --limit=20 +``` +Check for: missing environment variables, failed DB connections, wrong port. Test locally: `docker run -p 8080: ` and confirm it starts quickly. + +**5xx errors in production.** +```bash +gcloud logging read \ + 'resource.type="cloud_run_revision" AND resource.labels.service_name="{{artifacts.cloud_run_service.service_name}}" AND httpRequest.status>=500' \ + --project={{artifacts.cloud_run_service.project_id}} \ + --limit=50 \ + --format="table(timestamp,httpRequest.status,httpRequest.requestUrl)" +``` + +**IAM binding not applied after connecting a canvas wire.** +Connect the wire on the canvas AND redeploy this package. The binding does not exist until Terraform applies it. + +**Image pull failure.** +Check the workload SA's Artifact Registry permission (see Non-obvious constraints above). Also confirm the image tag or digest exists in the registry. + +## Day-2 operations + +**Rolling back to a prior revision:** +```bash +# 1. List revisions to find the last known-good one +gcloud run revisions list \ + --service={{artifacts.cloud_run_service.service_name}} \ + --region={{artifacts.cloud_run_service.location}} \ + --project={{artifacts.cloud_run_service.project_id}} \ + --format="table(name,status.conditions[0].status)" + +# 2. Shift 100% traffic to the prior revision +gcloud run services update-traffic {{artifacts.cloud_run_service.service_name}} \ + --region={{artifacts.cloud_run_service.location}} \ + --project={{artifacts.cloud_run_service.project_id}} \ + --to-revisions==100 +``` +This rollback is manual and temporary. The next Massdriver deploy will override it. Fix the image or config, then redeploy. + +**Pinning to a digest to prevent silent image changes:** +```bash +gcloud container images describe : \ + --format="value(image_summary.digest)" +# Use the output sha256:... in the image param: @sha256:... +``` + +**Scaling changes:** Update `min_instances` or `max_instances` params and redeploy. In-place safe. + +**Rotating the runtime service account:** This requires a bundle code change (the SA is created by the landing zone). Changing the connected landing zone artifact and redeploying will update the SA reference. + +## Useful commands + +```bash +# Describe the service (traffic splits, SA, status) +gcloud run services describe {{artifacts.cloud_run_service.service_name}} \ + --region={{artifacts.cloud_run_service.location}} \ + --project={{artifacts.cloud_run_service.project_id}} \ + --format="yaml(name,status,spec.template.spec.serviceAccountName,spec.traffic)" + +# List revisions with status +gcloud run revisions list \ + --service={{artifacts.cloud_run_service.service_name}} \ + --region={{artifacts.cloud_run_service.location}} \ + --project={{artifacts.cloud_run_service.project_id}} \ + --format="table(name,status.conditions[0].status,metadata.creationTimestamp)" + +# Tail recent application logs +gcloud logging read \ + 'resource.type="cloud_run_revision" AND resource.labels.service_name="{{artifacts.cloud_run_service.service_name}}"' \ + --project={{artifacts.cloud_run_service.project_id}} \ + --limit=100 \ + --format=json | jq '.[].textPayload // .[].jsonPayload' + +# Send a test request (authenticated) +curl -H "Authorization: Bearer $(gcloud auth print-identity-token)" \ + {{artifacts.cloud_run_service.service_url}}/healthz + +# Check IAM on the service +gcloud run services get-iam-policy {{artifacts.cloud_run_service.service_name}} \ + --region={{artifacts.cloud_run_service.location}} \ + --project={{artifacts.cloud_run_service.project_id}} + +# Check runtime SA's IAM bindings on a connected Pub/Sub topic +gcloud pubsub topics get-iam-policy \ + --project={{artifacts.cloud_run_service.project_id}} \ + --format="table(bindings.role,bindings.members)" + +# Check runtime SA's IAM bindings on a connected GCS bucket +gcloud storage buckets get-iam-policy gs:// \ + --format="table(bindings.role,bindings.members)" +``` diff --git a/bundles/gcp-cloud-run-service/src/.checkov.yml b/bundles/gcp-cloud-run-service/src/.checkov.yml new file mode 100644 index 0000000..b2157a6 --- /dev/null +++ b/bundles/gcp-cloud-run-service/src/.checkov.yml @@ -0,0 +1,20 @@ +skip-check: + # CKV_GCP_102: Ensure Cloud Run service is not publicly accessible + # This check flags any Cloud Run service that has ingress=all or is not restricted + # to internal traffic. This bundle intentionally exposes the ingress setting as a + # configurable parameter because Cloud Run services legitimately need to be public + # (Public API preset) or internal (Internal / Worker presets). The allow_unauthenticated + # param further controls IAM-level access. Blanket-skipping is appropriate here because + # the check does not distinguish between the three valid ingress modes — it fires on + # all non-internal services regardless of IAM controls. Operators requiring internal-only + # can set ingress=internal, which makes this check pass without the skip. + - CKV_GCP_102 + + # CKV_GCP_103: Ensure Cloud Run service requires Binary Authorization + # Binary Authorization enforces a deploy-time policy that container images must be + # attested (signed) before they can run. This is a valid control for strict supply-chain + # environments but requires a separate Binary Authorization policy infrastructure that + # is out of scope for this bundle. Enabling it without a configured attestor causes + # all deployments to fail. Teams that require binary authorization should implement + # it at the project level via google_binary_authorization_policy, not per-service. + - CKV_GCP_103 diff --git a/bundles/gcp-cloud-run-service/src/artifacts.tf b/bundles/gcp-cloud-run-service/src/artifacts.tf new file mode 100644 index 0000000..9841b9c --- /dev/null +++ b/bundles/gcp-cloud-run-service/src/artifacts.tf @@ -0,0 +1,18 @@ +# Cloud Run service artifact — matches catalog-demo/gcp-cloud-run-service schema. +# Emits after the service is fully deployed and the first revision is ready. +# Downstream bundles (Scheduler, Pub/Sub push subscriptions) consume service_url +# and runtime_service_account_member to configure invocation and IAM. + +resource "massdriver_artifact" "cloud_run_service" { + field = "cloud_run_service" + name = "GCP Cloud Run Service ${var.md_metadata.name_prefix}" + artifact = jsonencode({ + project_id = local.project_id + service_name = google_cloud_run_v2_service.main.name + service_url = google_cloud_run_v2_service.main.uri + location = google_cloud_run_v2_service.main.location + latest_ready_revision = google_cloud_run_v2_service.main.latest_ready_revision + runtime_service_account_email = local.workload_sa_email + runtime_service_account_member = local.workload_sa_member + }) +} diff --git a/bundles/gcp-cloud-run-service/src/iam.tf b/bundles/gcp-cloud-run-service/src/iam.tf new file mode 100644 index 0000000..528aa8d --- /dev/null +++ b/bundles/gcp-cloud-run-service/src/iam.tf @@ -0,0 +1,78 @@ +# ─── Upstream Artifact IAM Auto-Binding ─────────────────────────────────────── +# +# This file implements the "auto-binding" pattern for Cloud Run services that +# consume upstream data artifacts. For each optional connection that IS wired on +# the canvas, Terraform grants the workload service account the minimum-privilege +# role required to use that resource. +# +# HOW IT WORKS +# ──────────── +# Massdriver passes optional connections as null when not wired on the canvas, +# or as a plain object when wired. We detect presence with: var. != null +# Then use `count = var. != null ? 1 : 0` to conditionally create +# the binding. No connection → no IAM change. Add connection → binding appears +# on next deploy. Remove connection → binding is destroyed on next deploy. +# +# ROLES GRANTED +# ───────────── +# Pub/Sub topic → roles/pubsub.publisher +# Allows the service to publish messages to the topic. Does NOT grant +# subscription creation or management. For subscriber access, use a separate +# binding with roles/pubsub.subscriber. +# +# BigQuery dataset → roles/bigquery.dataEditor +# Allows reading, writing, and deleting table data, and creating/deleting +# tables within the dataset. Does NOT allow dropping the dataset itself. +# For read-only access, use roles/bigquery.dataViewer instead. +# +# Storage bucket → roles/storage.objectUser +# Allows reading and writing objects (get, list, create, delete). Does NOT +# grant bucket-level admin (lifecycle, IAM, metadata changes). For read-only +# access, use roles/storage.objectViewer instead. +# +# REFERENCE EXAMPLE +# ───────────────── +# This is the canonical artifact-policy-style auto-binding pattern for the +# GCP Data Platform demo series. When building downstream bundles that consume +# multiple optional artifacts, copy this pattern: one conditional count block +# per artifact type, one role per binding, all referencing local.workload_sa_member. + +# ── Pub/Sub Topic ───────────────────────────────────────────────────────────── +# Grant the workload SA publisher access to the connected Pub/Sub topic. +# Binding is topic-scoped — does not grant access to other topics. + +resource "google_pubsub_topic_iam_member" "workload_publisher" { + count = var.pubsub_topic != null ? 1 : 0 + + project = var.pubsub_topic.project_id + topic = var.pubsub_topic.topic_name + role = "roles/pubsub.publisher" + member = local.workload_sa_member +} + +# ── BigQuery Dataset ─────────────────────────────────────────────────────────── +# Grant the workload SA dataEditor on the connected BigQuery dataset. +# Binding is dataset-scoped — propagates to all current and future tables in +# the dataset. For table-level isolation, use google_bigquery_table_iam_member. + +resource "google_bigquery_dataset_iam_member" "workload_data_editor" { + count = var.bigquery_dataset != null ? 1 : 0 + + project = var.bigquery_dataset.project_id + dataset_id = var.bigquery_dataset.dataset_id + role = "roles/bigquery.dataEditor" + member = local.workload_sa_member +} + +# ── Storage Bucket ───────────────────────────────────────────────────────────── +# Grant the workload SA objectUser on the connected GCS bucket. +# Binding is bucket-scoped — allows read/write of all objects in the bucket. +# For read-only access, use roles/storage.objectViewer. + +resource "google_storage_bucket_iam_member" "workload_object_user" { + count = var.storage_bucket != null ? 1 : 0 + + bucket = var.storage_bucket.bucket_name + role = "roles/storage.objectUser" + member = local.workload_sa_member +} diff --git a/bundles/gcp-cloud-run-service/src/main.tf b/bundles/gcp-cloud-run-service/src/main.tf new file mode 100644 index 0000000..309114f --- /dev/null +++ b/bundles/gcp-cloud-run-service/src/main.tf @@ -0,0 +1,108 @@ +terraform { + required_version = ">= 1.0" + required_providers { + google = { + source = "hashicorp/google" + version = "~> 6.0" + } + massdriver = { + source = "massdriver-cloud/massdriver" + version = "~> 1.3" + } + } +} + +provider "google" { + project = var.gcp_authentication.project_id + credentials = jsonencode(var.gcp_authentication) +} + +locals { + project_id = var.landing_zone.project_id + name_prefix = var.md_metadata.name_prefix + region = var.landing_zone.network.region + + # The workload SA is defined once in the landing zone; all upstream IAM + # bindings in iam.tf reference this local so the principal is never duplicated. + workload_sa_email = var.landing_zone.workload_identity.service_account_email + workload_sa_member = "serviceAccount:${local.workload_sa_email}" +} + +# ─── Cloud Run v2 Service ────────────────────────────────────────────────────── +# Uses the v2 API (google_cloud_run_v2_service), which is the current GA surface. +# The v1 resource (google_cloud_run_service) is deprecated and lacks v2-only +# features such as direct VPC egress and improved traffic management. + +resource "google_cloud_run_v2_service" "main" { + project = local.project_id + name = local.name_prefix + location = local.region + + # ── Ingress ───────────────────────────────────────────────────────────────── + # Controls which traffic sources can reach this service. + # Changing ingress triggers a full revision replacement (cold start expected). + ingress = upper(var.ingress) == "INTERNAL-AND-CLOUD-LOAD-BALANCING" ? "INGRESS_TRAFFIC_INTERNAL_LOAD_BALANCER" : ( + upper(var.ingress) == "INTERNAL" ? "INGRESS_TRAFFIC_INTERNAL_ONLY" : "INGRESS_TRAFFIC_ALL" + ) + + template { + # ── Runtime identity ────────────────────────────────────────────────────── + # Run every revision as the landing zone's shared workload service account. + # This is the identity that upstream IAM bindings (iam.tf) grant access to. + # Per-service SAs are out of scope; use a separate landing-zone-style bundle + # if your workload requires a dedicated SA with narrower permissions. + service_account = local.workload_sa_email + + # ── Scaling ─────────────────────────────────────────────────────────────── + # min_instance_count > 0 disables scale-to-zero. You pay for idle capacity. + scaling { + min_instance_count = var.min_instances + max_instance_count = var.max_instances + } + + containers { + image = var.image + + ports { + container_port = var.port + } + + resources { + limits = { + cpu = var.cpu + memory = var.memory + } + } + } + } + + labels = var.md_metadata.default_tags + + lifecycle { + ignore_changes = [ + # Allow external traffic management tools (e.g., gcloud beta run services + # update-traffic) to adjust revision splits without Terraform reverting them. + template[0].labels, + ] + } +} + +# ─── Public Invoker IAM ──────────────────────────────────────────────────────── +# Only created when allow_unauthenticated = true. Grants roles/run.invoker to +# allUsers, making the .run.app URL publicly accessible without a Bearer token. +# When false, callers must present a valid GCP identity token. +# +# Note: This IAM binding is independent of ingress. You can have: +# ingress=all + allow_unauthenticated=false → public network, authenticated +# ingress=all + allow_unauthenticated=true → fully public (anonymous access) +# ingress=internal + allow_unauthenticated=false → VPC-only, authenticated + +resource "google_cloud_run_v2_service_iam_member" "all_users_invoker" { + count = var.allow_unauthenticated ? 1 : 0 + + project = local.project_id + location = local.region + name = google_cloud_run_v2_service.main.name + role = "roles/run.invoker" + member = "allUsers" +} diff --git a/bundles/gcp-cloud-run-service/src/variables.tf b/bundles/gcp-cloud-run-service/src/variables.tf new file mode 100644 index 0000000..34cca79 --- /dev/null +++ b/bundles/gcp-cloud-run-service/src/variables.tf @@ -0,0 +1,135 @@ +variable "md_metadata" { + type = object({ + name_prefix = string + default_tags = optional(map(string), {}) + }) +} + +variable "gcp_authentication" { + type = object({ + type = string + project_id = string + private_key_id = string + private_key = string + client_email = string + client_id = string + auth_uri = string + token_uri = string + auth_provider_x509_cert_url = string + client_x509_cert_url = string + }) + sensitive = true +} + +variable "landing_zone" { + type = object({ + project_id = string + network = object({ + network_name = string + network_self_link = string + region = string + primary_subnet = object({ + name = string + cidr = string + self_link = string + }) + }) + workload_identity = object({ + service_account_email = string + service_account_id = string + service_account_name = string + }) + enabled_apis = list(string) + budget = object({ + enabled = bool + budget_name = optional(string) + billing_account_id = optional(string) + amount_usd = optional(number) + }) + }) +} + +# ─── Optional upstream artifact connections ──────────────────────────────────── +# These variables are null when the connection is not wired on the canvas. +# Massdriver passes optional connections as a plain object or null — NOT a list. +# iam.tf uses count = var. != null ? 1 : 0 to conditionally create IAM +# bindings, and references fields directly (e.g., var.pubsub_topic.topic_name). + +variable "pubsub_topic" { + description = "Optional Pub/Sub topic connection. When provided, the workload SA is granted roles/pubsub.publisher on the topic." + type = object({ + project_id = string + topic_name = string + topic_id = string + dlq_topic_name = optional(string) + dlq_topic_id = optional(string) + }) + default = null +} + +variable "bigquery_dataset" { + description = "Optional BigQuery dataset connection. When provided, the workload SA is granted roles/bigquery.dataEditor on the dataset." + type = object({ + project_id = string + dataset_id = string + dataset_full_name = string + location = string + friendly_name = optional(string) + }) + default = null +} + +variable "storage_bucket" { + description = "Optional GCS bucket connection. When provided, the workload SA is granted roles/storage.objectUser on the bucket." + type = object({ + project_id = string + bucket_name = string + bucket_url = string + bucket_self_link = string + location = string + storage_class = string + }) + default = null +} + +# ─── Service params ──────────────────────────────────────────────────────────── + +variable "image" { + type = string + default = "gcr.io/cloudrun/hello" +} + +variable "port" { + type = number + default = 8080 +} + +variable "cpu" { + type = string + default = "1" +} + +variable "memory" { + type = string + default = "512Mi" +} + +variable "min_instances" { + type = number + default = 0 +} + +variable "max_instances" { + type = number + default = 100 +} + +variable "ingress" { + type = string + default = "internal" +} + +variable "allow_unauthenticated" { + type = bool + default = false +} diff --git a/bundles/gcp-landing-zone/README.md b/bundles/gcp-landing-zone/README.md new file mode 100644 index 0000000..c4dbf25 --- /dev/null +++ b/bundles/gcp-landing-zone/README.md @@ -0,0 +1,93 @@ +# gcp-landing-zone + +Environment-foundational construct for a GCP data platform. Deploy this once per environment before any workload bundles. It: + +- Enables GCP service APIs required by your data platform stack +- Provisions a **workload runtime service account** that Cloud Run, Vertex Workbench, and other services run as +- Optionally configures a **billing budget** with spend-threshold email alerts +- Folds the input `gcp-network` artifact into its own `landing_zone` output so downstream bundles need only one connection instead of wiring network and identity separately + +## Resources Created + +| Resource | Type | Notes | +|---|---|---| +| `google_project_service.apis` | API enablement (one per API) | `disable_on_destroy = false` to avoid disrupting other resources | +| `google_service_account.workload` | Workload runtime SA | Created with no project-level roles; downstream bundles bind roles on their own resources | +| `google_billing_budget.environment` | Billing budget | Created only when `budget.enabled = true` | +| `google_monitoring_notification_channel.budget_email` | Email alert channel | Created only when budget is enabled and `notification_emails` is non-empty | + +## Artifacts Consumed (Connections) + +| Connection | Artifact Type | How It Is Used | +|---|---|---| +| `gcp_authentication` | `gcp-service-account` | Deploy credential — project ID and service account key for the Google provider | +| `network` | `catalog-demo/gcp-network` | Network metadata passed through into the `landing_zone` artifact for downstream use | + +## Artifacts Produced + +The bundle publishes a single `catalog-demo/gcp-landing-zone` artifact. Downstream bundles connect to this one artifact to get everything they need. + +| Field | Description | +|---|---| +| `project_id` | GCP project ID | +| `network.network_name` | VPC name (passed through from input) | +| `network.network_self_link` | VPC self-link URI | +| `network.region` | Subnet region | +| `network.primary_subnet.name` | Subnet name | +| `network.primary_subnet.cidr` | Subnet CIDR range | +| `network.primary_subnet.self_link` | Subnet self-link URI | +| `workload_identity.service_account_email` | Runtime SA email — used by downstream bundles to bind IAM roles | +| `workload_identity.service_account_id` | Runtime SA unique ID | +| `workload_identity.service_account_name` | Runtime SA resource name | +| `enabled_apis` | List of APIs that were enabled | +| `budget.enabled` | Whether a budget was configured | +| `budget.budget_name` | Budget display name (null when disabled) | +| `budget.billing_account_id` | Billing account the budget is attached to (null when disabled) | +| `budget.amount_usd` | Monthly budget limit in USD (null when disabled) | + +## Downstream IAM Pattern + +Each downstream bundle reads `landing_zone.workload_identity.service_account_email` and grants the minimum required roles on its own resources. Example for a BigQuery dataset: + +```hcl +resource "google_bigquery_dataset_iam_member" "workload" { + dataset_id = google_bigquery_dataset.main.dataset_id + role = "roles/bigquery.dataEditor" + member = "serviceAccount:${var.landing_zone.workload_identity.service_account_email}" +} +``` + +The workload SA is intentionally created with no project-level roles. Do not add broad roles here. + +## Compliance + +### Hardcoded security controls + +| Control | Mechanism | Reason | +|---|---|---| +| No broad IAM roles on workload SA | SA created with no bindings | Downstream bundles use least-privilege per-resource bindings | +| APIs not disabled on destroy | `disable_on_destroy = false` | Prevents accidental disruption of other resources that depend on the same APIs | + +### Checkov skips + +| Check | Reason | +|---|---| +| `CKV_GCP_118` | Skipped on `google_project_service` — API enablement resources do not accept IAM policies | + +### Production gating + +The `halt_on_failure` expression in `massdriver.yaml` blocks deployments with unresolved high-severity Checkov findings when the environment target matches `prod`, `prd`, or `production`. + +## Assumptions + +- The GCP project already exists — this bundle does not create projects. +- The `gcp_authentication` credential has `iam.serviceAccountAdmin`, `serviceusage.serviceUsageAdmin`, and (if using budgets) `billing.budgets.create` IAM. +- Cloud Billing must be linked to the project before budgets can be created. +- `billingbudgets.googleapis.com` must be in `enabled_apis` when `budget.enabled = true`. + +## Presets + +| Preset | Budget | Notable APIs | +|---|---|---| +| Standard (no budget) | Disabled | compute, iam, resourcemanager, serviceusage, run, bigquery, storage, aiplatform, notebooks, logging, monitoring | +| Standard (with budget) | Enabled — $500/mo, alerts at 50%/90%/100% | All of the above plus billingbudgets | diff --git a/bundles/gcp-landing-zone/massdriver.yaml b/bundles/gcp-landing-zone/massdriver.yaml new file mode 100644 index 0000000..9e1c5da --- /dev/null +++ b/bundles/gcp-landing-zone/massdriver.yaml @@ -0,0 +1,209 @@ +name: gcp-landing-zone +description: Environment-foundational construct for a GCP data platform. Enables required + service APIs, provisions the workload runtime service account, configures a billing + budget with threshold alerts, and emits a single landing-zone artifact so downstream + bundles only need one connection. +source_url: https://github.com/massdriver-cloud/massdriver-catalog/tree/main/bundles/gcp-landing-zone +version: 0.1.0 + +params: + required: + - service_account_name + - enabled_apis + - budget + examples: + - __name: Standard (no budget) + service_account_name: workload + enabled_apis: + - compute.googleapis.com + - iam.googleapis.com + - cloudresourcemanager.googleapis.com + - serviceusage.googleapis.com + - run.googleapis.com + - bigquery.googleapis.com + - storage.googleapis.com + - aiplatform.googleapis.com + - notebooks.googleapis.com + - logging.googleapis.com + - monitoring.googleapis.com + budget: + enabled: false + - __name: Standard (with budget) + service_account_name: workload + enabled_apis: + - compute.googleapis.com + - iam.googleapis.com + - cloudresourcemanager.googleapis.com + - serviceusage.googleapis.com + - billingbudgets.googleapis.com + - run.googleapis.com + - bigquery.googleapis.com + - storage.googleapis.com + - aiplatform.googleapis.com + - notebooks.googleapis.com + - logging.googleapis.com + - monitoring.googleapis.com + budget: + enabled: true + billing_account_id: "012345-ABCDEF-012345" + amount: 500 + threshold_percentages: + - 50 + - 90 + - 100 + notification_emails: + - platform-alerts@example.com + properties: + service_account_name: + type: string + title: Workload Service Account Name + description: Identity that workloads in this environment will run as. Cannot + be changed after creation — renaming the service account destroys the existing + SA and breaks any downstream IAM bindings referencing the old email. + $md.immutable: true + pattern: "^[a-z][a-z0-9-]{4,28}[a-z0-9]$" + default: workload + + enabled_apis: + title: Enabled APIs + description: GCP service APIs to enable in this project. Select from the list. + Add billingbudgets.googleapis.com if you enable a budget below. + type: array + uniqueItems: true + items: + type: string + enum: + - compute.googleapis.com + - iam.googleapis.com + - cloudresourcemanager.googleapis.com + - serviceusage.googleapis.com + - billingbudgets.googleapis.com + - run.googleapis.com + - bigquery.googleapis.com + - storage.googleapis.com + - pubsub.googleapis.com + - aiplatform.googleapis.com + - notebooks.googleapis.com + - logging.googleapis.com + - monitoring.googleapis.com + default: + - compute.googleapis.com + - iam.googleapis.com + - cloudresourcemanager.googleapis.com + - serviceusage.googleapis.com + + budget: + title: Billing Budget + description: Optionally configure a GCP billing budget with spend alerts for this environment. + Enable billingbudgets.googleapis.com in the API list above when using this feature. + type: object + required: + - enabled + properties: + enabled: + type: boolean + title: Enable Budget + description: Create a GCP billing budget with threshold email alerts + default: false + dependencies: + enabled: + oneOf: + - properties: + enabled: + const: true + billing_account_id: + title: Billing Account ID + description: GCP billing account to attach the budget to. Find it in + Cloud Console under Billing > Account management (format XXXXXX-XXXXXX-XXXXXX). + Cloud Billing must be enabled on the account. + type: string + pattern: "^[0-9A-Fa-f]{6}-[0-9A-Za-z]{6}-[0-9A-Za-z]{6}$" + examples: + - 015537-E00AAA-3F7EDD + amount: + title: Budget Amount (USD) + description: Monthly spend limit in US dollars + type: number + minimum: 1 + default: 500 + threshold_percentages: + title: Alert Thresholds (%) + description: Percentage spend thresholds at which email alerts are triggered. + E.g. 50 = 50%, 90 = 90%, 100 = 100%. + type: array + minItems: 1 + maxItems: 5 + default: + - 50 + - 90 + - 100 + items: + type: number + minimum: 1 + maximum: 150 + notification_emails: + title: Notification Emails + description: Email addresses to notify when spend thresholds are crossed (optional) + type: array + items: + type: string + format: email + required: + - billing_account_id + - amount + - threshold_percentages + - properties: + enabled: + const: false + +connections: + required: + - gcp_authentication + - network + properties: + gcp_authentication: + $ref: gcp-service-account + title: GCP Credentials + + network: + $ref: catalog-demo/gcp-network + title: GCP Network + +artifacts: + required: + - landing_zone + properties: + landing_zone: + $ref: catalog-demo/gcp-landing-zone + title: GCP Landing Zone + +steps: + - path: src + provisioner: opentofu:1.10 + config: + checkov: + halt_on_failure: '.params.md_metadata.default_tags["md-target"] | test("^(prod|prd|production)$")' + +ui: + ui:order: + - service_account_name + - enabled_apis + - budget + - "*" + properties: + budget: + ui:order: + - enabled + - billing_account_id + - amount + - threshold_percentages + - notification_emails + - "*" + properties: + enabled: + ui:widget: checkbox + threshold_percentages: + ui:options: + orderable: false + items: + ui:title: "Threshold (%)" diff --git a/bundles/gcp-landing-zone/operator.md b/bundles/gcp-landing-zone/operator.md new file mode 100644 index 0000000..921f722 --- /dev/null +++ b/bundles/gcp-landing-zone/operator.md @@ -0,0 +1,85 @@ +--- +templating: mustache +--- + +# GCP Landing Zone — Operator Runbook + +## Non-obvious constraints + +**Service account name is immutable.** Changing it destroys the existing SA and creates a new one. Any downstream IAM bindings referencing the old SA email break immediately. Treat the workload SA name as permanent after first deploy. + +**Removing an API from `enabled_apis` does not disable it in GCP.** The `disable_on_destroy = false` flag means Terraform removes the state entry but never calls the GCP disable API. The API stays enabled. To actually disable it, run `gcloud services disable --project={{artifacts.landing_zone.project_id}}` manually after confirming no resources depend on it. + +**Budget requires Cloud Billing linked to the project.** If deploy fails with a billing budget error, confirm the project has a billing account attached in the GCP console before enabling the budget param. + +**Budget alert emails require a verified notification channel.** The Google Cloud Monitoring email channel must be verified in GCP before alerts deliver. Billing admins on the account always receive alerts regardless of channel configuration. + +**Newly added APIs can take 1–2 minutes to propagate.** If a downstream bundle deploy fails immediately after adding an API here, wait a minute and retry. + +## Troubleshooting + +**Downstream bundle fails with "API has not been used in project X."** +Add the required API to `enabled_apis` in this package, deploy, wait ~60 seconds, then retry the downstream bundle. + +Common APIs for this data platform: +- `pubsub.googleapis.com` — required for gcp-pubsub-topic +- `bigquery.googleapis.com` — required for gcp-bigquery-dataset +- `run.googleapis.com` — required for gcp-cloud-run-service +- `storage.googleapis.com` — required for gcp-storage-bucket +- `billingbudgets.googleapis.com` — required when budget is enabled + +To check which APIs are currently enabled: +```bash +gcloud services list --enabled --project={{artifacts.landing_zone.project_id}} +``` + +**Budget not enabled because billing API is missing.** +```bash +gcloud services list --enabled --project={{artifacts.landing_zone.project_id}} | grep billingbudgets +``` +If nothing returns, add `billingbudgets.googleapis.com` to `enabled_apis` and redeploy before enabling the budget. + +**Workload SA has unexpected project-level IAM bindings.** +The workload SA should have no project-level bindings after deploy — downstream bundles add per-resource bindings. If you see unexpected bindings: +```bash +gcloud projects get-iam-policy {{artifacts.landing_zone.project_id}} \ + --flatten="bindings[].members" \ + --filter="bindings.members:{{artifacts.landing_zone.workload_identity.service_account_email}}" \ + --format="table(bindings.role)" +``` +An empty result is expected and correct. + +**IAM binding changes outside Terraform get overwritten.** +Any bindings added manually (console or gcloud) will be removed on the next Massdriver deploy. Add permanent bindings via the bundle source. + +## Day-2 operations + +**Adding APIs after initial deploy:** Update `enabled_apis` in the package config and redeploy. Adding an API adds a new `google_project_service` resource without touching existing ones. + +**Disabling an API:** Remove it from `enabled_apis` and redeploy. Terraform drops the state entry but does NOT call the GCP disable API. Manually disable via `gcloud services disable` if required. + +**Changing budget amount or alert thresholds:** Update params and redeploy. The `google_billing_budget` resource updates in-place. + +**Disabling the budget after it was enabled:** Set `budget.enabled = false` and redeploy. The budget and notification channel are destroyed. Spend is not affected — only alerting is removed. + +**Rotating the deploy credential:** Update the GCP credential in the Massdriver UI under environment credential settings, then redeploy. Terraform state does not hold the credential — it is injected at plan time. + +## Useful commands + +```bash +# List enabled APIs in the project +gcloud services list --enabled --project={{artifacts.landing_zone.project_id}} + +# Check IAM bindings for the workload service account +gcloud projects get-iam-policy {{artifacts.landing_zone.project_id}} \ + --flatten="bindings[].members" \ + --filter="bindings.members:{{artifacts.landing_zone.workload_identity.service_account_email}}" \ + --format="table(bindings.role)" + +# Describe the workload service account +gcloud iam service-accounts describe {{artifacts.landing_zone.workload_identity.service_account_email}} \ + --project={{artifacts.landing_zone.project_id}} + +# List all service accounts in the project +gcloud iam service-accounts list --project={{artifacts.landing_zone.project_id}} +``` diff --git a/bundles/gcp-landing-zone/src/.checkov.yml b/bundles/gcp-landing-zone/src/.checkov.yml new file mode 100644 index 0000000..c43c062 --- /dev/null +++ b/bundles/gcp-landing-zone/src/.checkov.yml @@ -0,0 +1,3 @@ +skip-check: + # CKV_GCP_118: google_project_service — no IAM policy needed on API enablement resources + - CKV_GCP_118 diff --git a/bundles/gcp-landing-zone/src/artifacts.tf b/bundles/gcp-landing-zone/src/artifacts.tf new file mode 100644 index 0000000..f83c4b5 --- /dev/null +++ b/bundles/gcp-landing-zone/src/artifacts.tf @@ -0,0 +1,41 @@ +# Single landing-zone artifact — combines network, workload identity, enabled APIs, +# and budget reference. Downstream bundles connect to this one artifact instead of +# wiring network and identity connections separately. + +resource "massdriver_artifact" "landing_zone" { + field = "landing_zone" + name = "GCP Landing Zone ${var.md_metadata.name_prefix}" + artifact = jsonencode({ + project_id = local.project_id + + network = { + network_name = var.network.network_name + network_self_link = var.network.network_self_link + region = var.network.region + primary_subnet = var.network.primary_subnet + } + + workload_identity = { + service_account_email = google_service_account.workload.email + service_account_id = google_service_account.workload.unique_id + service_account_name = google_service_account.workload.name + } + + enabled_apis = var.enabled_apis + + # budget is always present in the artifact for schema conformance. + # When disabled, fields carry null/empty sentinel values so downstream + # bundles can safely check landing_zone.budget.enabled before using them. + budget = var.budget.enabled ? { + enabled = true + budget_name = google_billing_budget.environment[0].display_name + billing_account_id = var.budget.billing_account_id + amount_usd = var.budget.amount + } : { + enabled = false + budget_name = null + billing_account_id = null + amount_usd = null + } + }) +} diff --git a/bundles/gcp-landing-zone/src/main.tf b/bundles/gcp-landing-zone/src/main.tf new file mode 100644 index 0000000..2ced40d --- /dev/null +++ b/bundles/gcp-landing-zone/src/main.tf @@ -0,0 +1,123 @@ +terraform { + required_version = ">= 1.0" + required_providers { + google = { + source = "hashicorp/google" + version = "~> 6.0" + } + google-beta = { + source = "hashicorp/google-beta" + version = "~> 6.0" + } + massdriver = { + source = "massdriver-cloud/massdriver" + version = "~> 1.3" + } + } +} + +provider "google" { + project = var.gcp_authentication.project_id + credentials = jsonencode(var.gcp_authentication) +} + +provider "google-beta" { + project = var.gcp_authentication.project_id + credentials = jsonencode(var.gcp_authentication) +} + +locals { + project_id = var.gcp_authentication.project_id + name_prefix = var.md_metadata.name_prefix +} + +# ─── Service APIs ───────────────────────────────────────────────────────────── + +resource "google_project_service" "apis" { + for_each = toset(var.enabled_apis) + + project = local.project_id + service = each.value + + # Do not disable the API on destroy — other resources in the project may depend on it + disable_on_destroy = false +} + +# ─── Workload Service Account ───────────────────────────────────────────────── +# Runtime identity that data platform workloads (Cloud Run, Vertex Workbench, +# etc.) will run as. This is NOT the Terraform deploy credential. +# Downstream bundles read landing_zone.workload_identity.service_account_email +# and bind IAM roles to it on their own resources. + +resource "google_service_account" "workload" { + project = local.project_id + account_id = var.service_account_name + display_name = "Data Platform Workload Identity — ${local.name_prefix}" + description = "Runtime service account for data platform workloads. Managed by Massdriver landing zone ${local.name_prefix}." + + depends_on = [google_project_service.apis] +} + +# ─── Billing Budget ─────────────────────────────────────────────────────────── +# Requires billingbudgets.googleapis.com enabled and billing.budgets.create IAM. +# Only created when var.budget.enabled == true. The billingbudgets.googleapis.com +# API should be included in enabled_apis when budget is enabled. + +data "google_project" "current" { + project_id = local.project_id + + depends_on = [google_project_service.apis] +} + +resource "google_billing_budget" "environment" { + count = var.budget.enabled ? 1 : 0 + + billing_account = var.budget.billing_account_id + display_name = "Budget — ${local.name_prefix}" + + budget_filter { + projects = ["projects/${data.google_project.current.number}"] + } + + amount { + specified_amount { + currency_code = "USD" + units = tostring(floor(var.budget.amount)) + } + } + + dynamic "threshold_rules" { + for_each = var.budget.threshold_percentages + content { + # threshold_percentages are stored as whole numbers (50, 90, 100) in params + # and converted to fractions (0.5, 0.9, 1.0) for the GCP API + threshold_percent = threshold_rules.value / 100 + spend_basis = "CURRENT_SPEND" + } + } + + all_updates_rule { + monitoring_notification_channels = length(google_monitoring_notification_channel.budget_email) > 0 ? [google_monitoring_notification_channel.budget_email[0].id] : [] + disable_default_iam_recipients = false + } + + depends_on = [google_project_service.apis] +} + +# ─── Budget Email Alert via Monitoring Notification Channel ────────────────── +# Only provisioned when budget is enabled AND notification_emails is non-empty. +# Emails are optional — GCP will still send to billing admins via disable_default_iam_recipients=false. + +resource "google_monitoring_notification_channel" "budget_email" { + count = var.budget.enabled && length(var.budget.notification_emails) > 0 ? 1 : 0 + + project = local.project_id + display_name = "Budget Alert — ${local.name_prefix}" + type = "email" + + labels = { + email_address = var.budget.notification_emails[0] + } + + depends_on = [google_project_service.apis] +} diff --git a/bundles/gcp-landing-zone/src/variables.tf b/bundles/gcp-landing-zone/src/variables.tf new file mode 100644 index 0000000..cc78633 --- /dev/null +++ b/bundles/gcp-landing-zone/src/variables.tf @@ -0,0 +1,54 @@ +variable "md_metadata" { + type = object({ + name_prefix = string + default_tags = optional(map(string), {}) + }) +} + +variable "gcp_authentication" { + type = object({ + type = string + project_id = string + private_key_id = string + private_key = string + client_email = string + client_id = string + auth_uri = string + token_uri = string + auth_provider_x509_cert_url = string + client_x509_cert_url = string + }) + sensitive = true +} + +variable "network" { + type = object({ + project_id = string + network_name = string + network_self_link = string + region = string + primary_subnet = object({ + name = string + cidr = string + self_link = string + }) + }) +} + +variable "service_account_name" { + type = string +} + +variable "enabled_apis" { + type = list(string) +} + +variable "budget" { + type = object({ + enabled = bool + billing_account_id = optional(string) + amount = optional(number) + threshold_percentages = optional(list(number)) + notification_emails = optional(list(string), []) + }) +} diff --git a/bundles/gcp-network/README.md b/bundles/gcp-network/README.md new file mode 100644 index 0000000..e007de5 --- /dev/null +++ b/bundles/gcp-network/README.md @@ -0,0 +1,70 @@ +# gcp-network + +Minimal GCP VPC network with a single regional subnet. This is the foundational networking bundle for the GCP data platform stack. Other bundles — including `gcp-landing-zone`, Cloud Run, and Vertex Workbench — consume the `gcp-network` artifact it produces. + +## Purpose + +Creates a production-ready VPC with sensible defaults: + +- VPC created in custom (non-auto) mode so subnets are explicitly managed +- Flow logging enabled on the subnet for visibility into traffic +- Private Google Access enabled on the subnet so workloads reach Google APIs without a NAT gateway +- A deny-all ingress firewall rule at priority 65534 enforces explicit allowlisting — workload bundles add targeted allow rules on top + +## Resources Created + +| Resource | Type | Notes | +|---|---|---| +| `google_compute_network.vpc` | VPC network | Custom subnet mode, global | +| `google_compute_subnetwork.primary` | Regional subnet | Flow logging on, Private Google Access on | +| `google_compute_firewall.deny_all_ingress` | Firewall rule | Deny all ingress at priority 65534 | + +## Artifacts Consumed (Connections) + +| Connection | Artifact Type | How It Is Used | +|---|---|---| +| `gcp_authentication` | `gcp-service-account` | Deploy credential — project ID and service account key used for the Google provider | + +## Artifacts Produced + +The bundle publishes a `gcp-network` artifact with the following fields: + +| Field | Description | +|---|---| +| `project_id` | GCP project the VPC belongs to | +| `network_name` | Name of the VPC network resource | +| `network_self_link` | Full self-link URI for the VPC (used by resource references) | +| `region` | Region of the primary subnet | +| `primary_subnet.name` | Subnet resource name | +| `primary_subnet.cidr` | Primary IP range of the subnet | +| `primary_subnet.self_link` | Full self-link URI for the subnet | + +Downstream bundles (e.g., `gcp-landing-zone`) pass this artifact through their own artifact, so further-downstream bundles only need one connection. + +## Compliance + +### Hardcoded security controls + +| Control | Mechanism | Reason | +|---|---|---| +| Deny-all ingress | `google_compute_firewall.deny_all_ingress` at priority 65534 | Satisfies CKV2_GCP_18; forces explicit allowlisting per workload | +| Custom subnet mode | `auto_create_subnetworks = false` | Prevents GCP from auto-creating subnets in every region | +| Private Google Access | `private_ip_google_access = true` | Lets VMs reach Google APIs over internal IPs without egress | +| Flow logging | `log_config` block with 0.5 sampling | Network audit trail; enables traffic troubleshooting | + +### Checkov posture + +There is no `.checkov.yml` skip list for this bundle — all findings are either satisfied by the hardcoded controls above or blocked in production via `halt_on_failure`. + +The `halt_on_failure` expression in `massdriver.yaml` blocks deployments with remaining high-severity findings when the environment target matches `prod`, `prd`, or `production`. + +## Assumptions + +- The GCP project already exists — this bundle does not create projects. +- The `gcp_authentication` credential has `compute.admin` or equivalent IAM to create VPC resources and firewall rules. + +## Presets + +| Preset | Region | Network Name | Subnet CIDR | +|---|---|---|---| +| Standard | `us-central1` | `data-platform-vpc` | `10.0.0.0/20` | diff --git a/bundles/gcp-network/massdriver.yaml b/bundles/gcp-network/massdriver.yaml new file mode 100644 index 0000000..2155ba5 --- /dev/null +++ b/bundles/gcp-network/massdriver.yaml @@ -0,0 +1,90 @@ +name: gcp-network +description: Minimal GCP VPC network with a single regional subnet. Produces a + gcp-network artifact consumed by landing-zone, Cloud Run, Vertex Workbench, and + other data-platform bundles. +source_url: https://github.com/massdriver-cloud/massdriver-catalog/tree/main/bundles/gcp-network +version: 0.1.0 + +params: + required: + - region + - network_name + - subnet_cidr + examples: + - __name: Standard + region: us-central1 + network_name: data-platform-vpc + subnet_cidr: "10.0.0.0/20" + properties: + region: + title: Region + description: GCP region to deploy the subnet into (the VPC is global). Cannot + be changed after the subnet is created — changing the region requires destroying + and recreating all resources in this bundle. + type: string + $md.immutable: true + default: us-central1 + enum: + - us-central1 + - us-east1 + - us-east4 + - us-west1 + - us-west2 + - us-west3 + - us-west4 + - europe-west1 + - europe-west2 + - europe-west4 + - asia-east1 + - asia-northeast1 + - asia-southeast1 + + network_name: + title: Network Name + description: Name for the VPC network resource. Cannot be changed after creation — + renaming the network requires destroying and recreating all dependent resources + (subnets, firewall rules). + type: string + $md.immutable: true + default: data-platform-vpc + pattern: ^[a-z][a-z0-9-]{0,61}[a-z0-9]$ + + subnet_cidr: + title: Subnet CIDR + description: Primary IP range for the regional subnet. Cannot be changed after + creation — expanding or changing the range requires subnet recreation. + type: string + $md.immutable: true + default: "10.0.0.0/20" + pattern: >- + ^(?:[0-9]|[0-9]{2}|1[0-9][0-9]|2[0-4][0-9]|25[0-5])(?:\.(?:[0-9]|[0-9]{2}|1[0-9][0-9]|2[0-4][0-9]|25[0-5])){3}(?:/(?:[0-9]|1[0-9]|2[0-9]|3[0-2]))$ + +connections: + required: + - gcp_authentication + properties: + gcp_authentication: + $ref: gcp-service-account + title: GCP Credentials + +artifacts: + required: + - network + properties: + network: + $ref: gcp-network + title: GCP Network + +steps: + - path: src + provisioner: opentofu:1.10 + config: + checkov: + halt_on_failure: '.params.md_metadata.default_tags["md-target"] | test("^(prod|prd|production)$")' + +ui: + ui:order: + - region + - network_name + - subnet_cidr + - "*" diff --git a/bundles/gcp-network/operator.md b/bundles/gcp-network/operator.md new file mode 100644 index 0000000..fdc758f --- /dev/null +++ b/bundles/gcp-network/operator.md @@ -0,0 +1,84 @@ +--- +templating: mustache +--- + +# GCP Network — Operator Runbook + +## Non-obvious constraints + +**Network name is immutable.** Changing it forces replacement of the entire VPC and all dependent resources (subnets, firewall rules, peerings). Treat it as permanent after first deploy. + +**Subnet CIDR is immutable.** GCP does not support in-place CIDR changes. To change it: destroy the package (all resources in the subnet must be decommissioned first), then reprovision with the new range. Plan a maintenance window. + +**Subnet region is immutable.** The subnet's region cannot be changed after creation. A region change requires destroy and recreate. + +**Deny-all firewall is hardcoded at priority 65534.** This bundle creates a single baseline deny-all ingress rule. No traffic is allowed by default. Workload bundles (Cloud Run, Vertex, etc.) layer their own allow rules at lower priority numbers above it. + +**VPC is global; the subnet is regional.** The VPC itself has no region. Only the subnet is regional. Cross-region resources can share the VPC but must use their own regional subnets — extend the Terraform source if additional subnets are needed. + +**Deleting the network fails if anything is still attached.** Terraform will error if VMs, Cloud Run VPC connectors, GKE nodes, or other resources are still using the network. Decommission all dependent packages first. + +## Troubleshooting + +**Subnet resources fail to delete ("resourceInUseByAnotherResource").** +Something is still attached. Find it: +```bash +gcloud compute networks list-associated-resources {{artifacts.network.network_name}} \ + --project={{artifacts.network.project_id}} +``` +Decommission those packages first, then retry destroy. + +**Firewall rules not taking effect.** +Rules are evaluated by priority (lowest number wins). Check the full rule list to find conflicts: +```bash +gcloud compute firewall-rules list \ + --filter="network:{{artifacts.network.network_name}}" \ + --format="table(name,direction,priority,disabled,sourceRanges,allowed[].map().firewall_rule().list():label=ALLOW,denied[].map().firewall_rule().list():label=DENY)" \ + --sort-by=priority +``` + +**API quota or "permission denied" on VPC creation.** +Ensure `compute.googleapis.com` is enabled in the landing zone's `enabled_apis`. + +## Day-2 operations + +**Expanding or changing CIDR:** Not supported in-place. Must destroy and recreate. All resources in the subnet must be decommissioned first. + +**Adding subnets:** This bundle provisions one regional subnet. For additional subnets (GKE secondary ranges, separate workload tiers), extend the Terraform source directly. + +**VPC peering:** Use `gcloud compute networks peerings create` or add a `google_compute_network_peering` resource to the bundle source. Ensure CIDR ranges don't overlap between peered VPCs. + +**Querying VPC flow logs:** +Flow logs are stored in Cloud Logging under resource type `gce_subnetwork`. Sampling is 50% at 5-second aggregation intervals. +```bash +gcloud logging read \ + 'resource.type="gce_subnetwork" AND resource.labels.subnetwork_name="{{artifacts.network.primary_subnet.name}}"' \ + --project={{artifacts.network.project_id}} \ + --limit=50 \ + --format=json +``` + +## Useful commands + +```bash +# List all firewall rules on this network +gcloud compute firewall-rules list \ + --filter="network:{{artifacts.network.network_name}}" \ + --format="table(name,direction,priority,disabled,sourceRanges,allowed[].map().firewall_rule().list():label=ALLOW,denied[].map().firewall_rule().list():label=DENY)" + +# Describe the primary subnet +gcloud compute networks subnets describe {{artifacts.network.primary_subnet.name}} \ + --region={{artifacts.network.region}} \ + --project={{artifacts.network.project_id}} + +# Describe the VPC +gcloud compute networks describe {{artifacts.network.network_name}} \ + --project={{artifacts.network.project_id}} + +# Tail recent VPC flow logs for this subnet +gcloud logging read \ + 'resource.type="gce_subnetwork" AND resource.labels.subnetwork_name="{{artifacts.network.primary_subnet.name}}"' \ + --project={{artifacts.network.project_id}} \ + --limit=20 \ + --format=json +``` diff --git a/bundles/gcp-network/src/_massdriver_variables.tf b/bundles/gcp-network/src/_massdriver_variables.tf new file mode 100644 index 0000000..f420e06 --- /dev/null +++ b/bundles/gcp-network/src/_massdriver_variables.tf @@ -0,0 +1,47 @@ +// This file is auto-generated by massdriver from your massdriver.yaml file. +// Any changes made directly to this file will be overwritten on the next build. +// To opt a variable out of regeneration, move it to another file (e.g. variables.tf). +variable "gcp_authentication" { + type = object({ + auth_provider_x509_cert_url = string + auth_uri = string + client_email = string + client_id = string + client_x509_cert_url = string + private_key = string + private_key_id = string + project_id = string + token_uri = string + type = string + }) +} +variable "md_metadata" { + type = object({ + default_tags = map(string) + deployment = object({ + id = string + }) + name_prefix = string + observability = object({ + alarm_webhook_url = string + }) + package = object({ + created_at = string + deployment_enqueued_at = string + previous_status = string + updated_at = string + }) + target = object({ + contact_email = string + }) + }) +} +variable "network_name" { + type = string +} +variable "region" { + type = string +} +variable "subnet_cidr" { + type = string +} diff --git a/bundles/gcp-network/src/artifacts.tf b/bundles/gcp-network/src/artifacts.tf new file mode 100644 index 0000000..e09d7a6 --- /dev/null +++ b/bundles/gcp-network/src/artifacts.tf @@ -0,0 +1,15 @@ +resource "massdriver_artifact" "network" { + field = "network" + name = "GCP Network ${var.md_metadata.name_prefix}" + artifact = jsonencode({ + project_id = var.gcp_authentication.project_id + network_name = google_compute_network.vpc.name + network_self_link = google_compute_network.vpc.self_link + region = var.region + primary_subnet = { + name = google_compute_subnetwork.primary.name + cidr = google_compute_subnetwork.primary.ip_cidr_range + self_link = google_compute_subnetwork.primary.self_link + } + }) +} diff --git a/bundles/gcp-network/src/main.tf b/bundles/gcp-network/src/main.tf new file mode 100644 index 0000000..d24aa64 --- /dev/null +++ b/bundles/gcp-network/src/main.tf @@ -0,0 +1,60 @@ +terraform { + required_version = ">= 1.0" + required_providers { + google = { + source = "hashicorp/google" + version = "~> 6.0" + } + massdriver = { + source = "massdriver-cloud/massdriver" + version = "~> 1.3" + } + } +} + +provider "google" { + project = var.gcp_authentication.project_id + credentials = jsonencode(var.gcp_authentication) + region = var.region +} + +locals { + subnet_name = "${var.network_name}-${var.region}" +} + +resource "google_compute_network" "vpc" { + name = var.network_name + auto_create_subnetworks = false + description = "Data platform VPC managed by Massdriver — ${var.md_metadata.name_prefix}" +} + +resource "google_compute_subnetwork" "primary" { + name = local.subnet_name + ip_cidr_range = var.subnet_cidr + region = var.region + network = google_compute_network.vpc.id + private_ip_google_access = true + + log_config { + aggregation_interval = "INTERVAL_5_SEC" + flow_sampling = 0.5 + metadata = "INCLUDE_ALL_METADATA" + } +} + +# Baseline deny-all ingress firewall. Workload bundles add targeted allow rules +# (e.g. allow 443 from load balancer IP ranges). This satisfies CKV2_GCP_18 and +# enforces explicit allowlisting instead of relying on GCP's permissive defaults. +resource "google_compute_firewall" "deny_all_ingress" { + name = "${var.network_name}-deny-all-ingress" + network = google_compute_network.vpc.id + description = "Baseline deny-all ingress. Workload bundles add targeted allow rules." + direction = "INGRESS" + priority = 65534 + + deny { + protocol = "all" + } + + source_ranges = ["0.0.0.0/0"] +} diff --git a/bundles/gcp-pubsub-topic/README.md b/bundles/gcp-pubsub-topic/README.md new file mode 100644 index 0000000..93da2d8 --- /dev/null +++ b/bundles/gcp-pubsub-topic/README.md @@ -0,0 +1,69 @@ +# gcp-pubsub-topic + +Google Cloud Pub/Sub topic with optional dead-letter queue (DLQ). Use this bundle to provision a managed message topic for event-driven workloads — Cloud Run consumers, Dataflow pipelines, BigQuery subscriptions, and similar. The landing zone's workload service account is automatically granted publisher access. + +## Purpose + +- Provisions a Pub/Sub topic with configurable retention +- Optionally provisions a companion DLQ topic for undeliverable messages +- Grants `roles/pubsub.publisher` to the landing zone's workload service account on the main topic +- Emits a `catalog-demo/gcp-pubsub-topic` artifact so downstream bundles can reference the topic without hard-coding names + +## Resources Created + +| Resource | Type | Notes | +|---|---|---| +| `google_pubsub_topic.main` | Main Pub/Sub topic | Retention and ordering label set at provision time | +| `google_pubsub_topic.dlq` | Dead-letter topic | Created only when `dlq.enabled = true` | +| `google_pubsub_topic_iam_member.workload_publisher` | IAM binding | Grants `roles/pubsub.publisher` to the landing zone workload SA on the main topic | + +## Artifacts Consumed (Connections) + +| Connection | Artifact Type | How It Is Used | +|---|---|---| +| `gcp_authentication` | `gcp-service-account` | Deploy credential — project ID and service account key for the Google provider | +| `landing_zone` | `catalog-demo/gcp-landing-zone` | Provides `project_id` and `workload_identity.service_account_email` for the publisher IAM binding | + +## Artifacts Produced + +The bundle publishes a `catalog-demo/gcp-pubsub-topic` artifact. DLQ fields are present only when the DLQ is enabled. + +| Field | Description | Present | +|---|---|---| +| `project_id` | GCP project ID | Always | +| `topic_name` | Main topic resource name | Always | +| `topic_id` | Full topic resource ID | Always | +| `dlq_topic_name` | DLQ topic resource name | Only when `dlq.enabled = true` | +| `dlq_topic_id` | Full DLQ topic resource ID | Only when `dlq.enabled = true` | + +Downstream bundles that need subscriber access should bind `roles/pubsub.subscriber` on the topic or on their own subscription using `topic_name` and `project_id` from this artifact. + +## Compliance + +### Checkov skips + +| Check | Reason | +|---|---| +| `CKV_GCP_83` | CSEK (Customer-Supplied Encryption Keys) skipped across all environments. CSEK requires callers to manage raw AES-256 keys on every API call — GCP itself recommends against this for most workloads. Google-managed encryption (default) satisfies encryption-at-rest requirements. If CMEK via Cloud KMS is required, add a `kms_key_name` param and remove this skip. | + +### Production gating + +The `halt_on_failure` expression in `massdriver.yaml` blocks deployments with unresolved high-severity Checkov findings when the environment target matches `prod`, `prd`, or `production`. + +## Assumptions + +- `pubsub.googleapis.com` must be enabled in the landing zone before deploying this bundle. Add it to `enabled_apis` in the `gcp-landing-zone` package config. +- The `gcp_authentication` credential has `pubsub.admin` or equivalent IAM on the project. +- The landing zone's workload SA is granted publisher access automatically; subscriber access for consumers must be added by the downstream bundle. + +## Message Ordering + +Message ordering is enforced at the **publisher SDK level**, not at the topic resource level. The `message_ordering_enabled` parameter writes a label (`message-ordering: enabled|disabled`) on the topic to record operator intent. Publishers must explicitly set `enable_message_ordering = true` and use ordering keys when publishing. Enabling ordering reduces maximum throughput per topic. + +## Presets + +| Preset | Retention | DLQ | Max Delivery Attempts | Use Case | +|---|---|---|---|---| +| Low-volume | 7 days | Off | — | Dev / low-traffic topics where DLQ overhead is unnecessary | +| Standard | 7 days | On | 5 | Most production topics; catches poison-pill messages | +| High-throughput | 1 day | On | 10 | High-volume pipelines where shorter retention reduces storage cost | diff --git a/bundles/gcp-pubsub-topic/massdriver.yaml b/bundles/gcp-pubsub-topic/massdriver.yaml new file mode 100644 index 0000000..1259683 --- /dev/null +++ b/bundles/gcp-pubsub-topic/massdriver.yaml @@ -0,0 +1,139 @@ +name: gcp-pubsub-topic +description: Google Cloud Pub/Sub topic with optional dead-letter queue. Provisions + the main topic, an optional DLQ topic, and grants the landing zone's workload + service account publisher access. Emits a gcp-pubsub-topic artifact for downstream + Cloud Run, Dataflow, and BigQuery bundles to consume. +source_url: https://github.com/massdriver-cloud/massdriver-catalog/tree/main/bundles/gcp-pubsub-topic +version: 0.1.0 + +params: + required: + - message_retention_duration + - dlq + - message_ordering_enabled + examples: + - __name: Low-volume + message_retention_duration: 604800 + message_ordering_enabled: false + dlq: + enabled: false + - __name: Standard + message_retention_duration: 604800 + message_ordering_enabled: false + dlq: + enabled: true + max_delivery_attempts: 5 + dlq_retention_duration: 604800 + - __name: High-throughput + message_retention_duration: 86400 + message_ordering_enabled: false + dlq: + enabled: true + max_delivery_attempts: 10 + dlq_retention_duration: 86400 + properties: + message_retention_duration: + title: Message Retention (seconds) + description: How long unacknowledged messages are retained on the topic, in + seconds. Minimum 600 (10 min), maximum 604800 (7 days). + type: integer + minimum: 600 + maximum: 604800 + default: 604800 + + message_ordering_enabled: + title: Enable Message Ordering + description: When enabled, messages with the same ordering key are delivered + to subscribers in the order they were published. Disabling improves throughput + at the cost of ordering guarantees. + type: boolean + default: false + + dlq: + title: Dead-Letter Queue + description: Configure a dead-letter queue to capture messages that cannot be + delivered after the maximum number of delivery attempts. + type: object + required: + - enabled + properties: + enabled: + title: Enable DLQ + description: Provision a separate dead-letter topic and configure max delivery + attempts on the main topic's default subscription. + type: boolean + default: true + dependencies: + enabled: + oneOf: + - properties: + enabled: + const: true + max_delivery_attempts: + title: Max Delivery Attempts + description: Number of delivery attempts before a message is forwarded + to the dead-letter topic. Must be between 5 and 100. + type: integer + minimum: 5 + maximum: 100 + default: 5 + dlq_retention_duration: + title: DLQ Retention (seconds) + description: How long messages are retained on the dead-letter topic, + in seconds. Minimum 600, maximum 604800 (7 days). + type: integer + minimum: 600 + maximum: 604800 + default: 604800 + required: + - max_delivery_attempts + - properties: + enabled: + const: false + +connections: + required: + - gcp_authentication + - landing_zone + properties: + gcp_authentication: + $ref: gcp-service-account + title: GCP Credentials + + landing_zone: + $ref: catalog-demo/gcp-landing-zone + title: GCP Landing Zone + +artifacts: + required: + - pubsub_topic + properties: + pubsub_topic: + $ref: catalog-demo/gcp-pubsub-topic + title: GCP Pub/Sub Topic + +steps: + - path: src + provisioner: opentofu:1.10 + config: + checkov: + halt_on_failure: '.params.md_metadata.default_tags["md-target"] | test("^(prod|prd|production)$")' + +ui: + ui:order: + - message_retention_duration + - message_ordering_enabled + - dlq + - "*" + properties: + message_ordering_enabled: + ui:widget: checkbox + dlq: + ui:order: + - enabled + - max_delivery_attempts + - dlq_retention_duration + - "*" + properties: + enabled: + ui:widget: checkbox diff --git a/bundles/gcp-pubsub-topic/operator.md b/bundles/gcp-pubsub-topic/operator.md new file mode 100644 index 0000000..c36d214 --- /dev/null +++ b/bundles/gcp-pubsub-topic/operator.md @@ -0,0 +1,90 @@ +--- +templating: mustache +--- + +# GCP Pub/Sub Topic — Operator Runbook + +## Non-obvious constraints + +**Topic name is immutable.** To rename a topic: decommission this package, recreate it with the new name, and update all consumer subscriptions. Plan a maintenance window. + +**Message retention changes are safe in-place.** Updating `message_retention_duration` applies without disruption. In-flight messages are not affected. + +**Enabling DLQ after-the-fact does not update existing subscriptions.** When you enable the DLQ on an existing topic, Terraform creates the DLQ topic — but existing consumer subscriptions do not automatically gain a dead-letter policy. Consumer bundles must be updated separately to reference the new DLQ topic. + +**Disabling DLQ destroys the DLQ topic.** Any consumer subscriptions that have a dead-letter policy pointing to the old DLQ topic will fail to deliver dead letters after the destroy. Remove dead-letter policies from consumer subscriptions before disabling the DLQ here. + +**Message ordering on the topic is not enforcement.** Setting ordering on the topic is a configuration label. Publishers must also set `enable_message_ordering = true` in their SDK client and pass an ordering key on every publish call. Without ordering keys from publishers, messages are not ordered regardless of the topic setting. + +**`max_delivery_attempts` is enforced at the subscription, not the topic.** This bundle provisions the DLQ topic. The delivery attempt limit lives on the consumer's subscription (managed by the consumer bundle). If messages aren't reaching the DLQ, check the consumer subscription's dead-letter policy first. + +## Troubleshooting + +**Messages not flowing to DLQ.** +Check that the consumer subscription has a dead-letter policy referencing `{{artifacts.pubsub_topic.dlq_topic_name}}`: +```bash +gcloud pubsub subscriptions describe \ + --project={{artifacts.pubsub_topic.project_id}} \ + --format="yaml(deadLetterPolicy)" +``` +If the field is absent, the consumer bundle is not configured to use the DLQ. + +**Deploy fails with "pubsub.googleapis.com has not been used in project."** +Add `pubsub.googleapis.com` to `enabled_apis` in the `gcp-landing-zone` package, redeploy the landing zone, wait ~60 seconds, then retry. + +**Publisher permission denied.** +The workload SA needs `roles/pubsub.publisher` on the topic: +```bash +gcloud pubsub topics get-iam-policy {{artifacts.pubsub_topic.topic_name}} \ + --project={{artifacts.pubsub_topic.project_id}} +``` + +## Day-2 operations + +**Changing retention duration:** Update param and redeploy. In-place, no disruption. + +**Enabling DLQ on an existing topic:** Set `dlq.enabled = true`, configure `max_delivery_attempts`, redeploy. Then update consumer bundles to add dead-letter policies to their subscriptions pointing to `{{artifacts.pubsub_topic.dlq_topic_name}}`. + +**Disabling DLQ:** Remove dead-letter policies from all consumer subscriptions first. Then set `dlq.enabled = false` and redeploy. The DLQ topic is destroyed. + +**Renaming the topic:** Destroy this package, recreate with the new name, update all consumers. No in-place rename is possible. + +**Pulling messages from the DLQ to inspect failures.** +A subscription on the DLQ topic is required (managed by a consumer bundle). If one exists: +```bash +gcloud pubsub subscriptions pull \ + --project={{artifacts.pubsub_topic.project_id}} \ + --limit=10 \ + --auto-ack +``` + +## Useful commands + +```bash +# Describe the main topic +gcloud pubsub topics describe {{artifacts.pubsub_topic.topic_name}} \ + --project={{artifacts.pubsub_topic.project_id}} + +# List subscriptions on the main topic +gcloud pubsub topics list-subscriptions {{artifacts.pubsub_topic.topic_name}} \ + --project={{artifacts.pubsub_topic.project_id}} + +{{#artifacts.pubsub_topic.dlq_topic_name}} +# Describe the DLQ topic +gcloud pubsub topics describe {{artifacts.pubsub_topic.dlq_topic_name}} \ + --project={{artifacts.pubsub_topic.project_id}} + +# List subscriptions on the DLQ topic +gcloud pubsub topics list-subscriptions {{artifacts.pubsub_topic.dlq_topic_name}} \ + --project={{artifacts.pubsub_topic.project_id}} +{{/artifacts.pubsub_topic.dlq_topic_name}} + +# Check IAM on the main topic +gcloud pubsub topics get-iam-policy {{artifacts.pubsub_topic.topic_name}} \ + --project={{artifacts.pubsub_topic.project_id}} + +# Publish a test message +gcloud pubsub topics publish {{artifacts.pubsub_topic.topic_name}} \ + --project={{artifacts.pubsub_topic.project_id}} \ + --message="test" +``` diff --git a/bundles/gcp-pubsub-topic/src/.checkov.yml b/bundles/gcp-pubsub-topic/src/.checkov.yml new file mode 100644 index 0000000..976ffde --- /dev/null +++ b/bundles/gcp-pubsub-topic/src/.checkov.yml @@ -0,0 +1,9 @@ +skip-check: + # CKV_GCP_83: PubSub Topics encrypted with Customer Supplied Encryption Keys (CSEK) + # CSEK requires the caller to manage raw AES-256 keys and pass them on every API + # call — an operational burden that GCP itself recommends against for most workloads. + # Google-managed encryption (default) and CMEK via Cloud KMS are both acceptable + # alternatives that satisfy encryption-at-rest requirements without CSEK complexity. + # This skip applies to all environments; if CMEK is required, add a kms_key_name + # param and wire it to the google_pubsub_topic resources, then remove this skip. + - CKV_GCP_83 diff --git a/bundles/gcp-pubsub-topic/src/artifacts.tf b/bundles/gcp-pubsub-topic/src/artifacts.tf new file mode 100644 index 0000000..db49d55 --- /dev/null +++ b/bundles/gcp-pubsub-topic/src/artifacts.tf @@ -0,0 +1,18 @@ +# Pub/Sub topic artifact — flat schema matching catalog-demo/gcp-pubsub-topic. +# Includes DLQ fields only when the DLQ is enabled (conditional merge). + +resource "massdriver_artifact" "pubsub_topic" { + field = "pubsub_topic" + name = "GCP Pub/Sub Topic ${var.md_metadata.name_prefix}" + artifact = jsonencode(merge( + { + project_id = local.project_id + topic_name = google_pubsub_topic.main.name + topic_id = google_pubsub_topic.main.id + }, + var.dlq.enabled ? { + dlq_topic_name = google_pubsub_topic.dlq[0].name + dlq_topic_id = google_pubsub_topic.dlq[0].id + } : {} + )) +} diff --git a/bundles/gcp-pubsub-topic/src/main.tf b/bundles/gcp-pubsub-topic/src/main.tf new file mode 100644 index 0000000..81c7062 --- /dev/null +++ b/bundles/gcp-pubsub-topic/src/main.tf @@ -0,0 +1,80 @@ +terraform { + required_version = ">= 1.0" + required_providers { + google = { + source = "hashicorp/google" + version = "~> 6.0" + } + massdriver = { + source = "massdriver-cloud/massdriver" + version = "~> 1.3" + } + } +} + +provider "google" { + project = var.gcp_authentication.project_id + credentials = jsonencode(var.gcp_authentication) +} + +locals { + project_id = var.landing_zone.project_id + name_prefix = var.md_metadata.name_prefix + + # GCP message_retention_duration requires the "s" suffix (e.g. "604800s") + retention_duration = "${var.message_retention_duration}s" + dlq_retention_duration = var.dlq.enabled ? "${var.dlq.dlq_retention_duration}s" : null + + topic_name = local.name_prefix + dlq_name = "${local.name_prefix}-dlq" +} + +# ─── Main Topic ─────────────────────────────────────────────────────────────── + +resource "google_pubsub_topic" "main" { + project = local.project_id + name = local.topic_name + + message_retention_duration = local.retention_duration + + # Message ordering is set at the publisher client level; the schema_settings + # field is not required. Ordering is enforced per-publisher, not at topic level. + # This label records the operator intent so Cloud Run and other publishers know + # whether to enable ordering keys when publishing. + labels = merge(var.md_metadata.default_tags, { + message-ordering = var.message_ordering_enabled ? "enabled" : "disabled" + }) +} + +# ─── Dead-Letter Queue Topic ────────────────────────────────────────────────── +# Only created when dlq.enabled == true. Pub/Sub requires the DLQ topic to exist +# before the subscription referencing it can be created by consumers. + +resource "google_pubsub_topic" "dlq" { + count = var.dlq.enabled ? 1 : 0 + + project = local.project_id + name = local.dlq_name + + message_retention_duration = local.dlq_retention_duration + + labels = var.md_metadata.default_tags +} + +# ─── Workload Publisher IAM ─────────────────────────────────────────────────── +# Grant the landing zone's workload service account roles/pubsub.publisher on +# the main topic. This is the IAM role binding example pattern for this series: +# +# member = "serviceAccount:" +# role = "roles/pubsub.publisher" +# topic = google_pubsub_topic.main.name +# +# Downstream bundles that need subscriber access should bind roles/pubsub.subscriber +# on this topic (or on their subscription) to the service account that reads messages. + +resource "google_pubsub_topic_iam_member" "workload_publisher" { + project = local.project_id + topic = google_pubsub_topic.main.name + role = "roles/pubsub.publisher" + member = "serviceAccount:${var.landing_zone.workload_identity.service_account_email}" +} diff --git a/bundles/gcp-pubsub-topic/src/variables.tf b/bundles/gcp-pubsub-topic/src/variables.tf new file mode 100644 index 0000000..7d614c6 --- /dev/null +++ b/bundles/gcp-pubsub-topic/src/variables.tf @@ -0,0 +1,68 @@ +variable "md_metadata" { + type = object({ + name_prefix = string + default_tags = optional(map(string), {}) + }) +} + +variable "gcp_authentication" { + type = object({ + type = string + project_id = string + private_key_id = string + private_key = string + client_email = string + client_id = string + auth_uri = string + token_uri = string + auth_provider_x509_cert_url = string + client_x509_cert_url = string + }) + sensitive = true +} + +variable "landing_zone" { + type = object({ + project_id = string + network = object({ + network_name = string + network_self_link = string + region = string + primary_subnet = object({ + name = string + cidr = string + self_link = string + }) + }) + workload_identity = object({ + service_account_email = string + service_account_id = string + service_account_name = string + }) + enabled_apis = list(string) + budget = object({ + enabled = bool + budget_name = optional(string) + billing_account_id = optional(string) + amount_usd = optional(number) + }) + }) +} + +variable "message_retention_duration" { + type = number + default = 604800 +} + +variable "message_ordering_enabled" { + type = bool + default = false +} + +variable "dlq" { + type = object({ + enabled = bool + max_delivery_attempts = optional(number) + dlq_retention_duration = optional(number) + }) +} diff --git a/bundles/gcp-storage-bucket/README.md b/bundles/gcp-storage-bucket/README.md new file mode 100644 index 0000000..5a43c33 --- /dev/null +++ b/bundles/gcp-storage-bucket/README.md @@ -0,0 +1,87 @@ +# gcp-storage-bucket + +Google Cloud Storage bucket with configurable storage class, optional versioning, and lifecycle rules. Use this bundle to provision a managed object store for data platform workloads — Cloud Run pipelines, BigQuery exports, Vertex Workbench datasets, and similar. The landing zone's workload service account is automatically granted object read/write access. + +## Purpose + +- Provisions a GCS bucket with configurable storage class and location +- Optionally enables versioning for durable datasets and non-current version lifecycle management +- Supports lifecycle rules for automated cost optimization (Delete and SetStorageClass transitions) +- Enforces `uniform_bucket_level_access` and `public_access_prevention = "enforced"` as non-negotiable security baselines +- Grants `roles/storage.objectUser` to the landing zone's workload service account on the bucket +- Emits a `catalog-demo/gcp-storage-bucket` artifact so downstream bundles can reference the bucket without hard-coding names or project IDs + +## Resources Created + +| Resource | Type | Notes | +|---|---|---| +| `google_storage_bucket.main` | GCS bucket | Storage class, location, versioning, and lifecycle rules set at provision time | +| `google_storage_bucket_iam_member.workload_object_user` | IAM binding | Grants `roles/storage.objectUser` to the landing zone workload SA | + +## Artifacts Consumed (Connections) + +| Connection | Artifact Type | How It Is Used | +|---|---|---| +| `gcp_authentication` | `gcp-service-account` | Deploy credential — project ID and service account key for the Google provider | +| `landing_zone` | `catalog-demo/gcp-landing-zone` | Provides `project_id` and `workload_identity.service_account_email` for the objectUser IAM binding | + +## Artifacts Produced + +The bundle publishes a `catalog-demo/gcp-storage-bucket` artifact with all fields needed for downstream bundles to read and write objects. + +| Field | Description | +|---|---| +| `project_id` | GCP project ID that owns the bucket | +| `bucket_name` | Globally-unique GCS bucket name (derived from Massdriver name prefix) | +| `bucket_url` | Canonical `gs://` URL for use with gsutil and client libraries | +| `bucket_self_link` | GCS REST API resource URL (`https://www.googleapis.com/storage/v1/b/`) | +| `location` | GCS location where the bucket was deployed | +| `storage_class` | Active storage class of the bucket | + +Downstream bundles that need additional access (e.g., read-only) should bind `roles/storage.objectViewer` on the bucket using `bucket_name` and `project_id` from this artifact. + +## Compliance + +### Hardcoded security baselines + +Two settings are enforced at the Terraform level and cannot be changed via parameters: + +| Setting | Value | Reason | +|---|---|---| +| `uniform_bucket_level_access` | `true` | Disables legacy object-level ACLs. All access is controlled by IAM only. Prevents split access-control models that are difficult to audit and easy to misconfigure (Checkov CKV_GCP_29). | +| `public_access_prevention` | `"enforced"` | Blocks all public object access regardless of IAM policies or ACLs. Prevents accidental data exposure via `allUsers` or `allAuthenticatedUsers` grants (Checkov CKV_GCP_114). Non-negotiable baseline for all environments in this data platform series. | + +### Checkov skips + +| Check | Reason | +|---|---| +| `CKV_GCP_62` | Bucket access logging requires a separate log-sink GCS bucket. That bucket is not part of this bundle's scope — enabling logging here without a target bucket causes a plan-time error. Operators who need access logs should provision a dedicated log bucket and wire `logging.log_bucket` manually. | +| `CKV_GCP_63` | Checks that a bucket does not log access requests to itself. Because no `logging` block is configured (see CKV_GCP_62), this bucket cannot log to itself. Checkov fails this check in the absence of any logging configuration, making the finding a false positive in this context. | +| `CKV_GCP_78` | Retention lock (WORM) makes objects immutable for a fixed duration and cannot be shortened or removed once set. It is not universally appropriate — it prevents deletion of any object, including accidental uploads. Add a `retention_policy` param if your workload requires WORM guarantees. | + +### Production gating + +The `halt_on_failure` expression in `massdriver.yaml` blocks deployments with unresolved high-severity Checkov findings when the environment target matches `prod`, `prd`, or `production`. + +## Assumptions + +- `storage.googleapis.com` must be enabled in the landing zone before deploying this bundle. Add it to `enabled_apis` in the `gcp-landing-zone` package config. +- The `gcp_authentication` credential has `storage.admin` or equivalent IAM on the project. +- The landing zone's workload SA is granted `roles/storage.objectUser` automatically; read-only or admin access for other consumers must be added by the downstream bundle. +- Bucket names are derived from the Massdriver `name_prefix` and are globally unique — operators do not pick the raw bucket name. + +## Presets + +| Preset | Storage Class | Location | Versioning | Lifecycle | +|---|---|---|---|---| +| Staging | STANDARD | US | Off | Delete objects after 30 days | +| Durable | STANDARD | US | On | None — retain all versions indefinitely | +| Archive | COLDLINE | US | On | Transition to ARCHIVE class after 365 days | + +## Bucket Naming + +GCS bucket names are globally unique across all GCP projects. This bundle derives the bucket name from the Massdriver `name_prefix`, which incorporates the environment slug and package name. Operators do not choose the raw name — name collisions are avoided by construction. + +## Location Immutability + +A bucket's location cannot be changed after creation. Selecting the wrong location requires decommissioning the package and reprovisioning. Choose carefully at deploy time based on where your compute resources run. diff --git a/bundles/gcp-storage-bucket/massdriver.yaml b/bundles/gcp-storage-bucket/massdriver.yaml new file mode 100644 index 0000000..cef9504 --- /dev/null +++ b/bundles/gcp-storage-bucket/massdriver.yaml @@ -0,0 +1,221 @@ +name: gcp-storage-bucket +description: Google Cloud Storage bucket with configurable storage class, optional + versioning, and lifecycle rules. Enforces uniform bucket-level access and public + access prevention as non-negotiable security baselines. Grants the landing zone's + workload service account objectAdmin access. Emits a gcp-storage-bucket artifact + for downstream Cloud Run, BigQuery, and Vertex Workbench bundles. +source_url: https://github.com/massdriver-cloud/massdriver-catalog/tree/main/bundles/gcp-storage-bucket +version: 0.1.0 + +params: + required: + - storage_class + - location + - versioning_enabled + - lifecycle_rules + examples: + - __name: Staging + storage_class: STANDARD + location: US + versioning_enabled: false + lifecycle_rules: + - action: + type: Delete + condition: + age_days: 30 + - __name: Durable + storage_class: STANDARD + location: US + versioning_enabled: true + lifecycle_rules: [] + - __name: Archive + storage_class: COLDLINE + location: US + versioning_enabled: true + lifecycle_rules: + - action: + type: SetStorageClass + storage_class: ARCHIVE + condition: + age_days: 365 + properties: + storage_class: + title: Storage Class + description: Storage class controls cost vs. access latency trade-offs. STANDARD + for frequently accessed data. NEARLINE for monthly access. COLDLINE for + quarterly access. ARCHIVE for annual or less frequent access. + type: string + enum: + - STANDARD + - NEARLINE + - COLDLINE + - ARCHIVE + default: STANDARD + + location: + title: Location + description: GCS location for the bucket. Multi-regions (US, EU, ASIA) give + highest availability. Dual-regions (NAM4, EUR4, ASIA1) offer low-latency + redundancy. Single regions (e.g. us-central1) co-locate storage with compute. + Location cannot be changed after bucket creation. + type: string + $md.immutable: true + default: US + enum: + - US + - EU + - ASIA + - NAM4 + - EUR4 + - ASIA1 + - us-central1 + - us-east1 + - us-east4 + - us-west1 + - us-west2 + - europe-west1 + - europe-west2 + - europe-west3 + - europe-west4 + - asia-east1 + - asia-northeast1 + - asia-south1 + - asia-southeast1 + - australia-southeast1 + - southamerica-east1 + + versioning_enabled: + title: Enable Versioning + description: When enabled, GCS retains previous versions of objects when they + are overwritten or deleted. Required for Archive lifecycle transitions and + recommended for durable datasets. + type: boolean + default: false + + lifecycle_rules: + title: Lifecycle Rules + description: Ordered list of lifecycle rules applied to objects in this bucket. + Rules are evaluated in order; the first matching rule wins. Leave empty + for no automated lifecycle management. + type: array + default: [] + items: + type: object + required: + - action + - condition + properties: + action: + title: Action + description: What to do when the condition is met. + type: object + required: + - type + properties: + type: + title: Action Type + description: "Delete: permanently removes matched objects. SetStorageClass: + transitions matched objects to the target storage class." + type: string + enum: + - Delete + - SetStorageClass + dependencies: + type: + oneOf: + - properties: + type: + const: SetStorageClass + storage_class: + title: Target Storage Class + description: Storage class to transition matched objects into. + Only valid when action type is SetStorageClass. + type: string + enum: + - NEARLINE + - COLDLINE + - ARCHIVE + required: + - storage_class + - properties: + type: + const: Delete + condition: + title: Condition + description: When to apply the action. + type: object + required: + - age_days + properties: + age_days: + title: Age (days) + description: Apply the action to objects that are at least this many + days old. + type: integer + minimum: 1 + with_state: + title: Object State Filter + description: "Restrict the rule to objects in a particular versioning + state. LIVE: current versions only. ARCHIVED: non-current versions + only. ANY: all versions. Omit to match all states." + type: string + enum: + - LIVE + - ARCHIVED + - ANY + +connections: + required: + - gcp_authentication + - landing_zone + properties: + gcp_authentication: + $ref: gcp-service-account + title: GCP Credentials + + landing_zone: + $ref: catalog-demo/gcp-landing-zone + title: GCP Landing Zone + +artifacts: + required: + - storage_bucket + properties: + storage_bucket: + $ref: catalog-demo/gcp-storage-bucket + title: GCP Storage Bucket + +steps: + - path: src + provisioner: opentofu:1.10 + config: + checkov: + halt_on_failure: '.params.md_metadata.default_tags["md-target"] | test("^(prod|prd|production)$")' + +ui: + ui:order: + - storage_class + - location + - versioning_enabled + - lifecycle_rules + - "*" + properties: + versioning_enabled: + ui:widget: checkbox + lifecycle_rules: + items: + ui:order: + - action + - condition + - "*" + properties: + action: + ui:order: + - type + - storage_class + - "*" + condition: + ui:order: + - age_days + - with_state + - "*" diff --git a/bundles/gcp-storage-bucket/operator.md b/bundles/gcp-storage-bucket/operator.md new file mode 100644 index 0000000..39eef28 --- /dev/null +++ b/bundles/gcp-storage-bucket/operator.md @@ -0,0 +1,96 @@ +--- +templating: mustache +--- + +# GCP Storage Bucket — Operator Runbook + +## Non-obvious constraints + +**Bucket name is globally unique and immutable.** The name is derived from the Massdriver name prefix and is set at creation. A rename requires decommissioning and recreating the package with a new name prefix, then migrating all objects. + +**Location is immutable.** Bucket location cannot be changed after creation. To move a bucket: export all objects to a new bucket in the target location, update consumers to point to the new bucket, then decommission this package. Use `gcloud storage cp -r` or a Dataflow job for large datasets. + +**Public access prevention is enforced and cannot be loosened via params.** `public_access_prevention = "enforced"` is hardcoded. Any attempt to grant `allUsers` or `allAuthenticatedUsers` via IAM is rejected by GCP even if the IAM call appears to succeed. Objects are never publicly readable. This is intentional — it cannot be overridden through bundle configuration. + +**Uniform bucket-level access is enabled.** Object-level ACLs are disabled. All access is controlled via bucket-level IAM only. Granting access to specific objects via ACLs is not possible. + +**Turning versioning off does not delete existing non-current versions.** GCS stops creating new versions, but existing non-current versions are retained and continue to incur storage charges. Add a lifecycle rule targeting `with_state: ARCHIVED` to clean them up. + +**Lifecycle rules evaluate once daily, not in real time.** A rule set to delete objects after 30 days may not take effect until the next evaluation window. This is a GCP platform constraint. + +**`Delete` action on a versioned bucket sets a delete marker, it does not immediately remove storage.** Add a second lifecycle rule targeting `with_state: ARCHIVED` with a shorter `age_days` to purge non-current versions and reclaim storage. + +**Deploy fails with "storage.googleapis.com has not been used in project."** +Add `storage.googleapis.com` to `enabled_apis` in the `gcp-landing-zone` package, redeploy the landing zone, wait ~60 seconds, then retry. + +## Troubleshooting + +**Permission denied on object read/write.** +Uniform bucket-level access is on — check bucket IAM, not object ACLs: +```bash +gcloud storage buckets get-iam-policy {{artifacts.storage_bucket.bucket_url}} +``` +The workload SA needs `roles/storage.objectUser` to read and write, or `roles/storage.objectViewer` for read-only. + +**Objects not being deleted by lifecycle rules.** +Lifecycle rules evaluate once daily. Wait up to 24 hours after a rule change takes effect. To inspect current lifecycle config: +```bash +gcloud storage buckets describe {{artifacts.storage_bucket.bucket_url}} \ + --format="yaml(lifecycle)" +``` + +**Storage costs unexpectedly high after disabling versioning.** +Old non-current versions are still present. List them: +```bash +gcloud storage ls -a {{artifacts.storage_bucket.bucket_url}} +``` +Add a lifecycle rule with `with_state: ARCHIVED` to purge them. + +## Day-2 operations + +**Changing storage class:** Update `storage_class` param and redeploy. The bucket updates in-place. Existing objects retain their current storage class — only new writes use the new class. Use a lifecycle `SetStorageClass` rule to migrate existing objects. + +**Enabling versioning:** Safe in-place change. Objects written before versioning was enabled have a single version. Objects overwritten or deleted afterward accumulate versions. + +**Disabling versioning:** In-place change, but existing non-current versions are retained. Add a lifecycle rule targeting `with_state: ARCHIVED` to clean up. + +**Granting read-only access to another service account** (outside Terraform — will be overwritten on next apply): +```bash +gcloud storage buckets add-iam-policy-binding {{artifacts.storage_bucket.bucket_url}} \ + --member="serviceAccount:" \ + --role="roles/storage.objectViewer" +``` +For permanent bindings, add a `google_storage_bucket_iam_member` resource to the bundle source. + +**Migrating objects to a new bucket location:** +```bash +gcloud storage cp -r {{artifacts.storage_bucket.bucket_url}}/* gs:/// +``` + +## Useful commands + +```bash +# List objects in the bucket +gcloud storage ls {{artifacts.storage_bucket.bucket_url}} + +# List all objects including non-current versions +gcloud storage ls -a {{artifacts.storage_bucket.bucket_url}} + +# Check bucket IAM policy +gcloud storage buckets get-iam-policy {{artifacts.storage_bucket.bucket_url}} + +# Inspect lifecycle rules +gcloud storage buckets describe {{artifacts.storage_bucket.bucket_url}} \ + --format="yaml(lifecycle)" + +# Get a signed URL for a specific object (valid 1 hour) +gcloud storage sign-url {{artifacts.storage_bucket.bucket_url}}/ \ + --duration=1h \ + --private-key-file= + +# Copy a local file into the bucket +gcloud storage cp ./myfile.txt {{artifacts.storage_bucket.bucket_url}}/myfile.txt + +# Sync a local directory to the bucket +gcloud storage rsync ./local-dir {{artifacts.storage_bucket.bucket_url}}/remote-dir --recursive +``` diff --git a/bundles/gcp-storage-bucket/src/.checkov.yml b/bundles/gcp-storage-bucket/src/.checkov.yml new file mode 100644 index 0000000..731dcc2 --- /dev/null +++ b/bundles/gcp-storage-bucket/src/.checkov.yml @@ -0,0 +1,27 @@ +skip-check: + # CKV_GCP_62: Bucket should log access requests + # Access logging writes log objects to a separate GCS bucket. For this demo data + # platform series the log sink bucket is not provisioned, so enabling logging here + # would cause a plan-time error (the target bucket does not exist). Operators who + # need access logs should provision a dedicated log bucket and set the + # logging.log_bucket field on this resource. This skip is appropriate because the + # check requires an out-of-band dependency that is not part of this bundle's scope. + - CKV_GCP_62 + + # CKV_GCP_63: Bucket should not log to itself + # This check verifies a bucket is not configured to log access requests to itself + # (which would cause infinite log growth). Because we have no logging block at all + # (access logging is skipped per CKV_GCP_62 — no log-sink bucket is in scope), + # Checkov incorrectly fails this check. The bucket cannot log to itself if logging + # is not configured. Both CKV_GCP_62 and CKV_GCP_63 require a log-sink bucket as + # an out-of-band dependency not provided by this bundle. + - CKV_GCP_63 + + # CKV_GCP_78: Ensure Cloud storage has lock retention policy enabled + # Retention lock makes a bucket's objects immutable for a specified duration. + # While valuable for compliance/WORM use cases, it is not universally appropriate: + # it prevents deletion of any object (including accidental uploads) and cannot be + # shortened or removed once set. Exposing it as a param is the right approach for + # workloads that need it. This skip applies across all environments in this series; + # add a retention_policy param if your workload requires WORM guarantees. + - CKV_GCP_78 diff --git a/bundles/gcp-storage-bucket/src/artifacts.tf b/bundles/gcp-storage-bucket/src/artifacts.tf new file mode 100644 index 0000000..8eb550c --- /dev/null +++ b/bundles/gcp-storage-bucket/src/artifacts.tf @@ -0,0 +1,14 @@ +# Storage bucket artifact — flat schema matching catalog-demo/gcp-storage-bucket. + +resource "massdriver_artifact" "storage_bucket" { + field = "storage_bucket" + name = "GCP Storage Bucket ${var.md_metadata.name_prefix}" + artifact = jsonencode({ + project_id = local.project_id + bucket_name = google_storage_bucket.main.name + bucket_url = "gs://${google_storage_bucket.main.name}" + bucket_self_link = google_storage_bucket.main.self_link + location = google_storage_bucket.main.location + storage_class = google_storage_bucket.main.storage_class + }) +} diff --git a/bundles/gcp-storage-bucket/src/main.tf b/bundles/gcp-storage-bucket/src/main.tf new file mode 100644 index 0000000..3f4e162 --- /dev/null +++ b/bundles/gcp-storage-bucket/src/main.tf @@ -0,0 +1,92 @@ +terraform { + required_version = ">= 1.0" + required_providers { + google = { + source = "hashicorp/google" + version = "~> 6.0" + } + massdriver = { + source = "massdriver-cloud/massdriver" + version = "~> 1.3" + } + } +} + +provider "google" { + project = var.gcp_authentication.project_id + credentials = jsonencode(var.gcp_authentication) +} + +locals { + project_id = var.landing_zone.project_id + name_prefix = var.md_metadata.name_prefix + + # GCS bucket names must be globally unique. The name_prefix already incorporates + # the Massdriver environment slug, so we use it directly as the bucket name. + bucket_name = local.name_prefix +} + +# ─── GCS Bucket ─────────────────────────────────────────────────────────────── + +resource "google_storage_bucket" "main" { + project = local.project_id + name = local.bucket_name + location = var.location + + storage_class = var.storage_class + + # ── Security baselines — NOT configurable ──────────────────────────────────── + # uniform_bucket_level_access: Disables legacy object-level ACLs and enforces + # IAM-only access control. This is a GCP best practice and a Checkov requirement + # (CKV_GCP_29). Allowing ACLs alongside IAM creates split access control models + # that are difficult to audit and easy to misconfigure. + uniform_bucket_level_access = true + + # public_access_prevention: Set to "enforced" to block all public object access + # regardless of IAM policies or ACLs. This prevents accidental data exposure via + # allUsers/allAuthenticatedUsers grants (CKV_GCP_114). This is a non-negotiable + # baseline for all environments in this data platform series. + public_access_prevention = "enforced" + # ───────────────────────────────────────────────────────────────────────────── + + versioning { + enabled = var.versioning_enabled + } + + dynamic "lifecycle_rule" { + for_each = var.lifecycle_rules + content { + action { + type = lifecycle_rule.value.action.type + storage_class = try(lifecycle_rule.value.action.storage_class, null) + } + condition { + age = lifecycle_rule.value.condition.age_days + with_state = try(lifecycle_rule.value.condition.with_state, null) + } + } + } + + labels = var.md_metadata.default_tags +} + +# ─── Workload IAM Binding ───────────────────────────────────────────────────── +# Grant the landing zone's workload service account roles/storage.objectUser on +# this bucket. objectUser covers read and write of objects without granting +# delete or bucket-level admin operations. This follows the principle of least +# privilege — workloads that need to delete objects should bind objectAdmin +# explicitly in their own bundle. +# +# IAM role binding pattern for this series: +# member = "serviceAccount:" +# role = "roles/storage.objectUser" +# bucket = google_storage_bucket.main.name +# +# Downstream bundles that need read-only access should bind roles/storage.objectViewer +# on this bucket using the storage_bucket artifact's bucket_name field. + +resource "google_storage_bucket_iam_member" "workload_object_user" { + bucket = google_storage_bucket.main.name + role = "roles/storage.objectUser" + member = "serviceAccount:${var.landing_zone.workload_identity.service_account_email}" +} diff --git a/bundles/gcp-storage-bucket/src/variables.tf b/bundles/gcp-storage-bucket/src/variables.tf new file mode 100644 index 0000000..fd6c0f5 --- /dev/null +++ b/bundles/gcp-storage-bucket/src/variables.tf @@ -0,0 +1,79 @@ +variable "md_metadata" { + type = object({ + name_prefix = string + default_tags = optional(map(string), {}) + }) +} + +variable "gcp_authentication" { + type = object({ + type = string + project_id = string + private_key_id = string + private_key = string + client_email = string + client_id = string + auth_uri = string + token_uri = string + auth_provider_x509_cert_url = string + client_x509_cert_url = string + }) + sensitive = true +} + +variable "landing_zone" { + type = object({ + project_id = string + network = object({ + network_name = string + network_self_link = string + region = string + primary_subnet = object({ + name = string + cidr = string + self_link = string + }) + }) + workload_identity = object({ + service_account_email = string + service_account_id = string + service_account_name = string + }) + enabled_apis = list(string) + budget = object({ + enabled = bool + budget_name = optional(string) + billing_account_id = optional(string) + amount_usd = optional(number) + }) + }) +} + +variable "storage_class" { + type = string + default = "STANDARD" +} + +variable "location" { + type = string + default = "US" +} + +variable "versioning_enabled" { + type = bool + default = false +} + +variable "lifecycle_rules" { + type = list(object({ + action = object({ + type = string + storage_class = optional(string) + }) + condition = object({ + age_days = number + with_state = optional(string) + }) + })) + default = [] +} From 8d49e49c0005921aa94728f5c780b30235d28e49 Mon Sep 17 00:00:00 2001 From: Cory O'Daniel Date: Sun, 19 Apr 2026 21:59:44 -0700 Subject: [PATCH 04/15] Add gcp-vertex-workbench bundle (7/7 GCP data platform) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit User prompts: - "Build the gcp-vertex-workbench bundle — the final bundle (7 of 7) in the GCP Data Platform demo catalog." Changes: - New artifact definition: catalog-demo/gcp-vertex-workbench (project_id, instance_name, location, proxy_url, instance_service_account_email/member) - New bundle: gcp-vertex-workbench with massdriver.yaml, src/main.tf, src/iam.tf, src/artifacts.tf, src/variables.tf, src/.checkov.yml, README.md, operator.md - Provisions google_workbench_instance (Workbench Instances API v2, not deprecated notebooks API) - Per-instance SA pattern with hardcoded design decision comment - Shielded VM hardcoded: secure_boot=true, vtpm=true, integrity_monitoring=true - No public IP: disable_public_ip=true; JupyterLab access via proxy - Idle shutdown via idle-timeout-seconds metadata key - Optional bigquery_dataset connection grants roles/bigquery.dataViewer (read-only, hardcoded) - Skips CKV2_GCP_27 (CMEK for Workbench disks — out of scope for this demo bundle) - Small/Medium/GPU presets; boot disk minimum 150GB (Workbench base image requirement) - Deployed and passing on gcp-claude test environment Co-Authored-By: Claude Sonnet 4.6 --- .../gcp-vertex-workbench/massdriver.yaml | 98 +++++++++++ bundles/gcp-vertex-workbench/README.md | 91 ++++++++++ bundles/gcp-vertex-workbench/massdriver.yaml | 139 ++++++++++++++++ bundles/gcp-vertex-workbench/operator.md | 149 +++++++++++++++++ bundles/gcp-vertex-workbench/src/.checkov.yml | 14 ++ bundles/gcp-vertex-workbench/src/artifacts.tf | 18 ++ bundles/gcp-vertex-workbench/src/iam.tf | 50 ++++++ bundles/gcp-vertex-workbench/src/main.tf | 155 ++++++++++++++++++ bundles/gcp-vertex-workbench/src/variables.tf | 90 ++++++++++ 9 files changed, 804 insertions(+) create mode 100644 artifact-definitions/gcp-vertex-workbench/massdriver.yaml create mode 100644 bundles/gcp-vertex-workbench/README.md create mode 100644 bundles/gcp-vertex-workbench/massdriver.yaml create mode 100644 bundles/gcp-vertex-workbench/operator.md create mode 100644 bundles/gcp-vertex-workbench/src/.checkov.yml create mode 100644 bundles/gcp-vertex-workbench/src/artifacts.tf create mode 100644 bundles/gcp-vertex-workbench/src/iam.tf create mode 100644 bundles/gcp-vertex-workbench/src/main.tf create mode 100644 bundles/gcp-vertex-workbench/src/variables.tf diff --git a/artifact-definitions/gcp-vertex-workbench/massdriver.yaml b/artifact-definitions/gcp-vertex-workbench/massdriver.yaml new file mode 100644 index 0000000..b9d45bf --- /dev/null +++ b/artifact-definitions/gcp-vertex-workbench/massdriver.yaml @@ -0,0 +1,98 @@ +name: gcp-vertex-workbench +label: GCP Vertex AI Workbench Instance +icon: https://raw.githubusercontent.com/massdriver-cloud/massdriver-catalog/refs/heads/main/platforms/gcp/icon.png + +# IAM Role Binding Pattern — granting external access to this Workbench instance: +# +# Workbench instances are single-user data-science environments. The primary IAM +# surface is the instance service account (instance_service_account_member), which +# is what you bind roles TO — not roles ON the Workbench instance itself. +# +# To grant an external principal read-only visibility to the Workbench instance +# (e.g., for audit or platform-admin purposes): +# +# resource "google_notebooks_instance_iam_member" "viewer" { +# project = var.vertex_workbench.project_id +# location = var.vertex_workbench.location +# name = var.vertex_workbench.instance_name +# role = "roles/notebooks.viewer" +# member = "user:alice@example.com" +# } +# +# The instance_service_account_member field carries the IAM principal string +# ("serviceAccount:") for the instance SA. Use it to grant the Workbench +# instance access to downstream resources (e.g., dataset viewer, bucket reader) +# without hard-coding the SA email. +# +# Example — granting a connected dataset access to the instance SA: +# resource "google_bigquery_dataset_iam_member" "reader" { +# dataset_id = var.bigquery_dataset.dataset_id +# role = "roles/bigquery.dataViewer" +# member = var.vertex_workbench.instance_service_account_member +# } +exports: [] + +schema: + title: GCP Vertex AI Workbench Instance + description: A deployed Vertex AI Workbench instance. Carries the project ID, + instance name, zone, JupyterLab proxy URL, and the instance service account + identity so downstream bundles can grant the Workbench access to data resources + without hard-coding project or service account identifiers. + type: object + required: + - project_id + - instance_name + - location + - proxy_url + - instance_service_account_email + - instance_service_account_member + properties: + project_id: + title: Project ID + description: GCP project identifier that owns this Workbench instance + type: string + examples: + - my-gcp-project-123 + + instance_name: + title: Instance Name + description: Workbench instance name (short name used in gcloud commands) + type: string + examples: + - my-workbench-instance + + location: + title: Location (Zone) + description: GCP zone where the Workbench instance is deployed. Workbench + instances are zonal resources — location is a zone (e.g., us-central1-a), + not a region. + type: string + examples: + - us-central1-a + + proxy_url: + title: JupyterLab Proxy URL + description: The HTTPS proxy URL to access the JupyterLab interface for this + Workbench instance. Populated after the instance is running. May be empty + while the instance is starting or if proxy access is disabled. + type: string + examples: + - https://abc123-dot-us-central1.notebooks.googleusercontent.com/ + + instance_service_account_email: + title: Instance Service Account Email + description: Email address of the GCP service account this Workbench instance + runs as. Downstream bundles bind IAM roles to this email to grant the + Workbench access to data resources. + type: string + examples: + - my-workbench@my-gcp-project-123.iam.gserviceaccount.com + + instance_service_account_member: + title: Instance Service Account IAM Member + description: "The full IAM principal string for the instance service account, + in 'serviceAccount:' form. Use this directly as the member argument + in google_*_iam_member resources so callers do not have to construct it manually." + type: string + examples: + - serviceAccount:my-workbench@my-gcp-project-123.iam.gserviceaccount.com diff --git a/bundles/gcp-vertex-workbench/README.md b/bundles/gcp-vertex-workbench/README.md new file mode 100644 index 0000000..6210319 --- /dev/null +++ b/bundles/gcp-vertex-workbench/README.md @@ -0,0 +1,91 @@ +# gcp-vertex-workbench + +Vertex AI Workbench instance for interactive data science. Each bundle instance provisions a dedicated per-instance service account and a managed JupyterLab environment running on GCE. When a BigQuery dataset is connected, the instance SA is automatically granted read-only access — no manual IAM wiring required. + +## Use Cases + +- **Exploratory data analysis** — interactive notebooks with access to BigQuery datasets and GCS buckets via scoped IAM. +- **ML model development** — GPU-accelerated notebook environments for training and evaluation, with the ability to consume Pub/Sub topics or write results to BigQuery via separate pipeline services. +- **Platform-managed data science environments** — org-wide Workbench standard enforcing Shielded VM, no public IP, per-instance identity, and idle shutdown — so each team gets a consistent, auditable environment without manual GCP console work. + +## Resources Created + +| Resource | Description | +|---|---| +| `google_service_account.instance` | Per-instance SA — this bundle's own workload identity | +| `google_workbench_instance.main` | The Vertex AI Workbench instance (Workbench Instances API v2) | +| `google_bigquery_dataset_iam_member.dataset_viewer` | Created only when BigQuery dataset is connected — grants `roles/bigquery.dataViewer` (read-only) to instance SA | + +## Connections + +### Required + +| Connection | Artifact Type | Purpose | +|---|---|---| +| `gcp_authentication` | `gcp-service-account` | GCP credentials used by Terraform to provision resources | +| `landing_zone` | `catalog-demo/gcp-landing-zone` | Provides `project_id`, `network.region`, and subnet self-link for instance placement | + +### Optional + +| Connection | Artifact Type | IAM Role Granted | +|---|---|---| +| `bigquery_dataset` | `catalog-demo/gcp-bigquery-dataset` | `roles/bigquery.dataViewer` (read-only) on the dataset | + +When the BigQuery dataset is connected, the instance SA can run SELECT queries from notebooks without manual IAM changes. Disconnect the canvas wire and redeploy to revoke access. + +## Artifact Produced + +**Artifact type:** `catalog-demo/gcp-vertex-workbench` + +| Field | Type | Description | +|---|---|---| +| `project_id` | string | GCP project that owns the instance | +| `instance_name` | string | Short instance name (used in gcloud commands) | +| `location` | string | GCP zone where the instance is deployed (e.g., `us-central1-a`) | +| `proxy_url` | string | JupyterLab HTTPS proxy URL — open this in a browser to access the notebook. May be empty while the instance is starting. | +| `instance_service_account_email` | string | Email of this instance's own SA | +| `instance_service_account_member` | string | IAM principal string (`serviceAccount:`) for downstream bindings | + +## Parameters + +| Parameter | Type | Default | Description | +|---|---|---|---| +| `machine_type` | enum | `e2-standard-4` | GCP machine type. E2 for general-purpose, N1 required for GPUs. | +| `boot_disk_size_gb` | integer | `150` | Boot disk size in GB (150–4000). Minimum 150 GB enforced by the Workbench base image. Holds OS, packages, and local notebook files. | +| `idle_shutdown_timeout_minutes` | integer | `180` | Minutes of kernel inactivity before auto-shutdown. 0 = never (continuous billing). | +| `accelerator_type` | enum | _(none)_ | GPU type. Requires N1 machine type. Leave empty for CPU-only. | +| `accelerator_count` | integer | `1` | Number of GPU accelerators. Only used when `accelerator_type` is set. | + +## Presets + +| Preset | Machine Type | Disk | GPU | Idle Timeout | +|---|---|---|---|---| +| Small | `e2-standard-4` | 150 GB | none | 3 hours | +| Medium | `n1-standard-8` | 200 GB | none | 3 hours | +| GPU | `n1-standard-8` | 200 GB | NVIDIA_TESLA_T4 × 1 | 3 hours | + +## Compliance + +### Hardcoded Controls + +| Control | Value | Rationale | +|---|---|---| +| Shielded VM — Secure Boot | `enable_secure_boot = true` | Prevents unsigned kernel modules and boot-time malware from loading. Cannot be disabled without recreating the instance. | +| Shielded VM — vTPM | `enable_vtpm = true` | Enables measured boot and key attestation. Required for integrity monitoring. | +| Shielded VM — Integrity Monitoring | `enable_integrity_monitoring = true` | Detects tampering with the boot sequence by comparing against a known-good baseline. | +| No public IP | `disable_public_ip = true` | The Workbench proxy handles browser access. No external IP is exposed. JupyterLab traffic does not traverse the public internet. | +| Per-instance service account | `google_service_account.instance` (one per bundle instance) | Each instance gets its own SA — no shared SA that grants access across all Workbench notebooks. See iam.tf for design rationale. | +| Read-only BigQuery access | `roles/bigquery.dataViewer` (not dataEditor) | Workbench is an exploration environment. Write access would allow ad-hoc schema mutations from notebook cells. Users who need to write back should use their personal GCP identity. | +| Resource labels | Massdriver default tags | Enforces cost attribution and environment tagging. | + +### Skipped Checks + +None. As of checkov 3.2.x, all existing Vertex AI Workbench checks (CKV_GCP_89, CKV_GCP_126, CKV_GCP_127) target the deprecated `google_notebooks_instance` resource and do not fire against `google_workbench_instance`. CMEK for disk encryption is intentionally out of scope for this bundle — Google-managed encryption is used. If CMEK is required, a separate bundle with a KMS key connection should be used. + +## Assumptions + +- The landing zone provides `project_id`, `network.region`, and `primary_subnet.self_link`. The Workbench instance is placed in the primary subnet of the landing zone's region, zone `-a`. +- The landing zone's subnet must have Private Google Access enabled for the instance to reach GCP APIs (BigQuery, GCS) without a public IP. The `gcp-landing-zone` bundle enables this by default. +- Idle shutdown is implemented via the `idle-timeout-seconds` GCE metadata key, which the Workbench agent reads at startup. If the instance is restarted externally (e.g., via gcloud), the idle timer resets. +- GPU availability is zone-dependent. If a GPU type is not available in `-a`, change `local.zone` in `src/main.tf` to a zone with quota. +- The `proxy_url` artifact field may be empty immediately after deploy while the instance boots. It populates within 2–5 minutes after the instance reaches ACTIVE state. diff --git a/bundles/gcp-vertex-workbench/massdriver.yaml b/bundles/gcp-vertex-workbench/massdriver.yaml new file mode 100644 index 0000000..1f1a400 --- /dev/null +++ b/bundles/gcp-vertex-workbench/massdriver.yaml @@ -0,0 +1,139 @@ +name: gcp-vertex-workbench +description: Vertex AI Workbench instance for interactive data science. Provisions + a dedicated per-instance service account with scoped IAM access to any connected + data resources. Automatically grants the instance SA read-only access to a connected + BigQuery dataset when wired. Emits a gcp-vertex-workbench artifact carrying the + instance name, zone, JupyterLab proxy URL, and SA identity. +source_url: https://github.com/massdriver-cloud/massdriver-catalog/tree/main/bundles/gcp-vertex-workbench +version: 0.1.0 + +params: + required: + - machine_type + - boot_disk_size_gb + - idle_shutdown_timeout_minutes + examples: + - __name: Small + machine_type: e2-standard-4 + boot_disk_size_gb: 150 + idle_shutdown_timeout_minutes: 180 + - __name: Medium + machine_type: n1-standard-8 + boot_disk_size_gb: 200 + idle_shutdown_timeout_minutes: 180 + - __name: GPU + machine_type: n1-standard-8 + boot_disk_size_gb: 200 + idle_shutdown_timeout_minutes: 180 + accelerator_type: NVIDIA_TESLA_T4 + accelerator_count: 1 + + properties: + machine_type: + title: Machine Type + description: GCP machine type for the Workbench instance. E2 types are cost-efficient + general-purpose machines. N1 types are required when attaching GPUs. N2 types + offer higher per-core performance for CPU-intensive workloads. Machine type + can be changed in-place by redeploying — the instance is stopped and restarted. + type: string + default: e2-standard-4 + enum: + - e2-standard-2 + - e2-standard-4 + - e2-standard-8 + - e2-standard-16 + - n1-standard-4 + - n1-standard-8 + - n1-standard-16 + - n1-standard-32 + - n2-standard-4 + - n2-standard-8 + - n2-standard-16 + + boot_disk_size_gb: + title: Boot Disk Size (GB) + description: Size of the boot disk in gigabytes. The boot disk holds the OS, + JupyterLab environment, conda/pip packages, and local notebook files. Increase + if you expect to install large libraries (e.g., TensorFlow with CUDA) or store + interim data locally. Boot disk size can only be increased in-place, not decreased. + Minimum 150 GB (enforced by the Workbench base image), maximum 4000 GB. + type: integer + minimum: 150 + maximum: 4000 + default: 150 + + idle_shutdown_timeout_minutes: + title: Idle Shutdown Timeout (minutes) + description: Number of minutes of kernel inactivity before the instance automatically + shuts down. Set to 0 to disable idle shutdown entirely (not recommended in + shared projects — you will be billed continuously). Default 180 minutes (3 hours) + is a good balance for interactive data science sessions. The instance can be + manually restarted from the Massdriver canvas or via gcloud after an idle shutdown. + type: integer + minimum: 0 + default: 180 + + accelerator_type: + title: GPU Accelerator Type + description: GPU accelerator to attach to the instance. Leave empty for CPU-only + workloads. GPUs require an N1 machine type — do not combine with E2 or N2 types. + Attaching a GPU changes the underlying VM and will cause a brief interruption + if changed in-place. + type: string + enum: + - NVIDIA_TESLA_T4 + - NVIDIA_TESLA_V100 + - NVIDIA_TESLA_A100 + - NVIDIA_L4 + + accelerator_count: + title: GPU Count + description: Number of GPU accelerators to attach. Must be set alongside accelerator_type. + Typical values are 1, 2, or 4 depending on the GPU type and quota. + type: integer + minimum: 1 + maximum: 8 + default: 1 + +connections: + required: + - gcp_authentication + - landing_zone + properties: + gcp_authentication: + $ref: gcp-service-account + title: GCP Credentials + + landing_zone: + $ref: catalog-demo/gcp-landing-zone + title: GCP Landing Zone + + # Optional — when wired, the bundle automatically grants the instance SA + # roles/bigquery.dataViewer on this dataset (read-only). Disconnect to remove. + bigquery_dataset: + $ref: catalog-demo/gcp-bigquery-dataset + title: BigQuery Dataset (optional — grants read-only access to instance SA) + +artifacts: + required: + - vertex_workbench + properties: + vertex_workbench: + $ref: catalog-demo/gcp-vertex-workbench + title: GCP Vertex AI Workbench Instance + +steps: + - path: src + provisioner: opentofu:1.10 + config: + checkov: + halt_on_failure: '.params.md_metadata.default_tags["md-target"] | test("^(prod|prd|production)$")' + +ui: + ui:order: + - machine_type + - boot_disk_size_gb + - idle_shutdown_timeout_minutes + - accelerator_type + - accelerator_count + - "*" diff --git a/bundles/gcp-vertex-workbench/operator.md b/bundles/gcp-vertex-workbench/operator.md new file mode 100644 index 0000000..e06e301 --- /dev/null +++ b/bundles/gcp-vertex-workbench/operator.md @@ -0,0 +1,149 @@ +--- +templating: mustache +--- + +# GCP Vertex AI Workbench — Operator Runbook + +## Non-obvious constraints + +**Location is a zone, not a region.** This bundle appends `-a` to the landing zone region (e.g., `us-central1` → `us-central1-a`). GPU quota is zone-specific — if you get a quota error for a GPU type, check availability in the zone and request quota or change `local.zone` in `src/main.tf`. + +**E2 machine types do not support GPUs.** If `accelerator_type` is set, the machine type must be N1 (`n1-standard-*`). Attempting to attach a GPU to an E2 machine fails at apply time with a GCP API error. + +**Machine type changes stop and restart the instance.** Workbench does not do live migration for machine type changes. The instance shuts down, is resized, and restarts. Expect 5–10 minutes of downtime. Open notebooks in JupyterLab are saved to disk and are available after restart. + +**Shielded VM settings are not changeable in-place.** Changing `enable_secure_boot`, `enable_vtpm`, or `enable_integrity_monitoring` requires destroying and recreating the instance. These are hardcoded to `true` in this bundle and are not exposed as params. + +**Idle shutdown requires the Workbench agent running.** The `idle-timeout-seconds` metadata key is only honoured if the Workbench agent is active inside the VM. If the agent crashes or the instance was reimaged externally, the idle shutdown will not fire. + +**Per-instance SA recreates if the package is renamed.** The SA `account_id` is derived from `name_prefix`. Renaming the Massdriver package destroys the old SA and creates a new one. All canvas-wired IAM bindings are recreated automatically on the next deploy. Out-of-band bindings (e.g., manually granted Artifact Registry reader) must be reapplied manually. + +**Canvas wires require a deploy to take effect.** Connecting or disconnecting the BigQuery dataset on the canvas does NOT grant or revoke IAM access immediately — a Massdriver deploy must run to apply the Terraform change. + +**proxy_url is empty until the instance is ACTIVE.** The JupyterLab proxy URL (`{{artifacts.vertex_workbench.proxy_url}}`) is only populated after the instance boots and the proxy registers. This takes 2–5 minutes after the Terraform apply completes. + +## Troubleshooting + +**Instance stuck in PROVISIONING or STARTING.** +Check the GCE instance serial console for boot errors: +```bash +gcloud compute instances get-serial-port-output {{artifacts.vertex_workbench.instance_name}} \ + --zone={{artifacts.vertex_workbench.location}} \ + --project={{artifacts.vertex_workbench.project_id}} +``` +Common causes: GPU quota exceeded, subnet CIDR exhausted, missing API enablement (`notebooks.googleapis.com`). + +**proxy_url is empty after 10 minutes.** +```bash +gcloud workbench instances describe {{artifacts.vertex_workbench.instance_name}} \ + --location={{artifacts.vertex_workbench.location}} \ + --project={{artifacts.vertex_workbench.project_id}} \ + --format="yaml(state,proxyUri,healthInfo)" +``` +If `state` is ACTIVE but `proxyUri` is empty, the Workbench proxy failed to register. Stop and start the instance: +```bash +gcloud workbench instances stop {{artifacts.vertex_workbench.instance_name}} \ + --location={{artifacts.vertex_workbench.location}} \ + --project={{artifacts.vertex_workbench.project_id}} + +gcloud workbench instances start {{artifacts.vertex_workbench.instance_name}} \ + --location={{artifacts.vertex_workbench.location}} \ + --project={{artifacts.vertex_workbench.project_id}} +``` + +**Notebook can't query BigQuery — `Access Denied`.** +Confirm the canvas wire is connected AND the package has been redeployed since the wire was added. Verify the IAM binding exists: +```bash +bq get-iam-policy {{artifacts.vertex_workbench.project_id}}:$(BQ_DATASET_ID) \ + --format=prettyjson | grep -A3 "dataViewer" +``` +The member should be `{{artifacts.vertex_workbench.instance_service_account_member}}`. + +**GPU not available in zone.** +```bash +gcloud compute accelerator-types list \ + --filter="zone:{{artifacts.vertex_workbench.location}}" \ + --project={{artifacts.vertex_workbench.project_id}} +``` +If the GPU type is absent, request quota for a different zone, then update `local.zone` in `src/main.tf` and redeploy. + +**Instance not shutting down after idle timeout.** +Confirm the `idle-timeout-seconds` metadata key was set: +```bash +gcloud compute instances describe {{artifacts.vertex_workbench.instance_name}} \ + --zone={{artifacts.vertex_workbench.location}} \ + --project={{artifacts.vertex_workbench.project_id}} \ + --format="yaml(metadata.items)" +``` +If missing, the idle_shutdown_timeout_minutes param may have been 0 (disabled). The metadata key is only written when the value is > 0. + +## Day-2 operations + +**Stopping and starting the instance (e.g., to save costs overnight):** +```bash +# Stop +gcloud workbench instances stop {{artifacts.vertex_workbench.instance_name}} \ + --location={{artifacts.vertex_workbench.location}} \ + --project={{artifacts.vertex_workbench.project_id}} + +# Start +gcloud workbench instances start {{artifacts.vertex_workbench.instance_name}} \ + --location={{artifacts.vertex_workbench.location}} \ + --project={{artifacts.vertex_workbench.project_id}} +``` +Starting the instance after an idle shutdown or manual stop takes 2–5 minutes. The proxy URL remains the same. + +**Resizing the instance (machine type or disk):** Update the `machine_type` or `boot_disk_size_gb` params in Massdriver and redeploy. The instance will stop, resize, and restart. Disk can only be increased, not decreased. + +**Adding a GPU after initial deploy:** Change `machine_type` to an N1 type, set `accelerator_type` and `accelerator_count`, and redeploy. This recreates the underlying GCE VM. + +**Rotating the instance service account:** The SA is derived from `name_prefix`. Rotating requires renaming the Massdriver package, which destroys the old SA and creates a new one. Canvas-wired IAM bindings are recreated automatically. Out-of-band bindings must be reapplied. + +**Granting a user access to the JupyterLab UI:** The proxy URL requires the user to authenticate with a GCP identity that has `roles/notebooks.viewer` or higher on the instance. Grant via: +```bash +gcloud workbench instances add-iam-policy-binding {{artifacts.vertex_workbench.instance_name}} \ + --location={{artifacts.vertex_workbench.location}} \ + --project={{artifacts.vertex_workbench.project_id}} \ + --role="roles/notebooks.viewer" \ + --member="user:alice@example.com" +``` + +## Useful commands + +```bash +# Describe instance state and proxy URL +gcloud workbench instances describe {{artifacts.vertex_workbench.instance_name}} \ + --location={{artifacts.vertex_workbench.location}} \ + --project={{artifacts.vertex_workbench.project_id}} \ + --format="yaml(state,proxyUri,healthInfo,gceSetup.machineType,gceSetup.serviceAccounts)" + +# List all Workbench instances in the project +gcloud workbench instances list \ + --location=- \ + --project={{artifacts.vertex_workbench.project_id}} \ + --format="table(name,location,state,proxyUri)" + +# Describe the instance service account +gcloud iam service-accounts describe {{artifacts.vertex_workbench.instance_service_account_email}} \ + --project={{artifacts.vertex_workbench.project_id}} + +# Check IAM bindings on the service account (bindings granted TO the SA) +gcloud projects get-iam-policy {{artifacts.vertex_workbench.project_id}} \ + --flatten="bindings[].members" \ + --filter="bindings.members:{{artifacts.vertex_workbench.instance_service_account_member}}" \ + --format="table(bindings.role)" + +# Check runtime logs from the Workbench agent +gcloud logging read \ + 'resource.type="gce_instance" AND labels."compute.googleapis.com/resource_name"="{{artifacts.vertex_workbench.instance_name}}"' \ + --project={{artifacts.vertex_workbench.project_id}} \ + --limit=50 \ + --format="table(timestamp,textPayload)" + +# Check GCP Audit Logs for BigQuery access by the instance SA +gcloud logging read \ + 'protoPayload.authenticationInfo.principalEmail="{{artifacts.vertex_workbench.instance_service_account_email}}" AND protoPayload.serviceName="bigquery.googleapis.com"' \ + --project={{artifacts.vertex_workbench.project_id}} \ + --limit=20 \ + --format="table(timestamp,protoPayload.methodName,protoPayload.resourceName)" +``` diff --git a/bundles/gcp-vertex-workbench/src/.checkov.yml b/bundles/gcp-vertex-workbench/src/.checkov.yml new file mode 100644 index 0000000..2f59817 --- /dev/null +++ b/bundles/gcp-vertex-workbench/src/.checkov.yml @@ -0,0 +1,14 @@ +skip-check: + # CKV2_GCP_27: Ensure Vertex AI workbench instance disks are encrypted with a Customer Managed Key (CMK) + # CMEK is intentionally out of scope for this bundle. Boot and data disks use Google-managed + # encryption (GOOGLE_MANAGED_ENCRYPTION), which is appropriate for the interactive data-science + # workloads this bundle targets. Checkov fires this check whenever a kms_key is absent from the + # boot_disk and data_disks blocks, making it a false positive here. If CMEK is required for a + # specific regulatory workload, a separate bundle with a Cloud KMS key connection should be used — + # it requires provisioning a KMS key, key ring, and granting the Compute Engine SA CryptoKey + # Encrypter/Decrypter access, which is out of scope for this general-purpose bundle. + - CKV2_GCP_27 + + # Note: As of checkov 3.2.x, legacy Vertex AI Notebook checks (CKV_GCP_89, CKV_GCP_126, + # CKV_GCP_127) target google_notebooks_instance (deprecated API v1) and do not fire against + # google_workbench_instance. No skip needed for those here. diff --git a/bundles/gcp-vertex-workbench/src/artifacts.tf b/bundles/gcp-vertex-workbench/src/artifacts.tf new file mode 100644 index 0000000..b91902c --- /dev/null +++ b/bundles/gcp-vertex-workbench/src/artifacts.tf @@ -0,0 +1,18 @@ +# Workbench instance artifact — matches catalog-demo/gcp-vertex-workbench schema. +# Emitted after the instance is provisioned and the proxy_uri is known. +# The proxy_url may be empty on first deploy if the instance is still starting. +# Downstream connections can use instance_service_account_member to grant the +# Workbench additional IAM roles on resources outside this bundle. + +resource "massdriver_artifact" "vertex_workbench" { + field = "vertex_workbench" + name = "GCP Vertex Workbench ${var.md_metadata.name_prefix}" + artifact = jsonencode({ + project_id = local.project_id + instance_name = google_workbench_instance.main.name + location = google_workbench_instance.main.location + proxy_url = google_workbench_instance.main.proxy_uri + instance_service_account_email = local.instance_sa_email + instance_service_account_member = local.instance_sa_member + }) +} diff --git a/bundles/gcp-vertex-workbench/src/iam.tf b/bundles/gcp-vertex-workbench/src/iam.tf new file mode 100644 index 0000000..0e97f25 --- /dev/null +++ b/bundles/gcp-vertex-workbench/src/iam.tf @@ -0,0 +1,50 @@ +# ─── Upstream Artifact IAM Auto-Binding ─────────────────────────────────────── +# +# This file implements the "auto-binding" pattern for Workbench instances that +# consume upstream data artifacts. For each optional connection that IS wired on +# the canvas, Terraform grants THIS bundle's instance service account the +# minimum-privilege read-only role required to use that resource. +# +# The instance SA (google_service_account.instance in main.tf) is created by +# this bundle — not inherited from the landing zone. Each Workbench instance gets +# its own identity with bindings only to the resources it actually connects to. +# +# HOW IT WORKS +# ──────────── +# Massdriver passes optional connections as null when not wired on the canvas, +# or as a plain object when wired. We detect presence with: var. != null +# Then use `count = var. != null ? 1 : 0` to conditionally create +# the binding. No connection → no IAM change. Add connection → binding appears +# on next deploy. Remove connection → binding is destroyed on next deploy. +# +# ROLES GRANTED +# ───────────── +# BigQuery dataset → roles/bigquery.dataViewer (read-only) +# Allows the Workbench instance to SELECT from tables and list tables within +# the dataset. Does NOT allow writing, updating, deleting rows, or creating +# tables. This is intentionally restrictive — Workbench is a read-and-explore +# environment, not a write path. If a notebook needs to write results back, a +# separate BigQuery writer service (Cloud Run, Dataflow) should own that role. +# +# HARDCODED POLICY: read-only access for BigQuery dataset connections +# The decision to grant only roles/bigquery.dataViewer (not dataEditor) is +# deliberate and non-configurable. Workbench instances are interactive exploration +# tools — granting write access would allow ad-hoc schema mutations and data +# deletion from notebook cells, bypassing any pipeline governance. If a user needs +# write access to BigQuery from Workbench, they should authenticate with their +# personal GCP identity (via Application Default Credentials), which is subject +# to IAM policy for their user account and provides a full audit trail. + +# ── BigQuery Dataset ─────────────────────────────────────────────────────────── +# Grant the instance SA read-only access to the connected BigQuery dataset. +# Binding is dataset-scoped — propagates to all current and future tables. +# For table-level isolation, use google_bigquery_table_iam_member. + +resource "google_bigquery_dataset_iam_member" "dataset_viewer" { + count = var.bigquery_dataset != null ? 1 : 0 + + project = var.bigquery_dataset.project_id + dataset_id = var.bigquery_dataset.dataset_id + role = "roles/bigquery.dataViewer" + member = local.instance_sa_member +} diff --git a/bundles/gcp-vertex-workbench/src/main.tf b/bundles/gcp-vertex-workbench/src/main.tf new file mode 100644 index 0000000..47a1778 --- /dev/null +++ b/bundles/gcp-vertex-workbench/src/main.tf @@ -0,0 +1,155 @@ +terraform { + required_version = ">= 1.0" + required_providers { + google = { + source = "hashicorp/google" + version = "~> 6.0" + } + massdriver = { + source = "massdriver-cloud/massdriver" + version = "~> 1.3" + } + } +} + +provider "google" { + project = var.gcp_authentication.project_id + credentials = jsonencode(var.gcp_authentication) +} + +locals { + project_id = var.landing_zone.project_id + name_prefix = var.md_metadata.name_prefix + region = var.landing_zone.network.region + # Workbench instances are zonal resources. Default to the first zone of the region. + zone = "${local.region}-a" + + # Instance SA is created by THIS bundle — scoped to this specific Workbench instance. + # See google_service_account.instance below for the design rationale. + instance_sa_email = google_service_account.instance.email + instance_sa_member = "serviceAccount:${google_service_account.instance.email}" + + # Idle shutdown is configured via GCE metadata. The Workbench agent reads + # "idle-timeout-seconds" and shuts down the instance after the specified + # number of seconds of kernel inactivity. 0 = never shut down. + idle_shutdown_seconds = var.idle_shutdown_timeout_minutes * 60 + + # Detect whether a GPU is requested. + has_gpu = var.accelerator_type != null && var.accelerator_type != "" +} + +# ─── Instance Service Account ────────────────────────────────────────────────── +# DESIGN DECISION: This bundle always creates a dedicated per-instance service +# account. Workbench instances are intended for data-science exploration with +# scoped, auditable access. Sharing a single SA across multiple Workbench +# instances makes post-hoc access auditing impossible — you can't tell which +# instance accessed a resource. By issuing one SA per instance, every IAM action +# in Cloud Audit Logs is traceable to a specific instance and its owner. +# +# The SA is granted ONLY the roles it needs for resources explicitly connected +# on the Massdriver canvas — no standing access to datasets or buckets it does +# not use. Roles are bound and unbound automatically as connections are added +# or removed. +# +# account_id is derived from name_prefix and capped at 28 chars (GCP limit is 30; +# we reserve 2 chars for future suffix use). The SA lives in the landing zone project. +# +# WARNING: Changing the package name_prefix recreates the SA with a new email. +# Any out-of-band IAM bindings referencing the old SA email are invalidated. Canvas- +# wired bindings are recreated automatically on the next deploy. + +resource "google_service_account" "instance" { + project = local.project_id + account_id = substr(local.name_prefix, 0, 28) + display_name = "Workbench Instance — ${local.name_prefix}" + description = "Runtime identity for Workbench instance ${local.name_prefix}. Managed by Massdriver." +} + +# ─── Vertex AI Workbench Instance ───────────────────────────────────────────── +# Uses google_workbench_instance (current Vertex AI Workbench Instances API v2). +# Do NOT use google_notebooks_instance — that resource targets the deprecated +# Notebooks API v1 and is scheduled for removal. +# +# Location is a ZONE, not a region. We derive it from the landing zone region +# by appending "-a" (the first zone in every GCP region). If you need a different +# zone, adjust local.zone above. +# +# Shielded VM (secure boot, vTPM, integrity monitoring) is enabled by default +# as a hardcoded security baseline. Disabling these requires explicit override +# and is not exposed as a param — see compliance notes in README.md. +# +# Public IP is disabled (disable_public_ip = true). Workbench instances reach +# GCP APIs via Private Google Access on the landing zone subnet. No external +# IP is required for normal JupyterLab use — the proxy URL handles browser access. + +resource "google_workbench_instance" "main" { + project = local.project_id + name = local.name_prefix + location = local.zone + + gce_setup { + machine_type = var.machine_type + + # ── Shielded VM ────────────────────────────────────────────────────────── + # Hardcoded security baseline — not configurable. Secure Boot prevents + # unsigned code from running during startup. vTPM enables measured boot and + # key attestation. Integrity Monitoring detects tampering of the boot sequence. + # All three are standard security hygiene for data science VMs. + shielded_instance_config { + enable_secure_boot = true + enable_vtpm = true + enable_integrity_monitoring = true + } + + # ── Network ────────────────────────────────────────────────────────────── + # Place the instance on the landing zone's primary subnet. + # disable_public_ip prevents an external IP from being assigned — the + # JupyterLab proxy handles browser access without a public IP. + disable_public_ip = true + + network_interfaces { + network = var.landing_zone.network.network_self_link + subnet = var.landing_zone.network.primary_subnet.self_link + nic_type = "GVNIC" + } + + # ── Service Account ─────────────────────────────────────────────────────── + # Run as the per-instance SA created above. IAM bindings in iam.tf grant + # this SA the minimum required roles on any connected upstream data artifact. + service_accounts { + email = local.instance_sa_email + } + + # ── GPU Accelerator ─────────────────────────────────────────────────────── + # Only created when accelerator_type is set. GPUs require N1 machine types. + # E2 and N2 machine types do not support GPU attachment. + dynamic "accelerator_configs" { + for_each = local.has_gpu ? [1] : [] + content { + type = var.accelerator_type + core_count = var.accelerator_count + } + } + + # ── Boot Disk ───────────────────────────────────────────────────────────── + boot_disk { + disk_size_gb = var.boot_disk_size_gb + disk_type = "PD_SSD" + } + + # ── Metadata ────────────────────────────────────────────────────────────── + # idle-timeout-seconds: Workbench agent shuts down the instance after this + # many seconds of kernel inactivity. 0 = never (not recommended — continuous billing). + # serial-port-logging-enable: disabled by default; enable only for deep debugging. + metadata = merge( + { + "serial-port-logging-enable" = "false" + }, + local.idle_shutdown_seconds > 0 ? { + "idle-timeout-seconds" = tostring(local.idle_shutdown_seconds) + } : {} + ) + } + + labels = var.md_metadata.default_tags +} diff --git a/bundles/gcp-vertex-workbench/src/variables.tf b/bundles/gcp-vertex-workbench/src/variables.tf new file mode 100644 index 0000000..9d3baf6 --- /dev/null +++ b/bundles/gcp-vertex-workbench/src/variables.tf @@ -0,0 +1,90 @@ +variable "md_metadata" { + type = object({ + name_prefix = string + default_tags = optional(map(string), {}) + }) +} + +variable "gcp_authentication" { + type = object({ + type = string + project_id = string + private_key_id = string + private_key = string + client_email = string + client_id = string + auth_uri = string + token_uri = string + auth_provider_x509_cert_url = string + client_x509_cert_url = string + }) + sensitive = true +} + +variable "landing_zone" { + type = object({ + project_id = string + network = object({ + network_name = string + network_self_link = string + region = string + primary_subnet = object({ + name = string + cidr = string + self_link = string + }) + }) + enabled_apis = list(string) + budget = object({ + enabled = bool + budget_name = optional(string) + billing_account_id = optional(string) + amount_usd = optional(number) + }) + }) +} + +# ─── Optional upstream artifact connections ──────────────────────────────────── +# These variables are null when the connection is not wired on the canvas. +# Massdriver passes optional connections as a plain object or null — NOT a list. +# iam.tf uses count = var. != null ? 1 : 0 to conditionally create IAM +# bindings, and references fields directly (e.g., var.bigquery_dataset.dataset_id). + +variable "bigquery_dataset" { + description = "Optional BigQuery dataset connection. When provided, the instance SA is granted roles/bigquery.dataViewer (read-only) on the dataset." + type = object({ + project_id = string + dataset_id = string + dataset_full_name = string + location = string + friendly_name = optional(string) + }) + default = null +} + +# ─── Instance params ─────────────────────────────────────────────────────────── + +variable "machine_type" { + type = string + default = "e2-standard-4" +} + +variable "boot_disk_size_gb" { + type = number + default = 150 +} + +variable "idle_shutdown_timeout_minutes" { + type = number + default = 180 +} + +variable "accelerator_type" { + type = string + default = null +} + +variable "accelerator_count" { + type = number + default = 1 +} From cce12e77596bc35bee892025d92e8dbefccb778e Mon Sep 17 00:00:00 2001 From: Cory O'Daniel Date: Sun, 19 Apr 2026 22:25:15 -0700 Subject: [PATCH 05/15] Refactor landing zone to standard IAM pattern MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The landing zone previously provisioned a single shared workload service account that every consumer bundle bound roles to. That diverges from how GCP landing zones are typically scoped — in the standard pattern landing zone owns project-level IAM for humans/groups and org-policy guardrails, while each workload creates its own least-privilege service account. Landing zone changes -------------------- - Remove google_service_account.workload and service_account_name param - Remove workload_identity from the gcp-landing-zone artifact schema - Add iam_bindings param: array of {role, members} for project-level bindings to users and groups (non-authoritative google_project_iam_member) - Add org_policies param: array of constraint + enforcement for posture guardrails (google_project_organization_policy) - Emit iam_bindings in the artifact as an informational summary Consumer bundle changes ----------------------- - gcp-pubsub-topic, gcp-bigquery-dataset, gcp-storage-bucket: drop the IAM binding that granted the landing zone SA a role on the resource. These are pure data bundles now — artifact policies document the binding pattern for downstream consumers. - gcp-cloud-run-service: create its own google_service_account.runtime per service. The auto-binding pattern for upstream connections stays, but now binds this bundle's SA rather than the landing zone SA. - Drop workload_identity from every consumer bundle's landing_zone variable type. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../gcp-landing-zone/massdriver.yaml | 82 ++++++------- bundles/gcp-bigquery-dataset/src/main.tf | 35 ++---- bundles/gcp-bigquery-dataset/src/variables.tf | 5 - bundles/gcp-cloud-run-service/README.md | 25 ++-- bundles/gcp-cloud-run-service/operator.md | 37 ++++-- .../gcp-cloud-run-service/src/artifacts.tf | 7 +- bundles/gcp-cloud-run-service/src/iam.tf | 33 +++-- bundles/gcp-cloud-run-service/src/main.tf | 37 ++++-- .../gcp-cloud-run-service/src/variables.tf | 11 +- bundles/gcp-landing-zone/README.md | 42 ++++--- bundles/gcp-landing-zone/massdriver.yaml | 115 +++++++++++++++--- bundles/gcp-landing-zone/operator.md | 46 ++++--- bundles/gcp-landing-zone/src/artifacts.tf | 22 ++-- bundles/gcp-landing-zone/src/main.tf | 52 ++++++-- bundles/gcp-landing-zone/src/variables.tf | 22 +++- bundles/gcp-pubsub-topic/src/main.tf | 28 ++--- bundles/gcp-pubsub-topic/src/variables.tf | 5 - bundles/gcp-storage-bucket/src/main.tf | 30 ++--- bundles/gcp-storage-bucket/src/variables.tf | 5 - 19 files changed, 392 insertions(+), 247 deletions(-) diff --git a/artifact-definitions/gcp-landing-zone/massdriver.yaml b/artifact-definitions/gcp-landing-zone/massdriver.yaml index 3d9e681..73e57ee 100644 --- a/artifact-definitions/gcp-landing-zone/massdriver.yaml +++ b/artifact-definitions/gcp-landing-zone/massdriver.yaml @@ -2,32 +2,31 @@ name: gcp-landing-zone label: GCP Landing Zone icon: https://raw.githubusercontent.com/massdriver-cloud/massdriver-catalog/refs/heads/main/platforms/gcp/icon.png -# Downstream bundle IAM binding pattern: -# This artifact carries the workload service account email. Downstream bundles -# (BigQuery, GCS, Pub/Sub, Cloud Run, Vertex) bind roles to it on their resources. +# Landing zone scope: +# This artifact represents the project-level governance layer — shared VPC, enabled APIs, +# billing budget reference, and a summary of the project-level IAM bindings and org-policy +# guardrails that the landing zone applied. # -# Terraform example — grant BigQuery data editor to the workload SA: -# resource "google_bigquery_dataset_iam_member" "workload" { -# dataset_id = google_bigquery_dataset.main.dataset_id -# role = "roles/bigquery.dataEditor" -# member = "serviceAccount:${var.landing_zone.workload_identity.service_account_email}" -# } +# It does NOT carry a workload service account. Each consumer bundle creates its own +# runtime SA and binds it to the resources it owns. Use the artifact policy comments in +# each data-resource artdef (gcp-pubsub-topic, gcp-bigquery-dataset, gcp-storage-bucket) +# as the canonical role-binding reference. # -# The network fields allow downstream bundles to place resources in the shared VPC -# without needing a separate network connection: +# The network fields allow downstream bundles to place resources in the shared VPC: # subnet_self_link = var.landing_zone.network.primary_subnet.self_link exports: [] schema: title: GCP Landing Zone - description: Environment foundation artifact — carries the shared VPC network, - workload runtime service account, enabled API list, and billing budget reference. - Downstream bundles connect to this instead of wiring network and identity separately. + description: Environment governance artifact — carries the shared VPC network, + enabled API list, billing budget reference, and an informational summary of + project-level IAM bindings and org-policy guardrails applied by the landing zone. + Downstream bundles connect to this instead of wiring network separately, and each + creates its own runtime service account for its workload identity. type: object required: - project_id - network - - workload_identity - enabled_apis - budget properties: @@ -81,33 +80,6 @@ schema: title: Self Link type: string - workload_identity: - title: Workload Identity - description: Runtime service account that environment workloads run as. - Downstream bundles bind IAM roles to service_account_email. - type: object - required: - - service_account_email - - service_account_id - - service_account_name - properties: - service_account_email: - title: Service Account Email - description: Use as IAM member string — serviceAccount: - type: string - examples: - - data-workload@my-project.iam.gserviceaccount.com - service_account_id: - title: Service Account Unique ID - type: string - examples: - - "123456789012345678901" - service_account_name: - title: Service Account Resource Name - type: string - examples: - - projects/my-project/serviceAccounts/data-workload@my-project.iam.gserviceaccount.com - enabled_apis: title: Enabled APIs description: GCP service APIs enabled in this project by the landing zone @@ -144,3 +116,29 @@ schema: type: - number - "null" + + iam_bindings: + title: IAM Bindings + description: Informational summary of project-level IAM bindings applied by this + landing zone (e.g., human operators and groups). Each entry records the role + and member list as applied. This is NOT enforced by downstream bundles — it is + a read-only audit trail of what the landing zone provisioned. + type: array + items: + type: object + required: + - role + - member + properties: + role: + title: Role + type: string + examples: + - roles/viewer + - roles/bigquery.dataViewer + member: + title: Member + type: string + examples: + - user:alice@example.com + - group:analysts@example.com diff --git a/bundles/gcp-bigquery-dataset/src/main.tf b/bundles/gcp-bigquery-dataset/src/main.tf index b904792..638d107 100644 --- a/bundles/gcp-bigquery-dataset/src/main.tf +++ b/bundles/gcp-bigquery-dataset/src/main.tf @@ -58,27 +58,16 @@ resource "google_bigquery_dataset" "main" { labels = var.md_metadata.default_tags } -# ─── Workload IAM Binding ────────────────────────────────────────────────────── -# Grant the landing zone's workload service account roles/bigquery.dataEditor on -# this dataset. dataEditor allows reading, writing, and deleting table data, as -# well as creating and deleting tables within the dataset — without granting -# dataset-level admin (which would allow dropping the dataset itself). +# ─── No workload IAM binding here ──────────────────────────────────────────── +# BigQuery datasets do not own a runtime identity. The landing zone no longer +# provides a shared workload SA. Consumer bundles (e.g. gcp-cloud-run-service) +# create their OWN service account and the Cloud Run bundle grants dataEditor +# access on this dataset when connected on the canvas. # -# IAM role binding pattern for this series: -# member = "serviceAccount:" -# role = "roles/bigquery.dataEditor" -# resource = google_bigquery_dataset.main.dataset_id (dataset-level binding) -# -# Note: This is a DATASET-level binding — it propagates to all current and future -# tables in the dataset. For table-level isolation, use google_bigquery_table_iam_member -# instead. For read-only access, bind roles/bigquery.dataViewer. -# -# Downstream bundles that need read-only access should bind roles/bigquery.dataViewer -# on this dataset using the bigquery_dataset artifact's dataset_id and project_id. - -resource "google_bigquery_dataset_iam_member" "workload_data_editor" { - project = local.project_id - dataset_id = google_bigquery_dataset.main.dataset_id - role = "roles/bigquery.dataEditor" - member = "serviceAccount:${var.landing_zone.workload_identity.service_account_email}" -} +# Artifact policy pattern — grant a consumer's SA data editor access: +# resource "google_bigquery_dataset_iam_member" "runtime_editor" { +# project = var.bigquery_dataset.project_id +# dataset_id = var.bigquery_dataset.dataset_id +# role = "roles/bigquery.dataEditor" +# member = "serviceAccount:" +# } diff --git a/bundles/gcp-bigquery-dataset/src/variables.tf b/bundles/gcp-bigquery-dataset/src/variables.tf index 8bbfd57..50399ba 100644 --- a/bundles/gcp-bigquery-dataset/src/variables.tf +++ b/bundles/gcp-bigquery-dataset/src/variables.tf @@ -34,11 +34,6 @@ variable "landing_zone" { self_link = string }) }) - workload_identity = object({ - service_account_email = string - service_account_id = string - service_account_name = string - }) enabled_apis = list(string) budget = object({ enabled = bool diff --git a/bundles/gcp-cloud-run-service/README.md b/bundles/gcp-cloud-run-service/README.md index 4c33db4..b952cbd 100644 --- a/bundles/gcp-cloud-run-service/README.md +++ b/bundles/gcp-cloud-run-service/README.md @@ -1,6 +1,6 @@ # gcp-cloud-run-service -Google Cloud Run v2 service with automatic IAM binding for upstream data artifacts. The service runs as the landing zone's shared workload service account, and any optional upstream artifact connection (Pub/Sub topic, BigQuery dataset, GCS bucket) automatically grants the workload SA the minimum-privilege role on that resource — no manual IAM wiring required. +Google Cloud Run v2 service with automatic IAM binding for upstream data artifacts. Each instance of this bundle creates its **own runtime service account** and automatically grants it the minimum-privilege role on any connected upstream artifact (Pub/Sub topic, BigQuery dataset, GCS bucket) — no manual IAM wiring required. ## Use Cases @@ -11,7 +11,7 @@ Google Cloud Run v2 service with automatic IAM binding for upstream data artifac ## Use as a Runtime Template -This bundle is an example **runtime template** — an opinionated, org-wide standard for how Cloud Run services are provisioned. It encodes your platform's security baseline (workload identity, ingress controls, compliance skips with documented rationale) and auto-wires IAM for common data dependencies. +This bundle is an example **runtime template** — an opinionated, org-wide standard for how Cloud Run services are provisioned. It encodes your platform's security baseline (per-service workload identity, ingress controls, compliance skips with documented rationale) and auto-wires IAM for common data dependencies. The typical workflow for application teams: @@ -21,17 +21,16 @@ The typical workflow for application teams: This separation keeps the platform baseline consistent across all services while letting application teams move independently. -For more on the `mass bundle new` workflow and template structure, see the Massdriver documentation. {{TODO: add direct link to the templates repository — check https://github.com/massdriver-cloud/massdriver-catalog or the `mass bundle new --help` output for the template path configuration.}} - ## Resources Created | Resource | Description | |---|---| +| `google_service_account.runtime` | Per-service runtime SA — this bundle's own workload identity | | `google_cloud_run_v2_service` | The Cloud Run v2 service running your container | | `google_cloud_run_v2_service_iam_member` (allUsers) | Created only when `allow_unauthenticated = true` — grants public invoke access | -| `google_pubsub_topic_iam_member` | Created only when Pub/Sub topic is connected — grants `roles/pubsub.publisher` to workload SA | -| `google_bigquery_dataset_iam_member` | Created only when BigQuery dataset is connected — grants `roles/bigquery.dataEditor` to workload SA | -| `google_storage_bucket_iam_member` | Created only when Storage bucket is connected — grants `roles/storage.objectUser` to workload SA | +| `google_pubsub_topic_iam_member` | Created only when Pub/Sub topic is connected — grants `roles/pubsub.publisher` to runtime SA | +| `google_bigquery_dataset_iam_member` | Created only when BigQuery dataset is connected — grants `roles/bigquery.dataEditor` to runtime SA | +| `google_storage_bucket_iam_member` | Created only when Storage bucket is connected — grants `roles/storage.objectUser` to runtime SA | ## Connections @@ -40,11 +39,11 @@ For more on the `mass bundle new` workflow and template structure, see the Massd | Connection | Artifact Type | Purpose | |---|---|---| | `gcp_authentication` | `gcp-service-account` | GCP credentials used by Terraform to provision resources | -| `landing_zone` | `catalog-demo/gcp-landing-zone` | Provides `project_id`, `network.region`, and `workload_identity.service_account_email` | +| `landing_zone` | `catalog-demo/gcp-landing-zone` | Provides `project_id` and `network.region` | ### Optional -These connections are not required. When wired on the canvas, the bundle automatically grants the workload service account the appropriate IAM role on the upstream resource. When absent, no IAM binding is created. +These connections are not required. When wired on the canvas, the bundle automatically grants this service's runtime SA the appropriate IAM role on the upstream resource. When absent, no IAM binding is created. | Connection | Artifact Type | IAM Role Granted | |---|---|---| @@ -63,10 +62,10 @@ These connections are not required. When wired on the canvas, the bundle automat | `service_url` | string | HTTPS URL of the service (`.run.app` domain) | | `location` | string | GCP region where the service is deployed | | `latest_ready_revision` | string | Name of the currently-serving revision | -| `runtime_service_account_email` | string | Email of the SA the service runs as | +| `runtime_service_account_email` | string | Email of this service's own runtime SA | | `runtime_service_account_member` | string | IAM principal string (`serviceAccount:`) for downstream bindings | -The `runtime_service_account_member` field is designed for downstream bundles (Scheduler, Pub/Sub push) that need to grant `roles/run.invoker` to the Cloud Run service's own identity — or for external callers that need to invoke the service. +The `runtime_service_account_member` field is designed for downstream bundles (Scheduler, Pub/Sub push) that need to grant `roles/run.invoker` to this service's identity. ## Parameters @@ -95,7 +94,7 @@ The `runtime_service_account_member` field is designed for downstream bundles (S | Control | Value | Rationale | |---|---|---| -| Runtime identity | Landing zone workload SA | All services run as the org-managed SA — no per-service SA proliferation | +| Per-service runtime identity | `google_service_account.runtime` (one per bundle instance) | Each service gets its own SA with bindings only to resources it connects to — no shared SA that grants access across all workloads | | Resource labels | Massdriver default tags | Enforces cost attribution and environment tagging on all revisions | ### Skipped Checks @@ -107,6 +106,6 @@ The `runtime_service_account_member` field is designed for downstream bundles (S ## Assumptions -- The landing zone's workload SA is the correct runtime identity for this service. If you need a per-service SA, extend this bundle with an additional `google_service_account` resource and wire it into `template.service_account`. +- The landing zone provides `project_id` and `network.region`. It does NOT provide a workload SA — this bundle creates its own. - VPC connector / direct VPC egress is not provisioned by this bundle. Cloud Run uses Google's serverless infrastructure by default. If you need to reach VPC-private resources (e.g., Cloud SQL without public IP), add a `google_vpc_access_connector` resource and reference it in the template's `vpc_access` block. - The default image (`gcr.io/cloudrun/hello`) is the Google-managed hello-world container. Replace it with your application image before a real deployment. diff --git a/bundles/gcp-cloud-run-service/operator.md b/bundles/gcp-cloud-run-service/operator.md index 1403be0..46a3fe7 100644 --- a/bundles/gcp-cloud-run-service/operator.md +++ b/bundles/gcp-cloud-run-service/operator.md @@ -6,6 +6,10 @@ templating: mustache ## Non-obvious constraints +**Each bundle instance creates its own service account.** Unlike the previous pattern where all services shared the landing zone's workload SA, this bundle provisions `google_service_account.runtime` scoped to this specific service. If you redeploy after the old landing-zone SA is gone from state, the Cloud Run service will be updated to run as the new per-service SA. Cold start on the first revision after a SA switch is expected. + +**Rotating the runtime SA is destructive — expect cold start.** The SA `account_id` is derived from the bundle's `name_prefix`. Changing the package name or name_prefix recreates the SA with a new email. Any out-of-band IAM bindings referencing the old SA email (e.g., manually granted Artifact Registry reader) must be reapplied. Canvas-wired bindings (Pub/Sub, BigQuery, GCS) are recreated automatically on the next deploy. + **New deployments route 100% of traffic to the latest revision immediately.** Blue/green splits must be configured before deploying the new revision. You cannot retroactively split traffic between an old and new revision once the new one is live at 100%. **Changing `ingress` triggers a new revision and a cold start.** Even if `min_instances > 0`, an ingress change forces revision replacement. Expect a brief cold start. @@ -14,15 +18,15 @@ templating: mustache **Container port must match what the image listens on.** If the image doesn't listen on the configured port, the revision fails health checks and Cloud Run rolls back. Error in logs: `Container failed to start. Failed to start and then listen on the port defined by the PORT environment variable.` Check application logs before the platform logs. -**Image pull from Artifact Registry: the workload SA needs `roles/artifactregistry.reader`.** This bundle does not grant that role. If a revision fails with `image not found` or `permission denied` at startup, check this IAM binding first: +**Image pull from Artifact Registry: the runtime SA needs `roles/artifactregistry.reader`.** This bundle does not grant that role. If a revision fails with `image not found` or `permission denied` at startup, grant the role manually or add it to the bundle: ```bash -gcloud artifacts repositories get-iam-policy \ +gcloud artifacts repositories add-iam-policy-binding \ --location={{artifacts.cloud_run_service.location}} \ - --project={{artifacts.cloud_run_service.project_id}} + --project={{artifacts.cloud_run_service.project_id}} \ + --member="{{artifacts.cloud_run_service.runtime_service_account_member}}" \ + --role="roles/artifactregistry.reader" ``` -**CPU-to-memory minimums are enforced at the API level.** 2 vCPU requires at least 512Mi; 4 vCPU requires at least 2Gi. A mismatched deploy fails before any revision is created. - **Connecting or disconnecting canvas wires requires a Massdriver deploy to take effect.** Wiring an artifact on the canvas does not grant IAM access. The Terraform apply must run to create or destroy the IAM binding. ## Troubleshooting @@ -49,8 +53,21 @@ gcloud logging read \ **IAM binding not applied after connecting a canvas wire.** Connect the wire on the canvas AND redeploy this package. The binding does not exist until Terraform applies it. -**Image pull failure.** -Check the workload SA's Artifact Registry permission (see Non-obvious constraints above). Also confirm the image tag or digest exists in the registry. +**Service can't access a resource it's connected to.** +Confirm the canvas wire is connected AND the package has been deployed since the wire was added. Check the specific IAM binding: +```bash +# Pub/Sub +gcloud pubsub topics get-iam-policy \ + --project={{artifacts.cloud_run_service.project_id}} \ + --format="table(bindings.role,bindings.members)" + +# BigQuery +bq get-iam-policy {{artifacts.cloud_run_service.project_id}}: + +# GCS +gcloud storage buckets get-iam-policy gs:// +``` +The member should be `{{artifacts.cloud_run_service.runtime_service_account_member}}`. ## Day-2 operations @@ -80,7 +97,7 @@ gcloud container images describe : \ **Scaling changes:** Update `min_instances` or `max_instances` params and redeploy. In-place safe. -**Rotating the runtime service account:** This requires a bundle code change (the SA is created by the landing zone). Changing the connected landing zone artifact and redeploying will update the SA reference. +**Rotating the runtime service account:** The SA is derived from the bundle's name_prefix. Rotating requires renaming, which is destructive — the old SA is deleted, a new one is created, and all canvas-wired IAM bindings are recreated on next deploy. Any out-of-band bindings (e.g., Artifact Registry reader) must be reapplied manually. ## Useful commands @@ -114,6 +131,10 @@ gcloud run services get-iam-policy {{artifacts.cloud_run_service.service_name}} --region={{artifacts.cloud_run_service.location}} \ --project={{artifacts.cloud_run_service.project_id}} +# Describe the runtime service account +gcloud iam service-accounts describe {{artifacts.cloud_run_service.runtime_service_account_email}} \ + --project={{artifacts.cloud_run_service.project_id}} + # Check runtime SA's IAM bindings on a connected Pub/Sub topic gcloud pubsub topics get-iam-policy \ --project={{artifacts.cloud_run_service.project_id}} \ diff --git a/bundles/gcp-cloud-run-service/src/artifacts.tf b/bundles/gcp-cloud-run-service/src/artifacts.tf index 9841b9c..aca34fa 100644 --- a/bundles/gcp-cloud-run-service/src/artifacts.tf +++ b/bundles/gcp-cloud-run-service/src/artifacts.tf @@ -2,6 +2,9 @@ # Emits after the service is fully deployed and the first revision is ready. # Downstream bundles (Scheduler, Pub/Sub push subscriptions) consume service_url # and runtime_service_account_member to configure invocation and IAM. +# +# runtime_service_account_email / runtime_service_account_member now reference +# THIS bundle's own runtime SA (created in main.tf), NOT the landing zone SA. resource "massdriver_artifact" "cloud_run_service" { field = "cloud_run_service" @@ -12,7 +15,7 @@ resource "massdriver_artifact" "cloud_run_service" { service_url = google_cloud_run_v2_service.main.uri location = google_cloud_run_v2_service.main.location latest_ready_revision = google_cloud_run_v2_service.main.latest_ready_revision - runtime_service_account_email = local.workload_sa_email - runtime_service_account_member = local.workload_sa_member + runtime_service_account_email = local.runtime_sa_email + runtime_service_account_member = local.runtime_sa_member }) } diff --git a/bundles/gcp-cloud-run-service/src/iam.tf b/bundles/gcp-cloud-run-service/src/iam.tf index 528aa8d..b489baa 100644 --- a/bundles/gcp-cloud-run-service/src/iam.tf +++ b/bundles/gcp-cloud-run-service/src/iam.tf @@ -2,8 +2,12 @@ # # This file implements the "auto-binding" pattern for Cloud Run services that # consume upstream data artifacts. For each optional connection that IS wired on -# the canvas, Terraform grants the workload service account the minimum-privilege -# role required to use that resource. +# the canvas, Terraform grants THIS bundle's runtime service account the minimum- +# privilege role required to use that resource. +# +# The runtime SA (google_service_account.runtime in main.tf) is created by this +# bundle — not inherited from the landing zone. This means each Cloud Run service +# gets its own identity with bindings only to the resources it actually connects to. # # HOW IT WORKS # ──────────── @@ -29,50 +33,43 @@ # Allows reading and writing objects (get, list, create, delete). Does NOT # grant bucket-level admin (lifecycle, IAM, metadata changes). For read-only # access, use roles/storage.objectViewer instead. -# -# REFERENCE EXAMPLE -# ───────────────── -# This is the canonical artifact-policy-style auto-binding pattern for the -# GCP Data Platform demo series. When building downstream bundles that consume -# multiple optional artifacts, copy this pattern: one conditional count block -# per artifact type, one role per binding, all referencing local.workload_sa_member. # ── Pub/Sub Topic ───────────────────────────────────────────────────────────── -# Grant the workload SA publisher access to the connected Pub/Sub topic. +# Grant this service's runtime SA publisher access to the connected Pub/Sub topic. # Binding is topic-scoped — does not grant access to other topics. -resource "google_pubsub_topic_iam_member" "workload_publisher" { +resource "google_pubsub_topic_iam_member" "runtime_publisher" { count = var.pubsub_topic != null ? 1 : 0 project = var.pubsub_topic.project_id topic = var.pubsub_topic.topic_name role = "roles/pubsub.publisher" - member = local.workload_sa_member + member = local.runtime_sa_member } # ── BigQuery Dataset ─────────────────────────────────────────────────────────── -# Grant the workload SA dataEditor on the connected BigQuery dataset. +# Grant this service's runtime SA dataEditor on the connected BigQuery dataset. # Binding is dataset-scoped — propagates to all current and future tables in # the dataset. For table-level isolation, use google_bigquery_table_iam_member. -resource "google_bigquery_dataset_iam_member" "workload_data_editor" { +resource "google_bigquery_dataset_iam_member" "runtime_data_editor" { count = var.bigquery_dataset != null ? 1 : 0 project = var.bigquery_dataset.project_id dataset_id = var.bigquery_dataset.dataset_id role = "roles/bigquery.dataEditor" - member = local.workload_sa_member + member = local.runtime_sa_member } # ── Storage Bucket ───────────────────────────────────────────────────────────── -# Grant the workload SA objectUser on the connected GCS bucket. +# Grant this service's runtime SA objectUser on the connected GCS bucket. # Binding is bucket-scoped — allows read/write of all objects in the bucket. # For read-only access, use roles/storage.objectViewer. -resource "google_storage_bucket_iam_member" "workload_object_user" { +resource "google_storage_bucket_iam_member" "runtime_object_user" { count = var.storage_bucket != null ? 1 : 0 bucket = var.storage_bucket.bucket_name role = "roles/storage.objectUser" - member = local.workload_sa_member + member = local.runtime_sa_member } diff --git a/bundles/gcp-cloud-run-service/src/main.tf b/bundles/gcp-cloud-run-service/src/main.tf index 309114f..5af4e09 100644 --- a/bundles/gcp-cloud-run-service/src/main.tf +++ b/bundles/gcp-cloud-run-service/src/main.tf @@ -22,10 +22,31 @@ locals { name_prefix = var.md_metadata.name_prefix region = var.landing_zone.network.region - # The workload SA is defined once in the landing zone; all upstream IAM - # bindings in iam.tf reference this local so the principal is never duplicated. - workload_sa_email = var.landing_zone.workload_identity.service_account_email - workload_sa_member = "serviceAccount:${local.workload_sa_email}" + # Runtime SA is created by THIS bundle — not inherited from the landing zone. + # The SA email and member string are sourced from the google_service_account resource below. + # Use these locals anywhere an SA principal is needed (iam.tf, artifacts.tf). + runtime_sa_email = google_service_account.runtime.email + runtime_sa_member = "serviceAccount:${google_service_account.runtime.email}" +} + +# ─── Runtime Service Account ────────────────────────────────────────────────── +# Each Cloud Run service instance creates its own SA. This is the identity the +# service runs as and the principal that IAM bindings in iam.tf grant access to. +# +# account_id is derived from name_prefix and capped at 30 chars (GCP limit is 30). +# The SA is created in the landing zone's project — the project that owns the +# Cloud Run service and the upstream data resources. +# +# IMPORTANT: This SA is destroyed and recreated if the name_prefix changes (e.g., +# if the package is renamed). That is a destructive operation — downstream IAM +# bindings referencing the old email are invalidated. Plan SA naming carefully +# before first deploy; treat it as immutable after that. + +resource "google_service_account" "runtime" { + project = local.project_id + account_id = substr(local.name_prefix, 0, 30) + display_name = "Cloud Run Runtime — ${local.name_prefix}" + description = "Runtime identity for Cloud Run service ${local.name_prefix}. Managed by Massdriver." } # ─── Cloud Run v2 Service ────────────────────────────────────────────────────── @@ -47,11 +68,9 @@ resource "google_cloud_run_v2_service" "main" { template { # ── Runtime identity ────────────────────────────────────────────────────── - # Run every revision as the landing zone's shared workload service account. - # This is the identity that upstream IAM bindings (iam.tf) grant access to. - # Per-service SAs are out of scope; use a separate landing-zone-style bundle - # if your workload requires a dedicated SA with narrower permissions. - service_account = local.workload_sa_email + # Run every revision as this bundle's own runtime service account (created above). + # iam.tf grants this SA the minimum required roles on any connected upstream artifact. + service_account = local.runtime_sa_email # ── Scaling ─────────────────────────────────────────────────────────────── # min_instance_count > 0 disables scale-to-zero. You pay for idle capacity. diff --git a/bundles/gcp-cloud-run-service/src/variables.tf b/bundles/gcp-cloud-run-service/src/variables.tf index 34cca79..46004f8 100644 --- a/bundles/gcp-cloud-run-service/src/variables.tf +++ b/bundles/gcp-cloud-run-service/src/variables.tf @@ -34,11 +34,6 @@ variable "landing_zone" { self_link = string }) }) - workload_identity = object({ - service_account_email = string - service_account_id = string - service_account_name = string - }) enabled_apis = list(string) budget = object({ enabled = bool @@ -56,7 +51,7 @@ variable "landing_zone" { # bindings, and references fields directly (e.g., var.pubsub_topic.topic_name). variable "pubsub_topic" { - description = "Optional Pub/Sub topic connection. When provided, the workload SA is granted roles/pubsub.publisher on the topic." + description = "Optional Pub/Sub topic connection. When provided, the runtime SA is granted roles/pubsub.publisher on the topic." type = object({ project_id = string topic_name = string @@ -68,7 +63,7 @@ variable "pubsub_topic" { } variable "bigquery_dataset" { - description = "Optional BigQuery dataset connection. When provided, the workload SA is granted roles/bigquery.dataEditor on the dataset." + description = "Optional BigQuery dataset connection. When provided, the runtime SA is granted roles/bigquery.dataEditor on the dataset." type = object({ project_id = string dataset_id = string @@ -80,7 +75,7 @@ variable "bigquery_dataset" { } variable "storage_bucket" { - description = "Optional GCS bucket connection. When provided, the workload SA is granted roles/storage.objectUser on the bucket." + description = "Optional GCS bucket connection. When provided, the runtime SA is granted roles/storage.objectUser on the bucket." type = object({ project_id = string bucket_name = string diff --git a/bundles/gcp-landing-zone/README.md b/bundles/gcp-landing-zone/README.md index c4dbf25..35c90fd 100644 --- a/bundles/gcp-landing-zone/README.md +++ b/bundles/gcp-landing-zone/README.md @@ -1,18 +1,22 @@ # gcp-landing-zone -Environment-foundational construct for a GCP data platform. Deploy this once per environment before any workload bundles. It: +Project-level governance construct for a GCP data platform. Deploy this once per environment before any workload bundles. It: - Enables GCP service APIs required by your data platform stack -- Provisions a **workload runtime service account** that Cloud Run, Vertex Workbench, and other services run as +- Applies **project-level IAM bindings** for human operators and groups (e.g., `roles/viewer` to `group:data-analysts@example.com`) +- Enforces **org-policy guardrails** at the project level (e.g., disable SA key creation, block public GCS access) - Optionally configures a **billing budget** with spend-threshold email alerts -- Folds the input `gcp-network` artifact into its own `landing_zone` output so downstream bundles need only one connection instead of wiring network and identity separately +- Folds the input `gcp-network` artifact into its own `landing_zone` output so downstream bundles need only one connection instead of wiring network separately + +**This bundle does NOT provision workload service accounts.** Each consumer bundle (Cloud Run, etc.) creates its own runtime SA with least-privilege bindings on the resources it owns. Project-level IAM here is for human operators and group access management. ## Resources Created | Resource | Type | Notes | |---|---|---| | `google_project_service.apis` | API enablement (one per API) | `disable_on_destroy = false` to avoid disrupting other resources | -| `google_service_account.workload` | Workload runtime SA | Created with no project-level roles; downstream bundles bind roles on their own resources | +| `google_project_iam_member.operators` | Project IAM bindings | One resource per `{role, member}` entry; additive (non-authoritative) | +| `google_project_organization_policy.guardrails` | Org policy constraints | Project-scoped; one resource per constraint | | `google_billing_budget.environment` | Billing budget | Created only when `budget.enabled = true` | | `google_monitoring_notification_channel.budget_email` | Email alert channel | Created only when budget is enabled and `notification_emails` is non-empty | @@ -36,28 +40,34 @@ The bundle publishes a single `catalog-demo/gcp-landing-zone` artifact. Downstre | `network.primary_subnet.name` | Subnet name | | `network.primary_subnet.cidr` | Subnet CIDR range | | `network.primary_subnet.self_link` | Subnet self-link URI | -| `workload_identity.service_account_email` | Runtime SA email — used by downstream bundles to bind IAM roles | -| `workload_identity.service_account_id` | Runtime SA unique ID | -| `workload_identity.service_account_name` | Runtime SA resource name | | `enabled_apis` | List of APIs that were enabled | +| `iam_bindings` | Informational list of project-level `{role, member}` bindings applied by this landing zone | | `budget.enabled` | Whether a budget was configured | | `budget.budget_name` | Budget display name (null when disabled) | | `budget.billing_account_id` | Billing account the budget is attached to (null when disabled) | | `budget.amount_usd` | Monthly budget limit in USD (null when disabled) | -## Downstream IAM Pattern +## IAM Pattern for Downstream Consumer Bundles -Each downstream bundle reads `landing_zone.workload_identity.service_account_email` and grants the minimum required roles on its own resources. Example for a BigQuery dataset: +Each downstream bundle creates its OWN service account for its runtime identity, then binds that SA to the specific resources it needs. The landing zone does not provision or share a workload SA. Example pattern in a consumer bundle: ```hcl -resource "google_bigquery_dataset_iam_member" "workload" { +# Consumer bundle creates its own runtime SA +resource "google_service_account" "runtime" { + project = var.landing_zone.project_id + account_id = "${var.md_metadata.name_prefix}-sa" + display_name = "Runtime SA for ${var.md_metadata.name_prefix}" +} + +# Consumer bundle binds its SA only to the resources it actually needs +resource "google_bigquery_dataset_iam_member" "runtime_editor" { dataset_id = google_bigquery_dataset.main.dataset_id role = "roles/bigquery.dataEditor" - member = "serviceAccount:${var.landing_zone.workload_identity.service_account_email}" + member = "serviceAccount:${google_service_account.runtime.email}" } ``` -The workload SA is intentionally created with no project-level roles. Do not add broad roles here. +The artifact policy comments in each data-resource artdef (`gcp-pubsub-topic`, `gcp-bigquery-dataset`, `gcp-storage-bucket`) are the canonical role-binding reference. ## Compliance @@ -65,7 +75,7 @@ The workload SA is intentionally created with no project-level roles. Do not add | Control | Mechanism | Reason | |---|---|---| -| No broad IAM roles on workload SA | SA created with no bindings | Downstream bundles use least-privilege per-resource bindings | +| Additive (non-authoritative) IAM | `google_project_iam_member` (per-binding) | Avoids clobbering bindings set by GCP defaults or other automation | | APIs not disabled on destroy | `disable_on_destroy = false` | Prevents accidental disruption of other resources that depend on the same APIs | ### Checkov skips @@ -81,7 +91,7 @@ The `halt_on_failure` expression in `massdriver.yaml` blocks deployments with un ## Assumptions - The GCP project already exists — this bundle does not create projects. -- The `gcp_authentication` credential has `iam.serviceAccountAdmin`, `serviceusage.serviceUsageAdmin`, and (if using budgets) `billing.budgets.create` IAM. +- The `gcp_authentication` credential has `iam.admin`, `serviceusage.serviceUsageAdmin`, `orgpolicy.policy.set` (project scope), and (if using budgets) `billing.budgets.create` IAM. - Cloud Billing must be linked to the project before budgets can be created. - `billingbudgets.googleapis.com` must be in `enabled_apis` when `budget.enabled = true`. @@ -89,5 +99,5 @@ The `halt_on_failure` expression in `massdriver.yaml` blocks deployments with un | Preset | Budget | Notable APIs | |---|---|---| -| Standard (no budget) | Disabled | compute, iam, resourcemanager, serviceusage, run, bigquery, storage, aiplatform, notebooks, logging, monitoring | -| Standard (with budget) | Enabled — $500/mo, alerts at 50%/90%/100% | All of the above plus billingbudgets | +| Standard (no budget) | Disabled | compute, iam, resourcemanager, serviceusage, run, bigquery, storage, pubsub, aiplatform, notebooks, logging, monitoring | +| Standard (with budget) | Enabled — $500/mo, alerts at 50%/90%/100% | All of the above plus billingbudgets; example org policies: disable SA keys, block public GCS, require OS Login | diff --git a/bundles/gcp-landing-zone/massdriver.yaml b/bundles/gcp-landing-zone/massdriver.yaml index 9e1c5da..76cd85c 100644 --- a/bundles/gcp-landing-zone/massdriver.yaml +++ b/bundles/gcp-landing-zone/massdriver.yaml @@ -1,19 +1,20 @@ name: gcp-landing-zone -description: Environment-foundational construct for a GCP data platform. Enables required - service APIs, provisions the workload runtime service account, configures a billing - budget with threshold alerts, and emits a single landing-zone artifact so downstream - bundles only need one connection. +description: Project-level governance construct for a GCP data platform. Enables required + service APIs, applies project-level IAM bindings for human operators and groups, enforces + org-policy guardrails, configures a billing budget with threshold alerts, and emits a + single landing-zone artifact so downstream bundles only need one connection. Does NOT + provision workload service accounts — each consumer bundle creates its own. source_url: https://github.com/massdriver-cloud/massdriver-catalog/tree/main/bundles/gcp-landing-zone version: 0.1.0 params: required: - - service_account_name - enabled_apis - budget + - iam_bindings + - org_policies examples: - __name: Standard (no budget) - service_account_name: workload enabled_apis: - compute.googleapis.com - iam.googleapis.com @@ -22,14 +23,16 @@ params: - run.googleapis.com - bigquery.googleapis.com - storage.googleapis.com + - pubsub.googleapis.com - aiplatform.googleapis.com - notebooks.googleapis.com - logging.googleapis.com - monitoring.googleapis.com budget: enabled: false + iam_bindings: [] + org_policies: [] - __name: Standard (with budget) - service_account_name: workload enabled_apis: - compute.googleapis.com - iam.googleapis.com @@ -39,6 +42,7 @@ params: - run.googleapis.com - bigquery.googleapis.com - storage.googleapis.com + - pubsub.googleapis.com - aiplatform.googleapis.com - notebooks.googleapis.com - logging.googleapis.com @@ -53,17 +57,15 @@ params: - 100 notification_emails: - platform-alerts@example.com + iam_bindings: [] + org_policies: + - constraint: constraints/iam.disableServiceAccountKeyCreation + enforced: true + - constraint: constraints/storage.publicAccessPrevention + enforced: true + - constraint: constraints/compute.requireOsLogin + enforced: true properties: - service_account_name: - type: string - title: Workload Service Account Name - description: Identity that workloads in this environment will run as. Cannot - be changed after creation — renaming the service account destroys the existing - SA and breaks any downstream IAM bindings referencing the old email. - $md.immutable: true - pattern: "^[a-z][a-z0-9-]{4,28}[a-z0-9]$" - default: workload - enabled_apis: title: Enabled APIs description: GCP service APIs to enable in this project. Select from the list. @@ -92,6 +94,67 @@ params: - cloudresourcemanager.googleapis.com - serviceusage.googleapis.com + iam_bindings: + title: Project IAM Bindings + description: Project-level IAM bindings for human operators and groups. Each entry + grants a single member a role on this project. Uses google_project_iam_member + (additive) — will not clobber bindings set outside Terraform. Intended for + humans and groups, not workload service accounts (those are created by consumer bundles). + Empty array is valid; bindings are optional. + type: array + default: [] + items: + type: object + required: + - role + - member + properties: + role: + title: IAM Role + description: GCP role identifier, e.g. roles/viewer or roles/bigquery.dataViewer + type: string + examples: + - roles/viewer + - roles/bigquery.dataViewer + - roles/storage.objectViewer + member: + title: Member + description: IAM member string. Prefix with user:, group:, or serviceAccount:. + type: string + examples: + - user:alice@example.com + - group:data-analysts@example.com + + org_policies: + title: Org Policy Guardrails + description: Project-scoped org-policy constraints to enforce security posture. + Uses google_project_organization_policy applied at the project level (not org-wide). + Each entry enforces or denies a named constraint. Empty array is valid. + Common constraints are listed in the preset above. + type: array + default: [] + items: + type: object + required: + - constraint + - enforced + properties: + constraint: + title: Constraint + description: Org policy constraint name, e.g. constraints/iam.disableServiceAccountKeyCreation + type: string + examples: + - constraints/iam.disableServiceAccountKeyCreation + - constraints/storage.publicAccessPrevention + - constraints/compute.requireOsLogin + - constraints/compute.vmExternalIpAccess + enforced: + title: Enforced + description: When true, the boolean constraint is enforced. For list constraints + (e.g. vmExternalIpAccess), use enforced=true to deny all values. + type: boolean + default: true + budget: title: Billing Budget description: Optionally configure a GCP billing budget with spend alerts for this environment. @@ -186,8 +249,9 @@ steps: ui: ui:order: - - service_account_name - enabled_apis + - iam_bindings + - org_policies - budget - "*" properties: @@ -207,3 +271,18 @@ ui: orderable: false items: ui:title: "Threshold (%)" + iam_bindings: + items: + ui:order: + - role + - member + - "*" + org_policies: + items: + ui:order: + - constraint + - enforced + - "*" + properties: + enforced: + ui:widget: checkbox diff --git a/bundles/gcp-landing-zone/operator.md b/bundles/gcp-landing-zone/operator.md index 921f722..0cf3448 100644 --- a/bundles/gcp-landing-zone/operator.md +++ b/bundles/gcp-landing-zone/operator.md @@ -6,7 +6,11 @@ templating: mustache ## Non-obvious constraints -**Service account name is immutable.** Changing it destroys the existing SA and creates a new one. Any downstream IAM bindings referencing the old SA email break immediately. Treat the workload SA name as permanent after first deploy. +**This bundle manages project-level IAM for humans and groups, NOT workload service accounts.** Do not add workload SAs here. Consumer bundles (Cloud Run, etc.) own their own runtime SAs. If you see unexpected workload SAs here, they are from an older version of this bundle that has since been refactored. + +**IAM bindings are additive — they are never removed except when removed from params.** `google_project_iam_member` does not reconcile the full project IAM policy. Removing a binding from params and redeploying will destroy that specific binding resource; all other project-level bindings (from GCP defaults, other automation, or the Console) remain untouched. + +**Org policies are project-scoped, not org-wide.** The `google_project_organization_policy` resource applies constraints at the project level only. Org-wide enforcement requires setting the policy at the org node, which is out of scope for this bundle. **Removing an API from `enabled_apis` does not disable it in GCP.** The `disable_on_destroy = false` flag means Terraform removes the state entry but never calls the GCP disable API. The API stays enabled. To actually disable it, run `gcloud services disable --project={{artifacts.landing_zone.project_id}}` manually after confirming no resources depend on it. @@ -39,21 +43,27 @@ gcloud services list --enabled --project={{artifacts.landing_zone.project_id}} | ``` If nothing returns, add `billingbudgets.googleapis.com` to `enabled_apis` and redeploy before enabling the budget. -**Workload SA has unexpected project-level IAM bindings.** -The workload SA should have no project-level bindings after deploy — downstream bundles add per-resource bindings. If you see unexpected bindings: +**Org policy apply fails with "403 PERMISSION_DENIED".** +The deploy credential (`gcp_authentication`) needs `orgpolicy.policy.set` at the project level. Grant it: ```bash -gcloud projects get-iam-policy {{artifacts.landing_zone.project_id}} \ - --flatten="bindings[].members" \ - --filter="bindings.members:{{artifacts.landing_zone.workload_identity.service_account_email}}" \ - --format="table(bindings.role)" +gcloud projects add-iam-policy-binding {{artifacts.landing_zone.project_id}} \ + --member="serviceAccount:" \ + --role="roles/orgpolicy.policyAdmin" ``` -An empty result is expected and correct. -**IAM binding changes outside Terraform get overwritten.** -Any bindings added manually (console or gcloud) will be removed on the next Massdriver deploy. Add permanent bindings via the bundle source. +**An IAM binding appears in GCP but is not in params.** +If the binding is a GCP default or was added outside Terraform, it will not be touched by Massdriver. If it needs to be removed, use `gcloud` or the Console — Terraform only manages the specific bindings in `iam_bindings`. ## Day-2 operations +**Adding a human operator binding:** Add `{role, member}` to `iam_bindings` and redeploy. The new `google_project_iam_member` resource is additive — no existing bindings are touched. + +**Removing a human operator binding:** Remove the entry from `iam_bindings` and redeploy. Only that specific binding resource is destroyed. No other project IAM is affected. + +**Adding an org policy constraint:** Add `{constraint, enforced}` to `org_policies` and redeploy. Each constraint is an independent resource. + +**Removing an org policy constraint:** Remove the entry from `org_policies` and redeploy. The constraint is removed from the project — the org's inherited policy (if any) applies after removal. + **Adding APIs after initial deploy:** Update `enabled_apis` in the package config and redeploy. Adding an API adds a new `google_project_service` resource without touching existing ones. **Disabling an API:** Remove it from `enabled_apis` and redeploy. Terraform drops the state entry but does NOT call the GCP disable API. Manually disable via `gcloud services disable` if required. @@ -70,16 +80,18 @@ Any bindings added manually (console or gcloud) will be removed on the next Mass # List enabled APIs in the project gcloud services list --enabled --project={{artifacts.landing_zone.project_id}} -# Check IAM bindings for the workload service account +# Check all project-level IAM bindings gcloud projects get-iam-policy {{artifacts.landing_zone.project_id}} \ - --flatten="bindings[].members" \ - --filter="bindings.members:{{artifacts.landing_zone.workload_identity.service_account_email}}" \ - --format="table(bindings.role)" + --format="table(bindings.role,bindings.members)" + +# List active org policy constraints on the project +gcloud resource-manager org-policies list \ + --project={{artifacts.landing_zone.project_id}} -# Describe the workload service account -gcloud iam service-accounts describe {{artifacts.landing_zone.workload_identity.service_account_email}} \ +# Describe a specific org policy constraint +gcloud resource-manager org-policies describe constraints/iam.disableServiceAccountKeyCreation \ --project={{artifacts.landing_zone.project_id}} -# List all service accounts in the project +# List all service accounts in the project (workload SAs are owned by consumer bundles, not this one) gcloud iam service-accounts list --project={{artifacts.landing_zone.project_id}} ``` diff --git a/bundles/gcp-landing-zone/src/artifacts.tf b/bundles/gcp-landing-zone/src/artifacts.tf index f83c4b5..195dceb 100644 --- a/bundles/gcp-landing-zone/src/artifacts.tf +++ b/bundles/gcp-landing-zone/src/artifacts.tf @@ -1,6 +1,7 @@ -# Single landing-zone artifact — combines network, workload identity, enabled APIs, -# and budget reference. Downstream bundles connect to this one artifact instead of -# wiring network and identity connections separately. +# Single landing-zone artifact — combines network, enabled APIs, budget reference, +# and an informational summary of the IAM bindings applied at project level. +# Downstream bundles connect to this one artifact to get project_id, network, and +# the list of enabled APIs. Each consumer bundle creates its own workload SA. resource "massdriver_artifact" "landing_zone" { field = "landing_zone" @@ -15,14 +16,17 @@ resource "massdriver_artifact" "landing_zone" { primary_subnet = var.network.primary_subnet } - workload_identity = { - service_account_email = google_service_account.workload.email - service_account_id = google_service_account.workload.unique_id - service_account_name = google_service_account.workload.name - } - enabled_apis = var.enabled_apis + # iam_bindings carries an informational summary of what project-level IAM was applied. + # Downstream bundles do not consume this — it is an audit trail for operators. + iam_bindings = [ + for binding in var.iam_bindings : { + role = binding.role + member = binding.member + } + ] + # budget is always present in the artifact for schema conformance. # When disabled, fields carry null/empty sentinel values so downstream # bundles can safely check landing_zone.budget.enabled before using them. diff --git a/bundles/gcp-landing-zone/src/main.tf b/bundles/gcp-landing-zone/src/main.tf index 2ced40d..b644149 100644 --- a/bundles/gcp-landing-zone/src/main.tf +++ b/bundles/gcp-landing-zone/src/main.tf @@ -43,17 +43,49 @@ resource "google_project_service" "apis" { disable_on_destroy = false } -# ─── Workload Service Account ───────────────────────────────────────────────── -# Runtime identity that data platform workloads (Cloud Run, Vertex Workbench, -# etc.) will run as. This is NOT the Terraform deploy credential. -# Downstream bundles read landing_zone.workload_identity.service_account_email -# and bind IAM roles to it on their own resources. +# ─── Project IAM Bindings (human operators / groups) ───────────────────────── +# Non-authoritative (google_project_iam_member) — one resource per binding. +# This will NOT remove any bindings set outside of Terraform. +# Intended for humans and groups who need project-level access (e.g., viewers, +# billing admins). Workload service accounts are NOT managed here — each consumer +# bundle creates its own runtime SA. + +resource "google_project_iam_member" "operators" { + for_each = { + for binding in var.iam_bindings : + "${binding.role}/${binding.member}" => binding + } -resource "google_service_account" "workload" { - project = local.project_id - account_id = var.service_account_name - display_name = "Data Platform Workload Identity — ${local.name_prefix}" - description = "Runtime service account for data platform workloads. Managed by Massdriver landing zone ${local.name_prefix}." + project = local.project_id + role = each.value.role + member = each.value.member + + depends_on = [google_project_service.apis] +} + +# ─── Org Policy Guardrails (project-scoped) ─────────────────────────────────── +# Applied at the project level — does not affect other projects in the org. +# Boolean constraints: enforce = true/false as configured. +# List constraints (e.g. vmExternalIpAccess): enforced=true → deny_all policy. +# +# Common useful constraints: +# constraints/iam.disableServiceAccountKeyCreation — prevents user-managed SA keys +# constraints/storage.publicAccessPrevention — blocks public GCS bucket access +# constraints/compute.requireOsLogin — enforces OS Login on all VMs +# constraints/compute.vmExternalIpAccess — deny all external IPs on VMs + +resource "google_project_organization_policy" "guardrails" { + for_each = { + for policy in var.org_policies : + policy.constraint => policy + } + + project = local.project_id + constraint = each.value.constraint + + boolean_policy { + enforced = each.value.enforced + } depends_on = [google_project_service.apis] } diff --git a/bundles/gcp-landing-zone/src/variables.tf b/bundles/gcp-landing-zone/src/variables.tf index cc78633..8d0580c 100644 --- a/bundles/gcp-landing-zone/src/variables.tf +++ b/bundles/gcp-landing-zone/src/variables.tf @@ -35,14 +35,28 @@ variable "network" { }) } -variable "service_account_name" { - type = string -} - variable "enabled_apis" { type = list(string) } +variable "iam_bindings" { + description = "Project-level IAM bindings for human operators/groups. Non-authoritative (additive) — will not remove bindings set outside Terraform." + type = list(object({ + role = string + member = string + })) + default = [] +} + +variable "org_policies" { + description = "Project-scoped org-policy constraints. Boolean constraints are set to enforce=true/false. List constraints with enforced=true deny all values." + type = list(object({ + constraint = string + enforced = bool + })) + default = [] +} + variable "budget" { type = object({ enabled = bool diff --git a/bundles/gcp-pubsub-topic/src/main.tf b/bundles/gcp-pubsub-topic/src/main.tf index 81c7062..b5007dd 100644 --- a/bundles/gcp-pubsub-topic/src/main.tf +++ b/bundles/gcp-pubsub-topic/src/main.tf @@ -61,20 +61,16 @@ resource "google_pubsub_topic" "dlq" { labels = var.md_metadata.default_tags } -# ─── Workload Publisher IAM ─────────────────────────────────────────────────── -# Grant the landing zone's workload service account roles/pubsub.publisher on -# the main topic. This is the IAM role binding example pattern for this series: +# ─── No workload IAM binding here ──────────────────────────────────────────── +# Pub/Sub topics do not own a runtime identity. The landing zone no longer +# provides a shared workload SA. Consumer bundles (e.g. gcp-cloud-run-service) +# create their OWN service account and the Cloud Run bundle grants publisher/ +# subscriber access on this topic's artifact fields when connected on the canvas. # -# member = "serviceAccount:" -# role = "roles/pubsub.publisher" -# topic = google_pubsub_topic.main.name -# -# Downstream bundles that need subscriber access should bind roles/pubsub.subscriber -# on this topic (or on their subscription) to the service account that reads messages. - -resource "google_pubsub_topic_iam_member" "workload_publisher" { - project = local.project_id - topic = google_pubsub_topic.main.name - role = "roles/pubsub.publisher" - member = "serviceAccount:${var.landing_zone.workload_identity.service_account_email}" -} +# Artifact policy pattern — grant a consumer's SA publisher access: +# resource "google_pubsub_topic_iam_member" "publisher" { +# project = var.pubsub_topic.project_id +# topic = var.pubsub_topic.topic_name +# role = "roles/pubsub.publisher" +# member = "serviceAccount:" +# } diff --git a/bundles/gcp-pubsub-topic/src/variables.tf b/bundles/gcp-pubsub-topic/src/variables.tf index 7d614c6..2429da1 100644 --- a/bundles/gcp-pubsub-topic/src/variables.tf +++ b/bundles/gcp-pubsub-topic/src/variables.tf @@ -34,11 +34,6 @@ variable "landing_zone" { self_link = string }) }) - workload_identity = object({ - service_account_email = string - service_account_id = string - service_account_name = string - }) enabled_apis = list(string) budget = object({ enabled = bool diff --git a/bundles/gcp-storage-bucket/src/main.tf b/bundles/gcp-storage-bucket/src/main.tf index 3f4e162..f62fa05 100644 --- a/bundles/gcp-storage-bucket/src/main.tf +++ b/bundles/gcp-storage-bucket/src/main.tf @@ -70,23 +70,15 @@ resource "google_storage_bucket" "main" { labels = var.md_metadata.default_tags } -# ─── Workload IAM Binding ───────────────────────────────────────────────────── -# Grant the landing zone's workload service account roles/storage.objectUser on -# this bucket. objectUser covers read and write of objects without granting -# delete or bucket-level admin operations. This follows the principle of least -# privilege — workloads that need to delete objects should bind objectAdmin -# explicitly in their own bundle. +# ─── No workload IAM binding here ──────────────────────────────────────────── +# GCS buckets do not own a runtime identity. The landing zone no longer provides +# a shared workload SA. Consumer bundles (e.g. gcp-cloud-run-service) create their +# OWN service account and the Cloud Run bundle grants objectUser access on this +# bucket when connected on the canvas. # -# IAM role binding pattern for this series: -# member = "serviceAccount:" -# role = "roles/storage.objectUser" -# bucket = google_storage_bucket.main.name -# -# Downstream bundles that need read-only access should bind roles/storage.objectViewer -# on this bucket using the storage_bucket artifact's bucket_name field. - -resource "google_storage_bucket_iam_member" "workload_object_user" { - bucket = google_storage_bucket.main.name - role = "roles/storage.objectUser" - member = "serviceAccount:${var.landing_zone.workload_identity.service_account_email}" -} +# Artifact policy pattern — grant a consumer's SA object access: +# resource "google_storage_bucket_iam_member" "runtime_object_user" { +# bucket = var.storage_bucket.bucket_name +# role = "roles/storage.objectUser" +# member = "serviceAccount:" +# } diff --git a/bundles/gcp-storage-bucket/src/variables.tf b/bundles/gcp-storage-bucket/src/variables.tf index fd6c0f5..ab231c4 100644 --- a/bundles/gcp-storage-bucket/src/variables.tf +++ b/bundles/gcp-storage-bucket/src/variables.tf @@ -34,11 +34,6 @@ variable "landing_zone" { self_link = string }) }) - workload_identity = object({ - service_account_email = string - service_account_id = string - service_account_name = string - }) enabled_apis = list(string) budget = object({ enabled = bool From eb0f3a36a542d152abe508e50a651457c6192b56 Mon Sep 17 00:00:00 2001 From: Cory O'Daniel Date: Sun, 19 Apr 2026 22:25:34 -0700 Subject: [PATCH 06/15] Rewrite bundle READMEs and operator guides for POC handoff MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Audit pass on all 7 GCP bundle READMEs and operator guides for consistency and POC readiness. Correctness fixes ----------------- gcp-pubsub-topic, gcp-storage-bucket, gcp-bigquery-dataset docs had drifted from the post-refactor code — they still referenced phantom IAM binding resources (workload_publisher, workload_object_user, workload_data_editor) and the removed workload_identity field on the landing zone artifact. Removed the stale references and replaced with the correct artifact-policy binding pattern for consumer bundles. Standards applied ----------------- - Removed marketing language (production-ready, robust, elegant, designed for, etc.) - Operator guides use exactly the four-header runbook structure: Non-obvious constraints, Troubleshooting, Day-2 operations, Useful commands. Empty sections omitted rather than force-filled. - Cut all params/deployed-config tables from operator guides — that information is on the Config page in the UI and made the guides feel like config reviews instead of incident runbooks. - Cut "verify the platform did what it said" sections. - Mustache interpolation added where useful for artifact values a developer pastes into a command at 2am: bucket_url, service_url, runtime_service_account_member, proxy_url, etc. - READMEs document hardcoded security controls and Checkov skips with reasons, presets, and assumptions. Short and scannable. Co-Authored-By: Claude Opus 4.7 (1M context) --- bundles/gcp-bigquery-dataset/README.md | 59 ++++++++------- bundles/gcp-bigquery-dataset/operator.md | 24 +++---- bundles/gcp-cloud-run-service/README.md | 87 ++++++++++------------- bundles/gcp-cloud-run-service/operator.md | 42 ++++------- bundles/gcp-landing-zone/README.md | 42 ++++++----- bundles/gcp-landing-zone/operator.md | 36 +++++----- bundles/gcp-network/README.md | 39 +++++----- bundles/gcp-network/operator.md | 14 ++-- bundles/gcp-pubsub-topic/README.md | 64 +++++++++-------- bundles/gcp-pubsub-topic/operator.md | 15 ++-- bundles/gcp-storage-bucket/README.md | 73 +++++++++---------- bundles/gcp-storage-bucket/operator.md | 35 +++++---- bundles/gcp-vertex-workbench/README.md | 79 +++++++++----------- bundles/gcp-vertex-workbench/operator.md | 34 +++++---- 14 files changed, 308 insertions(+), 335 deletions(-) diff --git a/bundles/gcp-bigquery-dataset/README.md b/bundles/gcp-bigquery-dataset/README.md index 8fff308..46834ed 100644 --- a/bundles/gcp-bigquery-dataset/README.md +++ b/bundles/gcp-bigquery-dataset/README.md @@ -1,32 +1,31 @@ # gcp-bigquery-dataset -Google Cloud BigQuery dataset with configurable location, default table expiration, and delete protection. Use this bundle to provision a managed analytics dataset for data platform workloads — Cloud Run pipelines, Vertex Workbench notebooks, Dataflow jobs, and ad-hoc SQL analytics. The landing zone's workload service account is automatically granted `dataEditor` access on the dataset. +Google Cloud BigQuery dataset with configurable location, default table expiration, and delete protection. Use this bundle to provision a managed analytics dataset for data platform workloads — Cloud Run pipelines, Vertex Workbench notebooks, Dataflow jobs, and ad-hoc SQL analytics. -## Purpose +## Use Cases -- Provisions a BigQuery dataset at a chosen location with an immutable dataset ID -- Configures optional default table expiration to control storage cost growth in non-production environments -- Supports delete protection to prevent accidental dataset destruction in production -- Grants `roles/bigquery.dataEditor` to the landing zone's workload service account on the dataset -- Emits a `catalog-demo/gcp-bigquery-dataset` artifact so downstream bundles can reference the dataset without hard-coding project or dataset identifiers +- Centralized analytics dataset consumed by multiple downstream services with scoped IAM +- Dev/staging datasets with automatic table expiration to control storage cost growth +- Production datasets with delete protection to prevent accidental data loss ## Resources Created | Resource | Type | Notes | |---|---|---| | `google_bigquery_dataset.main` | BigQuery dataset | Location, expiration, and delete protection set at provision time; Google-managed encryption | -| `google_bigquery_dataset_iam_member.workload_data_editor` | IAM binding | Grants `roles/bigquery.dataEditor` to the landing zone workload SA on the dataset | -## Artifacts Consumed (Connections) +This bundle does NOT create any IAM bindings. Consumer bundles (e.g., `gcp-cloud-run-service`, `gcp-vertex-workbench`) create their own service accounts and bind the appropriate roles on this dataset when connected on the canvas. + +## Connections | Connection | Artifact Type | How It Is Used | |---|---|---| | `gcp_authentication` | `gcp-service-account` | Deploy credential — project ID and service account key for the Google provider | -| `landing_zone` | `catalog-demo/gcp-landing-zone` | Provides `project_id` and `workload_identity.service_account_email` for the dataEditor IAM binding | +| `landing_zone` | `catalog-demo/gcp-landing-zone` | Provides `project_id` for resource placement | -## Artifacts Produced +## Artifact Produced -The bundle publishes a `catalog-demo/gcp-bigquery-dataset` artifact with all fields needed for downstream bundles to query and load data. +**Artifact type:** `catalog-demo/gcp-bigquery-dataset` | Field | Type | Description | |---|---|---| @@ -36,30 +35,42 @@ The bundle publishes a `catalog-demo/gcp-bigquery-dataset` artifact with all fie | `location` | string | BigQuery location where the dataset is stored | | `friendly_name` | string or null | Human-readable display name if set; null otherwise | -Downstream bundles that need read-only access should bind `roles/bigquery.dataViewer` on the dataset using `dataset_id` and `project_id` from this artifact. Bundles requiring full ownership should bind `roles/bigquery.dataOwner`. +Consumer bundles bind IAM roles using `dataset_id` and `project_id` from this artifact. Example patterns: + +```hcl +# Read/write access (Cloud Run workers) +resource "google_bigquery_dataset_iam_member" "runtime_editor" { + project = var.bigquery_dataset.project_id + dataset_id = var.bigquery_dataset.dataset_id + role = "roles/bigquery.dataEditor" + member = "serviceAccount:${google_service_account.runtime.email}" +} + +# Read-only access (Vertex Workbench notebooks) +resource "google_bigquery_dataset_iam_member" "dataset_viewer" { + project = var.bigquery_dataset.project_id + dataset_id = var.bigquery_dataset.dataset_id + role = "roles/bigquery.dataViewer" + member = "serviceAccount:${google_service_account.instance.email}" +} +``` ## Compliance -### Hardcoded security baselines - -BigQuery dataset-level IAM is the access control mechanism — there are no per-object ACLs to configure. All access to this dataset must go through IAM bindings, which this bundle manages via the `workload_data_editor` resource. - ### Checkov skips | Check | Reason | |---|---| -| `CKV_GCP_81` | Requires CMEK on all BigQuery datasets. CMEK is intentionally out of scope for this bundle — all datasets use Google-managed encryption, which is appropriate for the workloads this bundle targets. Checkov fires this check whenever a `default_encryption_configuration` block is absent, making it a false positive here. If CMEK is required for a specific workload, a separate bundle with a KMS connection should be used. | - -### Production gating +| `CKV_GCP_81` | Requires CMEK on all BigQuery datasets. CMEK is intentionally out of scope for this bundle — Google-managed encryption is used. Checkov fires this check whenever a `default_encryption_configuration` block is absent. If CMEK is required, use a separate bundle with a KMS key connection. | The `halt_on_failure` expression in `massdriver.yaml` blocks deployments with unresolved high-severity Checkov findings when the environment target matches `prod`, `prd`, or `production`. ## Assumptions -- `bigquery.googleapis.com` must be enabled in the landing zone before deploying this bundle. Add it to `enabled_apis` in the `gcp-landing-zone` package config. +- `bigquery.googleapis.com` must be enabled in the landing zone before deploying. Add it to `enabled_apis` in the `gcp-landing-zone` package. - The `gcp_authentication` credential has `bigquery.admin` or equivalent IAM on the project. -- The landing zone's workload SA is granted `roles/bigquery.dataEditor` automatically; read-only or owner-level access for other consumers must be added by the downstream bundle. -- The dataset ID (`dataset_id`) is immutable after creation. Changing it requires destroying and recreating the dataset — all data will be lost unless exported first. +- `dataset_id` is immutable after creation. Changing it requires destroying and recreating the dataset — all data is lost unless exported first. +- `default_table_expiration_days` applies only to tables created after the setting is applied. Existing tables are not affected. ## Presets @@ -67,4 +78,4 @@ The `halt_on_failure` expression in `massdriver.yaml` blocks deployments with un |---|---|---|---| | Dev | US | 30 days | Off | | Staging | US | 90 days | Off | -| Production | US | None (no expiration) | On | +| Production | US | None | On | diff --git a/bundles/gcp-bigquery-dataset/operator.md b/bundles/gcp-bigquery-dataset/operator.md index 9095186..5c19b76 100644 --- a/bundles/gcp-bigquery-dataset/operator.md +++ b/bundles/gcp-bigquery-dataset/operator.md @@ -8,36 +8,36 @@ templating: mustache **Dataset ID is immutable.** `dataset_id` cannot be changed in-place. To rename: export all tables, destroy the package, reprovision with the new ID, reload from GCS. Treat the dataset ID as permanent. -**Location is immutable.** Datasets cannot be moved between regions or multi-regions after creation. To change location: export all tables (`bq extract` to GCS), destroy the package, reprovision in the new location, reload. Budget for data transfer costs and downtime. +**Location is immutable.** Datasets cannot be moved between regions after creation. To change location: export all tables (`bq extract` to GCS), destroy the package, reprovision in the new location, reload. Budget for data transfer costs and downtime. **`default_table_expiration_ms` applies to NEW tables only.** Changing this on an existing dataset does not expire or modify existing tables. To set expiration on an existing table, update it directly via `bq update`. -**Delete protection requires a two-step destroy.** When `delete_protection = true`, the destroy will fail because `delete_contents_on_destroy = false` is enforced. To decommission: +**Delete protection requires a two-step destroy.** When `delete_protection = true`, the destroy will fail. To decommission: 1. Set `delete_protection = false` in the package config and deploy. 2. Then run the destroy. **Dataset-level IAM propagates to all tables, current and future.** For row-level or table-level isolation, use BigQuery row-level security policies or bind IAM at the table level separately. -**IAM bindings added outside Terraform are overwritten on the next apply.** For permanent bindings, add a `google_bigquery_dataset_iam_member` resource to the bundle source. +**This bundle creates no IAM bindings.** Consumer bundles bind their own service accounts to this dataset. If a service can't query or load data, the IAM binding is missing from the consumer bundle — not from here. **Cross-region queries are not supported.** BigQuery cannot join tables in different regions in a single query. Use Storage Transfer Service or BigQuery Data Transfer Service to replicate data first. -**Deploy fails with "bigquery.googleapis.com has not been used in project."** -Add `bigquery.googleapis.com` to `enabled_apis` in the `gcp-landing-zone` package, redeploy the landing zone, wait ~60 seconds, then retry. - ## Troubleshooting +**Permission denied on dataset access.** +```bash +bq get-iam-policy {{artifacts.bigquery_dataset.dataset_full_name}} +``` +The required member should have `roles/bigquery.dataEditor` for read/write or `roles/bigquery.dataViewer` for read-only. If the binding is absent, redeploy the consumer bundle with the dataset wired on the canvas. + **Quota exceeded on concurrent jobs or daily bytes scanned.** -BigQuery per-project quotas are not manageable via this bundle. Check the BigQuery quota dashboard in the GCP console and request increases if needed. +BigQuery per-project quotas are not manageable through this bundle. Check the BigQuery quota dashboard in the GCP console and request increases if needed. **Streaming insert rows not expiring as expected.** Rows inserted via the streaming API have a delay before table expiration recalculation applies. Batch loads have no such lag. -**Permission denied on dataset access.** -```bash -bq get-iam-policy {{artifacts.bigquery_dataset.dataset_full_name}} -``` -The workload SA needs `roles/bigquery.dataEditor` for read/write or `roles/bigquery.dataViewer` for read-only. +**Deploy fails with "bigquery.googleapis.com has not been used in project."** +Add `bigquery.googleapis.com` to `enabled_apis` in the `gcp-landing-zone` package, redeploy the landing zone, wait ~60 seconds, then retry. **Table schema mismatch or load failure.** ```bash diff --git a/bundles/gcp-cloud-run-service/README.md b/bundles/gcp-cloud-run-service/README.md index b952cbd..87f8fe5 100644 --- a/bundles/gcp-cloud-run-service/README.md +++ b/bundles/gcp-cloud-run-service/README.md @@ -1,49 +1,48 @@ # gcp-cloud-run-service -Google Cloud Run v2 service with automatic IAM binding for upstream data artifacts. Each instance of this bundle creates its **own runtime service account** and automatically grants it the minimum-privilege role on any connected upstream artifact (Pub/Sub topic, BigQuery dataset, GCS bucket) — no manual IAM wiring required. +Google Cloud Run v2 service. Each bundle instance creates its own runtime service account and automatically grants it the minimum-privilege role on any connected upstream artifact (Pub/Sub topic, BigQuery dataset, GCS bucket) — no manual IAM wiring required. ## Use Cases -- **Internal APIs and microservices** — low-latency HTTP services behind a load balancer or internal ingress, consuming Pub/Sub and BigQuery without internet exposure. -- **Event-driven workers** — services triggered by Pub/Sub push subscriptions or Cloud Scheduler, reading from GCS and writing to BigQuery. -- **Public APIs** — internet-facing HTTPS services with anonymous or token-authenticated access. -- **Data pipelines** — pull-based workers that read from GCS buckets and publish results to Pub/Sub or BigQuery. +- Internal APIs and microservices consuming Pub/Sub and BigQuery without internet exposure +- Event-driven workers triggered by Pub/Sub push subscriptions or Cloud Scheduler +- Public HTTPS APIs with anonymous or token-authenticated access +- Data pipeline workers reading from GCS and writing to BigQuery or Pub/Sub ## Use as a Runtime Template -This bundle is an example **runtime template** — an opinionated, org-wide standard for how Cloud Run services are provisioned. It encodes your platform's security baseline (per-service workload identity, ingress controls, compliance skips with documented rationale) and auto-wires IAM for common data dependencies. +This bundle is an example runtime template — an opinionated standard for how Cloud Run services are provisioned. It encodes a security baseline (per-service workload identity, ingress controls, compliance skips with documented rationale) and auto-wires IAM for common data dependencies. -The typical workflow for application teams: +Typical workflow: +1. Platform team publishes this template bundle (or a fork) to Massdriver. +2. Application developer runs `mass bundle new` pointing at the template to generate a bundle for their specific service. +3. The developer customizes image, connections, environment variables, and app-specific dependencies. The platform baseline is inherited. -1. **Ops/platform team** publishes this template bundle (or a fork of it) to Massdriver. -2. **Application developer** runs `mass bundle new` pointing at the template to generate a new bundle for their specific application. They customize it with their app's image, connections, environment variables, and any app-specific dependencies. -3. The per-app bundle inherits the org's runtime standards from the template; the developer only changes what's specific to their application. - -This separation keeps the platform baseline consistent across all services while letting application teams move independently. + ## Resources Created -| Resource | Description | -|---|---| -| `google_service_account.runtime` | Per-service runtime SA — this bundle's own workload identity | -| `google_cloud_run_v2_service` | The Cloud Run v2 service running your container | -| `google_cloud_run_v2_service_iam_member` (allUsers) | Created only when `allow_unauthenticated = true` — grants public invoke access | -| `google_pubsub_topic_iam_member` | Created only when Pub/Sub topic is connected — grants `roles/pubsub.publisher` to runtime SA | -| `google_bigquery_dataset_iam_member` | Created only when BigQuery dataset is connected — grants `roles/bigquery.dataEditor` to runtime SA | -| `google_storage_bucket_iam_member` | Created only when Storage bucket is connected — grants `roles/storage.objectUser` to runtime SA | +| Resource | Type | Notes | +|---|---|---| +| `google_service_account.runtime` | Per-service runtime SA | This service's workload identity — one per bundle instance | +| `google_cloud_run_v2_service.main` | Cloud Run v2 service | Runs containers as the runtime SA | +| `google_cloud_run_v2_service_iam_member` (allUsers) | Public invoker IAM | Created only when `allow_unauthenticated = true` | +| `google_pubsub_topic_iam_member` | Pub/Sub publisher IAM | Created only when Pub/Sub topic is connected | +| `google_bigquery_dataset_iam_member` | BigQuery data editor IAM | Created only when BigQuery dataset is connected | +| `google_storage_bucket_iam_member` | GCS object user IAM | Created only when Storage bucket is connected | ## Connections ### Required -| Connection | Artifact Type | Purpose | +| Connection | Artifact Type | How It Is Used | |---|---|---| | `gcp_authentication` | `gcp-service-account` | GCP credentials used by Terraform to provision resources | | `landing_zone` | `catalog-demo/gcp-landing-zone` | Provides `project_id` and `network.region` | ### Optional -These connections are not required. When wired on the canvas, the bundle automatically grants this service's runtime SA the appropriate IAM role on the upstream resource. When absent, no IAM binding is created. +When connected on the canvas, the bundle automatically grants this service's runtime SA the listed IAM role. When absent, no binding is created. | Connection | Artifact Type | IAM Role Granted | |---|---|---| @@ -51,6 +50,8 @@ These connections are not required. When wired on the canvas, the bundle automat | `bigquery_dataset` | `catalog-demo/gcp-bigquery-dataset` | `roles/bigquery.dataEditor` on the dataset | | `storage_bucket` | `catalog-demo/gcp-storage-bucket` | `roles/storage.objectUser` on the bucket | +Connecting or disconnecting a canvas wire does not take effect until a Terraform apply runs. + ## Artifact Produced **Artifact type:** `catalog-demo/gcp-cloud-run-service` @@ -65,47 +66,37 @@ These connections are not required. When wired on the canvas, the bundle automat | `runtime_service_account_email` | string | Email of this service's own runtime SA | | `runtime_service_account_member` | string | IAM principal string (`serviceAccount:`) for downstream bindings | -The `runtime_service_account_member` field is designed for downstream bundles (Scheduler, Pub/Sub push) that need to grant `roles/run.invoker` to this service's identity. - -## Parameters - -| Parameter | Type | Default | Description | -|---|---|---|---| -| `image` | string | `gcr.io/cloudrun/hello` | Container image to deploy. Default is deployable out of the box for testing. | -| `port` | integer | `8080` | Port the container listens on. Must match the process — mismatch causes revision failure. | -| `cpu` | enum | `1` | vCPUs per instance: `1`, `2`, `4`, `8` | -| `memory` | enum | `512Mi` | Memory per instance: `256Mi` through `32Gi` | -| `min_instances` | integer | `0` | Scale-to-zero when 0. Any value above 0 means you pay for idle capacity. | -| `max_instances` | integer | `100` | Cap on autoscaling. Reduce to protect downstream systems from traffic spikes. | -| `ingress` | enum | `internal` | Traffic source restriction: `all`, `internal`, `internal-and-cloud-load-balancing` | -| `allow_unauthenticated` | boolean | `false` | Grant `allUsers` `roles/run.invoker` for public anonymous access | - -## Presets - -| Preset | Ingress | Min | Max | CPU | Memory | Unauth | -|---|---|---|---|---|---|---| -| Internal | `internal` | 0 | 10 | 1 | 512Mi | false | -| Public API | `all` | 1 | 100 | 2 | 1Gi | true | -| Worker | `internal` | 1 | 50 | 2 | 2Gi | false | +`runtime_service_account_member` is designed for downstream bundles (Scheduler, Pub/Sub push) that need to grant `roles/run.invoker` to this service's identity. ## Compliance -### Hardcoded Controls +### Hardcoded controls -| Control | Value | Rationale | +| Control | Value | Reason | |---|---|---| | Per-service runtime identity | `google_service_account.runtime` (one per bundle instance) | Each service gets its own SA with bindings only to resources it connects to — no shared SA that grants access across all workloads | | Resource labels | Massdriver default tags | Enforces cost attribution and environment tagging on all revisions | -### Skipped Checks +### Checkov skips | Check | Reason | |---|---| | `CKV_GCP_102` | Ingress is intentionally configurable. The check fires on any non-internal service without distinguishing IAM controls. Internal-preset services pass this check without the skip; only public-ingress services need it bypassed. | | `CKV_GCP_103` | Binary Authorization requires a pre-configured attestor policy at the project level. Enabling it per-service without an attestor causes all deployments to fail. Teams requiring binary authorization should enforce it via `google_binary_authorization_policy`. | +The `halt_on_failure` expression in `massdriver.yaml` blocks deployments with unresolved high-severity Checkov findings when the environment target matches `prod`, `prd`, or `production`. + ## Assumptions - The landing zone provides `project_id` and `network.region`. It does NOT provide a workload SA — this bundle creates its own. -- VPC connector / direct VPC egress is not provisioned by this bundle. Cloud Run uses Google's serverless infrastructure by default. If you need to reach VPC-private resources (e.g., Cloud SQL without public IP), add a `google_vpc_access_connector` resource and reference it in the template's `vpc_access` block. +- The runtime SA does not automatically have `roles/artifactregistry.reader`. If your image is in Artifact Registry, grant that role manually or add it to the bundle source. +- VPC connector or direct VPC egress is not provisioned by this bundle. If you need to reach VPC-private resources (e.g., Cloud SQL without public IP), add a `google_vpc_access_connector` resource to the bundle source. - The default image (`gcr.io/cloudrun/hello`) is the Google-managed hello-world container. Replace it with your application image before a real deployment. + +## Presets + +| Preset | Ingress | Min Instances | Max Instances | CPU | Memory | Unauthenticated | +|---|---|---|---|---|---|---| +| Internal | `internal` | 0 | 10 | 1 | 512Mi | false | +| Public API | `all` | 1 | 100 | 2 | 1Gi | true | +| Worker | `internal` | 1 | 50 | 2 | 2Gi | false | diff --git a/bundles/gcp-cloud-run-service/operator.md b/bundles/gcp-cloud-run-service/operator.md index 46a3fe7..339f7ba 100644 --- a/bundles/gcp-cloud-run-service/operator.md +++ b/bundles/gcp-cloud-run-service/operator.md @@ -6,19 +6,17 @@ templating: mustache ## Non-obvious constraints -**Each bundle instance creates its own service account.** Unlike the previous pattern where all services shared the landing zone's workload SA, this bundle provisions `google_service_account.runtime` scoped to this specific service. If you redeploy after the old landing-zone SA is gone from state, the Cloud Run service will be updated to run as the new per-service SA. Cold start on the first revision after a SA switch is expected. +**Each bundle instance creates its own service account.** The SA email is derived from the bundle's `name_prefix`. If the package is renamed, the SA is destroyed and recreated. Any out-of-band IAM bindings referencing the old SA email (e.g., manually granted Artifact Registry reader) must be reapplied. Canvas-wired bindings (Pub/Sub, BigQuery, GCS) are recreated automatically on the next deploy. -**Rotating the runtime SA is destructive — expect cold start.** The SA `account_id` is derived from the bundle's `name_prefix`. Changing the package name or name_prefix recreates the SA with a new email. Any out-of-band IAM bindings referencing the old SA email (e.g., manually granted Artifact Registry reader) must be reapplied. Canvas-wired bindings (Pub/Sub, BigQuery, GCS) are recreated automatically on the next deploy. +**New deployments route 100% of traffic to the latest revision immediately.** Blue/green splits must be configured before deploying the new revision. You cannot retroactively split traffic between revisions once the new one is live at 100%. -**New deployments route 100% of traffic to the latest revision immediately.** Blue/green splits must be configured before deploying the new revision. You cannot retroactively split traffic between an old and new revision once the new one is live at 100%. +**Changing `ingress` triggers a new revision and a cold start.** Even if `min_instances > 0`, an ingress change forces revision replacement. -**Changing `ingress` triggers a new revision and a cold start.** Even if `min_instances > 0`, an ingress change forces revision replacement. Expect a brief cold start. +**`min_instances > 0` means continuous billing.** You pay for idle capacity at the full CPU+memory rate at all times. -**`min_instances > 0` means continuous billing.** No scale-to-zero. You pay for idle capacity at the full CPU+memory rate at all times. +**Container port must match what the image listens on.** A mismatch causes revision health check failure and Cloud Run rolls back. Error in logs: `Container failed to start. Failed to start and then listen on the port defined by the PORT environment variable.` -**Container port must match what the image listens on.** If the image doesn't listen on the configured port, the revision fails health checks and Cloud Run rolls back. Error in logs: `Container failed to start. Failed to start and then listen on the port defined by the PORT environment variable.` Check application logs before the platform logs. - -**Image pull from Artifact Registry: the runtime SA needs `roles/artifactregistry.reader`.** This bundle does not grant that role. If a revision fails with `image not found` or `permission denied` at startup, grant the role manually or add it to the bundle: +**The runtime SA does not have `roles/artifactregistry.reader` by default.** If a revision fails with `image not found` or `permission denied` at startup, grant the role: ```bash gcloud artifacts repositories add-iam-policy-binding \ --location={{artifacts.cloud_run_service.location}} \ @@ -27,7 +25,7 @@ gcloud artifacts repositories add-iam-policy-binding \ --role="roles/artifactregistry.reader" ``` -**Connecting or disconnecting canvas wires requires a Massdriver deploy to take effect.** Wiring an artifact on the canvas does not grant IAM access. The Terraform apply must run to create or destroy the IAM binding. +**Canvas wire changes require a deploy to take effect.** Connecting or disconnecting a data artifact on the canvas does not grant or revoke IAM access. The Terraform apply must run to create or destroy the binding. ## Troubleshooting @@ -39,7 +37,7 @@ gcloud logging read \ --project={{artifacts.cloud_run_service.project_id}} \ --limit=20 ``` -Check for: missing environment variables, failed DB connections, wrong port. Test locally: `docker run -p 8080: ` and confirm it starts quickly. +Check for: missing environment variables, wrong port, failed startup connections. Test locally: `docker run -p 8080: ` and confirm it starts quickly. **5xx errors in production.** ```bash @@ -50,10 +48,7 @@ gcloud logging read \ --format="table(timestamp,httpRequest.status,httpRequest.requestUrl)" ``` -**IAM binding not applied after connecting a canvas wire.** -Connect the wire on the canvas AND redeploy this package. The binding does not exist until Terraform applies it. - -**Service can't access a resource it's connected to.** +**Service can't access a connected resource (Pub/Sub, BigQuery, GCS).** Confirm the canvas wire is connected AND the package has been deployed since the wire was added. Check the specific IAM binding: ```bash # Pub/Sub @@ -73,20 +68,20 @@ The member should be `{{artifacts.cloud_run_service.runtime_service_account_memb **Rolling back to a prior revision:** ```bash -# 1. List revisions to find the last known-good one +# List revisions to find the last known-good one gcloud run revisions list \ --service={{artifacts.cloud_run_service.service_name}} \ --region={{artifacts.cloud_run_service.location}} \ --project={{artifacts.cloud_run_service.project_id}} \ --format="table(name,status.conditions[0].status)" -# 2. Shift 100% traffic to the prior revision +# Shift 100% traffic to the prior revision gcloud run services update-traffic {{artifacts.cloud_run_service.service_name}} \ --region={{artifacts.cloud_run_service.location}} \ --project={{artifacts.cloud_run_service.project_id}} \ --to-revisions==100 ``` -This rollback is manual and temporary. The next Massdriver deploy will override it. Fix the image or config, then redeploy. +This rollback is manual and temporary. The next Massdriver deploy overrides it. Fix the image or config, then redeploy. **Pinning to a digest to prevent silent image changes:** ```bash @@ -95,9 +90,7 @@ gcloud container images describe : \ # Use the output sha256:... in the image param: @sha256:... ``` -**Scaling changes:** Update `min_instances` or `max_instances` params and redeploy. In-place safe. - -**Rotating the runtime service account:** The SA is derived from the bundle's name_prefix. Rotating requires renaming, which is destructive — the old SA is deleted, a new one is created, and all canvas-wired IAM bindings are recreated on next deploy. Any out-of-band bindings (e.g., Artifact Registry reader) must be reapplied manually. +**Scaling changes:** Update `min_instances` or `max_instances` params and redeploy. In-place, safe. ## Useful commands @@ -134,13 +127,4 @@ gcloud run services get-iam-policy {{artifacts.cloud_run_service.service_name}} # Describe the runtime service account gcloud iam service-accounts describe {{artifacts.cloud_run_service.runtime_service_account_email}} \ --project={{artifacts.cloud_run_service.project_id}} - -# Check runtime SA's IAM bindings on a connected Pub/Sub topic -gcloud pubsub topics get-iam-policy \ - --project={{artifacts.cloud_run_service.project_id}} \ - --format="table(bindings.role,bindings.members)" - -# Check runtime SA's IAM bindings on a connected GCS bucket -gcloud storage buckets get-iam-policy gs:// \ - --format="table(bindings.role,bindings.members)" ``` diff --git a/bundles/gcp-landing-zone/README.md b/bundles/gcp-landing-zone/README.md index 35c90fd..53417fe 100644 --- a/bundles/gcp-landing-zone/README.md +++ b/bundles/gcp-landing-zone/README.md @@ -1,35 +1,37 @@ # gcp-landing-zone -Project-level governance construct for a GCP data platform. Deploy this once per environment before any workload bundles. It: +Project-level governance construct for a GCP data platform. Deploy this once per environment before any workload bundles. - Enables GCP service APIs required by your data platform stack -- Applies **project-level IAM bindings** for human operators and groups (e.g., `roles/viewer` to `group:data-analysts@example.com`) -- Enforces **org-policy guardrails** at the project level (e.g., disable SA key creation, block public GCS access) -- Optionally configures a **billing budget** with spend-threshold email alerts -- Folds the input `gcp-network` artifact into its own `landing_zone` output so downstream bundles need only one connection instead of wiring network separately +- Applies project-level IAM bindings for human operators and groups (e.g., `roles/viewer` to `group:data-analysts@example.com`) +- Enforces org-policy guardrails at the project level (e.g., disable SA key creation, block public GCS access) +- Optionally configures a billing budget with spend-threshold email alerts +- Folds the input `gcp-network` artifact into its own `landing_zone` output so downstream bundles need only one connection -**This bundle does NOT provision workload service accounts.** Each consumer bundle (Cloud Run, etc.) creates its own runtime SA with least-privilege bindings on the resources it owns. Project-level IAM here is for human operators and group access management. +**This bundle does NOT provision workload service accounts.** Each consumer bundle (Cloud Run, Vertex Workbench) creates its own runtime SA with least-privilege bindings on the specific resources it uses. Project-level IAM here is for human operators and group access only. ## Resources Created | Resource | Type | Notes | |---|---|---| -| `google_project_service.apis` | API enablement (one per API) | `disable_on_destroy = false` to avoid disrupting other resources | -| `google_project_iam_member.operators` | Project IAM bindings | One resource per `{role, member}` entry; additive (non-authoritative) | +| `google_project_service.apis` | API enablement (one per API) | `disable_on_destroy = false` — removing an API from params does not disable it in GCP | +| `google_project_iam_member.operators` | Project IAM bindings | One resource per `{role, member}` entry; additive, non-authoritative | | `google_project_organization_policy.guardrails` | Org policy constraints | Project-scoped; one resource per constraint | | `google_billing_budget.environment` | Billing budget | Created only when `budget.enabled = true` | | `google_monitoring_notification_channel.budget_email` | Email alert channel | Created only when budget is enabled and `notification_emails` is non-empty | -## Artifacts Consumed (Connections) +## Connections | Connection | Artifact Type | How It Is Used | |---|---|---| | `gcp_authentication` | `gcp-service-account` | Deploy credential — project ID and service account key for the Google provider | | `network` | `catalog-demo/gcp-network` | Network metadata passed through into the `landing_zone` artifact for downstream use | -## Artifacts Produced +## Artifact Produced -The bundle publishes a single `catalog-demo/gcp-landing-zone` artifact. Downstream bundles connect to this one artifact to get everything they need. +**Artifact type:** `catalog-demo/gcp-landing-zone` + +Downstream bundles connect to this one artifact to get everything they need. | Field | Description | |---|---| @@ -40,34 +42,32 @@ The bundle publishes a single `catalog-demo/gcp-landing-zone` artifact. Downstre | `network.primary_subnet.name` | Subnet name | | `network.primary_subnet.cidr` | Subnet CIDR range | | `network.primary_subnet.self_link` | Subnet self-link URI | -| `enabled_apis` | List of APIs that were enabled | -| `iam_bindings` | Informational list of project-level `{role, member}` bindings applied by this landing zone | +| `enabled_apis` | List of APIs enabled by this landing zone | +| `iam_bindings` | Informational list of project-level `{role, member}` bindings applied — audit trail only, not consumed by downstream bundles | | `budget.enabled` | Whether a budget was configured | | `budget.budget_name` | Budget display name (null when disabled) | | `budget.billing_account_id` | Billing account the budget is attached to (null when disabled) | | `budget.amount_usd` | Monthly budget limit in USD (null when disabled) | -## IAM Pattern for Downstream Consumer Bundles +## IAM Pattern for Consumer Bundles -Each downstream bundle creates its OWN service account for its runtime identity, then binds that SA to the specific resources it needs. The landing zone does not provision or share a workload SA. Example pattern in a consumer bundle: +Each consumer bundle creates its own service account and binds it to the specific resources it uses. The landing zone does not provide or share a workload SA. Example: ```hcl -# Consumer bundle creates its own runtime SA resource "google_service_account" "runtime" { project = var.landing_zone.project_id account_id = "${var.md_metadata.name_prefix}-sa" display_name = "Runtime SA for ${var.md_metadata.name_prefix}" } -# Consumer bundle binds its SA only to the resources it actually needs resource "google_bigquery_dataset_iam_member" "runtime_editor" { - dataset_id = google_bigquery_dataset.main.dataset_id + dataset_id = var.bigquery_dataset.dataset_id role = "roles/bigquery.dataEditor" member = "serviceAccount:${google_service_account.runtime.email}" } ``` -The artifact policy comments in each data-resource artdef (`gcp-pubsub-topic`, `gcp-bigquery-dataset`, `gcp-storage-bucket`) are the canonical role-binding reference. +The artifact policy comments in `gcp-pubsub-topic`, `gcp-bigquery-dataset`, and `gcp-storage-bucket` source files are the canonical role-binding reference. ## Compliance @@ -75,7 +75,7 @@ The artifact policy comments in each data-resource artdef (`gcp-pubsub-topic`, ` | Control | Mechanism | Reason | |---|---|---| -| Additive (non-authoritative) IAM | `google_project_iam_member` (per-binding) | Avoids clobbering bindings set by GCP defaults or other automation | +| Additive IAM only | `google_project_iam_member` (per-binding, non-authoritative) | Avoids clobbering bindings set by GCP defaults or other automation | | APIs not disabled on destroy | `disable_on_destroy = false` | Prevents accidental disruption of other resources that depend on the same APIs | ### Checkov skips @@ -84,8 +84,6 @@ The artifact policy comments in each data-resource artdef (`gcp-pubsub-topic`, ` |---|---| | `CKV_GCP_118` | Skipped on `google_project_service` — API enablement resources do not accept IAM policies | -### Production gating - The `halt_on_failure` expression in `massdriver.yaml` blocks deployments with unresolved high-severity Checkov findings when the environment target matches `prod`, `prd`, or `production`. ## Assumptions diff --git a/bundles/gcp-landing-zone/operator.md b/bundles/gcp-landing-zone/operator.md index 0cf3448..a8bb171 100644 --- a/bundles/gcp-landing-zone/operator.md +++ b/bundles/gcp-landing-zone/operator.md @@ -6,19 +6,19 @@ templating: mustache ## Non-obvious constraints -**This bundle manages project-level IAM for humans and groups, NOT workload service accounts.** Do not add workload SAs here. Consumer bundles (Cloud Run, etc.) own their own runtime SAs. If you see unexpected workload SAs here, they are from an older version of this bundle that has since been refactored. +**This bundle manages project-level IAM for humans and groups, not workload service accounts.** Consumer bundles (Cloud Run, Vertex Workbench) own their own runtime SAs. Do not add workload SAs here. -**IAM bindings are additive — they are never removed except when removed from params.** `google_project_iam_member` does not reconcile the full project IAM policy. Removing a binding from params and redeploying will destroy that specific binding resource; all other project-level bindings (from GCP defaults, other automation, or the Console) remain untouched. +**IAM bindings are additive and only removed when explicitly deleted from params.** `google_project_iam_member` does not reconcile the full project IAM policy. Removing a binding from params and redeploying destroys only that specific binding resource — all other project-level bindings remain untouched. -**Org policies are project-scoped, not org-wide.** The `google_project_organization_policy` resource applies constraints at the project level only. Org-wide enforcement requires setting the policy at the org node, which is out of scope for this bundle. +**Org policies are project-scoped, not org-wide.** `google_project_organization_policy` applies constraints at the project level only. Org-wide enforcement requires setting the policy at the org node, which is out of scope for this bundle. -**Removing an API from `enabled_apis` does not disable it in GCP.** The `disable_on_destroy = false` flag means Terraform removes the state entry but never calls the GCP disable API. The API stays enabled. To actually disable it, run `gcloud services disable --project={{artifacts.landing_zone.project_id}}` manually after confirming no resources depend on it. +**Removing an API from `enabled_apis` does not disable it in GCP.** `disable_on_destroy = false` means Terraform removes the state entry but never calls the GCP disable API. The API stays enabled. To actually disable it, run `gcloud services disable` manually after confirming no resources depend on it. **Budget requires Cloud Billing linked to the project.** If deploy fails with a billing budget error, confirm the project has a billing account attached in the GCP console before enabling the budget param. **Budget alert emails require a verified notification channel.** The Google Cloud Monitoring email channel must be verified in GCP before alerts deliver. Billing admins on the account always receive alerts regardless of channel configuration. -**Newly added APIs can take 1–2 minutes to propagate.** If a downstream bundle deploy fails immediately after adding an API here, wait a minute and retry. +**Newly added APIs can take 1–2 minutes to propagate.** If a downstream bundle deploy fails immediately after adding an API here, wait ~60 seconds and retry. ## Troubleshooting @@ -44,7 +44,7 @@ gcloud services list --enabled --project={{artifacts.landing_zone.project_id}} | If nothing returns, add `billingbudgets.googleapis.com` to `enabled_apis` and redeploy before enabling the budget. **Org policy apply fails with "403 PERMISSION_DENIED".** -The deploy credential (`gcp_authentication`) needs `orgpolicy.policy.set` at the project level. Grant it: +The deploy credential needs `orgpolicy.policy.set` at the project level: ```bash gcloud projects add-iam-policy-binding {{artifacts.landing_zone.project_id}} \ --member="serviceAccount:" \ @@ -52,27 +52,27 @@ gcloud projects add-iam-policy-binding {{artifacts.landing_zone.project_id}} \ ``` **An IAM binding appears in GCP but is not in params.** -If the binding is a GCP default or was added outside Terraform, it will not be touched by Massdriver. If it needs to be removed, use `gcloud` or the Console — Terraform only manages the specific bindings in `iam_bindings`. +If the binding was added outside Terraform, it will not be touched by Massdriver. To remove it, use `gcloud` or the Console. ## Day-2 operations -**Adding a human operator binding:** Add `{role, member}` to `iam_bindings` and redeploy. The new `google_project_iam_member` resource is additive — no existing bindings are touched. +**Adding a human operator binding:** Add `{role, member}` to `iam_bindings` and redeploy. Additive — no existing bindings are touched. -**Removing a human operator binding:** Remove the entry from `iam_bindings` and redeploy. Only that specific binding resource is destroyed. No other project IAM is affected. +**Removing a human operator binding:** Remove the entry from `iam_bindings` and redeploy. Only that specific binding resource is destroyed. -**Adding an org policy constraint:** Add `{constraint, enforced}` to `org_policies` and redeploy. Each constraint is an independent resource. +**Adding an org policy constraint:** Add `{constraint, enforced}` to `org_policies` and redeploy. -**Removing an org policy constraint:** Remove the entry from `org_policies` and redeploy. The constraint is removed from the project — the org's inherited policy (if any) applies after removal. +**Removing an org policy constraint:** Remove the entry from `org_policies` and redeploy. The org's inherited policy (if any) applies after removal. -**Adding APIs after initial deploy:** Update `enabled_apis` in the package config and redeploy. Adding an API adds a new `google_project_service` resource without touching existing ones. +**Adding APIs after initial deploy:** Update `enabled_apis` and redeploy. Existing APIs are not touched. -**Disabling an API:** Remove it from `enabled_apis` and redeploy. Terraform drops the state entry but does NOT call the GCP disable API. Manually disable via `gcloud services disable` if required. +**Disabling an API:** Remove it from `enabled_apis` and redeploy. Terraform drops the state entry but does NOT call the GCP disable API. Manually disable via `gcloud services disable` if needed. **Changing budget amount or alert thresholds:** Update params and redeploy. The `google_billing_budget` resource updates in-place. -**Disabling the budget after it was enabled:** Set `budget.enabled = false` and redeploy. The budget and notification channel are destroyed. Spend is not affected — only alerting is removed. +**Disabling the budget after it was enabled:** Set `budget.enabled = false` and redeploy. The budget and notification channel are destroyed. -**Rotating the deploy credential:** Update the GCP credential in the Massdriver UI under environment credential settings, then redeploy. Terraform state does not hold the credential — it is injected at plan time. +**Rotating the deploy credential:** Update the GCP credential in the Massdriver UI under environment credential settings, then redeploy. ## Useful commands @@ -92,6 +92,10 @@ gcloud resource-manager org-policies list \ gcloud resource-manager org-policies describe constraints/iam.disableServiceAccountKeyCreation \ --project={{artifacts.landing_zone.project_id}} -# List all service accounts in the project (workload SAs are owned by consumer bundles, not this one) +# List all service accounts in the project (workload SAs are owned by consumer bundles) gcloud iam service-accounts list --project={{artifacts.landing_zone.project_id}} + +# Manually disable an API (only needed if you removed it from enabled_apis and want it actually off) +gcloud services disable .googleapis.com \ + --project={{artifacts.landing_zone.project_id}} ``` diff --git a/bundles/gcp-network/README.md b/bundles/gcp-network/README.md index e007de5..a7ae332 100644 --- a/bundles/gcp-network/README.md +++ b/bundles/gcp-network/README.md @@ -1,45 +1,42 @@ # gcp-network -Minimal GCP VPC network with a single regional subnet. This is the foundational networking bundle for the GCP data platform stack. Other bundles — including `gcp-landing-zone`, Cloud Run, and Vertex Workbench — consume the `gcp-network` artifact it produces. +Minimal GCP VPC network with a single regional subnet. Deploy this before `gcp-landing-zone` — the landing zone consumes the `gcp-network` artifact and passes it downstream so other bundles only need one connection. -## Purpose +## Use Cases -Creates a production-ready VPC with sensible defaults: - -- VPC created in custom (non-auto) mode so subnets are explicitly managed -- Flow logging enabled on the subnet for visibility into traffic -- Private Google Access enabled on the subnet so workloads reach Google APIs without a NAT gateway -- A deny-all ingress firewall rule at priority 65534 enforces explicit allowlisting — workload bundles add targeted allow rules on top +- Foundational networking for a GCP data platform stack +- Single regional subnet with Private Google Access so VMs reach GCP APIs without a NAT gateway +- Baseline deny-all ingress policy; workload bundles layer their own allow rules on top ## Resources Created | Resource | Type | Notes | |---|---|---| -| `google_compute_network.vpc` | VPC network | Custom subnet mode, global | -| `google_compute_subnetwork.primary` | Regional subnet | Flow logging on, Private Google Access on | +| `google_compute_network.vpc` | VPC network | Custom subnet mode; GCP does not auto-create subnets in other regions | +| `google_compute_subnetwork.primary` | Regional subnet | Flow logging on (0.5 sampling), Private Google Access on | | `google_compute_firewall.deny_all_ingress` | Firewall rule | Deny all ingress at priority 65534 | -## Artifacts Consumed (Connections) +## Connections | Connection | Artifact Type | How It Is Used | |---|---|---| -| `gcp_authentication` | `gcp-service-account` | Deploy credential — project ID and service account key used for the Google provider | +| `gcp_authentication` | `gcp-service-account` | Deploy credential — project ID and service account key for the Google provider | -## Artifacts Produced +## Artifact Produced -The bundle publishes a `gcp-network` artifact with the following fields: +**Artifact type:** `gcp-network` | Field | Description | |---|---| | `project_id` | GCP project the VPC belongs to | | `network_name` | Name of the VPC network resource | -| `network_self_link` | Full self-link URI for the VPC (used by resource references) | +| `network_self_link` | Full self-link URI for the VPC | | `region` | Region of the primary subnet | | `primary_subnet.name` | Subnet resource name | | `primary_subnet.cidr` | Primary IP range of the subnet | | `primary_subnet.self_link` | Full self-link URI for the subnet | -Downstream bundles (e.g., `gcp-landing-zone`) pass this artifact through their own artifact, so further-downstream bundles only need one connection. +This artifact is consumed by `gcp-landing-zone`, which passes it through into its own artifact so downstream bundles (Cloud Run, Vertex Workbench) only need to connect to the landing zone. ## Compliance @@ -47,14 +44,12 @@ Downstream bundles (e.g., `gcp-landing-zone`) pass this artifact through their o | Control | Mechanism | Reason | |---|---|---| -| Deny-all ingress | `google_compute_firewall.deny_all_ingress` at priority 65534 | Satisfies CKV2_GCP_18; forces explicit allowlisting per workload | +| Deny-all ingress | `google_compute_firewall.deny_all_ingress` at priority 65534 | Enforces explicit allowlisting per workload (Checkov CKV2_GCP_18) | | Custom subnet mode | `auto_create_subnetworks = false` | Prevents GCP from auto-creating subnets in every region | -| Private Google Access | `private_ip_google_access = true` | Lets VMs reach Google APIs over internal IPs without egress | -| Flow logging | `log_config` block with 0.5 sampling | Network audit trail; enables traffic troubleshooting | - -### Checkov posture +| Private Google Access | `private_ip_google_access = true` | VMs reach GCP APIs over internal IPs without egress or NAT | +| Flow logging | `log_config` block, 0.5 sampling | Network audit trail for traffic troubleshooting | -There is no `.checkov.yml` skip list for this bundle — all findings are either satisfied by the hardcoded controls above or blocked in production via `halt_on_failure`. +No Checkov skips — all findings are satisfied by the hardcoded controls above or blocked in production via `halt_on_failure`. The `halt_on_failure` expression in `massdriver.yaml` blocks deployments with remaining high-severity findings when the environment target matches `prod`, `prd`, or `production`. diff --git a/bundles/gcp-network/operator.md b/bundles/gcp-network/operator.md index fdc758f..e5cf85b 100644 --- a/bundles/gcp-network/operator.md +++ b/bundles/gcp-network/operator.md @@ -8,13 +8,13 @@ templating: mustache **Network name is immutable.** Changing it forces replacement of the entire VPC and all dependent resources (subnets, firewall rules, peerings). Treat it as permanent after first deploy. -**Subnet CIDR is immutable.** GCP does not support in-place CIDR changes. To change it: destroy the package (all resources in the subnet must be decommissioned first), then reprovision with the new range. Plan a maintenance window. +**Subnet CIDR is immutable.** GCP does not support in-place CIDR changes. To change it: decommission all resources in the subnet, destroy this package, then reprovision with the new range. **Subnet region is immutable.** The subnet's region cannot be changed after creation. A region change requires destroy and recreate. -**Deny-all firewall is hardcoded at priority 65534.** This bundle creates a single baseline deny-all ingress rule. No traffic is allowed by default. Workload bundles (Cloud Run, Vertex, etc.) layer their own allow rules at lower priority numbers above it. +**Deny-all firewall is hardcoded at priority 65534.** This bundle creates a single baseline deny-all ingress rule. No ingress traffic is allowed by default. Workload bundles (Cloud Run, Vertex, etc.) layer their own allow rules at lower priority numbers. -**VPC is global; the subnet is regional.** The VPC itself has no region. Only the subnet is regional. Cross-region resources can share the VPC but must use their own regional subnets — extend the Terraform source if additional subnets are needed. +**VPC is global; the subnet is regional.** Cross-region resources can share the VPC but need their own regional subnets — extend the Terraform source if additional subnets are needed. **Deleting the network fails if anything is still attached.** Terraform will error if VMs, Cloud Run VPC connectors, GKE nodes, or other resources are still using the network. Decommission all dependent packages first. @@ -29,7 +29,7 @@ gcloud compute networks list-associated-resources {{artifacts.network.network_na Decommission those packages first, then retry destroy. **Firewall rules not taking effect.** -Rules are evaluated by priority (lowest number wins). Check the full rule list to find conflicts: +Rules are evaluated by priority (lowest number wins). Check the full rule list for conflicts: ```bash gcloud compute firewall-rules list \ --filter="network:{{artifacts.network.network_name}}" \ @@ -42,11 +42,11 @@ Ensure `compute.googleapis.com` is enabled in the landing zone's `enabled_apis`. ## Day-2 operations -**Expanding or changing CIDR:** Not supported in-place. Must destroy and recreate. All resources in the subnet must be decommissioned first. +**Expanding or changing CIDR:** Not supported in-place. Must destroy and recreate. Decommission all resources in the subnet first. -**Adding subnets:** This bundle provisions one regional subnet. For additional subnets (GKE secondary ranges, separate workload tiers), extend the Terraform source directly. +**Adding subnets:** This bundle provisions one regional subnet. For additional subnets, extend the Terraform source directly. -**VPC peering:** Use `gcloud compute networks peerings create` or add a `google_compute_network_peering` resource to the bundle source. Ensure CIDR ranges don't overlap between peered VPCs. +**VPC peering:** Add a `google_compute_network_peering` resource to the bundle source. Ensure CIDR ranges don't overlap between peered VPCs. **Querying VPC flow logs:** Flow logs are stored in Cloud Logging under resource type `gce_subnetwork`. Sampling is 50% at 5-second aggregation intervals. diff --git a/bundles/gcp-pubsub-topic/README.md b/bundles/gcp-pubsub-topic/README.md index 93da2d8..6e6195c 100644 --- a/bundles/gcp-pubsub-topic/README.md +++ b/bundles/gcp-pubsub-topic/README.md @@ -1,13 +1,12 @@ # gcp-pubsub-topic -Google Cloud Pub/Sub topic with optional dead-letter queue (DLQ). Use this bundle to provision a managed message topic for event-driven workloads — Cloud Run consumers, Dataflow pipelines, BigQuery subscriptions, and similar. The landing zone's workload service account is automatically granted publisher access. +Google Cloud Pub/Sub topic with optional dead-letter queue (DLQ). Use this bundle to provision a managed message topic for event-driven workloads — Cloud Run services, Dataflow pipelines, BigQuery subscriptions, and similar. -## Purpose +## Use Cases -- Provisions a Pub/Sub topic with configurable retention -- Optionally provisions a companion DLQ topic for undeliverable messages -- Grants `roles/pubsub.publisher` to the landing zone's workload service account on the main topic -- Emits a `catalog-demo/gcp-pubsub-topic` artifact so downstream bundles can reference the topic without hard-coding names +- Decoupling producers from consumers in event-driven architectures +- Buffering messages for downstream workers that process at their own pace +- Capturing undeliverable messages in a DLQ for retry or inspection ## Resources Created @@ -15,28 +14,42 @@ Google Cloud Pub/Sub topic with optional dead-letter queue (DLQ). Use this bundl |---|---|---| | `google_pubsub_topic.main` | Main Pub/Sub topic | Retention and ordering label set at provision time | | `google_pubsub_topic.dlq` | Dead-letter topic | Created only when `dlq.enabled = true` | -| `google_pubsub_topic_iam_member.workload_publisher` | IAM binding | Grants `roles/pubsub.publisher` to the landing zone workload SA on the main topic | -## Artifacts Consumed (Connections) +This bundle does NOT create any IAM bindings. Consumer bundles (e.g., `gcp-cloud-run-service`) create their own service accounts and bind the appropriate roles on this topic when connected on the canvas. + +## Connections | Connection | Artifact Type | How It Is Used | |---|---|---| | `gcp_authentication` | `gcp-service-account` | Deploy credential — project ID and service account key for the Google provider | -| `landing_zone` | `catalog-demo/gcp-landing-zone` | Provides `project_id` and `workload_identity.service_account_email` for the publisher IAM binding | +| `landing_zone` | `catalog-demo/gcp-landing-zone` | Provides `project_id` for resource placement | -## Artifacts Produced +## Artifact Produced -The bundle publishes a `catalog-demo/gcp-pubsub-topic` artifact. DLQ fields are present only when the DLQ is enabled. +**Artifact type:** `catalog-demo/gcp-pubsub-topic` -| Field | Description | Present | +| Field | Present | Description | |---|---|---| -| `project_id` | GCP project ID | Always | -| `topic_name` | Main topic resource name | Always | -| `topic_id` | Full topic resource ID | Always | -| `dlq_topic_name` | DLQ topic resource name | Only when `dlq.enabled = true` | -| `dlq_topic_id` | Full DLQ topic resource ID | Only when `dlq.enabled = true` | +| `project_id` | Always | GCP project ID | +| `topic_name` | Always | Main topic resource name | +| `topic_id` | Always | Full topic resource ID | +| `dlq_topic_name` | Only when `dlq.enabled = true` | DLQ topic resource name | +| `dlq_topic_id` | Only when `dlq.enabled = true` | Full DLQ topic resource ID | + +Consumer bundles that need to publish or subscribe bind IAM roles using `topic_name` and `project_id` from this artifact. Example pattern in a consumer bundle: + +```hcl +resource "google_pubsub_topic_iam_member" "publisher" { + project = var.pubsub_topic.project_id + topic = var.pubsub_topic.topic_name + role = "roles/pubsub.publisher" + member = "serviceAccount:${google_service_account.runtime.email}" +} +``` + +## Message Ordering -Downstream bundles that need subscriber access should bind `roles/pubsub.subscriber` on the topic or on their own subscription using `topic_name` and `project_id` from this artifact. +Message ordering is enforced at the publisher SDK level, not at the topic resource level. The `message_ordering_enabled` parameter writes a label (`message-ordering: enabled|disabled`) on the topic to record operator intent. Publishers must set `enable_message_ordering = true` and use ordering keys in their SDK client. Enabling ordering reduces maximum throughput. ## Compliance @@ -44,26 +57,19 @@ Downstream bundles that need subscriber access should bind `roles/pubsub.subscri | Check | Reason | |---|---| -| `CKV_GCP_83` | CSEK (Customer-Supplied Encryption Keys) skipped across all environments. CSEK requires callers to manage raw AES-256 keys on every API call — GCP itself recommends against this for most workloads. Google-managed encryption (default) satisfies encryption-at-rest requirements. If CMEK via Cloud KMS is required, add a `kms_key_name` param and remove this skip. | - -### Production gating +| `CKV_GCP_83` | CSEK (Customer-Supplied Encryption Keys) skipped across all environments. CSEK requires callers to manage raw AES-256 keys on every API call. Google-managed encryption satisfies encryption-at-rest requirements for the workloads this bundle targets. If CMEK via Cloud KMS is required, use a separate bundle with a KMS connection. | The `halt_on_failure` expression in `massdriver.yaml` blocks deployments with unresolved high-severity Checkov findings when the environment target matches `prod`, `prd`, or `production`. ## Assumptions -- `pubsub.googleapis.com` must be enabled in the landing zone before deploying this bundle. Add it to `enabled_apis` in the `gcp-landing-zone` package config. +- `pubsub.googleapis.com` must be enabled in the landing zone before deploying. Add it to `enabled_apis` in the `gcp-landing-zone` package. - The `gcp_authentication` credential has `pubsub.admin` or equivalent IAM on the project. -- The landing zone's workload SA is granted publisher access automatically; subscriber access for consumers must be added by the downstream bundle. - -## Message Ordering - -Message ordering is enforced at the **publisher SDK level**, not at the topic resource level. The `message_ordering_enabled` parameter writes a label (`message-ordering: enabled|disabled`) on the topic to record operator intent. Publishers must explicitly set `enable_message_ordering = true` and use ordering keys when publishing. Enabling ordering reduces maximum throughput per topic. ## Presets | Preset | Retention | DLQ | Max Delivery Attempts | Use Case | |---|---|---|---|---| -| Low-volume | 7 days | Off | — | Dev / low-traffic topics where DLQ overhead is unnecessary | -| Standard | 7 days | On | 5 | Most production topics; catches poison-pill messages | +| Low-volume | 7 days | Off | — | Dev or low-traffic topics where DLQ overhead is unnecessary | +| Standard | 7 days | On | 5 | Most production topics; catches undeliverable messages | | High-throughput | 1 day | On | 10 | High-volume pipelines where shorter retention reduces storage cost | diff --git a/bundles/gcp-pubsub-topic/operator.md b/bundles/gcp-pubsub-topic/operator.md index c36d214..e81951c 100644 --- a/bundles/gcp-pubsub-topic/operator.md +++ b/bundles/gcp-pubsub-topic/operator.md @@ -6,18 +6,20 @@ templating: mustache ## Non-obvious constraints -**Topic name is immutable.** To rename a topic: decommission this package, recreate it with the new name, and update all consumer subscriptions. Plan a maintenance window. +**Topic name is immutable.** Renaming requires decommissioning this package, recreating with the new name, and updating all consumer subscriptions. Plan a maintenance window. **Message retention changes are safe in-place.** Updating `message_retention_duration` applies without disruption. In-flight messages are not affected. -**Enabling DLQ after-the-fact does not update existing subscriptions.** When you enable the DLQ on an existing topic, Terraform creates the DLQ topic — but existing consumer subscriptions do not automatically gain a dead-letter policy. Consumer bundles must be updated separately to reference the new DLQ topic. +**Enabling DLQ after-the-fact does not update existing subscriptions.** When you enable the DLQ, Terraform creates the DLQ topic — but existing consumer subscriptions do not automatically gain a dead-letter policy. Consumer bundles must be updated separately to reference the new DLQ topic. -**Disabling DLQ destroys the DLQ topic.** Any consumer subscriptions that have a dead-letter policy pointing to the old DLQ topic will fail to deliver dead letters after the destroy. Remove dead-letter policies from consumer subscriptions before disabling the DLQ here. +**Disabling DLQ destroys the DLQ topic.** Consumer subscriptions that have a dead-letter policy pointing to the old DLQ topic will fail to deliver dead letters after the destroy. Remove dead-letter policies from consumer subscriptions before disabling the DLQ here. -**Message ordering on the topic is not enforcement.** Setting ordering on the topic is a configuration label. Publishers must also set `enable_message_ordering = true` in their SDK client and pass an ordering key on every publish call. Without ordering keys from publishers, messages are not ordered regardless of the topic setting. +**Message ordering on the topic is a label, not enforcement.** Publishers must also set `enable_message_ordering = true` in their SDK client and pass an ordering key on every publish call. Without ordering keys from the publisher, messages are not ordered regardless of the topic label. **`max_delivery_attempts` is enforced at the subscription, not the topic.** This bundle provisions the DLQ topic. The delivery attempt limit lives on the consumer's subscription (managed by the consumer bundle). If messages aren't reaching the DLQ, check the consumer subscription's dead-letter policy first. +**This bundle creates no IAM bindings.** Consumer bundles bind their own service accounts to this topic. If a service can't publish, the IAM binding is missing from the consumer bundle — not from here. + ## Troubleshooting **Messages not flowing to DLQ.** @@ -33,17 +35,18 @@ If the field is absent, the consumer bundle is not configured to use the DLQ. Add `pubsub.googleapis.com` to `enabled_apis` in the `gcp-landing-zone` package, redeploy the landing zone, wait ~60 seconds, then retry. **Publisher permission denied.** -The workload SA needs `roles/pubsub.publisher` on the topic: +Check the topic IAM policy — the publisher's SA must have `roles/pubsub.publisher`: ```bash gcloud pubsub topics get-iam-policy {{artifacts.pubsub_topic.topic_name}} \ --project={{artifacts.pubsub_topic.project_id}} ``` +If missing, the consumer bundle needs to be redeployed with the topic wired on the canvas. ## Day-2 operations **Changing retention duration:** Update param and redeploy. In-place, no disruption. -**Enabling DLQ on an existing topic:** Set `dlq.enabled = true`, configure `max_delivery_attempts`, redeploy. Then update consumer bundles to add dead-letter policies to their subscriptions pointing to `{{artifacts.pubsub_topic.dlq_topic_name}}`. +**Enabling DLQ on an existing topic:** Set `dlq.enabled = true`, configure `max_delivery_attempts`, redeploy. Then update consumer bundles to add dead-letter policies pointing to `{{artifacts.pubsub_topic.dlq_topic_name}}`. **Disabling DLQ:** Remove dead-letter policies from all consumer subscriptions first. Then set `dlq.enabled = false` and redeploy. The DLQ topic is destroyed. diff --git a/bundles/gcp-storage-bucket/README.md b/bundles/gcp-storage-bucket/README.md index 5a43c33..cc681a7 100644 --- a/bundles/gcp-storage-bucket/README.md +++ b/bundles/gcp-storage-bucket/README.md @@ -1,87 +1,82 @@ # gcp-storage-bucket -Google Cloud Storage bucket with configurable storage class, optional versioning, and lifecycle rules. Use this bundle to provision a managed object store for data platform workloads — Cloud Run pipelines, BigQuery exports, Vertex Workbench datasets, and similar. The landing zone's workload service account is automatically granted object read/write access. +Google Cloud Storage bucket with configurable storage class, optional versioning, and lifecycle rules. Use this bundle to provision a managed object store for data platform workloads — Cloud Run pipelines, BigQuery exports, Vertex Workbench datasets, and similar. -## Purpose +## Use Cases -- Provisions a GCS bucket with configurable storage class and location -- Optionally enables versioning for durable datasets and non-current version lifecycle management -- Supports lifecycle rules for automated cost optimization (Delete and SetStorageClass transitions) -- Enforces `uniform_bucket_level_access` and `public_access_prevention = "enforced"` as non-negotiable security baselines -- Grants `roles/storage.objectUser` to the landing zone's workload service account on the bucket -- Emits a `catalog-demo/gcp-storage-bucket` artifact so downstream bundles can reference the bucket without hard-coding names or project IDs +- Staging area for data ingestion before loading into BigQuery +- Durable dataset storage with versioning and access via scoped service accounts +- Archive tier for cost-optimized long-term retention +- Intermediate storage between pipeline stages ## Resources Created | Resource | Type | Notes | |---|---|---| | `google_storage_bucket.main` | GCS bucket | Storage class, location, versioning, and lifecycle rules set at provision time | -| `google_storage_bucket_iam_member.workload_object_user` | IAM binding | Grants `roles/storage.objectUser` to the landing zone workload SA | -## Artifacts Consumed (Connections) +This bundle does NOT create any IAM bindings. Consumer bundles (e.g., `gcp-cloud-run-service`) create their own service accounts and bind the appropriate roles on this bucket when connected on the canvas. + +## Connections | Connection | Artifact Type | How It Is Used | |---|---|---| | `gcp_authentication` | `gcp-service-account` | Deploy credential — project ID and service account key for the Google provider | -| `landing_zone` | `catalog-demo/gcp-landing-zone` | Provides `project_id` and `workload_identity.service_account_email` for the objectUser IAM binding | +| `landing_zone` | `catalog-demo/gcp-landing-zone` | Provides `project_id` for resource placement | -## Artifacts Produced +## Artifact Produced -The bundle publishes a `catalog-demo/gcp-storage-bucket` artifact with all fields needed for downstream bundles to read and write objects. +**Artifact type:** `catalog-demo/gcp-storage-bucket` | Field | Description | |---|---| | `project_id` | GCP project ID that owns the bucket | -| `bucket_name` | Globally-unique GCS bucket name (derived from Massdriver name prefix) | +| `bucket_name` | Globally-unique GCS bucket name (derived from the Massdriver name prefix) | | `bucket_url` | Canonical `gs://` URL for use with gsutil and client libraries | | `bucket_self_link` | GCS REST API resource URL (`https://www.googleapis.com/storage/v1/b/`) | -| `location` | GCS location where the bucket was deployed | +| `location` | GCS location where the bucket is deployed | | `storage_class` | Active storage class of the bucket | -Downstream bundles that need additional access (e.g., read-only) should bind `roles/storage.objectViewer` on the bucket using `bucket_name` and `project_id` from this artifact. +Consumer bundles bind IAM roles on the bucket using `bucket_name` and `project_id` from this artifact. Example pattern: -## Compliance +```hcl +resource "google_storage_bucket_iam_member" "runtime_object_user" { + bucket = var.storage_bucket.bucket_name + role = "roles/storage.objectUser" + member = "serviceAccount:${google_service_account.runtime.email}" +} +``` -### Hardcoded security baselines +## Compliance -Two settings are enforced at the Terraform level and cannot be changed via parameters: +### Hardcoded security controls | Setting | Value | Reason | |---|---|---| -| `uniform_bucket_level_access` | `true` | Disables legacy object-level ACLs. All access is controlled by IAM only. Prevents split access-control models that are difficult to audit and easy to misconfigure (Checkov CKV_GCP_29). | -| `public_access_prevention` | `"enforced"` | Blocks all public object access regardless of IAM policies or ACLs. Prevents accidental data exposure via `allUsers` or `allAuthenticatedUsers` grants (Checkov CKV_GCP_114). Non-negotiable baseline for all environments in this data platform series. | +| `uniform_bucket_level_access` | `true` | Disables legacy object-level ACLs. All access is IAM-only, which prevents split access-control models that are difficult to audit (Checkov CKV_GCP_29). | +| `public_access_prevention` | `"enforced"` | Blocks all public object access regardless of IAM or ACLs. Prevents accidental data exposure via `allUsers` or `allAuthenticatedUsers` (Checkov CKV_GCP_114). | ### Checkov skips | Check | Reason | |---|---| -| `CKV_GCP_62` | Bucket access logging requires a separate log-sink GCS bucket. That bucket is not part of this bundle's scope — enabling logging here without a target bucket causes a plan-time error. Operators who need access logs should provision a dedicated log bucket and wire `logging.log_bucket` manually. | -| `CKV_GCP_63` | Checks that a bucket does not log access requests to itself. Because no `logging` block is configured (see CKV_GCP_62), this bucket cannot log to itself. Checkov fails this check in the absence of any logging configuration, making the finding a false positive in this context. | -| `CKV_GCP_78` | Retention lock (WORM) makes objects immutable for a fixed duration and cannot be shortened or removed once set. It is not universally appropriate — it prevents deletion of any object, including accidental uploads. Add a `retention_policy` param if your workload requires WORM guarantees. | - -### Production gating +| `CKV_GCP_62` | Bucket access logging requires a separate log-sink bucket not in scope here. Enabling logging without a target bucket causes a plan-time error. Operators who need access logs should provision a dedicated log bucket and wire `logging.log_bucket` manually. | +| `CKV_GCP_63` | Checks that a bucket does not log to itself. Because no `logging` block is configured, this check fires as a false positive. | +| `CKV_GCP_78` | Retention lock (WORM) makes objects immutable for a fixed duration and cannot be removed once set. It is not appropriate for all workloads. Add a `retention_policy` param if your workload requires WORM. | The `halt_on_failure` expression in `massdriver.yaml` blocks deployments with unresolved high-severity Checkov findings when the environment target matches `prod`, `prd`, or `production`. ## Assumptions -- `storage.googleapis.com` must be enabled in the landing zone before deploying this bundle. Add it to `enabled_apis` in the `gcp-landing-zone` package config. +- `storage.googleapis.com` must be enabled in the landing zone before deploying. Add it to `enabled_apis` in the `gcp-landing-zone` package. - The `gcp_authentication` credential has `storage.admin` or equivalent IAM on the project. -- The landing zone's workload SA is granted `roles/storage.objectUser` automatically; read-only or admin access for other consumers must be added by the downstream bundle. -- Bucket names are derived from the Massdriver `name_prefix` and are globally unique — operators do not pick the raw bucket name. +- Bucket names are derived from the Massdriver `name_prefix` and are globally unique — operators do not choose the raw bucket name. +- Bucket location cannot be changed after creation. Choosing the wrong location requires decommissioning and reprovisioning with data migration. ## Presets | Preset | Storage Class | Location | Versioning | Lifecycle | |---|---|---|---|---| | Staging | STANDARD | US | Off | Delete objects after 30 days | -| Durable | STANDARD | US | On | None — retain all versions indefinitely | -| Archive | COLDLINE | US | On | Transition to ARCHIVE class after 365 days | - -## Bucket Naming - -GCS bucket names are globally unique across all GCP projects. This bundle derives the bucket name from the Massdriver `name_prefix`, which incorporates the environment slug and package name. Operators do not choose the raw name — name collisions are avoided by construction. - -## Location Immutability - -A bucket's location cannot be changed after creation. Selecting the wrong location requires decommissioning the package and reprovisioning. Choose carefully at deploy time based on where your compute resources run. +| Durable | STANDARD | US | On | None — retain all versions | +| Archive | COLDLINE | US | On | Transition to ARCHIVE after 365 days | diff --git a/bundles/gcp-storage-bucket/operator.md b/bundles/gcp-storage-bucket/operator.md index 39eef28..e9fe990 100644 --- a/bundles/gcp-storage-bucket/operator.md +++ b/bundles/gcp-storage-bucket/operator.md @@ -6,22 +6,21 @@ templating: mustache ## Non-obvious constraints -**Bucket name is globally unique and immutable.** The name is derived from the Massdriver name prefix and is set at creation. A rename requires decommissioning and recreating the package with a new name prefix, then migrating all objects. +**Bucket name is globally unique and immutable.** The name is derived from the Massdriver name prefix. Renaming requires decommissioning and recreating the package, then migrating all objects. -**Location is immutable.** Bucket location cannot be changed after creation. To move a bucket: export all objects to a new bucket in the target location, update consumers to point to the new bucket, then decommission this package. Use `gcloud storage cp -r` or a Dataflow job for large datasets. +**Location is immutable.** Bucket location cannot be changed after creation. To move: copy all objects to a new bucket in the target location, update consumers, then decommission this package. Use `gcloud storage cp -r` or a Dataflow job for large datasets. -**Public access prevention is enforced and cannot be loosened via params.** `public_access_prevention = "enforced"` is hardcoded. Any attempt to grant `allUsers` or `allAuthenticatedUsers` via IAM is rejected by GCP even if the IAM call appears to succeed. Objects are never publicly readable. This is intentional — it cannot be overridden through bundle configuration. +**Public access prevention is hardcoded.** `public_access_prevention = "enforced"` cannot be overridden through bundle configuration. Any attempt to grant `allUsers` or `allAuthenticatedUsers` via IAM is rejected by GCP even if the IAM call appears to succeed. -**Uniform bucket-level access is enabled.** Object-level ACLs are disabled. All access is controlled via bucket-level IAM only. Granting access to specific objects via ACLs is not possible. +**Uniform bucket-level access is enabled.** Object-level ACLs are disabled. All access is IAM-only. Granting access to specific objects via ACLs is not possible. -**Turning versioning off does not delete existing non-current versions.** GCS stops creating new versions, but existing non-current versions are retained and continue to incur storage charges. Add a lifecycle rule targeting `with_state: ARCHIVED` to clean them up. +**This bundle creates no IAM bindings.** Consumer bundles bind their own service accounts to this bucket. If a service can't read or write objects, the IAM binding is missing from the consumer bundle — not from here. -**Lifecycle rules evaluate once daily, not in real time.** A rule set to delete objects after 30 days may not take effect until the next evaluation window. This is a GCP platform constraint. +**Turning versioning off does not delete existing non-current versions.** GCS stops creating new versions, but existing non-current versions are retained and continue to incur storage charges. Add a lifecycle rule targeting `with_state: ARCHIVED` to clean them up. -**`Delete` action on a versioned bucket sets a delete marker, it does not immediately remove storage.** Add a second lifecycle rule targeting `with_state: ARCHIVED` with a shorter `age_days` to purge non-current versions and reclaim storage. +**Lifecycle rules evaluate once daily, not in real time.** A rule set to delete objects after 30 days may not take effect until the next evaluation window. -**Deploy fails with "storage.googleapis.com has not been used in project."** -Add `storage.googleapis.com` to `enabled_apis` in the `gcp-landing-zone` package, redeploy the landing zone, wait ~60 seconds, then retry. +**`Delete` action on a versioned bucket sets a delete marker — it does not immediately remove storage.** Add a second lifecycle rule targeting `with_state: ARCHIVED` with a shorter `age_days` to purge non-current versions and reclaim storage. ## Troubleshooting @@ -30,10 +29,10 @@ Uniform bucket-level access is on — check bucket IAM, not object ACLs: ```bash gcloud storage buckets get-iam-policy {{artifacts.storage_bucket.bucket_url}} ``` -The workload SA needs `roles/storage.objectUser` to read and write, or `roles/storage.objectViewer` for read-only. +The workload SA needs `roles/storage.objectUser` to read and write, or `roles/storage.objectViewer` for read-only. If the binding is absent, redeploy the consumer bundle with the bucket wired on the canvas. **Objects not being deleted by lifecycle rules.** -Lifecycle rules evaluate once daily. Wait up to 24 hours after a rule change takes effect. To inspect current lifecycle config: +Lifecycle rules evaluate once daily. Wait up to 24 hours after a rule change. Inspect current lifecycle config: ```bash gcloud storage buckets describe {{artifacts.storage_bucket.bucket_url}} \ --format="yaml(lifecycle)" @@ -46,6 +45,9 @@ gcloud storage ls -a {{artifacts.storage_bucket.bucket_url}} ``` Add a lifecycle rule with `with_state: ARCHIVED` to purge them. +**Deploy fails with "storage.googleapis.com has not been used in project."** +Add `storage.googleapis.com` to `enabled_apis` in the `gcp-landing-zone` package, redeploy, wait ~60 seconds, then retry. + ## Day-2 operations **Changing storage class:** Update `storage_class` param and redeploy. The bucket updates in-place. Existing objects retain their current storage class — only new writes use the new class. Use a lifecycle `SetStorageClass` rule to migrate existing objects. @@ -54,15 +56,15 @@ Add a lifecycle rule with `with_state: ARCHIVED` to purge them. **Disabling versioning:** In-place change, but existing non-current versions are retained. Add a lifecycle rule targeting `with_state: ARCHIVED` to clean up. -**Granting read-only access to another service account** (outside Terraform — will be overwritten on next apply): +**Granting access to another service account** (outside Terraform — will be overwritten on next apply): ```bash gcloud storage buckets add-iam-policy-binding {{artifacts.storage_bucket.bucket_url}} \ --member="serviceAccount:" \ --role="roles/storage.objectViewer" ``` -For permanent bindings, add a `google_storage_bucket_iam_member` resource to the bundle source. +For permanent bindings, add a `google_storage_bucket_iam_member` resource to the consumer bundle source. -**Migrating objects to a new bucket location:** +**Migrating objects to a new bucket:** ```bash gcloud storage cp -r {{artifacts.storage_bucket.bucket_url}}/* gs:/// ``` @@ -83,11 +85,6 @@ gcloud storage buckets get-iam-policy {{artifacts.storage_bucket.bucket_url}} gcloud storage buckets describe {{artifacts.storage_bucket.bucket_url}} \ --format="yaml(lifecycle)" -# Get a signed URL for a specific object (valid 1 hour) -gcloud storage sign-url {{artifacts.storage_bucket.bucket_url}}/ \ - --duration=1h \ - --private-key-file= - # Copy a local file into the bucket gcloud storage cp ./myfile.txt {{artifacts.storage_bucket.bucket_url}}/myfile.txt diff --git a/bundles/gcp-vertex-workbench/README.md b/bundles/gcp-vertex-workbench/README.md index 6210319..b6bb4b0 100644 --- a/bundles/gcp-vertex-workbench/README.md +++ b/bundles/gcp-vertex-workbench/README.md @@ -4,23 +4,23 @@ Vertex AI Workbench instance for interactive data science. Each bundle instance ## Use Cases -- **Exploratory data analysis** — interactive notebooks with access to BigQuery datasets and GCS buckets via scoped IAM. -- **ML model development** — GPU-accelerated notebook environments for training and evaluation, with the ability to consume Pub/Sub topics or write results to BigQuery via separate pipeline services. -- **Platform-managed data science environments** — org-wide Workbench standard enforcing Shielded VM, no public IP, per-instance identity, and idle shutdown — so each team gets a consistent, auditable environment without manual GCP console work. +- Exploratory data analysis with scoped, auditable IAM access to BigQuery datasets +- ML model development in GPU-accelerated notebook environments +- Platform-managed data science environments enforcing Shielded VM, no public IP, and per-instance identity ## Resources Created -| Resource | Description | -|---|---| -| `google_service_account.instance` | Per-instance SA — this bundle's own workload identity | -| `google_workbench_instance.main` | The Vertex AI Workbench instance (Workbench Instances API v2) | -| `google_bigquery_dataset_iam_member.dataset_viewer` | Created only when BigQuery dataset is connected — grants `roles/bigquery.dataViewer` (read-only) to instance SA | +| Resource | Type | Notes | +|---|---|---| +| `google_service_account.instance` | Per-instance SA | This instance's workload identity — one per bundle instance | +| `google_workbench_instance.main` | Vertex AI Workbench instance | Workbench Instances API v2 (`google_workbench_instance`) | +| `google_bigquery_dataset_iam_member.dataset_viewer` | BigQuery read-only IAM | Created only when BigQuery dataset is connected — grants `roles/bigquery.dataViewer` to instance SA | ## Connections ### Required -| Connection | Artifact Type | Purpose | +| Connection | Artifact Type | How It Is Used | |---|---|---| | `gcp_authentication` | `gcp-service-account` | GCP credentials used by Terraform to provision resources | | `landing_zone` | `catalog-demo/gcp-landing-zone` | Provides `project_id`, `network.region`, and subnet self-link for instance placement | @@ -31,7 +31,7 @@ Vertex AI Workbench instance for interactive data science. Each bundle instance |---|---|---| | `bigquery_dataset` | `catalog-demo/gcp-bigquery-dataset` | `roles/bigquery.dataViewer` (read-only) on the dataset | -When the BigQuery dataset is connected, the instance SA can run SELECT queries from notebooks without manual IAM changes. Disconnect the canvas wire and redeploy to revoke access. +Connecting or disconnecting the BigQuery dataset on the canvas does not take effect until a Terraform apply runs. ## Artifact Produced @@ -42,50 +42,41 @@ When the BigQuery dataset is connected, the instance SA can run SELECT queries f | `project_id` | string | GCP project that owns the instance | | `instance_name` | string | Short instance name (used in gcloud commands) | | `location` | string | GCP zone where the instance is deployed (e.g., `us-central1-a`) | -| `proxy_url` | string | JupyterLab HTTPS proxy URL — open this in a browser to access the notebook. May be empty while the instance is starting. | +| `proxy_url` | string | JupyterLab HTTPS proxy URL — open in a browser to access the notebook. Empty while the instance is starting. | | `instance_service_account_email` | string | Email of this instance's own SA | | `instance_service_account_member` | string | IAM principal string (`serviceAccount:`) for downstream bindings | -## Parameters - -| Parameter | Type | Default | Description | -|---|---|---|---| -| `machine_type` | enum | `e2-standard-4` | GCP machine type. E2 for general-purpose, N1 required for GPUs. | -| `boot_disk_size_gb` | integer | `150` | Boot disk size in GB (150–4000). Minimum 150 GB enforced by the Workbench base image. Holds OS, packages, and local notebook files. | -| `idle_shutdown_timeout_minutes` | integer | `180` | Minutes of kernel inactivity before auto-shutdown. 0 = never (continuous billing). | -| `accelerator_type` | enum | _(none)_ | GPU type. Requires N1 machine type. Leave empty for CPU-only. | -| `accelerator_count` | integer | `1` | Number of GPU accelerators. Only used when `accelerator_type` is set. | - -## Presets - -| Preset | Machine Type | Disk | GPU | Idle Timeout | -|---|---|---|---|---| -| Small | `e2-standard-4` | 150 GB | none | 3 hours | -| Medium | `n1-standard-8` | 200 GB | none | 3 hours | -| GPU | `n1-standard-8` | 200 GB | NVIDIA_TESLA_T4 × 1 | 3 hours | - ## Compliance -### Hardcoded Controls +### Hardcoded controls -| Control | Value | Rationale | +| Control | Value | Reason | |---|---|---| -| Shielded VM — Secure Boot | `enable_secure_boot = true` | Prevents unsigned kernel modules and boot-time malware from loading. Cannot be disabled without recreating the instance. | -| Shielded VM — vTPM | `enable_vtpm = true` | Enables measured boot and key attestation. Required for integrity monitoring. | -| Shielded VM — Integrity Monitoring | `enable_integrity_monitoring = true` | Detects tampering with the boot sequence by comparing against a known-good baseline. | -| No public IP | `disable_public_ip = true` | The Workbench proxy handles browser access. No external IP is exposed. JupyterLab traffic does not traverse the public internet. | -| Per-instance service account | `google_service_account.instance` (one per bundle instance) | Each instance gets its own SA — no shared SA that grants access across all Workbench notebooks. See iam.tf for design rationale. | -| Read-only BigQuery access | `roles/bigquery.dataViewer` (not dataEditor) | Workbench is an exploration environment. Write access would allow ad-hoc schema mutations from notebook cells. Users who need to write back should use their personal GCP identity. | -| Resource labels | Massdriver default tags | Enforces cost attribution and environment tagging. | +| Shielded VM — Secure Boot | `enable_secure_boot = true` | Prevents unsigned kernel modules and boot-time malware from loading | +| Shielded VM — vTPM | `enable_vtpm = true` | Enables measured boot and key attestation | +| Shielded VM — Integrity Monitoring | `enable_integrity_monitoring = true` | Detects tampering with the boot sequence | +| No public IP | `disable_public_ip = true` | The Workbench proxy handles browser access; no external IP is exposed | +| Per-instance service account | `google_service_account.instance` (one per bundle instance) | Each instance gets its own SA — no shared SA across Workbench notebooks | +| Read-only BigQuery access | `roles/bigquery.dataViewer` (not dataEditor) | Workbench is an exploration environment. Write access would allow ad-hoc schema mutations from notebook cells. Users who need to write back should use their personal GCP identity or a separate pipeline bundle. | +| Resource labels | Massdriver default tags | Enforces cost attribution and environment tagging | + +### Checkov skips -### Skipped Checks +None. Existing Vertex AI Workbench Checkov checks (CKV_GCP_89, CKV_GCP_126, CKV_GCP_127) target the deprecated `google_notebooks_instance` resource and do not fire against `google_workbench_instance`. CMEK for disk encryption is intentionally out of scope — Google-managed encryption is used. -None. As of checkov 3.2.x, all existing Vertex AI Workbench checks (CKV_GCP_89, CKV_GCP_126, CKV_GCP_127) target the deprecated `google_notebooks_instance` resource and do not fire against `google_workbench_instance`. CMEK for disk encryption is intentionally out of scope for this bundle — Google-managed encryption is used. If CMEK is required, a separate bundle with a KMS key connection should be used. +The `halt_on_failure` expression in `massdriver.yaml` blocks deployments with unresolved high-severity Checkov findings when the environment target matches `prod`, `prd`, or `production`. ## Assumptions -- The landing zone provides `project_id`, `network.region`, and `primary_subnet.self_link`. The Workbench instance is placed in the primary subnet of the landing zone's region, zone `-a`. -- The landing zone's subnet must have Private Google Access enabled for the instance to reach GCP APIs (BigQuery, GCS) without a public IP. The `gcp-landing-zone` bundle enables this by default. -- Idle shutdown is implemented via the `idle-timeout-seconds` GCE metadata key, which the Workbench agent reads at startup. If the instance is restarted externally (e.g., via gcloud), the idle timer resets. +- The landing zone provides `project_id`, `network.region`, and `primary_subnet.self_link`. The instance is placed in the landing zone's subnet, zone `-a`. +- The subnet must have Private Google Access enabled for the instance to reach GCP APIs (BigQuery, GCS) without a public IP. The `gcp-network` bundle enables this by default. - GPU availability is zone-dependent. If a GPU type is not available in `-a`, change `local.zone` in `src/main.tf` to a zone with quota. -- The `proxy_url` artifact field may be empty immediately after deploy while the instance boots. It populates within 2–5 minutes after the instance reaches ACTIVE state. +- The `proxy_url` artifact field may be empty immediately after deploy. It populates within 2–5 minutes after the instance reaches ACTIVE state. + +## Presets + +| Preset | Machine Type | Disk | GPU | Idle Timeout | +|---|---|---|---|---| +| Small | `e2-standard-4` | 150 GB | none | 3 hours | +| Medium | `n1-standard-8` | 200 GB | none | 3 hours | +| GPU | `n1-standard-8` | 200 GB | NVIDIA_TESLA_T4 x 1 | 3 hours | diff --git a/bundles/gcp-vertex-workbench/operator.md b/bundles/gcp-vertex-workbench/operator.md index e06e301..2586591 100644 --- a/bundles/gcp-vertex-workbench/operator.md +++ b/bundles/gcp-vertex-workbench/operator.md @@ -6,21 +6,21 @@ templating: mustache ## Non-obvious constraints -**Location is a zone, not a region.** This bundle appends `-a` to the landing zone region (e.g., `us-central1` → `us-central1-a`). GPU quota is zone-specific — if you get a quota error for a GPU type, check availability in the zone and request quota or change `local.zone` in `src/main.tf`. +**Location is a zone, not a region.** This bundle appends `-a` to the landing zone region (e.g., `us-central1` → `us-central1-a`). GPU quota is zone-specific — if you get a quota error, check availability in the zone and request quota or change `local.zone` in `src/main.tf`. -**E2 machine types do not support GPUs.** If `accelerator_type` is set, the machine type must be N1 (`n1-standard-*`). Attempting to attach a GPU to an E2 machine fails at apply time with a GCP API error. +**E2 machine types do not support GPUs.** If `accelerator_type` is set, the machine type must be N1 (`n1-standard-*`). Attempting to attach a GPU to an E2 machine fails at apply time. -**Machine type changes stop and restart the instance.** Workbench does not do live migration for machine type changes. The instance shuts down, is resized, and restarts. Expect 5–10 minutes of downtime. Open notebooks in JupyterLab are saved to disk and are available after restart. +**Machine type changes stop and restart the instance.** The instance shuts down, resizes, and restarts. Expect 5–10 minutes of downtime. Open notebooks are saved to disk and are available after restart. -**Shielded VM settings are not changeable in-place.** Changing `enable_secure_boot`, `enable_vtpm`, or `enable_integrity_monitoring` requires destroying and recreating the instance. These are hardcoded to `true` in this bundle and are not exposed as params. +**Shielded VM settings are hardcoded and not changeable in-place.** Changing `enable_secure_boot`, `enable_vtpm`, or `enable_integrity_monitoring` would require destroying and recreating the instance. These are always `true` and are not exposed as params. -**Idle shutdown requires the Workbench agent running.** The `idle-timeout-seconds` metadata key is only honoured if the Workbench agent is active inside the VM. If the agent crashes or the instance was reimaged externally, the idle shutdown will not fire. +**Idle shutdown requires the Workbench agent running.** The `idle-timeout-seconds` metadata key is only honoured if the Workbench agent is active. If the agent crashes or the instance was reimaged externally, idle shutdown will not fire. -**Per-instance SA recreates if the package is renamed.** The SA `account_id` is derived from `name_prefix`. Renaming the Massdriver package destroys the old SA and creates a new one. All canvas-wired IAM bindings are recreated automatically on the next deploy. Out-of-band bindings (e.g., manually granted Artifact Registry reader) must be reapplied manually. +**Per-instance SA recreates if the package is renamed.** The SA `account_id` is derived from `name_prefix`. Renaming destroys the old SA and creates a new one. Canvas-wired IAM bindings are recreated automatically on the next deploy. Out-of-band bindings must be reapplied manually. -**Canvas wires require a deploy to take effect.** Connecting or disconnecting the BigQuery dataset on the canvas does NOT grant or revoke IAM access immediately — a Massdriver deploy must run to apply the Terraform change. +**Canvas wires require a deploy to take effect.** Connecting or disconnecting the BigQuery dataset on the canvas does NOT grant or revoke IAM access immediately — a Massdriver deploy must run. -**proxy_url is empty until the instance is ACTIVE.** The JupyterLab proxy URL (`{{artifacts.vertex_workbench.proxy_url}}`) is only populated after the instance boots and the proxy registers. This takes 2–5 minutes after the Terraform apply completes. +**`proxy_url` is empty until the instance is ACTIVE.** `{{artifacts.vertex_workbench.proxy_url}}` is only populated after the instance boots and the proxy registers. This takes 2–5 minutes after the Terraform apply completes. ## Troubleshooting @@ -33,7 +33,7 @@ gcloud compute instances get-serial-port-output {{artifacts.vertex_workbench.ins ``` Common causes: GPU quota exceeded, subnet CIDR exhausted, missing API enablement (`notebooks.googleapis.com`). -**proxy_url is empty after 10 minutes.** +**`proxy_url` is empty after 10 minutes.** ```bash gcloud workbench instances describe {{artifacts.vertex_workbench.instance_name}} \ --location={{artifacts.vertex_workbench.location}} \ @@ -51,10 +51,10 @@ gcloud workbench instances start {{artifacts.vertex_workbench.instance_name}} \ --project={{artifacts.vertex_workbench.project_id}} ``` -**Notebook can't query BigQuery — `Access Denied`.** +**Notebook can't query BigQuery — Access Denied.** Confirm the canvas wire is connected AND the package has been redeployed since the wire was added. Verify the IAM binding exists: ```bash -bq get-iam-policy {{artifacts.vertex_workbench.project_id}}:$(BQ_DATASET_ID) \ +bq get-iam-policy {{artifacts.vertex_workbench.project_id}}: \ --format=prettyjson | grep -A3 "dataViewer" ``` The member should be `{{artifacts.vertex_workbench.instance_service_account_member}}`. @@ -75,7 +75,7 @@ gcloud compute instances describe {{artifacts.vertex_workbench.instance_name}} \ --project={{artifacts.vertex_workbench.project_id}} \ --format="yaml(metadata.items)" ``` -If missing, the idle_shutdown_timeout_minutes param may have been 0 (disabled). The metadata key is only written when the value is > 0. +If missing, the `idle_shutdown_timeout_minutes` param was 0 (disabled). The metadata key is only written when the value is > 0. ## Day-2 operations @@ -91,15 +91,13 @@ gcloud workbench instances start {{artifacts.vertex_workbench.instance_name}} \ --location={{artifacts.vertex_workbench.location}} \ --project={{artifacts.vertex_workbench.project_id}} ``` -Starting the instance after an idle shutdown or manual stop takes 2–5 minutes. The proxy URL remains the same. +Starting after an idle shutdown or manual stop takes 2–5 minutes. The proxy URL remains the same. -**Resizing the instance (machine type or disk):** Update the `machine_type` or `boot_disk_size_gb` params in Massdriver and redeploy. The instance will stop, resize, and restart. Disk can only be increased, not decreased. +**Resizing the instance:** Update `machine_type` or `boot_disk_size_gb` params and redeploy. The instance stops, resizes, and restarts. Disk size can only be increased, not decreased. **Adding a GPU after initial deploy:** Change `machine_type` to an N1 type, set `accelerator_type` and `accelerator_count`, and redeploy. This recreates the underlying GCE VM. -**Rotating the instance service account:** The SA is derived from `name_prefix`. Rotating requires renaming the Massdriver package, which destroys the old SA and creates a new one. Canvas-wired IAM bindings are recreated automatically. Out-of-band bindings must be reapplied. - -**Granting a user access to the JupyterLab UI:** The proxy URL requires the user to authenticate with a GCP identity that has `roles/notebooks.viewer` or higher on the instance. Grant via: +**Granting a user access to the JupyterLab UI:** ```bash gcloud workbench instances add-iam-policy-binding {{artifacts.vertex_workbench.instance_name}} \ --location={{artifacts.vertex_workbench.location}} \ @@ -127,7 +125,7 @@ gcloud workbench instances list \ gcloud iam service-accounts describe {{artifacts.vertex_workbench.instance_service_account_email}} \ --project={{artifacts.vertex_workbench.project_id}} -# Check IAM bindings on the service account (bindings granted TO the SA) +# Check IAM bindings granted to the instance SA gcloud projects get-iam-policy {{artifacts.vertex_workbench.project_id}} \ --flatten="bindings[].members" \ --filter="bindings.members:{{artifacts.vertex_workbench.instance_service_account_member}}" \ From f2a2df4e452d948e595d4e43e4ac67ada24cb84f Mon Sep 17 00:00:00 2001 From: Cory O'Daniel Date: Sun, 19 Apr 2026 22:25:47 -0700 Subject: [PATCH 07/15] Add gcp-cloud-run-service application template MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Application template for scaffolding per-app Cloud Run bundles via mass bundle new. Designed as the runtime-template counterpart to the gcp-cloud-run-service catalog bundle — platform teams publish this template to codify their Cloud Run standards, app developers use it to generate bundles for their specific services. Intentionally lean: - Only image is exposed as a param. Sensible defaults for port, memory, CPU, ingress, and scaling are hardcoded in src/main.tf for developers to lift into params as their application needs them. - Connections loop over user-picked artifact definitions at scaffold time. gcp_authentication and landing_zone are hardcoded as required. - Per-service runtime service account pattern is preserved. - src/iam.tf includes commented-out IAM binding examples for the common upstream data artifacts (Pub/Sub topic, BigQuery dataset, GCS bucket) so developers can uncomment based on what they picked. Co-Authored-By: Claude Opus 4.7 (1M context) --- templates/gcp-cloud-run-service/README.md | 24 ++++++++ templates/gcp-cloud-run-service/icon.png | Bin 0 -> 1437 bytes .../gcp-cloud-run-service/massdriver.yaml | 57 ++++++++++++++++++ templates/gcp-cloud-run-service/operator.md | 22 +++++++ .../gcp-cloud-run-service/src/.checkov.yml | 12 ++++ .../gcp-cloud-run-service/src/_providers.tf | 18 ++++++ .../gcp-cloud-run-service/src/artifacts.tf | 13 ++++ templates/gcp-cloud-run-service/src/iam.tf | 25 ++++++++ templates/gcp-cloud-run-service/src/main.tf | 44 ++++++++++++++ 9 files changed, 215 insertions(+) create mode 100644 templates/gcp-cloud-run-service/README.md create mode 100644 templates/gcp-cloud-run-service/icon.png create mode 100644 templates/gcp-cloud-run-service/massdriver.yaml create mode 100644 templates/gcp-cloud-run-service/operator.md create mode 100644 templates/gcp-cloud-run-service/src/.checkov.yml create mode 100644 templates/gcp-cloud-run-service/src/_providers.tf create mode 100644 templates/gcp-cloud-run-service/src/artifacts.tf create mode 100644 templates/gcp-cloud-run-service/src/iam.tf create mode 100644 templates/gcp-cloud-run-service/src/main.tf diff --git a/templates/gcp-cloud-run-service/README.md b/templates/gcp-cloud-run-service/README.md new file mode 100644 index 0000000..5879b59 --- /dev/null +++ b/templates/gcp-cloud-run-service/README.md @@ -0,0 +1,24 @@ +# GCP Cloud Run Service — Application Template + +Scaffold a new application bundle for a Cloud Run service with a per-service runtime identity and pick any upstream data artifacts you want this service to consume. + +## Use with `mass bundle new` + +``` +mass bundle new --template gcp-cloud-run-service +``` + +The CLI will prompt for: +- The bundle's `name` and `description` +- Any connections to add (you'll see a list of artifact definitions published in your Massdriver org — pick the upstream resources this service needs, e.g. a `gcp-pubsub-topic`, `gcp-bigquery-dataset`, or `gcp-storage-bucket`) + +## What you get + +- Cloud Run v2 service running as its own per-service service account +- Sensible defaults baked in: 1 vCPU, 512Mi memory, internal ingress, port 8080 +- Artifact output so downstream bundles can discover the service URL and runtime SA +- Example IAM bindings in `src/iam.tf` for common upstream data resources, commented out — uncomment and adapt based on which connections you picked + +## What to customize + +The template is intentionally lean. Only `image` is exposed as a param. Add more params to `massdriver.yaml` as your application needs them (port, memory, environment variables, min/max instances, ingress, etc.). Everything in `src/main.tf` is hardcoded defaults — move anything you want operators to tune into `var.*` and add it to the params block. diff --git a/templates/gcp-cloud-run-service/icon.png b/templates/gcp-cloud-run-service/icon.png new file mode 100644 index 0000000000000000000000000000000000000000..29aefb8e3c0fb30a43642fccadee73b007e73847 GIT binary patch literal 1437 zcmV;O1!DS%P)Wc)Wizmn z`leEtP!=WRYC)J$iFY@@NST^eUbo$y`#c>Fr`1`{8D_TK^27Y#&pYhG@cX~d%R6(1 z3Kc3;sBli9ty3ettxePW+M3&v?XgD^?XmlkZL!;WT4OPtF(bse$SJYDIZdsJwrL+G z<}{yhqJBxvX+EB4Z(h~cHtjBb8N<0iZ(GwWN4(347Q~e$07lwV~1LvvTxcI={o*{nB(e~Rc{R9{BrdZ*;Y zYV!5&YxG6nXS@sO2&Rem%3H!`u0R$ij(B57Z8Mfk=dB6vhHD z4y3_grbn<>MKBkKQZha~3s?eV2qp`3+&tyY*YcPoKBzKAHT)zc5~5VjGjnBT4d$A0 znDL_#6Lt~;F$ks#F&~4hLFIkmD~hv0*Dw!Qfr|oyDu{r>J*d6}=#WexwX*hA`^w}= z_SQO2K6IwMyMjc4c7zR4VMP?Fp*SmnPW-uO;yXv4`qxKD6-jTZTPjm(7Rsn<#8!Fo z;eql1r&nPvMmI+z_Cku0mnC?pDurmrq-95zu?REh0_jZ`yh3Q?V`8JF$rm!2Y<~R+ zGoh9My3^h3fwU??%&VVpZ1JE9B)_eTOL+7`u_JH_lrArL2zj;NbH)893BZ%zLjbA( z`t0QQj>XF=po%f!k&7L95nDA_`4y(|*$06v#YxhS7Rr05$PyM|N-mK7;j)>I{8i%0 zhyCO`5n9NECV@;xc^x5fhWv6z{(%zY$>lmXZsIjn4AqeQq1` zE{&Znz&hXnumyMllaGU>TzS8p59Uvx^XSucEh;QiAA3+VFL#ww;JVZD_6&JBNZ8H~ zxSuy>vynUd<;zD$ok-0Ge!~yp+L8+piqyIC6v%_{Iglclb*}v2NrHUgw?C>{U-|^Z zj&bGXw7iQ-H^YpOKk9!Q5h=LBM-u%(kFoF^1UjRIO%MRVWMwz zxLK83a8XnXn(QrO0_M|zd){E>Gxo%%GsK+Vaj-cjW({Edu2NCf;)Ce+I6jOgtOr2C zejidhd#krB*gMulZFN`3*luG$Fc<*?V7wLh;Jb%^0gEv>F&GQ>)anL6#uzKj)|eJ! z#5zGla8@tIiDZm54Zyy$3v|T^;BDa1f%3b1Ms1z9ClwP1Yrq=8l@mb(CxQ|BS-bg- zLLpDk6B`hN$k~5$vJ`ltyf1SZo&)w4$oE!neW)uLQ$;*kCxwSi7;=8i5L!e4qFpT% z;cA30fpI%~s<%G4D;2Xw#0eqWzx&>w#^9eW2Mwo*6%OpZCqZ zJCZS6M?6_C6igrhE%xh5(Ekpayxiy`U-lE(?!^aU?dM19*I8?G{b$`<@m4Pd2rlKy zXB2r`BDol_d8i)h4*|DK9C`JXwIeQjzdAhbr-+Gmi%3rTWrgB`0bkOT6tGW}4T>%Z r<+K5K95_<>t-nHr3Kc3;IM?|bOkrj!cd>0600000NkvXXu0mjfdk?RT literal 0 HcmV?d00001 diff --git a/templates/gcp-cloud-run-service/massdriver.yaml b/templates/gcp-cloud-run-service/massdriver.yaml new file mode 100644 index 0000000..ec79b21 --- /dev/null +++ b/templates/gcp-cloud-run-service/massdriver.yaml @@ -0,0 +1,57 @@ +# Bundle YAML Spec: https://docs.massdriver.cloud/guides/bundle-yaml-spec +# Module Patterns: https://docs.massdriver.cloud/guides/module-patterns +# Bundle Templates: https://docs.massdriver.cloud/guides/bundle-templates + +name: "{{ name }}" +description: "{{ description }}" +source_url: https://github.com/YOUR_ORG/YOUR_REPO/tree/main/bundles/{{ name }} +version: 0.0.0 + +steps: + - path: src + provisioner: opentofu:1.10 + config: + checkov: + halt_on_failure: '.params.md_metadata.default_tags["md-target"] | test("^(prod|prd|production)$")' + +params: + required: + - image + properties: + image: + type: string + title: Container Image + description: Fully-qualified container image reference. Pin to a digest + (image@sha256:...) for production. + +connections: + required: + - gcp_authentication + - landing_zone + {%- for conn in connections %} + - {{ conn.name }} + {%- endfor %} + properties: + gcp_authentication: + $ref: gcp-service-account + title: GCP Credentials + landing_zone: + $ref: catalog-demo/gcp-landing-zone + title: GCP Landing Zone + {%- for conn in connections %} + {{ conn.name }}: + $ref: {{ conn.artifact_definition }} + {%- endfor %} + +artifacts: + required: + - cloud_run_service + properties: + cloud_run_service: + $ref: catalog-demo/gcp-cloud-run-service + title: GCP Cloud Run Service + +ui: + ui:order: + - image + - "*" diff --git a/templates/gcp-cloud-run-service/operator.md b/templates/gcp-cloud-run-service/operator.md new file mode 100644 index 0000000..88b892a --- /dev/null +++ b/templates/gcp-cloud-run-service/operator.md @@ -0,0 +1,22 @@ +# {{ name }} + +{{ description }} + +## Non-obvious constraints + +- Cloud Run revisions are immutable. New config triggers a new revision; traffic defaults to 100% on latest. +- Service account name derives from the bundle's name prefix, capped at 30 characters. Renaming the package destroys and recreates the SA. +- Ingress changes trigger a full revision replacement (cold start on next request). + +## Troubleshooting + +- Revision fails readiness check: the container port in `src/main.tf` must match what the running process binds to. +- Image pull errors: the runtime service account needs `roles/artifactregistry.reader` on the image repo. + +## Useful commands + +``` +gcloud run services describe $SERVICE --region $REGION +gcloud run services logs read $SERVICE --region $REGION --limit 100 +gcloud run services update-traffic $SERVICE --to-revisions $REVISION=100 --region $REGION +``` diff --git a/templates/gcp-cloud-run-service/src/.checkov.yml b/templates/gcp-cloud-run-service/src/.checkov.yml new file mode 100644 index 0000000..33d6926 --- /dev/null +++ b/templates/gcp-cloud-run-service/src/.checkov.yml @@ -0,0 +1,12 @@ +skip-check: + # CKV_GCP_102: Cloud Run services should use private endpoint or VPC connector. + # Ingress is configurable; the default is internal-only. If your service needs + # a VPC connector, add the relevant google_vpc_access_connector resource and + # update the service's vpc_access block. This skip acknowledges the check is + # not applicable to services that use ingress restrictions instead. + - CKV_GCP_102 + # CKV_GCP_103: Cloud Run services should use Binary Authorization. Binary + # Authorization requires separate attestor infrastructure and image signing + # pipelines that are out of scope for a per-service bundle. Enforce at the + # organization level via org policy if needed. + - CKV_GCP_103 diff --git a/templates/gcp-cloud-run-service/src/_providers.tf b/templates/gcp-cloud-run-service/src/_providers.tf new file mode 100644 index 0000000..cad113c --- /dev/null +++ b/templates/gcp-cloud-run-service/src/_providers.tf @@ -0,0 +1,18 @@ +terraform { + required_version = ">= 1.0" + required_providers { + google = { + source = "hashicorp/google" + version = "~> 6.0" + } + massdriver = { + source = "massdriver-cloud/massdriver" + version = "~> 1.3" + } + } +} + +provider "google" { + project = var.gcp_authentication.project_id + credentials = jsonencode(var.gcp_authentication) +} diff --git a/templates/gcp-cloud-run-service/src/artifacts.tf b/templates/gcp-cloud-run-service/src/artifacts.tf new file mode 100644 index 0000000..d8cc131 --- /dev/null +++ b/templates/gcp-cloud-run-service/src/artifacts.tf @@ -0,0 +1,13 @@ +resource "massdriver_artifact" "cloud_run_service" { + field = "cloud_run_service" + name = "GCP Cloud Run Service ${var.md_metadata.name_prefix}" + artifact = jsonencode({ + project_id = local.project_id + service_name = google_cloud_run_v2_service.main.name + service_url = google_cloud_run_v2_service.main.uri + location = google_cloud_run_v2_service.main.location + latest_ready_revision = google_cloud_run_v2_service.main.latest_ready_revision + runtime_service_account_email = local.runtime_sa_email + runtime_service_account_member = local.runtime_sa_member + }) +} diff --git a/templates/gcp-cloud-run-service/src/iam.tf b/templates/gcp-cloud-run-service/src/iam.tf new file mode 100644 index 0000000..74d16d3 --- /dev/null +++ b/templates/gcp-cloud-run-service/src/iam.tf @@ -0,0 +1,25 @@ +# Grant this service's runtime SA the minimum role it needs on each upstream +# resource it consumes. Each connection you picked at scaffold time is available +# as var.. +# +# Examples — uncomment and adapt based on which connections you selected: +# +# resource "google_pubsub_topic_iam_member" "publisher" { +# project = var.pubsub_topic.project_id +# topic = var.pubsub_topic.topic_name +# role = "roles/pubsub.publisher" +# member = local.runtime_sa_member +# } +# +# resource "google_bigquery_dataset_iam_member" "data_editor" { +# project = var.bigquery_dataset.project_id +# dataset_id = var.bigquery_dataset.dataset_id +# role = "roles/bigquery.dataEditor" +# member = local.runtime_sa_member +# } +# +# resource "google_storage_bucket_iam_member" "object_user" { +# bucket = var.storage_bucket.bucket_name +# role = "roles/storage.objectUser" +# member = local.runtime_sa_member +# } diff --git a/templates/gcp-cloud-run-service/src/main.tf b/templates/gcp-cloud-run-service/src/main.tf new file mode 100644 index 0000000..588b1c3 --- /dev/null +++ b/templates/gcp-cloud-run-service/src/main.tf @@ -0,0 +1,44 @@ +locals { + project_id = var.landing_zone.project_id + name_prefix = var.md_metadata.name_prefix + region = var.landing_zone.network.region + + runtime_sa_email = google_service_account.runtime.email + runtime_sa_member = "serviceAccount:${google_service_account.runtime.email}" +} + +resource "google_service_account" "runtime" { + project = local.project_id + account_id = substr(local.name_prefix, 0, 30) + display_name = "Cloud Run Runtime — ${local.name_prefix}" + description = "Runtime identity for ${local.name_prefix}. Managed by Massdriver." +} + +resource "google_cloud_run_v2_service" "main" { + project = local.project_id + name = local.name_prefix + location = local.region + + ingress = "INGRESS_TRAFFIC_INTERNAL_ONLY" + + template { + service_account = local.runtime_sa_email + + containers { + image = var.image + + ports { + container_port = 8080 + } + + resources { + limits = { + cpu = "1" + memory = "512Mi" + } + } + } + } + + labels = var.md_metadata.default_tags +} From b46b1700feaaea30811933207d709ab626ebf567 Mon Sep 17 00:00:00 2001 From: Cory O'Daniel Date: Sun, 19 Apr 2026 22:39:32 -0700 Subject: [PATCH 08/15] Fix gcp-vertex-workbench metadata drift causing redeploy failures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GCP adds auto-managed keys to the Workbench instance metadata after creation (for example enable-jupyterlab4, proxy-mode). Terraform saw those as drift on subsequent plans and tried to prune them, which triggered a gce_setup update. The Workbench API then rejected the apply because any update to gce_setup requires the instance to be stopped first — even when no restricted field (machine_type, shielded config, disable_public_ip, etc.) was actually changing. Add lifecycle.ignore_changes on gce_setup[0].metadata so redeploys are no-op when only the user-managed metadata keys (serial-port-logging- enable, idle-timeout-seconds) are stable. Initial metadata still applies at create time; operators just can't tune metadata via terraform after creation, which matches the Workbench operational model (metadata is set at VM boot and rarely changes). Co-Authored-By: Claude Opus 4.7 (1M context) --- bundles/gcp-vertex-workbench/src/main.tf | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/bundles/gcp-vertex-workbench/src/main.tf b/bundles/gcp-vertex-workbench/src/main.tf index 47a1778..552691e 100644 --- a/bundles/gcp-vertex-workbench/src/main.tf +++ b/bundles/gcp-vertex-workbench/src/main.tf @@ -152,4 +152,17 @@ resource "google_workbench_instance" "main" { } labels = var.md_metadata.default_tags + + # Google adds auto-managed keys to metadata post-creation (for example + # enable-jupyterlab4, proxy-mode). Terraform sees those as drift and wants to + # prune them, which triggers a gce_setup update. The Workbench API then + # rejects the apply because updates to gce_setup require the instance to be + # stopped first — even when no restricted field (machine_type, shielded + # config, etc.) is actually changing. Ignoring metadata drift keeps + # redeploys no-op when only the user-managed keys are stable. + lifecycle { + ignore_changes = [ + gce_setup[0].metadata, + ] + } } From 83b66e8e92ee4f71d2a6d9ca7df42fb29049de53 Mon Sep 17 00:00:00 2001 From: Cory O'Daniel Date: Wed, 22 Apr 2026 11:44:21 -0700 Subject: [PATCH 09/15] Second demo additions: pubsub subscription topology, VPC connector, log sink MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Topology change — subscriptions now live on the consumer bundle rather than as their own tile on the canvas. Wiring a pubsub topic into a bigquery-dataset or cloud-run-service implicitly creates the subscription. Matches real-world ownership (the consumer configures its own ack deadline, retry, schema mapping) and halves the number of canvas tiles for a typical pipeline. Versions bumped from 0.1.0 to 0.1.1 across all GCP bundles so dev builds do not collide with the first-demo stable 0.1.0 release. gcp-bigquery-dataset Adds optional pubsub_topic connection. When wired, creates a Pub/Sub BigQuery subscription delivering into a configurable table within this dataset. Grants the Pub/Sub service agent dataEditor and metadataViewer on this dataset only. Target table must exist; this bundle does not create tables. gcp-cloud-run-service Adds optional incoming_topic connection. When wired, creates a dedicated push invoker SA (separate from the runtime SA) and a push subscription pointing at the service URL, authenticated via OIDC. Adds optional vpc_connector connection and a vpc_egress enum param (PRIVATE_RANGES_ONLY or ALL_TRAFFIC). The existing pubsub_topic connection for outbound publishing is unchanged. gcp-log-sink New bundle. Project-level Cloud Logging sink with configurable filter and one of two delivery destinations (BigQuery dataset or GCS bucket). Precondition enforces exactly one destination is wired. Writer identity is granted the minimal destination role (dataEditor on BQ, objectCreator on GCS). gcp-vpc-connector artifact definition New. Represents an existing Serverless VPC Access connector for import-only use by Cloud Run and Cloud Functions bundles. templates/gcp-cloud-run-service Updated scaffold examples to include commented-out push subscription and VPC connector blocks alongside the existing upstream IAM examples. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../gcp-log-sink/massdriver.yaml | 68 ++++++++ .../gcp-vpc-connector/massdriver.yaml | 88 +++++++++++ bundles/gcp-bigquery-dataset/README.md | 39 ++++- bundles/gcp-bigquery-dataset/massdriver.yaml | 75 ++++++++- bundles/gcp-bigquery-dataset/operator.md | 39 ++++- bundles/gcp-bigquery-dataset/src/main.tf | 1 + .../gcp-bigquery-dataset/src/subscription.tf | 89 +++++++++++ bundles/gcp-bigquery-dataset/src/variables.tf | 25 +++ bundles/gcp-cloud-run-service/README.md | 31 +++- bundles/gcp-cloud-run-service/massdriver.yaml | 45 +++++- bundles/gcp-cloud-run-service/operator.md | 8 + bundles/gcp-cloud-run-service/src/main.tf | 14 ++ .../src/push_subscription.tf | 106 +++++++++++++ .../gcp-cloud-run-service/src/variables.tf | 36 +++++ bundles/gcp-landing-zone/massdriver.yaml | 2 +- bundles/gcp-log-sink/README.md | 62 ++++++++ bundles/gcp-log-sink/massdriver.yaml | 145 ++++++++++++++++++ bundles/gcp-log-sink/operator.md | 65 ++++++++ bundles/gcp-log-sink/src/artifacts.tf | 13 ++ bundles/gcp-log-sink/src/main.tf | 97 ++++++++++++ bundles/gcp-log-sink/src/variables.tf | 93 +++++++++++ bundles/gcp-network/massdriver.yaml | 2 +- bundles/gcp-pubsub-topic/massdriver.yaml | 2 +- bundles/gcp-storage-bucket/massdriver.yaml | 2 +- bundles/gcp-vertex-workbench/massdriver.yaml | 2 +- templates/gcp-cloud-run-service/README.md | 4 +- templates/gcp-cloud-run-service/src/iam.tf | 3 + templates/gcp-cloud-run-service/src/main.tf | 9 ++ .../src/push_subscription.tf | 43 ++++++ 29 files changed, 1188 insertions(+), 20 deletions(-) create mode 100644 artifact-definitions/gcp-log-sink/massdriver.yaml create mode 100644 artifact-definitions/gcp-vpc-connector/massdriver.yaml create mode 100644 bundles/gcp-bigquery-dataset/src/subscription.tf create mode 100644 bundles/gcp-cloud-run-service/src/push_subscription.tf create mode 100644 bundles/gcp-log-sink/README.md create mode 100644 bundles/gcp-log-sink/massdriver.yaml create mode 100644 bundles/gcp-log-sink/operator.md create mode 100644 bundles/gcp-log-sink/src/artifacts.tf create mode 100644 bundles/gcp-log-sink/src/main.tf create mode 100644 bundles/gcp-log-sink/src/variables.tf create mode 100644 templates/gcp-cloud-run-service/src/push_subscription.tf diff --git a/artifact-definitions/gcp-log-sink/massdriver.yaml b/artifact-definitions/gcp-log-sink/massdriver.yaml new file mode 100644 index 0000000..78471db --- /dev/null +++ b/artifact-definitions/gcp-log-sink/massdriver.yaml @@ -0,0 +1,68 @@ +name: gcp-log-sink +label: GCP Log Sink +icon: https://raw.githubusercontent.com/massdriver-cloud/massdriver-catalog/refs/heads/main/platforms/gcp/icon.png + +# Log sinks do not expose a useful IAM surface downstream. The writer_identity +# SA is Google-managed and grants are made ON the destination resource at sink +# creation time. Consumers of this artifact read sink metadata only — no IAM +# binding pattern is needed. +exports: [] + +schema: + title: GCP Log Sink + description: A Google Cloud Logging project sink. Carries the sink name, the + fully-qualified destination string, the Google-managed writer service account + identity, destination type (bigquery or gcs), and the owning project ID. + Downstream bundles can use writer_identity to grant additional access on the + destination resource if needed. + type: object + required: + - project_id + - sink_name + - destination + - writer_identity + - destination_type + properties: + project_id: + title: Project ID + description: GCP project identifier that owns this log sink + type: string + examples: + - my-gcp-project-123 + + sink_name: + title: Sink Name + description: Cloud Logging sink resource name + type: string + examples: + - dataplat-dev-log-sink + + destination: + title: Destination + description: >- + Fully-qualified logging destination URI. For BigQuery: + bigquery.googleapis.com/projects/PROJECT/datasets/DATASET. For GCS: + storage.googleapis.com/BUCKET_NAME. + type: string + examples: + - bigquery.googleapis.com/projects/my-project/datasets/my_dataset + - storage.googleapis.com/my-logs-bucket + + writer_identity: + title: Writer Identity + description: Google-managed service account email that Cloud Logging uses to + write log entries to the destination. This SA must be granted the + destination-appropriate role (bigquery.dataEditor for BigQuery, + storage.objectCreator for GCS). The bundle grants this automatically at + creation time. + type: string + examples: + - serviceAccount:p123456789-123456@gcp-sa-logging.iam.gserviceaccount.com + + destination_type: + title: Destination Type + description: Whether logs are routed to BigQuery or GCS + type: string + enum: + - bigquery + - gcs diff --git a/artifact-definitions/gcp-vpc-connector/massdriver.yaml b/artifact-definitions/gcp-vpc-connector/massdriver.yaml new file mode 100644 index 0000000..8721488 --- /dev/null +++ b/artifact-definitions/gcp-vpc-connector/massdriver.yaml @@ -0,0 +1,88 @@ +name: gcp-vpc-connector +label: GCP Serverless VPC Access Connector +icon: https://raw.githubusercontent.com/massdriver-cloud/massdriver-catalog/refs/heads/main/platforms/gcp/icon.png + +# IAM Role Binding Pattern (for bundles that consume this artifact): +# +# VPC Access connectors do not have their own IAM surface — access is governed by +# the VPC and subnet IAM. Consumers reference the connector by its fully-qualified +# name on the Cloud Run / Cloud Functions service. Example Terraform: +# +# resource "google_cloud_run_v2_service" "main" { +# template { +# vpc_access { +# connector = var.vpc_connector.connector_id +# egress = var.vpc_connector.egress_settings +# } +# } +# } +# +# To let a service account use a connector in another project, that SA needs +# `roles/vpcaccess.user` on the connector's project. +exports: [] + +schema: + title: GCP Serverless VPC Access Connector + description: An existing or provisioned Serverless VPC Access connector. Consumer + bundles (Cloud Run, Cloud Functions) reference connector_id to route egress + traffic through the VPC. + type: object + required: + - project_id + - region + - name + - connector_id + properties: + project_id: + title: Project ID + description: GCP project that owns the connector + type: string + examples: + - my-gcp-project-123 + + region: + title: Region + description: GCP region where the connector is deployed. Must match the region + of the consuming service. + type: string + examples: + - us-central1 + + name: + title: Connector Name + description: Short name of the connector + type: string + examples: + - my-vpc-connector + + connector_id: + title: Connector Resource ID + description: Fully-qualified connector resource name used on the consuming + service's vpc_access.connector field + type: string + examples: + - projects/my-gcp-project-123/locations/us-central1/connectors/my-vpc-connector + + network: + title: VPC Network + description: Name of the VPC network the connector is attached to + type: string + examples: + - default + + ip_cidr_range: + title: IP CIDR Range + description: /28 CIDR range reserved for the connector's internal addresses + type: string + examples: + - 10.8.0.0/28 + + egress_settings: + title: Default Egress Settings + description: Suggested egress setting for services using this connector. + Consumers may override. + type: string + enum: + - ALL_TRAFFIC + - PRIVATE_RANGES_ONLY + default: PRIVATE_RANGES_ONLY diff --git a/bundles/gcp-bigquery-dataset/README.md b/bundles/gcp-bigquery-dataset/README.md index 46834ed..2a324a8 100644 --- a/bundles/gcp-bigquery-dataset/README.md +++ b/bundles/gcp-bigquery-dataset/README.md @@ -13,15 +13,30 @@ Google Cloud BigQuery dataset with configurable location, default table expirati | Resource | Type | Notes | |---|---|---| | `google_bigquery_dataset.main` | BigQuery dataset | Location, expiration, and delete protection set at provision time; Google-managed encryption | +| `google_pubsub_subscription.bigquery` | Pub/Sub subscription | Created only when a Pub/Sub topic is wired. Delivers messages into a table in this dataset. | +| `google_bigquery_dataset_iam_member.pubsub_service_agent_data_editor` | IAM binding | Created only when a Pub/Sub topic is wired. Grants the Pub/Sub service agent `roles/bigquery.dataEditor` on this dataset. | +| `google_bigquery_dataset_iam_member.pubsub_service_agent_metadata_viewer` | IAM binding | Created only when a Pub/Sub topic is wired. Grants the Pub/Sub service agent `roles/bigquery.metadataViewer` on this dataset. | -This bundle does NOT create any IAM bindings. Consumer bundles (e.g., `gcp-cloud-run-service`, `gcp-vertex-workbench`) create their own service accounts and bind the appropriate roles on this dataset when connected on the canvas. +This bundle does NOT create any workload IAM bindings. Consumer bundles (e.g., `gcp-cloud-run-service`, `gcp-vertex-workbench`) create their own service accounts and bind the appropriate roles on this dataset when connected on the canvas. The IAM bindings above are only for the Pub/Sub service agent, and only when a topic is wired. ## Connections -| Connection | Artifact Type | How It Is Used | -|---|---|---| -| `gcp_authentication` | `gcp-service-account` | Deploy credential — project ID and service account key for the Google provider | -| `landing_zone` | `catalog-demo/gcp-landing-zone` | Provides `project_id` for resource placement | +| Connection | Required | Artifact Type | How It Is Used | +|---|---|---|---| +| `gcp_authentication` | Yes | `gcp-service-account` | Deploy credential — project ID and service account key for the Google provider | +| `landing_zone` | Yes | `catalog-demo/gcp-landing-zone` | Provides `project_id` for resource placement | +| `pubsub_topic` | No | `catalog-demo/gcp-pubsub-topic` | When wired, creates a Pub/Sub BigQuery subscription that delivers messages into a table in this dataset | + +### Optional: Pub/Sub BigQuery subscription + +When you wire a `gcp-pubsub-topic` bundle to this bundle's `pubsub_topic` connection, the following happens on the next deploy: + +1. The Pub/Sub service agent (`service-@gcp-sa-pubsub.iam.gserviceaccount.com`) is granted `roles/bigquery.dataEditor` and `roles/bigquery.metadataViewer` on this dataset. These bindings are dataset-scoped, not project-wide. +2. A Pub/Sub subscription is created on the topic with BigQuery delivery configured to write into the table you specify. + +The IAM bindings are removed when the topic is disconnected and the bundle is redeployed. + +**The target table must already exist.** Pub/Sub does not create BigQuery tables. Create the table in the dataset before wiring the topic connection and deploying. If the table is absent, Terraform will succeed but the subscription will fail to deliver messages. ## Artifact Produced @@ -55,6 +70,18 @@ resource "google_bigquery_dataset_iam_member" "dataset_viewer" { } ``` +### BigQuery subscription parameters + +These params appear in the form under **BigQuery Subscription Settings** and are only used when `pubsub_topic` is wired. + +| Parameter | Type | Default | Description | +|---|---|---|---| +| `bigquery_subscription.table_name` | string | — | Name of an existing table in this dataset. Pattern: `^[a-zA-Z0-9_]{1,1024}$`. Required when the topic is wired. | +| `bigquery_subscription.use_topic_schema` | boolean | `false` | When true, uses the Pub/Sub topic schema to map message fields to table columns. When false, writes raw bytes to a `data` column. | +| `bigquery_subscription.write_metadata` | boolean | `false` | When true, adds `subscription_name`, `message_id`, `publish_time`, and `attributes` columns to each row. Your table schema must include these columns. | +| `bigquery_subscription.drop_unknown_fields` | boolean | `false` | When true and `use_topic_schema` is enabled, silently drops message fields that are not in the table schema. When false, unknown fields send the message to the dead letter topic or drop it. | +| `bigquery_subscription.ack_deadline_seconds` | integer | `60` | Seconds Pub/Sub waits for BigQuery to acknowledge a message before re-delivering. Range 10–600. | + ## Compliance ### Checkov skips @@ -71,6 +98,8 @@ The `halt_on_failure` expression in `massdriver.yaml` blocks deployments with un - The `gcp_authentication` credential has `bigquery.admin` or equivalent IAM on the project. - `dataset_id` is immutable after creation. Changing it requires destroying and recreating the dataset — all data is lost unless exported first. - `default_table_expiration_days` applies only to tables created after the setting is applied. Existing tables are not affected. +- When using the Pub/Sub BigQuery subscription feature, the target table named in `bigquery_subscription.table_name` must already exist in the dataset before deploying. This bundle does not create tables. +- When using the Pub/Sub BigQuery subscription feature, `pubsub.googleapis.com` must be enabled in the project. ## Presets diff --git a/bundles/gcp-bigquery-dataset/massdriver.yaml b/bundles/gcp-bigquery-dataset/massdriver.yaml index 251141d..06c5ab9 100644 --- a/bundles/gcp-bigquery-dataset/massdriver.yaml +++ b/bundles/gcp-bigquery-dataset/massdriver.yaml @@ -5,7 +5,7 @@ description: Google Cloud BigQuery dataset with configurable location, default t access on the dataset. Emits a gcp-bigquery-dataset artifact for downstream Cloud Run, Vertex Workbench, and query workloads. source_url: https://github.com/massdriver-cloud/massdriver-catalog/tree/main/bundles/gcp-bigquery-dataset -version: 0.1.0 +version: 0.1.1 params: required: @@ -97,6 +97,59 @@ params: type: boolean default: false + bigquery_subscription: + title: BigQuery Subscription Settings + description: Settings for the Pub/Sub BigQuery subscription. Only used when + a Pub/Sub topic is wired to this bundle. The target table must already exist + in this dataset — Pub/Sub does not create it. + type: object + properties: + table_name: + title: Target Table Name + description: Name of an existing table in this dataset where Pub/Sub will + deliver messages. The table must exist before the subscription is created. + Pub/Sub does not create tables automatically. Letters, digits, and underscores + only; maximum 1024 characters. + type: string + pattern: "^[a-zA-Z0-9_]+$" + + use_topic_schema: + title: Use Topic Schema + description: When enabled, BigQuery uses the Pub/Sub topic's schema to + parse messages and map fields to table columns. When disabled, messages + are written as raw bytes in a single `data` column. Requires the topic + to have a schema attached. + type: boolean + default: false + + write_metadata: + title: Write Subscription Metadata + description: When enabled, BigQuery adds extra columns to each row — + `subscription_name`, `message_id`, `publish_time`, and `attributes`. + Useful for auditing and deduplication. The table schema must include + these columns when this is enabled. + type: boolean + default: false + + drop_unknown_fields: + title: Drop Unknown Fields + description: When enabled and use_topic_schema is also enabled, fields + in the message that do not exist in the table schema are silently dropped. + When disabled, unknown fields cause the message to be sent to the dead + letter topic (if configured) or dropped. + type: boolean + default: false + + ack_deadline_seconds: + title: Acknowledgement Deadline (seconds) + description: How long Pub/Sub waits for BigQuery to acknowledge a message + before re-delivering it. Increase if BigQuery write latency exceeds the + default. Range 10–600 seconds. + type: integer + default: 60 + minimum: 10 + maximum: 600 + connections: required: - gcp_authentication @@ -110,6 +163,10 @@ connections: $ref: catalog-demo/gcp-landing-zone title: GCP Landing Zone + pubsub_topic: + $ref: catalog-demo/gcp-pubsub-topic + title: Pub/Sub Topic (optional — wire to enable BigQuery subscription delivery) + artifacts: required: - bigquery_dataset @@ -133,7 +190,23 @@ ui: - location - default_table_expiration_days - delete_protection + - bigquery_subscription - "*" properties: delete_protection: ui:widget: checkbox + bigquery_subscription: + ui:order: + - table_name + - use_topic_schema + - write_metadata + - drop_unknown_fields + - ack_deadline_seconds + - "*" + properties: + use_topic_schema: + ui:widget: checkbox + write_metadata: + ui:widget: checkbox + drop_unknown_fields: + ui:widget: checkbox diff --git a/bundles/gcp-bigquery-dataset/operator.md b/bundles/gcp-bigquery-dataset/operator.md index 5c19b76..8873ed0 100644 --- a/bundles/gcp-bigquery-dataset/operator.md +++ b/bundles/gcp-bigquery-dataset/operator.md @@ -18,10 +18,16 @@ templating: mustache **Dataset-level IAM propagates to all tables, current and future.** For row-level or table-level isolation, use BigQuery row-level security policies or bind IAM at the table level separately. -**This bundle creates no IAM bindings.** Consumer bundles bind their own service accounts to this dataset. If a service can't query or load data, the IAM binding is missing from the consumer bundle — not from here. +**Consumer bundles are responsible for their own IAM bindings.** Consumer bundles bind their own service accounts to this dataset. If a service can't query or load data, the IAM binding is missing from the consumer bundle — not from here. The only IAM bindings this bundle creates are the Pub/Sub service agent bindings, and only when a topic is wired. **Cross-region queries are not supported.** BigQuery cannot join tables in different regions in a single query. Use Storage Transfer Service or BigQuery Data Transfer Service to replicate data first. +**BigQuery subscription target table must exist before deploy.** When the `pubsub_topic` connection is wired, Pub/Sub creates the subscription but does NOT create the target table. The table named in `bigquery_subscription.table_name` must already exist in this dataset. If the table is absent, Terraform will succeed but the subscription will fail to deliver messages — they will accumulate and eventually be dropped or sent to a dead letter topic. + +**Schema mismatch routes messages to dead letter or drops them.** When `use_topic_schema = true` and a message contains fields not in the table schema, behavior depends on `drop_unknown_fields`. If `drop_unknown_fields = false` (the default), the message is routed to the dead letter topic if one is configured on the source topic, or dropped. If `drop_unknown_fields = true`, the extra fields are silently discarded and the message is delivered. + +**Pub/Sub IAM bindings are dataset-scoped and removed on disconnect.** When you unwire the `pubsub_topic` connection and redeploy, Terraform removes the two service agent IAM bindings from this dataset. No project-level IAM is modified. Existing data in the table is not affected — only new message delivery stops. + ## Troubleshooting **Permission denied on dataset access.** @@ -44,6 +50,26 @@ Add `bigquery.googleapis.com` to `enabled_apis` in the `gcp-landing-zone` packag bq show --format=prettyjson {{artifacts.bigquery_dataset.dataset_full_name}}. ``` +**Pub/Sub subscription stuck — messages not appearing in BigQuery.** +First confirm the table exists and the subscription's IAM bindings are in place: +```bash +# Check subscription delivery status and error details +gcloud pubsub subscriptions describe --project= + +# Confirm IAM bindings on the dataset (look for gcp-sa-pubsub entries) +bq get-iam-policy {{artifacts.bigquery_dataset.dataset_full_name}} +``` +Common causes: table doesn't exist, table schema mismatch with message fields, `use_topic_schema = true` but topic has no schema, or IAM bindings not yet propagated. + +**Pub/Sub subscription creation fails with permission error during deploy.** +The Pub/Sub service agent IAM bindings may not have propagated before the subscription was created. IAM propagation is eventually consistent — wait 30–60 seconds and redeploy. The `depends_on` in this bundle mitigates this but does not eliminate it entirely. + +**Messages delivered but columns are all null.** +If `use_topic_schema = false` (default), messages are written as raw bytes to the `data` column. All other columns will be null unless `write_metadata = true` is set and the table has the corresponding metadata columns. Enable `use_topic_schema` or query the `data` column directly. + +**Deploy fails with "pubsub.googleapis.com has not been used in project."** +Add `pubsub.googleapis.com` to `enabled_apis` in the `gcp-landing-zone` package, redeploy the landing zone, wait ~60 seconds, then retry. + ## Day-2 operations **Setting expiration on an existing table** (default expiration doesn't backfill): @@ -95,4 +121,15 @@ bq show --format=prettyjson {{artifacts.bigquery_dataset.dataset_full_name}}.`' + +# List Pub/Sub subscriptions on a topic +gcloud pubsub topics list-subscriptions --project= + +# Describe a Pub/Sub subscription (shows delivery config and error state) +gcloud pubsub subscriptions describe --project= + +# Seek a subscription to a snapshot or timestamp (e.g., replay missed messages) +gcloud pubsub subscriptions seek \ + --time=$(date -u +%Y-%m-%dT%H:%M:%SZ -d "1 hour ago") \ + --project= ``` diff --git a/bundles/gcp-bigquery-dataset/src/main.tf b/bundles/gcp-bigquery-dataset/src/main.tf index 638d107..4503e24 100644 --- a/bundles/gcp-bigquery-dataset/src/main.tf +++ b/bundles/gcp-bigquery-dataset/src/main.tf @@ -20,6 +20,7 @@ provider "google" { locals { project_id = var.landing_zone.project_id name_prefix = var.md_metadata.name_prefix + dataset_id = var.dataset_id # Convert days → milliseconds for the BigQuery API. BigQuery requires ms. # 0 or null input means "no expiration" → pass null to terraform resource. diff --git a/bundles/gcp-bigquery-dataset/src/subscription.tf b/bundles/gcp-bigquery-dataset/src/subscription.tf new file mode 100644 index 0000000..66c2c2d --- /dev/null +++ b/bundles/gcp-bigquery-dataset/src/subscription.tf @@ -0,0 +1,89 @@ +# ─── BigQuery Subscription (optional) ───────────────────────────────────────── +# +# This file is count-gated on var.pubsub_topic being non-null. +# When a Pub/Sub topic is wired on the canvas, three resources are created: +# +# 1. google_bigquery_dataset_iam_member.pubsub_service_agent_data_editor +# 2. google_bigquery_dataset_iam_member.pubsub_service_agent_metadata_viewer +# 3. google_pubsub_subscription.bigquery +# +# IAM bindings grant the Pub/Sub service agent +# (service-@gcp-sa-pubsub.iam.gserviceaccount.com) the minimum +# roles required to write messages into BigQuery. Bindings are dataset-scoped, +# not project-wide. +# +# IMPORTANT — the target table must exist before deployment. +# Pub/Sub does NOT create BigQuery tables. Create the table in the dataset +# (manually, via a companion bundle, or via Dataform) before wiring this +# connection. Deploying when the table is absent will succeed at the Terraform +# layer but the subscription will fail to deliver and messages will back up. + +# ─── Pub/Sub service agent project number ────────────────────────────────────── +# The Pub/Sub service agent SA is project-number-scoped, so we need the numeric +# project number to construct the identity. +data "google_project" "this" { + project_id = local.project_id +} + +locals { + pubsub_enabled = var.pubsub_topic != null + pubsub_service_account = "serviceAccount:service-${data.google_project.this.number}@gcp-sa-pubsub.iam.gserviceaccount.com" + + # BigQuery subscription table reference format: projectId:datasetId.tableId + # This is the format required by the Pub/Sub API and the Terraform provider. + bq_table_ref = local.pubsub_enabled ? "${local.project_id}:${local.dataset_id}.${var.bigquery_subscription.table_name}" : null + + # Subscription name derived from the bundle name prefix for uniqueness. + subscription_name = "${local.name_prefix}-bq" +} + +# ─── IAM: dataEditor on this dataset ────────────────────────────────────────── +# Required so the Pub/Sub service agent can INSERT rows into the target table. +resource "google_bigquery_dataset_iam_member" "pubsub_service_agent_data_editor" { + count = local.pubsub_enabled ? 1 : 0 + + project = local.project_id + dataset_id = google_bigquery_dataset.main.dataset_id + role = "roles/bigquery.dataEditor" + member = local.pubsub_service_account +} + +# ─── IAM: metadataViewer on this dataset ────────────────────────────────────── +# Required so the Pub/Sub service agent can read table schemas and dataset +# metadata to validate message delivery configuration. +resource "google_bigquery_dataset_iam_member" "pubsub_service_agent_metadata_viewer" { + count = local.pubsub_enabled ? 1 : 0 + + project = local.project_id + dataset_id = google_bigquery_dataset.main.dataset_id + role = "roles/bigquery.metadataViewer" + member = local.pubsub_service_account +} + +# ─── Pub/Sub subscription with BigQuery delivery ─────────────────────────────── +resource "google_pubsub_subscription" "bigquery" { + count = local.pubsub_enabled ? 1 : 0 + + project = var.pubsub_topic.project_id + name = local.subscription_name + topic = var.pubsub_topic.topic_id + + ack_deadline_seconds = var.bigquery_subscription.ack_deadline_seconds + + bigquery_config { + table = local.bq_table_ref + use_topic_schema = var.bigquery_subscription.use_topic_schema + write_metadata = var.bigquery_subscription.write_metadata + drop_unknown_fields = var.bigquery_subscription.drop_unknown_fields + } + + labels = var.md_metadata.default_tags + + # The IAM bindings must exist before Pub/Sub validates the subscription's + # ability to write to BigQuery. Without these, the subscription creation will + # fail with a permission error even though the resource itself is valid. + depends_on = [ + google_bigquery_dataset_iam_member.pubsub_service_agent_data_editor, + google_bigquery_dataset_iam_member.pubsub_service_agent_metadata_viewer, + ] +} diff --git a/bundles/gcp-bigquery-dataset/src/variables.tf b/bundles/gcp-bigquery-dataset/src/variables.tf index 50399ba..9181dd6 100644 --- a/bundles/gcp-bigquery-dataset/src/variables.tf +++ b/bundles/gcp-bigquery-dataset/src/variables.tf @@ -72,3 +72,28 @@ variable "delete_protection" { type = bool default = false } + +# Optional — only present when a Pub/Sub topic is wired on the canvas. +variable "pubsub_topic" { + description = "Pub/Sub topic artifact. When wired, a BigQuery subscription is created to deliver messages into this dataset." + type = object({ + project_id = string + topic_name = string + topic_id = string + dlq_topic_id = optional(string) + dlq_topic_name = optional(string) + }) + default = null +} + +variable "bigquery_subscription" { + description = "Settings for the Pub/Sub BigQuery subscription. Consumed only when pubsub_topic is non-null." + type = object({ + table_name = optional(string) + use_topic_schema = optional(bool, false) + write_metadata = optional(bool, false) + drop_unknown_fields = optional(bool, false) + ack_deadline_seconds = optional(number, 60) + }) + default = {} +} diff --git a/bundles/gcp-cloud-run-service/README.md b/bundles/gcp-cloud-run-service/README.md index 87f8fe5..f7484c0 100644 --- a/bundles/gcp-cloud-run-service/README.md +++ b/bundles/gcp-cloud-run-service/README.md @@ -27,9 +27,12 @@ Typical workflow: | `google_service_account.runtime` | Per-service runtime SA | This service's workload identity — one per bundle instance | | `google_cloud_run_v2_service.main` | Cloud Run v2 service | Runs containers as the runtime SA | | `google_cloud_run_v2_service_iam_member` (allUsers) | Public invoker IAM | Created only when `allow_unauthenticated = true` | -| `google_pubsub_topic_iam_member` | Pub/Sub publisher IAM | Created only when Pub/Sub topic is connected | -| `google_bigquery_dataset_iam_member` | BigQuery data editor IAM | Created only when BigQuery dataset is connected | -| `google_storage_bucket_iam_member` | GCS object user IAM | Created only when Storage bucket is connected | +| `google_pubsub_topic_iam_member` | Pub/Sub publisher IAM | Created only when `pubsub_topic` is connected | +| `google_bigquery_dataset_iam_member` | BigQuery data editor IAM | Created only when `bigquery_dataset` is connected | +| `google_storage_bucket_iam_member` | GCS object user IAM | Created only when `storage_bucket` is connected | +| `google_service_account.push_invoker` | Push invoker SA | Created only when `incoming_topic` is connected — used by Pub/Sub for OIDC, separate from the runtime SA | +| `google_cloud_run_v2_service_iam_member` (push_invoker) | Push invoker IAM | Created only when `incoming_topic` is connected — grants `roles/run.invoker` to the push invoker SA | +| `google_pubsub_subscription.push` | Pub/Sub push subscription | Created only when `incoming_topic` is connected — delivers messages to this service's URL | ## Connections @@ -42,7 +45,9 @@ Typical workflow: ### Optional -When connected on the canvas, the bundle automatically grants this service's runtime SA the listed IAM role. When absent, no binding is created. +Connecting or disconnecting a canvas wire does not take effect until a Terraform apply runs. + +**Outgoing data connections** — grant this service's runtime SA the listed IAM role on the upstream resource: | Connection | Artifact Type | IAM Role Granted | |---|---|---| @@ -50,7 +55,21 @@ When connected on the canvas, the bundle automatically grants this service's run | `bigquery_dataset` | `catalog-demo/gcp-bigquery-dataset` | `roles/bigquery.dataEditor` on the dataset | | `storage_bucket` | `catalog-demo/gcp-storage-bucket` | `roles/storage.objectUser` on the bucket | -Connecting or disconnecting a canvas wire does not take effect until a Terraform apply runs. +**Incoming message delivery** — creates a Pub/Sub push subscription that calls this service's URL: + +| Connection | Artifact Type | What Gets Created | +|---|---|---| +| `incoming_topic` | `catalog-demo/gcp-pubsub-topic` | Push subscription on the topic + a dedicated `push_invoker` SA granted `roles/run.invoker` on this service | + +The push subscription uses a separate `push_invoker` service account (not the runtime SA) for OIDC authentication. Pub/Sub attaches a signed OIDC token for that SA to every HTTP request. Cloud Run validates the token and the `roles/run.invoker` binding before routing the request to the container. The `push_ack_deadline_seconds` param (default 60, max 600) controls how long Pub/Sub waits for a 2xx before redelivering. + +**Private egress** — routes outbound traffic through a VPC for access to private endpoints: + +| Connection | Artifact Type | What Gets Created | +|---|---|---| +| `vpc_connector` | `catalog-demo/gcp-vpc-connector` | Attaches the connector to the Cloud Run service's `vpc_access` block | + +The `vpc_egress` param controls whether only RFC1918 traffic (`PRIVATE_RANGES_ONLY`) or all outbound traffic (`ALL_TRAFFIC`) goes through the connector. Use `ALL_TRAFFIC` when downstream services such as Kafka brokers are on private IPs reachable only through the VPC. The connector must be in the same GCP region as this Cloud Run service. ## Artifact Produced @@ -90,7 +109,7 @@ The `halt_on_failure` expression in `massdriver.yaml` blocks deployments with un - The landing zone provides `project_id` and `network.region`. It does NOT provide a workload SA — this bundle creates its own. - The runtime SA does not automatically have `roles/artifactregistry.reader`. If your image is in Artifact Registry, grant that role manually or add it to the bundle source. -- VPC connector or direct VPC egress is not provisioned by this bundle. If you need to reach VPC-private resources (e.g., Cloud SQL without public IP), add a `google_vpc_access_connector` resource to the bundle source. +- The VPC connector is consumed by this bundle (via the `vpc_connector` optional connection) but not provisioned here. Deploy a VPC connector bundle separately and wire it on the canvas. - The default image (`gcr.io/cloudrun/hello`) is the Google-managed hello-world container. Replace it with your application image before a real deployment. ## Presets diff --git a/bundles/gcp-cloud-run-service/massdriver.yaml b/bundles/gcp-cloud-run-service/massdriver.yaml index e0eafce..236c1dc 100644 --- a/bundles/gcp-cloud-run-service/massdriver.yaml +++ b/bundles/gcp-cloud-run-service/massdriver.yaml @@ -6,7 +6,7 @@ description: Google Cloud Run v2 service with auto-binding IAM for upstream data GCS objectUser). Emits a gcp-cloud-run-service artifact for downstream event sources (Scheduler, Pub/Sub push) to use for invoking the service. source_url: https://github.com/massdriver-cloud/massdriver-catalog/tree/main/bundles/gcp-cloud-run-service -version: 0.1.0 +version: 0.1.1 params: required: @@ -148,6 +148,33 @@ params: type: boolean default: false + push_ack_deadline_seconds: + title: Push Subscription Ack Deadline (seconds) + description: Maximum time Pub/Sub waits for the service to acknowledge a push + message before redelivering it. Range is 10–600 seconds. Only used when + `incoming_topic` is connected. Set this to at least as long as your handler's + maximum processing time. If your handler cannot finish within 600 seconds, + acknowledge early and process asynchronously. + type: integer + minimum: 10 + maximum: 600 + default: 60 + + vpc_egress: + title: VPC Egress + description: Controls which traffic is routed through the VPC connector. Only + used when `vpc_connector` is connected. `PRIVATE_RANGES_ONLY` routes RFC1918 + traffic (10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16) through the connector; + public destinations still egress directly. `ALL_TRAFFIC` forces all outbound + traffic — including public API calls — through the connector and VPC. Use + `ALL_TRAFFIC` when the downstream endpoint (e.g., a Kafka broker) is on a + private IP behind the connector. + type: string + default: PRIVATE_RANGES_ONLY + enum: + - PRIVATE_RANGES_ONLY + - ALL_TRAFFIC + connections: required: - gcp_authentication @@ -176,6 +203,20 @@ connections: $ref: catalog-demo/gcp-storage-bucket title: Storage Bucket (optional) + # incoming_topic: when wired, this bundle creates a Pub/Sub push subscription + # that delivers messages FROM this topic INTO this Cloud Run service's URL. + # The subscription uses a dedicated push_invoker SA for OIDC authentication — + # separate from the runtime SA that runs the container. + incoming_topic: + $ref: catalog-demo/gcp-pubsub-topic + title: Incoming Pub/Sub Topic (optional — creates push subscription) + + # vpc_connector: when wired, attaches the connector to this Cloud Run service + # for private VPC egress. The connector must be in the same region as the service. + vpc_connector: + $ref: catalog-demo/gcp-vpc-connector + title: VPC Connector (optional — private egress) + artifacts: required: - cloud_run_service @@ -201,6 +242,8 @@ ui: - max_instances - ingress - allow_unauthenticated + - vpc_egress + - push_ack_deadline_seconds - "*" properties: allow_unauthenticated: diff --git a/bundles/gcp-cloud-run-service/operator.md b/bundles/gcp-cloud-run-service/operator.md index 339f7ba..ecbc6d6 100644 --- a/bundles/gcp-cloud-run-service/operator.md +++ b/bundles/gcp-cloud-run-service/operator.md @@ -8,6 +8,14 @@ templating: mustache **Each bundle instance creates its own service account.** The SA email is derived from the bundle's `name_prefix`. If the package is renamed, the SA is destroyed and recreated. Any out-of-band IAM bindings referencing the old SA email (e.g., manually granted Artifact Registry reader) must be reapplied. Canvas-wired bindings (Pub/Sub, BigQuery, GCS) are recreated automatically on the next deploy. +**The push subscription uses a SEPARATE service account from the runtime SA.** When `incoming_topic` is connected, this bundle creates two SAs: the runtime SA (which the container runs as and which holds data-access IAM bindings) and a `push_invoker` SA (which Pub/Sub uses exclusively to OIDC-authenticate HTTP push deliveries). Do not confuse them — they have different emails, different roles, and different lifecycles. The push invoker SA is named `-p` in GCP. + +**The VPC connector must be in the same region as this Cloud Run service.** The connector region is taken from the `catalog-demo/gcp-vpc-connector` artifact (`connector.region`). If the connector is in a different region than the landing zone's `network.region`, the Cloud Run deploy will fail with a region mismatch error. Deploy the connector bundle in the correct region before wiring. + +**`vpc_egress = PRIVATE_RANGES_ONLY` does NOT route all traffic through the VPC.** Only RFC1918 destinations (10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16) are routed through the connector. Public API calls (e.g., Google APIs, external HTTP endpoints) still egress directly to the internet. If your downstream endpoint — such as a Kafka broker — is on a private IP behind the connector, use `ALL_TRAFFIC` to force all egress through the VPC. If using `ALL_TRAFFIC`, ensure the VPC has a Cloud NAT gateway configured, otherwise internet-bound traffic will have no route. + +**Push subscription ack deadline is capped at 600 seconds.** If a handler cannot complete within 600 seconds, it must acknowledge the message early (return HTTP 2xx immediately) and process asynchronously using a background task, Cloud Tasks, or another mechanism. Returning a non-2xx after the deadline causes Pub/Sub to redeliver the message, which leads to duplicate processing. + **New deployments route 100% of traffic to the latest revision immediately.** Blue/green splits must be configured before deploying the new revision. You cannot retroactively split traffic between revisions once the new one is live at 100%. **Changing `ingress` triggers a new revision and a cold start.** Even if `min_instances > 0`, an ingress change forces revision replacement. diff --git a/bundles/gcp-cloud-run-service/src/main.tf b/bundles/gcp-cloud-run-service/src/main.tf index 5af4e09..4f024c6 100644 --- a/bundles/gcp-cloud-run-service/src/main.tf +++ b/bundles/gcp-cloud-run-service/src/main.tf @@ -93,6 +93,20 @@ resource "google_cloud_run_v2_service" "main" { } } } + + # ── VPC Connector ───────────────────────────────────────────────────────── + # Only configured when vpc_connector is wired on the canvas. + # connector_id is the fully-qualified resource name provided by the + # catalog-demo/gcp-vpc-connector artifact. egress is controlled by the + # vpc_egress param — use ALL_TRAFFIC to force all outbound traffic (including + # public destinations) through the VPC, e.g. for Kafka on a private endpoint. + dynamic "vpc_access" { + for_each = var.vpc_connector != null ? [1] : [] + content { + connector = var.vpc_connector.connector_id + egress = var.vpc_egress + } + } } labels = var.md_metadata.default_tags diff --git a/bundles/gcp-cloud-run-service/src/push_subscription.tf b/bundles/gcp-cloud-run-service/src/push_subscription.tf new file mode 100644 index 0000000..7a12041 --- /dev/null +++ b/bundles/gcp-cloud-run-service/src/push_subscription.tf @@ -0,0 +1,106 @@ +# ─── Pub/Sub Push Subscription ──────────────────────────────────────────────── +# +# This file is active only when `incoming_topic` is wired on the canvas. +# It creates everything needed for Pub/Sub to authenticate and invoke this +# Cloud Run service via HTTP push. +# +# TWO SERVICE ACCOUNT PATTERN +# ─────────────────────────── +# This uses two separate service accounts with distinct purposes: +# +# google_service_account.runtime (created in main.tf) +# ↳ The identity the container RUNS AS. It holds IAM bindings for +# Pub/Sub publisher, BigQuery, GCS, etc. — resources the app accesses. +# DO NOT use this SA for the push subscription OIDC token. +# +# google_service_account.push_invoker (created in THIS file) +# ↳ The identity Pub/Sub uses to INVOKE the Cloud Run service via HTTP. +# It is granted only roles/run.invoker on this specific service. +# Pub/Sub attaches an OIDC token for this SA to each push request, +# which Cloud Run validates before passing the request to the container. +# +# Separating these SAs means a compromised push subscription token cannot be +# used to publish messages or access data resources, and the runtime SA cannot +# be used to forge push deliveries from other topics. +# +# FLOW +# ──── +# Pub/Sub publishes a message to incoming_topic +# → Pub/Sub's push delivery thread attaches an OIDC token for push_invoker SA +# → Cloud Run validates the token → roles/run.invoker check passes +# → Request is routed to the container (running as the runtime SA) +# → Container processes the message and returns 2xx to acknowledge + +# ─── Push Invoker Service Account ───────────────────────────────────────────── +# A dedicated SA used exclusively by Pub/Sub to OIDC-authenticate push requests. +# account_id is capped at 30 chars (GCP limit). We use a "-p" suffix to +# distinguish it from the runtime SA that shares the same name_prefix. + +resource "google_service_account" "push_invoker" { + count = var.incoming_topic != null ? 1 : 0 + + project = local.project_id + account_id = "${substr(local.name_prefix, 0, 28)}-p" + display_name = "Pub/Sub Push Invoker — ${local.name_prefix}" + description = "Used by Pub/Sub to invoke Cloud Run service ${local.name_prefix} via OIDC push. Managed by Massdriver." +} + +# ─── Grant push_invoker SA run.invoker on THIS service ──────────────────────── +# Scoped to this specific Cloud Run service — not a project-level binding. +# This is the minimal permission Pub/Sub needs to successfully deliver messages. + +resource "google_cloud_run_v2_service_iam_member" "push_invoker" { + count = var.incoming_topic != null ? 1 : 0 + + project = local.project_id + location = local.region + name = google_cloud_run_v2_service.main.name + role = "roles/run.invoker" + member = "serviceAccount:${google_service_account.push_invoker[0].email}" +} + +# ─── Pub/Sub Push Subscription ──────────────────────────────────────────────── +# Subscribes to incoming_topic and delivers messages to this service's URL. +# +# push_endpoint: the service's root URI (provided by the Cloud Run v2 API). +# Append a path (e.g., /events) in the service code or override push_endpoint +# to a path — Pub/Sub appends nothing by default. +# +# oidc_token: Pub/Sub attaches a signed OIDC token for push_invoker SA on every +# request. Cloud Run validates the token and checks run.invoker before routing. +# audience defaults to the push_endpoint URL, which is the correct value for +# Cloud Run OIDC validation. +# +# ack_deadline_seconds: if the service does not return 2xx within this window, +# Pub/Sub redelivers the message. Max is 600s. Long-running handlers must either +# acknowledge early (return 2xx, then process async) or stay well under the limit. +# +# retry_policy: exponential backoff between redeliveries. 10s minimum and 600s +# maximum are sensible defaults for most event-driven workloads. Tune if your +# downstream has specific rate constraints. + +resource "google_pubsub_subscription" "push" { + count = var.incoming_topic != null ? 1 : 0 + + project = var.incoming_topic.project_id + name = "${local.name_prefix}-push" + topic = var.incoming_topic.topic_id + + ack_deadline_seconds = var.push_ack_deadline_seconds + + push_config { + push_endpoint = google_cloud_run_v2_service.main.uri + + oidc_token { + service_account_email = google_service_account.push_invoker[0].email + # audience defaults to push_endpoint — correct for Cloud Run OIDC validation + } + } + + retry_policy { + minimum_backoff = "10s" + maximum_backoff = "600s" + } + + labels = var.md_metadata.default_tags +} diff --git a/bundles/gcp-cloud-run-service/src/variables.tf b/bundles/gcp-cloud-run-service/src/variables.tf index 46004f8..eec3a46 100644 --- a/bundles/gcp-cloud-run-service/src/variables.tf +++ b/bundles/gcp-cloud-run-service/src/variables.tf @@ -87,6 +87,32 @@ variable "storage_bucket" { default = null } +variable "incoming_topic" { + description = "Optional Pub/Sub topic connection. When provided, a push subscription is created that delivers messages from this topic to this Cloud Run service's URL. Uses a dedicated push_invoker SA for OIDC authentication." + type = object({ + project_id = string + topic_name = string + topic_id = string + dlq_topic_name = optional(string) + dlq_topic_id = optional(string) + }) + default = null +} + +variable "vpc_connector" { + description = "Optional VPC connector connection. When provided, the Cloud Run service's vpc_access block is configured with the connector for private VPC egress." + type = object({ + project_id = string + region = string + name = string + connector_id = string + network = optional(string) + ip_cidr_range = optional(string) + egress_settings = optional(string) + }) + default = null +} + # ─── Service params ──────────────────────────────────────────────────────────── variable "image" { @@ -128,3 +154,13 @@ variable "allow_unauthenticated" { type = bool default = false } + +variable "push_ack_deadline_seconds" { + type = number + default = 60 +} + +variable "vpc_egress" { + type = string + default = "PRIVATE_RANGES_ONLY" +} diff --git a/bundles/gcp-landing-zone/massdriver.yaml b/bundles/gcp-landing-zone/massdriver.yaml index 76cd85c..f951ec3 100644 --- a/bundles/gcp-landing-zone/massdriver.yaml +++ b/bundles/gcp-landing-zone/massdriver.yaml @@ -5,7 +5,7 @@ description: Project-level governance construct for a GCP data platform. Enables single landing-zone artifact so downstream bundles only need one connection. Does NOT provision workload service accounts — each consumer bundle creates its own. source_url: https://github.com/massdriver-cloud/massdriver-catalog/tree/main/bundles/gcp-landing-zone -version: 0.1.0 +version: 0.1.1 params: required: diff --git a/bundles/gcp-log-sink/README.md b/bundles/gcp-log-sink/README.md new file mode 100644 index 0000000..7d12d4c --- /dev/null +++ b/bundles/gcp-log-sink/README.md @@ -0,0 +1,62 @@ +# gcp-log-sink + +Routes Cloud Logging entries from a GCP project to either a BigQuery dataset or a GCS bucket. Exactly one destination must be wired — the bundle enforces this with a Terraform precondition. The Google-managed sink writer service account is automatically granted the minimum required IAM role on the chosen destination. + +## Use Cases + +- Persistent audit log storage: pipe `cloudaudit.googleapis.com/activity` to GCS for long-term retention at low cost. +- Log-based analytics: route application or infrastructure logs to BigQuery for SQL queries and dashboards. +- Error alerting pipeline: filter `severity >= ERROR` to BigQuery, then query from Vertex Workbench or a BI tool. + +## Resources Created + +| Resource | Description | +|---|---| +| `google_logging_project_sink.main` | Project-scoped Cloud Logging sink with unique writer identity | +| `google_bigquery_dataset_iam_member.sink_writer` | (BigQuery only) Grants sink SA `roles/bigquery.dataEditor` on the dataset | +| `google_storage_bucket_iam_member.sink_writer` | (GCS only) Grants sink SA `roles/storage.objectCreator` on the bucket | + +## Connections + +### Required + +- **GCP Credentials** (`gcp-service-account`) — service account used by Terraform to create and manage the sink. +- **GCP Landing Zone** (`catalog-demo/gcp-landing-zone`) — provides the project ID where the sink is created. + +### Optional Destinations (exactly one must be wired) + +- **BigQuery Dataset** (`catalog-demo/gcp-bigquery-dataset`) — route logs to this dataset. Logs land in tables named after the log type; date-partitioned when `use_partitioned_tables` is enabled. +- **GCS Bucket** (`catalog-demo/gcp-storage-bucket`) — route logs to this bucket. Cloud Logging batches entries hourly into JSON files organized by date and hour. + +If neither or both destinations are wired, `tofu plan` will fail with a clear error message. + +## Artifact Produced + +`catalog-demo/gcp-log-sink` — carries `project_id`, `sink_name`, `destination`, `writer_identity`, and `destination_type`. Downstream bundles rarely need to consume this artifact directly; it is published for observability and chaining. + +## Parameters + +| Parameter | Type | Default | Description | +|---|---|---|---| +| `filter` | string | `""` | Cloud Logging query filter. Empty = all logs. | +| `use_partitioned_tables` | boolean | `true` | BigQuery only — write to date-partitioned tables. | +| `exclusions` | array | `[]` | Per-exclusion drop rules applied after the sink filter. | + +### Filter Examples + +``` +severity >= ERROR +resource.type = "cloud_run_revision" +logName = "projects/PROJECT/logs/cloudaudit.googleapis.com%2Factivity" +resource.type = "gce_instance" AND severity >= WARNING +``` + +## Compliance + +Log sinks are low-risk infrastructure. No Checkov skips are expected. `halt_on_failure` is set to block deployments to `prod`, `prd`, and `production` environments on any compliance failure. + +## Assumptions + +- This bundle creates a **project-level** sink. It does NOT capture logs from child projects, folders, or the organization. Folder or org sinks are out of scope. +- `unique_writer_identity = true` is non-negotiable. Sharing the project-level logging SA across sinks would mean IAM grants on one sink's destination affect all other sinks. +- Filter changes take effect immediately but do NOT backfill historical logs. Logs written before the filter change are not re-routed. diff --git a/bundles/gcp-log-sink/massdriver.yaml b/bundles/gcp-log-sink/massdriver.yaml new file mode 100644 index 0000000..63cff99 --- /dev/null +++ b/bundles/gcp-log-sink/massdriver.yaml @@ -0,0 +1,145 @@ +name: gcp-log-sink +description: Google Cloud Logging project-level sink with configurable filter and + destination. Routes log entries matching the filter to a BigQuery dataset or GCS + bucket. Automatically grants the Google-managed sink writer identity the minimum + required IAM role on the chosen destination. Enforces exactly-one destination via + a Terraform precondition. Emits a gcp-log-sink artifact with sink metadata for + downstream reference. +source_url: https://github.com/massdriver-cloud/massdriver-catalog/tree/main/bundles/gcp-log-sink +version: 0.1.1 + +params: + examples: + - __name: Error Logs to BigQuery + filter: "severity >= ERROR" + use_partitioned_tables: true + exclusions: [] + - __name: Audit Logs to GCS + filter: 'logName = "projects/PROJECT/logs/cloudaudit.googleapis.com%2Factivity"' + use_partitioned_tables: false + exclusions: [] + - __name: All Logs (no filter) + filter: "" + use_partitioned_tables: true + exclusions: [] + + properties: + filter: + title: Log Filter + description: >- + Cloud Logging filter query that determines which log entries are routed to + the destination. Leave empty to include all logs (can be expensive). Common + examples: severity >= ERROR / resource.type = "cloud_run_revision" / + logName = "projects/PROJECT/logs/cloudaudit.googleapis.com%2Factivity". + Filter syntax: https://cloud.google.com/logging/docs/view/logging-query-language + type: string + default: "" + + use_partitioned_tables: + title: Use Partitioned Tables (BigQuery only) + description: >- + When the destination is BigQuery, write log entries into date-partitioned + tables instead of a single monolithic table. Partitioning reduces query cost + and improves performance for time-bounded queries. Has no effect when the + destination is GCS — the setting is stored but ignored. + type: boolean + default: true + + exclusions: + title: Exclusions + description: >- + Optional list of log exclusions. Each exclusion drops log entries that match + its filter before they reach the destination. Useful for suppressing high-volume + low-value logs (e.g., health-check requests) from storage costs. Exclusions + are evaluated AFTER the sink filter, so they can only drop entries that would + otherwise be included. + type: array + default: [] + items: + type: object + required: + - name + - filter + properties: + name: + title: Exclusion Name + description: >- + Short identifier for this exclusion rule. Must be unique within the sink. + Used in Logging metrics and audit logs to identify which exclusion applied. + type: string + filter: + title: Exclusion Filter + description: >- + Cloud Logging query language filter. Log entries matching this filter + are dropped from the sink. Use the same syntax as the top-level sink + filter field. + type: string + description: + title: Description + description: Optional human-readable explanation of why these logs are excluded. + type: string + disabled: + title: Disabled + description: >- + When true, the exclusion is defined but not active — matching logs are + still routed to the destination. Useful for temporarily suspending an + exclusion without deleting it. + type: boolean + default: false + +connections: + required: + - gcp_authentication + - landing_zone + properties: + gcp_authentication: + $ref: gcp-service-account + title: GCP Credentials + + landing_zone: + $ref: catalog-demo/gcp-landing-zone + title: GCP Landing Zone + + bigquery_dataset: + $ref: catalog-demo/gcp-bigquery-dataset + title: BigQuery Dataset (optional destination) + + storage_bucket: + $ref: catalog-demo/gcp-storage-bucket + title: GCS Bucket (optional destination) + +artifacts: + required: + - log_sink + properties: + log_sink: + $ref: catalog-demo/gcp-log-sink + title: GCP Log Sink + +steps: + - path: src + provisioner: opentofu:1.10 + config: + checkov: + halt_on_failure: '.params.md_metadata.default_tags["md-target"] | test("^(prod|prd|production)$")' + +ui: + ui:order: + - filter + - use_partitioned_tables + - exclusions + - "*" + properties: + use_partitioned_tables: + ui:widget: checkbox + exclusions: + items: + ui:order: + - name + - filter + - description + - disabled + - "*" + properties: + disabled: + ui:widget: checkbox diff --git a/bundles/gcp-log-sink/operator.md b/bundles/gcp-log-sink/operator.md new file mode 100644 index 0000000..9517b24 --- /dev/null +++ b/bundles/gcp-log-sink/operator.md @@ -0,0 +1,65 @@ +# gcp-log-sink — Operator Runbook + +## Non-obvious Constraints + +**Project scope only.** This sink captures logs from the project specified in the landing zone connection. Logs from other projects, child folders, or the organization are not captured. Folder-level and org-level sinks require a different Terraform resource (`google_logging_folder_sink` / `google_logging_organization_sink`) and are out of scope for this bundle. + +**`unique_writer_identity` is locked to `true`.** The Google-managed writer SA is unique per sink. If set to `false`, Cloud Logging would use the shared `cloud-logs@system.gserviceaccount.com` SA, which cannot be individually scoped to a single dataset or bucket. Changing this after the sink is created requires destroy and recreate — the writer identity changes. + +**Writer identity is generated at sink creation.** The `writer_identity` SA email is not known before `tofu apply`. It is provisioned by Cloud Logging when the sink resource is created. If the sink is destroyed and recreated (not updated in place), a NEW writer identity SA is generated and all prior IAM bindings on the destination become stale. This bundle re-creates the IAM binding from the new identity, but any manually added bindings on the destination will not. + +**Filter changes are non-backfilling.** Updating the `filter` takes effect immediately for new log entries. Historical logs already written to the destination are not touched. Entries that were routed before the filter change remain in BigQuery or GCS permanently. + +**BigQuery schema is auto-created and can drift.** Cloud Logging infers table schema from log entry structure. If Google changes the structure of a system log (e.g., adds or renames a field), existing tables are not migrated. Queries relying on specific field paths may break. Use `SELECT *` with caution in production pipelines. + +**GCS batching latency.** Cloud Logging batches log entries hourly before writing to GCS. The sink is not suitable for near-real-time querying or alerting. Use BigQuery with `use_partitioned_tables = true` for latency-sensitive use cases. + +**Exactly-one destination is a hard constraint.** The Terraform precondition blocks plan if both or neither optional connections are wired. This check fires before any API calls — you will see the error in the deployment log from the `tofu plan` step. + +## Troubleshooting + +**"precondition failed: Connect either a BigQuery dataset or a Storage bucket"** — Exactly one of the two optional connections (`bigquery_dataset`, `storage_bucket`) must be wired on the canvas. Check the canvas wiring and re-deploy. + +**Sink exists but no logs appear in destination** — Verify the filter is correct by testing it in the Logs Explorer (`console.cloud.google.com/logs/query`) against live traffic before applying it to the sink. An overly restrictive filter results in a valid sink that routes nothing. + +**IAM error: "The caller does not have permission on the resource"** — The sink writer identity SA needs time to propagate after creation. If IAM bindings were applied but the sink was just created, wait 60-90 seconds and check again. If the sink was destroyed and recreated, the writer identity changed — check the artifact `writer_identity` field and verify the IAM binding reflects the new SA. + +**BigQuery tables not appearing after deploy** — Cloud Logging creates tables lazily: the first matching log entry triggers table creation. If no logs match the filter, no tables appear. Confirm by checking Logs Explorer for matching entries, then wait up to 5 minutes. + +**GCS files not appearing** — Cloud Logging writes hourly. Wait at least 90 minutes after deploy before concluding there is a problem. Check the Logs Explorer for entries matching the filter first. + +**"ALREADY_EXISTS" error on sink creation** — A sink with the same name (derived from `md_metadata.name_prefix`) already exists in the project. This happens if a previous deployment left a sink that Terraform state does not track. Import the existing sink: `tofu import google_logging_project_sink.main projects/PROJECT/sinks/SINK_NAME`. + +## Day-2 Operations + +**Updating the filter** — Change the `filter` param in the package config and deploy. The sink is updated in place. Filter changes are immediate for new log entries. No restart or recreate needed. + +**Adding an exclusion** — Add an entry to the `exclusions` array and deploy. Exclusions are applied after the sink filter. Use the Logs Explorer to validate the exclusion filter matches what you intend before deploying to production. + +**Switching destinations** — Changing from BigQuery to GCS (or vice versa) requires the opposite connection to be wired AND the currently wired connection to be unwired simultaneously. The precondition blocks any state where both or neither are active. Execute the connection change and re-deploy in a single operation. The old IAM binding is removed and a new one is created. The sink name and writer identity do not change. + +**Decommissioning** — Destroying the bundle removes the sink and the IAM binding. Log entries already in the destination (BigQuery tables or GCS objects) are NOT deleted — they remain in the destination resource and accrue storage cost until manually removed. + +## Useful Commands + +```bash +# List sinks in the project +gcloud logging sinks list --project=PROJECT_ID + +# Describe a specific sink +gcloud logging sinks describe SINK_NAME --project=PROJECT_ID + +# Check sink writer identity (useful for manual IAM debugging) +gcloud logging sinks describe SINK_NAME --project=PROJECT_ID --format="value(writerIdentity)" + +# Test a log filter in Logs Explorer (output to stdout for quick count check) +gcloud logging read 'severity >= ERROR' --project=PROJECT_ID --limit=10 + +# Verify BigQuery IAM on the dataset +gcloud projects get-iam-policy PROJECT_ID --flatten="bindings[].members" \ + --format="table(bindings.role,bindings.members)" \ + --filter="bindings.members:gcp-sa-logging" + +# Import an orphaned sink into Terraform state +tofu import google_logging_project_sink.main projects/PROJECT_ID/sinks/SINK_NAME +``` diff --git a/bundles/gcp-log-sink/src/artifacts.tf b/bundles/gcp-log-sink/src/artifacts.tf new file mode 100644 index 0000000..17f3534 --- /dev/null +++ b/bundles/gcp-log-sink/src/artifacts.tf @@ -0,0 +1,13 @@ +# Log sink artifact — matches catalog-demo/gcp-log-sink schema. + +resource "massdriver_artifact" "log_sink" { + field = "log_sink" + name = "GCP Log Sink ${var.md_metadata.name_prefix}" + artifact = jsonencode({ + project_id = local.project_id + sink_name = google_logging_project_sink.main.name + destination = google_logging_project_sink.main.destination + writer_identity = google_logging_project_sink.main.writer_identity + destination_type = local.destination_type + }) +} diff --git a/bundles/gcp-log-sink/src/main.tf b/bundles/gcp-log-sink/src/main.tf new file mode 100644 index 0000000..a790513 --- /dev/null +++ b/bundles/gcp-log-sink/src/main.tf @@ -0,0 +1,97 @@ +terraform { + required_version = ">= 1.0" + required_providers { + google = { + source = "hashicorp/google" + version = "~> 6.0" + } + massdriver = { + source = "massdriver-cloud/massdriver" + version = "~> 1.3" + } + } +} + +provider "google" { + project = var.gcp_authentication.project_id + credentials = jsonencode(var.gcp_authentication) +} + +locals { + project_id = var.landing_zone.project_id + name_prefix = var.md_metadata.name_prefix + + # Resolve which destination connection is wired. Exactly one must be non-null. + # The precondition on google_logging_project_sink.main enforces this at plan time. + has_bigquery = var.bigquery_dataset != null + has_gcs = var.storage_bucket != null + + destination = local.has_bigquery ? ( + "bigquery.googleapis.com/projects/${var.bigquery_dataset.project_id}/datasets/${var.bigquery_dataset.dataset_id}" + ) : local.has_gcs ? ( + "storage.googleapis.com/${var.storage_bucket.bucket_name}" + ) : "" + + destination_type = local.has_bigquery ? "bigquery" : "gcs" +} + +# ─── Cloud Logging Project Sink ─────────────────────────────────────────────── + +resource "google_logging_project_sink" "main" { + project = local.project_id + name = local.name_prefix + destination = local.destination + filter = var.filter != "" ? var.filter : null + + # unique_writer_identity = true ensures the sink gets its own Google-managed SA + # rather than sharing the project-level logging SA. Required when granting the + # sink's writer access on a specific dataset or bucket (otherwise IAM bindings + # would affect ALL sinks in the project). This is non-negotiable. + unique_writer_identity = true + + dynamic "bigquery_options" { + for_each = local.has_bigquery ? [1] : [] + content { + use_partitioned_tables = var.use_partitioned_tables + } + } + + dynamic "exclusions" { + for_each = var.exclusions + content { + name = exclusions.value.name + filter = exclusions.value.filter + description = try(exclusions.value.description, null) + disabled = try(exclusions.value.disabled, false) + } + } + + lifecycle { + precondition { + condition = (var.bigquery_dataset != null) != (var.storage_bucket != null) + error_message = "Connect either a BigQuery dataset or a Storage bucket as the sink destination, not both and not neither." + } + } +} + +# ─── Sink Writer IAM Binding ────────────────────────────────────────────────── +# Grant the Google-managed sink writer SA the minimum role on the destination. +# writer_identity is not known until the sink is created — Terraform handles the +# dependency automatically via the reference below. + +resource "google_bigquery_dataset_iam_member" "sink_writer" { + count = local.has_bigquery ? 1 : 0 + + project = var.bigquery_dataset.project_id + dataset_id = var.bigquery_dataset.dataset_id + role = "roles/bigquery.dataEditor" + member = google_logging_project_sink.main.writer_identity +} + +resource "google_storage_bucket_iam_member" "sink_writer" { + count = local.has_gcs ? 1 : 0 + + bucket = var.storage_bucket.bucket_name + role = "roles/storage.objectCreator" + member = google_logging_project_sink.main.writer_identity +} diff --git a/bundles/gcp-log-sink/src/variables.tf b/bundles/gcp-log-sink/src/variables.tf new file mode 100644 index 0000000..0e0766b --- /dev/null +++ b/bundles/gcp-log-sink/src/variables.tf @@ -0,0 +1,93 @@ +variable "md_metadata" { + type = object({ + name_prefix = string + default_tags = optional(map(string), {}) + }) +} + +variable "gcp_authentication" { + type = object({ + type = string + project_id = string + private_key_id = string + private_key = string + client_email = string + client_id = string + auth_uri = string + token_uri = string + auth_provider_x509_cert_url = string + client_x509_cert_url = string + }) + sensitive = true +} + +variable "landing_zone" { + type = object({ + project_id = string + network = object({ + network_name = string + network_self_link = string + region = string + primary_subnet = object({ + name = string + cidr = string + self_link = string + }) + }) + enabled_apis = list(string) + budget = object({ + enabled = bool + budget_name = optional(string) + billing_account_id = optional(string) + amount_usd = optional(number) + }) + }) +} + +variable "bigquery_dataset" { + description = "Optional BigQuery dataset destination. Must be wired when the sink routes to BigQuery." + type = object({ + project_id = string + dataset_id = string + dataset_full_name = string + location = string + friendly_name = optional(string) + }) + default = null +} + +variable "storage_bucket" { + description = "Optional GCS bucket destination. Must be wired when the sink routes to GCS." + type = object({ + project_id = string + bucket_name = string + bucket_url = string + bucket_self_link = string + location = string + storage_class = string + }) + default = null +} + +variable "filter" { + description = "Cloud Logging query filter. Empty string means include all logs." + type = string + default = "" +} + +variable "use_partitioned_tables" { + description = "Write to date-partitioned BigQuery tables. Ignored for GCS destinations." + type = bool + default = true +} + +variable "exclusions" { + description = "Log exclusion rules applied after the sink filter." + type = list(object({ + name = string + filter = string + description = optional(string) + disabled = optional(bool, false) + })) + default = [] +} diff --git a/bundles/gcp-network/massdriver.yaml b/bundles/gcp-network/massdriver.yaml index 2155ba5..77579e1 100644 --- a/bundles/gcp-network/massdriver.yaml +++ b/bundles/gcp-network/massdriver.yaml @@ -3,7 +3,7 @@ description: Minimal GCP VPC network with a single regional subnet. Produces a gcp-network artifact consumed by landing-zone, Cloud Run, Vertex Workbench, and other data-platform bundles. source_url: https://github.com/massdriver-cloud/massdriver-catalog/tree/main/bundles/gcp-network -version: 0.1.0 +version: 0.1.1 params: required: diff --git a/bundles/gcp-pubsub-topic/massdriver.yaml b/bundles/gcp-pubsub-topic/massdriver.yaml index 1259683..bc28cd6 100644 --- a/bundles/gcp-pubsub-topic/massdriver.yaml +++ b/bundles/gcp-pubsub-topic/massdriver.yaml @@ -4,7 +4,7 @@ description: Google Cloud Pub/Sub topic with optional dead-letter queue. Provisi service account publisher access. Emits a gcp-pubsub-topic artifact for downstream Cloud Run, Dataflow, and BigQuery bundles to consume. source_url: https://github.com/massdriver-cloud/massdriver-catalog/tree/main/bundles/gcp-pubsub-topic -version: 0.1.0 +version: 0.1.1 params: required: diff --git a/bundles/gcp-storage-bucket/massdriver.yaml b/bundles/gcp-storage-bucket/massdriver.yaml index cef9504..7b5d29b 100644 --- a/bundles/gcp-storage-bucket/massdriver.yaml +++ b/bundles/gcp-storage-bucket/massdriver.yaml @@ -5,7 +5,7 @@ description: Google Cloud Storage bucket with configurable storage class, option workload service account objectAdmin access. Emits a gcp-storage-bucket artifact for downstream Cloud Run, BigQuery, and Vertex Workbench bundles. source_url: https://github.com/massdriver-cloud/massdriver-catalog/tree/main/bundles/gcp-storage-bucket -version: 0.1.0 +version: 0.1.1 params: required: diff --git a/bundles/gcp-vertex-workbench/massdriver.yaml b/bundles/gcp-vertex-workbench/massdriver.yaml index 1f1a400..ff05896 100644 --- a/bundles/gcp-vertex-workbench/massdriver.yaml +++ b/bundles/gcp-vertex-workbench/massdriver.yaml @@ -5,7 +5,7 @@ description: Vertex AI Workbench instance for interactive data science. Provisio BigQuery dataset when wired. Emits a gcp-vertex-workbench artifact carrying the instance name, zone, JupyterLab proxy URL, and SA identity. source_url: https://github.com/massdriver-cloud/massdriver-catalog/tree/main/bundles/gcp-vertex-workbench -version: 0.1.0 +version: 0.1.1 params: required: diff --git a/templates/gcp-cloud-run-service/README.md b/templates/gcp-cloud-run-service/README.md index 5879b59..c08ff69 100644 --- a/templates/gcp-cloud-run-service/README.md +++ b/templates/gcp-cloud-run-service/README.md @@ -17,7 +17,9 @@ The CLI will prompt for: - Cloud Run v2 service running as its own per-service service account - Sensible defaults baked in: 1 vCPU, 512Mi memory, internal ingress, port 8080 - Artifact output so downstream bundles can discover the service URL and runtime SA -- Example IAM bindings in `src/iam.tf` for common upstream data resources, commented out — uncomment and adapt based on which connections you picked +- Example IAM bindings in `src/iam.tf` for common upstream data resources (Pub/Sub publisher, BigQuery writer, GCS object user) — commented out, ready to uncomment based on which connections you picked +- Example push subscription in `src/push_subscription.tf` — uncomment if you want this service to receive messages from a Pub/Sub topic. Uses a dedicated push-invoker SA and OIDC for authenticated delivery. +- Example VPC connector wiring in `src/main.tf` — uncomment if you want egress to flow through a Serverless VPC Access connector (required for reaching private endpoints like on-prem Kafka via peered networks). ## What to customize diff --git a/templates/gcp-cloud-run-service/src/iam.tf b/templates/gcp-cloud-run-service/src/iam.tf index 74d16d3..2ca37b2 100644 --- a/templates/gcp-cloud-run-service/src/iam.tf +++ b/templates/gcp-cloud-run-service/src/iam.tf @@ -4,6 +4,7 @@ # # Examples — uncomment and adapt based on which connections you selected: # +# --- Outgoing: publish to a Pub/Sub topic --- # resource "google_pubsub_topic_iam_member" "publisher" { # project = var.pubsub_topic.project_id # topic = var.pubsub_topic.topic_name @@ -11,6 +12,7 @@ # member = local.runtime_sa_member # } # +# --- Outgoing: write to BigQuery --- # resource "google_bigquery_dataset_iam_member" "data_editor" { # project = var.bigquery_dataset.project_id # dataset_id = var.bigquery_dataset.dataset_id @@ -18,6 +20,7 @@ # member = local.runtime_sa_member # } # +# --- Outgoing: read/write GCS objects --- # resource "google_storage_bucket_iam_member" "object_user" { # bucket = var.storage_bucket.bucket_name # role = "roles/storage.objectUser" diff --git a/templates/gcp-cloud-run-service/src/main.tf b/templates/gcp-cloud-run-service/src/main.tf index 588b1c3..4a79fd3 100644 --- a/templates/gcp-cloud-run-service/src/main.tf +++ b/templates/gcp-cloud-run-service/src/main.tf @@ -24,6 +24,15 @@ resource "google_cloud_run_v2_service" "main" { template { service_account = local.runtime_sa_email + # Optional: route egress through a Serverless VPC Access connector. + # Uncomment if you picked a gcp-vpc-connector connection named `vpc_connector` + # at scaffold time. + # + # vpc_access { + # connector = var.vpc_connector.connector_id + # egress = "PRIVATE_RANGES_ONLY" # or ALL_TRAFFIC to force all egress through VPC + # } + containers { image = var.image diff --git a/templates/gcp-cloud-run-service/src/push_subscription.tf b/templates/gcp-cloud-run-service/src/push_subscription.tf new file mode 100644 index 0000000..6b65d4a --- /dev/null +++ b/templates/gcp-cloud-run-service/src/push_subscription.tf @@ -0,0 +1,43 @@ +# Optional: receive messages from a Pub/Sub topic via push subscription. +# +# If you picked a gcp-pubsub-topic connection named `incoming_topic` at scaffold +# time, uncomment the resources below. This creates a push subscription that +# invokes this service's URL using OIDC with a dedicated invoker SA. +# +# The push invoker SA is separate from this service's runtime SA. Pub/Sub uses +# the invoker SA to call the service; the runtime SA is the identity the +# container runs as once the request lands. +# +# resource "google_service_account" "push_invoker" { +# project = local.project_id +# account_id = "${substr(local.name_prefix, 0, 28)}-p" +# display_name = "Push Invoker — ${local.name_prefix}" +# } +# +# resource "google_cloud_run_v2_service_iam_member" "push_invoker" { +# project = local.project_id +# location = google_cloud_run_v2_service.main.location +# name = google_cloud_run_v2_service.main.name +# role = "roles/run.invoker" +# member = "serviceAccount:${google_service_account.push_invoker.email}" +# } +# +# resource "google_pubsub_subscription" "push" { +# project = var.incoming_topic.project_id +# name = "${local.name_prefix}-push" +# topic = var.incoming_topic.topic_id +# +# ack_deadline_seconds = 60 +# +# push_config { +# push_endpoint = google_cloud_run_v2_service.main.uri +# oidc_token { +# service_account_email = google_service_account.push_invoker.email +# } +# } +# +# retry_policy { +# minimum_backoff = "10s" +# maximum_backoff = "600s" +# } +# } From 7eecc5f108ae82efe961d35e576f1a66f98d9bda Mon Sep 17 00:00:00 2001 From: Cory O'Daniel Date: Wed, 22 Apr 2026 16:38:39 -0700 Subject: [PATCH 10/15] Move Pub/Sub BigQuery subscription to new gcp-bigquery-table bundle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Pub/Sub → BigQuery subscription lives with the target table, not with the dataset. Tables are the real unit of data — subscriptions route into them, IAM binds to them, schemas live on them. Having the subscription on the dataset bundle meant a dataset could only ever ingest from one topic, and the IAM grants were over-scoped. gcp-bigquery-dataset (cleanup) Removes the pubsub_topic connection, the bigquery_subscription param block, src/subscription.tf, and all subscription-related prose from README and operator.md. The dataset bundle is back to being just a dataset container. gcp-bigquery-table (new) Creates a google_bigquery_table in the connected dataset. When a Pub/Sub topic is wired to the optional pubsub_topic input, also creates a BigQuery subscription delivering into this specific table with table-scoped IAM bindings for the Pub/Sub service agent. Two schema modes: pubsub_default (the five-column Pub/Sub-compatible schema — subscription_name, message_id, publish_time, data, attributes) or custom_schema via a JSON schema param. gcp-bigquery-table artifact definition (new) Fields: project_id, dataset_id, table_id, table_full_name. Header comment documents the IAM role-binding pattern for downstream consumers, matching the convention of the other gcp-* artdefs. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../gcp-bigquery-table/massdriver.yaml | 67 +++++++ bundles/gcp-bigquery-dataset/README.md | 31 +-- bundles/gcp-bigquery-dataset/massdriver.yaml | 73 ------- bundles/gcp-bigquery-dataset/operator.md | 39 +--- bundles/gcp-bigquery-dataset/src/variables.tf | 25 --- bundles/gcp-bigquery-table/README.md | 90 +++++++++ bundles/gcp-bigquery-table/massdriver.yaml | 180 ++++++++++++++++++ bundles/gcp-bigquery-table/operator.md | 109 +++++++++++ bundles/gcp-bigquery-table/src/artifacts.tf | 12 ++ bundles/gcp-bigquery-table/src/main.tf | 83 ++++++++ .../src/subscription.tf | 49 +++-- bundles/gcp-bigquery-table/src/variables.tf | 80 ++++++++ 12 files changed, 645 insertions(+), 193 deletions(-) create mode 100644 artifact-definitions/gcp-bigquery-table/massdriver.yaml create mode 100644 bundles/gcp-bigquery-table/README.md create mode 100644 bundles/gcp-bigquery-table/massdriver.yaml create mode 100644 bundles/gcp-bigquery-table/operator.md create mode 100644 bundles/gcp-bigquery-table/src/artifacts.tf create mode 100644 bundles/gcp-bigquery-table/src/main.tf rename bundles/{gcp-bigquery-dataset => gcp-bigquery-table}/src/subscription.tf (57%) create mode 100644 bundles/gcp-bigquery-table/src/variables.tf diff --git a/artifact-definitions/gcp-bigquery-table/massdriver.yaml b/artifact-definitions/gcp-bigquery-table/massdriver.yaml new file mode 100644 index 0000000..6c9b609 --- /dev/null +++ b/artifact-definitions/gcp-bigquery-table/massdriver.yaml @@ -0,0 +1,67 @@ +name: gcp-bigquery-table +label: GCP BigQuery Table +icon: https://raw.githubusercontent.com/massdriver-cloud/massdriver-catalog/refs/heads/main/platforms/gcp/icon.png + +# IAM Role Binding Pattern (examples — adapt to your actual consumer): +# Downstream bundles that need read-only access bind roles/bigquery.dataViewer at the table level. +# Downstream bundles that need read+write access bind roles/bigquery.dataEditor at the table level. +# +# Terraform example — grant data viewer access to a workload service account: +# resource "google_bigquery_table_iam_member" "reader" { +# project = var.bigquery_table.project_id +# dataset_id = var.bigquery_table.dataset_id +# table_id = var.bigquery_table.table_id +# role = "roles/bigquery.dataViewer" +# member = "serviceAccount:" +# } +# +# Note: Table-level IAM bindings do not propagate up to the parent dataset. +# Dataset-level bindings DO propagate down to all tables. Prefer dataset-level +# bindings for broad access and table-level bindings for scoped isolation. +# +# Policy examples below (reader / writer) follow this same pattern. They are +# illustrative — the actual IAM member string comes from the consumer bundle's +# service account, not from this artifact. +exports: [] + +schema: + title: GCP BigQuery Table + description: A Google Cloud BigQuery table. Carries the project ID, dataset ID, + table ID, and fully-qualified table name (..) so + downstream bundles can reference the table for querying, loading, and Pub/Sub + subscription delivery without hard-coding identifiers. + type: object + required: + - project_id + - dataset_id + - table_id + - table_full_name + properties: + project_id: + title: Project ID + description: GCP project identifier that owns this BigQuery table + type: string + examples: + - my-gcp-project-123 + + dataset_id: + title: Dataset ID + description: BigQuery dataset identifier that contains this table (letters, digits, underscores — no hyphens) + type: string + examples: + - my_analytics_dataset + + table_id: + title: Table ID + description: BigQuery table identifier within the dataset (letters, digits, underscores — no hyphens) + type: string + examples: + - messages + + table_full_name: + title: Table Full Name + description: Fully-qualified BigQuery table name in ..
form. + Use this in SQL FROM clauses and bq CLI commands. + type: string + examples: + - my-gcp-project-123.my_analytics_dataset.messages diff --git a/bundles/gcp-bigquery-dataset/README.md b/bundles/gcp-bigquery-dataset/README.md index 2a324a8..7e09213 100644 --- a/bundles/gcp-bigquery-dataset/README.md +++ b/bundles/gcp-bigquery-dataset/README.md @@ -13,11 +13,8 @@ Google Cloud BigQuery dataset with configurable location, default table expirati | Resource | Type | Notes | |---|---|---| | `google_bigquery_dataset.main` | BigQuery dataset | Location, expiration, and delete protection set at provision time; Google-managed encryption | -| `google_pubsub_subscription.bigquery` | Pub/Sub subscription | Created only when a Pub/Sub topic is wired. Delivers messages into a table in this dataset. | -| `google_bigquery_dataset_iam_member.pubsub_service_agent_data_editor` | IAM binding | Created only when a Pub/Sub topic is wired. Grants the Pub/Sub service agent `roles/bigquery.dataEditor` on this dataset. | -| `google_bigquery_dataset_iam_member.pubsub_service_agent_metadata_viewer` | IAM binding | Created only when a Pub/Sub topic is wired. Grants the Pub/Sub service agent `roles/bigquery.metadataViewer` on this dataset. | -This bundle does NOT create any workload IAM bindings. Consumer bundles (e.g., `gcp-cloud-run-service`, `gcp-vertex-workbench`) create their own service accounts and bind the appropriate roles on this dataset when connected on the canvas. The IAM bindings above are only for the Pub/Sub service agent, and only when a topic is wired. +This bundle does NOT create tables, subscriptions, or workload IAM bindings. Consumer bundles (e.g., `gcp-bigquery-table`, `gcp-cloud-run-service`, `gcp-vertex-workbench`) create their own resources and bind the appropriate roles on this dataset when connected on the canvas. ## Connections @@ -25,18 +22,6 @@ This bundle does NOT create any workload IAM bindings. Consumer bundles (e.g., ` |---|---|---|---| | `gcp_authentication` | Yes | `gcp-service-account` | Deploy credential — project ID and service account key for the Google provider | | `landing_zone` | Yes | `catalog-demo/gcp-landing-zone` | Provides `project_id` for resource placement | -| `pubsub_topic` | No | `catalog-demo/gcp-pubsub-topic` | When wired, creates a Pub/Sub BigQuery subscription that delivers messages into a table in this dataset | - -### Optional: Pub/Sub BigQuery subscription - -When you wire a `gcp-pubsub-topic` bundle to this bundle's `pubsub_topic` connection, the following happens on the next deploy: - -1. The Pub/Sub service agent (`service-@gcp-sa-pubsub.iam.gserviceaccount.com`) is granted `roles/bigquery.dataEditor` and `roles/bigquery.metadataViewer` on this dataset. These bindings are dataset-scoped, not project-wide. -2. A Pub/Sub subscription is created on the topic with BigQuery delivery configured to write into the table you specify. - -The IAM bindings are removed when the topic is disconnected and the bundle is redeployed. - -**The target table must already exist.** Pub/Sub does not create BigQuery tables. Create the table in the dataset before wiring the topic connection and deploying. If the table is absent, Terraform will succeed but the subscription will fail to deliver messages. ## Artifact Produced @@ -70,18 +55,6 @@ resource "google_bigquery_dataset_iam_member" "dataset_viewer" { } ``` -### BigQuery subscription parameters - -These params appear in the form under **BigQuery Subscription Settings** and are only used when `pubsub_topic` is wired. - -| Parameter | Type | Default | Description | -|---|---|---|---| -| `bigquery_subscription.table_name` | string | — | Name of an existing table in this dataset. Pattern: `^[a-zA-Z0-9_]{1,1024}$`. Required when the topic is wired. | -| `bigquery_subscription.use_topic_schema` | boolean | `false` | When true, uses the Pub/Sub topic schema to map message fields to table columns. When false, writes raw bytes to a `data` column. | -| `bigquery_subscription.write_metadata` | boolean | `false` | When true, adds `subscription_name`, `message_id`, `publish_time`, and `attributes` columns to each row. Your table schema must include these columns. | -| `bigquery_subscription.drop_unknown_fields` | boolean | `false` | When true and `use_topic_schema` is enabled, silently drops message fields that are not in the table schema. When false, unknown fields send the message to the dead letter topic or drop it. | -| `bigquery_subscription.ack_deadline_seconds` | integer | `60` | Seconds Pub/Sub waits for BigQuery to acknowledge a message before re-delivering. Range 10–600. | - ## Compliance ### Checkov skips @@ -98,8 +71,6 @@ The `halt_on_failure` expression in `massdriver.yaml` blocks deployments with un - The `gcp_authentication` credential has `bigquery.admin` or equivalent IAM on the project. - `dataset_id` is immutable after creation. Changing it requires destroying and recreating the dataset — all data is lost unless exported first. - `default_table_expiration_days` applies only to tables created after the setting is applied. Existing tables are not affected. -- When using the Pub/Sub BigQuery subscription feature, the target table named in `bigquery_subscription.table_name` must already exist in the dataset before deploying. This bundle does not create tables. -- When using the Pub/Sub BigQuery subscription feature, `pubsub.googleapis.com` must be enabled in the project. ## Presets diff --git a/bundles/gcp-bigquery-dataset/massdriver.yaml b/bundles/gcp-bigquery-dataset/massdriver.yaml index 06c5ab9..cde6803 100644 --- a/bundles/gcp-bigquery-dataset/massdriver.yaml +++ b/bundles/gcp-bigquery-dataset/massdriver.yaml @@ -97,59 +97,6 @@ params: type: boolean default: false - bigquery_subscription: - title: BigQuery Subscription Settings - description: Settings for the Pub/Sub BigQuery subscription. Only used when - a Pub/Sub topic is wired to this bundle. The target table must already exist - in this dataset — Pub/Sub does not create it. - type: object - properties: - table_name: - title: Target Table Name - description: Name of an existing table in this dataset where Pub/Sub will - deliver messages. The table must exist before the subscription is created. - Pub/Sub does not create tables automatically. Letters, digits, and underscores - only; maximum 1024 characters. - type: string - pattern: "^[a-zA-Z0-9_]+$" - - use_topic_schema: - title: Use Topic Schema - description: When enabled, BigQuery uses the Pub/Sub topic's schema to - parse messages and map fields to table columns. When disabled, messages - are written as raw bytes in a single `data` column. Requires the topic - to have a schema attached. - type: boolean - default: false - - write_metadata: - title: Write Subscription Metadata - description: When enabled, BigQuery adds extra columns to each row — - `subscription_name`, `message_id`, `publish_time`, and `attributes`. - Useful for auditing and deduplication. The table schema must include - these columns when this is enabled. - type: boolean - default: false - - drop_unknown_fields: - title: Drop Unknown Fields - description: When enabled and use_topic_schema is also enabled, fields - in the message that do not exist in the table schema are silently dropped. - When disabled, unknown fields cause the message to be sent to the dead - letter topic (if configured) or dropped. - type: boolean - default: false - - ack_deadline_seconds: - title: Acknowledgement Deadline (seconds) - description: How long Pub/Sub waits for BigQuery to acknowledge a message - before re-delivering it. Increase if BigQuery write latency exceeds the - default. Range 10–600 seconds. - type: integer - default: 60 - minimum: 10 - maximum: 600 - connections: required: - gcp_authentication @@ -163,10 +110,6 @@ connections: $ref: catalog-demo/gcp-landing-zone title: GCP Landing Zone - pubsub_topic: - $ref: catalog-demo/gcp-pubsub-topic - title: Pub/Sub Topic (optional — wire to enable BigQuery subscription delivery) - artifacts: required: - bigquery_dataset @@ -190,23 +133,7 @@ ui: - location - default_table_expiration_days - delete_protection - - bigquery_subscription - "*" properties: delete_protection: ui:widget: checkbox - bigquery_subscription: - ui:order: - - table_name - - use_topic_schema - - write_metadata - - drop_unknown_fields - - ack_deadline_seconds - - "*" - properties: - use_topic_schema: - ui:widget: checkbox - write_metadata: - ui:widget: checkbox - drop_unknown_fields: - ui:widget: checkbox diff --git a/bundles/gcp-bigquery-dataset/operator.md b/bundles/gcp-bigquery-dataset/operator.md index 8873ed0..5497daf 100644 --- a/bundles/gcp-bigquery-dataset/operator.md +++ b/bundles/gcp-bigquery-dataset/operator.md @@ -18,16 +18,10 @@ templating: mustache **Dataset-level IAM propagates to all tables, current and future.** For row-level or table-level isolation, use BigQuery row-level security policies or bind IAM at the table level separately. -**Consumer bundles are responsible for their own IAM bindings.** Consumer bundles bind their own service accounts to this dataset. If a service can't query or load data, the IAM binding is missing from the consumer bundle — not from here. The only IAM bindings this bundle creates are the Pub/Sub service agent bindings, and only when a topic is wired. +**Consumer bundles are responsible for their own IAM bindings.** Consumer bundles bind their own service accounts to this dataset. If a service can't query or load data, the IAM binding is missing from the consumer bundle — not from here. **Cross-region queries are not supported.** BigQuery cannot join tables in different regions in a single query. Use Storage Transfer Service or BigQuery Data Transfer Service to replicate data first. -**BigQuery subscription target table must exist before deploy.** When the `pubsub_topic` connection is wired, Pub/Sub creates the subscription but does NOT create the target table. The table named in `bigquery_subscription.table_name` must already exist in this dataset. If the table is absent, Terraform will succeed but the subscription will fail to deliver messages — they will accumulate and eventually be dropped or sent to a dead letter topic. - -**Schema mismatch routes messages to dead letter or drops them.** When `use_topic_schema = true` and a message contains fields not in the table schema, behavior depends on `drop_unknown_fields`. If `drop_unknown_fields = false` (the default), the message is routed to the dead letter topic if one is configured on the source topic, or dropped. If `drop_unknown_fields = true`, the extra fields are silently discarded and the message is delivered. - -**Pub/Sub IAM bindings are dataset-scoped and removed on disconnect.** When you unwire the `pubsub_topic` connection and redeploy, Terraform removes the two service agent IAM bindings from this dataset. No project-level IAM is modified. Existing data in the table is not affected — only new message delivery stops. - ## Troubleshooting **Permission denied on dataset access.** @@ -50,26 +44,6 @@ Add `bigquery.googleapis.com` to `enabled_apis` in the `gcp-landing-zone` packag bq show --format=prettyjson {{artifacts.bigquery_dataset.dataset_full_name}}. ``` -**Pub/Sub subscription stuck — messages not appearing in BigQuery.** -First confirm the table exists and the subscription's IAM bindings are in place: -```bash -# Check subscription delivery status and error details -gcloud pubsub subscriptions describe --project= - -# Confirm IAM bindings on the dataset (look for gcp-sa-pubsub entries) -bq get-iam-policy {{artifacts.bigquery_dataset.dataset_full_name}} -``` -Common causes: table doesn't exist, table schema mismatch with message fields, `use_topic_schema = true` but topic has no schema, or IAM bindings not yet propagated. - -**Pub/Sub subscription creation fails with permission error during deploy.** -The Pub/Sub service agent IAM bindings may not have propagated before the subscription was created. IAM propagation is eventually consistent — wait 30–60 seconds and redeploy. The `depends_on` in this bundle mitigates this but does not eliminate it entirely. - -**Messages delivered but columns are all null.** -If `use_topic_schema = false` (default), messages are written as raw bytes to the `data` column. All other columns will be null unless `write_metadata = true` is set and the table has the corresponding metadata columns. Enable `use_topic_schema` or query the `data` column directly. - -**Deploy fails with "pubsub.googleapis.com has not been used in project."** -Add `pubsub.googleapis.com` to `enabled_apis` in the `gcp-landing-zone` package, redeploy the landing zone, wait ~60 seconds, then retry. - ## Day-2 operations **Setting expiration on an existing table** (default expiration doesn't backfill): @@ -121,15 +95,4 @@ bq show --format=prettyjson {{artifacts.bigquery_dataset.dataset_full_name}}.`' - -# List Pub/Sub subscriptions on a topic -gcloud pubsub topics list-subscriptions --project= - -# Describe a Pub/Sub subscription (shows delivery config and error state) -gcloud pubsub subscriptions describe --project= - -# Seek a subscription to a snapshot or timestamp (e.g., replay missed messages) -gcloud pubsub subscriptions seek \ - --time=$(date -u +%Y-%m-%dT%H:%M:%SZ -d "1 hour ago") \ - --project= ``` diff --git a/bundles/gcp-bigquery-dataset/src/variables.tf b/bundles/gcp-bigquery-dataset/src/variables.tf index 9181dd6..50399ba 100644 --- a/bundles/gcp-bigquery-dataset/src/variables.tf +++ b/bundles/gcp-bigquery-dataset/src/variables.tf @@ -72,28 +72,3 @@ variable "delete_protection" { type = bool default = false } - -# Optional — only present when a Pub/Sub topic is wired on the canvas. -variable "pubsub_topic" { - description = "Pub/Sub topic artifact. When wired, a BigQuery subscription is created to deliver messages into this dataset." - type = object({ - project_id = string - topic_name = string - topic_id = string - dlq_topic_id = optional(string) - dlq_topic_name = optional(string) - }) - default = null -} - -variable "bigquery_subscription" { - description = "Settings for the Pub/Sub BigQuery subscription. Consumed only when pubsub_topic is non-null." - type = object({ - table_name = optional(string) - use_topic_schema = optional(bool, false) - write_metadata = optional(bool, false) - drop_unknown_fields = optional(bool, false) - ack_deadline_seconds = optional(number, 60) - }) - default = {} -} diff --git a/bundles/gcp-bigquery-table/README.md b/bundles/gcp-bigquery-table/README.md new file mode 100644 index 0000000..68c5b14 --- /dev/null +++ b/bundles/gcp-bigquery-table/README.md @@ -0,0 +1,90 @@ +# gcp-bigquery-table + +Google Cloud BigQuery table with configurable schema and optional Pub/Sub subscription delivery. Use this bundle to provision a managed table inside an existing BigQuery dataset — with or without a Pub/Sub subscription routing messages into it. + +## Use Cases + +- Pub/Sub-to-BigQuery pipeline: wire a topic to this table and messages land automatically +- Custom-schema analytics table: define your own BigQuery schema JSON and deploy +- Production tables with deletion protection to prevent accidental data loss + +## Resources Created + +| Resource | Type | Notes | +|---|---|---| +| `google_bigquery_table.main` | BigQuery table | Schema is either Pub/Sub-compatible (5 standard columns) or user-provided JSON | +| `google_bigquery_table_iam_member.pubsub_service_agent_data_editor` | IAM binding | Created only when a Pub/Sub topic is wired. Grants the Pub/Sub service agent `roles/bigquery.dataEditor` on this table. | +| `google_bigquery_table_iam_member.pubsub_service_agent_metadata_viewer` | IAM binding | Created only when a Pub/Sub topic is wired. Grants the Pub/Sub service agent `roles/bigquery.metadataViewer` on this table. | +| `google_pubsub_subscription.bigquery` | Pub/Sub subscription | Created only when a Pub/Sub topic is wired. Delivers messages from the topic into this table. | + +IAM bindings are table-scoped (not dataset-wide) for least privilege. The bindings are removed when the `pubsub_topic` connection is unwired and the bundle is redeployed. + +## Connections + +| Connection | Required | Artifact Type | How It Is Used | +|---|---|---|---| +| `gcp_authentication` | Yes | `gcp-service-account` | Deploy credential — project ID and service account key for the Google provider | +| `bigquery_dataset` | Yes | `catalog-demo/gcp-bigquery-dataset` | Provides `project_id` and `dataset_id` for table placement | +| `pubsub_topic` | No | `catalog-demo/gcp-pubsub-topic` | When wired, creates a Pub/Sub BigQuery subscription that delivers messages into this table | + +## Schema Modes + +**`pubsub_default`** (recommended when wiring a Pub/Sub topic): Creates the table with five standard Pub/Sub columns: +- `subscription_name STRING` — name of the subscription that delivered the message +- `message_id STRING` — unique message identifier assigned by Pub/Sub +- `publish_time TIMESTAMP` — time the message was published to the topic +- `data STRING` — message payload (base64-decoded when `use_topic_schema = false`) +- `attributes JSON` — key-value attributes attached to the message + +**`custom_schema`**: Provide the full BigQuery schema as a JSON array in the `schema_json` parameter. Each field descriptor requires at minimum `name` and `type`. Example: +```json +[ + {"name": "event_type", "type": "STRING", "mode": "NULLABLE"}, + {"name": "payload", "type": "JSON", "mode": "NULLABLE"}, + {"name": "created_at", "type": "TIMESTAMP", "mode": "NULLABLE"} +] +``` + +## Artifact Produced + +**Artifact type:** `catalog-demo/gcp-bigquery-table` + +| Field | Type | Description | +|---|---|---| +| `project_id` | string | GCP project ID that owns the table | +| `dataset_id` | string | BigQuery dataset containing this table | +| `table_id` | string | BigQuery table identifier | +| `table_full_name` | string | Fully-qualified name in `..
` form — use directly in SQL `FROM` clauses | + +Consumer bundles bind IAM roles using the table fields from this artifact. Example: + +```hcl +resource "google_bigquery_table_iam_member" "reader" { + project = var.bigquery_table.project_id + dataset_id = var.bigquery_table.dataset_id + table_id = var.bigquery_table.table_id + role = "roles/bigquery.dataViewer" + member = "serviceAccount:${google_service_account.runtime.email}" +} +``` + +## Compliance + +No Checkov checks are skipped. All findings in this bundle are resolved in Terraform directly. + +The `halt_on_failure` expression in `massdriver.yaml` blocks deployments with unresolved high-severity Checkov findings when the environment target matches `prod`, `prd`, or `production`. + +## Assumptions + +- `bigquery.googleapis.com` must be enabled in the landing zone before deploying. +- The `gcp_authentication` credential has `bigquery.admin` or equivalent IAM on the project. +- `table_id` is immutable after creation. Changing it requires destroying and recreating the table — all data is lost unless exported first. +- When wiring a `pubsub_topic`, `pubsub.googleapis.com` must also be enabled in the project. +- The BigQuery subscription target table (this table) must exist before Pub/Sub can validate the subscription. This bundle creates the table first, so order-of-operations is handled automatically. + +## Presets + +| Preset | Schema Mode | Deletion Protection | +|---|---|---| +| Pub/Sub Default | pubsub_default | On | +| Custom Schema | custom_schema | On | diff --git a/bundles/gcp-bigquery-table/massdriver.yaml b/bundles/gcp-bigquery-table/massdriver.yaml new file mode 100644 index 0000000..cd9a38c --- /dev/null +++ b/bundles/gcp-bigquery-table/massdriver.yaml @@ -0,0 +1,180 @@ +name: gcp-bigquery-table +description: Google Cloud BigQuery table with configurable schema and optional Pub/Sub + subscription delivery. When a Pub/Sub topic is wired, creates a BigQuery subscription + that routes messages from the topic into this table. Emits a gcp-bigquery-table + artifact for downstream query and pipeline workloads. +source_url: https://github.com/massdriver-cloud/massdriver-catalog/tree/main/bundles/gcp-bigquery-table +version: 0.1.1 + +params: + required: + - table_id + - deletion_protection + - schema_mode + examples: + - __name: Pub/Sub Default + table_id: messages + deletion_protection: true + schema_mode: pubsub_default + bigquery_subscription: + use_topic_schema: false + write_metadata: true + drop_unknown_fields: false + ack_deadline_seconds: 60 + - __name: Custom Schema + table_id: events + deletion_protection: true + schema_mode: custom_schema + schema_json: '[{"name":"event_type","type":"STRING","mode":"NULLABLE"},{"name":"payload","type":"JSON","mode":"NULLABLE"},{"name":"created_at","type":"TIMESTAMP","mode":"NULLABLE"}]' + + properties: + table_id: + title: Table ID + description: BigQuery table identifier. Must contain only letters, digits, and + underscores — no hyphens or spaces. Maximum 1024 characters. Cannot be changed + after creation without destroying and recreating the table. + type: string + $md.immutable: true + pattern: "^[a-zA-Z0-9_]+$" + + description: + title: Description + description: Optional free-text description of the table's purpose, data classification, + or owner team. Stored in BigQuery metadata and visible in the console. + type: string + + deletion_protection: + title: Enable Deletion Protection + description: When enabled, the table cannot be destroyed until deletion protection + is first disabled (a two-step destroy). Prevents accidental data loss in production. + Strongly recommended for production tables. Disable only immediately before + a planned decommission. + type: boolean + default: true + + schema_mode: + title: Schema Mode + description: >- + Controls how the table schema is defined. "pubsub_default" creates the table + with a Pub/Sub-compatible schema (subscription_name, message_id, publish_time, + data, attributes) — use this when wiring a Pub/Sub topic. "custom_schema" + lets you provide the full BigQuery schema JSON via the schema_json parameter. + type: string + default: pubsub_default + enum: + - pubsub_default + - custom_schema + + schema_json: + title: Schema JSON + description: >- + BigQuery table schema as a JSON array of field descriptors. Only used when + schema_mode is "custom_schema". Each element must have "name", "type", and + optionally "mode" (NULLABLE, REQUIRED, REPEATED) and "description". + Example: [{"name":"event_type","type":"STRING","mode":"NULLABLE"}, + {"name":"payload","type":"JSON","mode":"NULLABLE"}, + {"name":"created_at","type":"TIMESTAMP","mode":"NULLABLE"}] + Valid types: STRING, BYTES, INTEGER, INT64, FLOAT, FLOAT64, NUMERIC, + BIGNUMERIC, BOOLEAN, BOOL, TIMESTAMP, DATE, TIME, DATETIME, JSON, RECORD, STRUCT. + type: string + + bigquery_subscription: + title: BigQuery Subscription Settings + description: Settings for the Pub/Sub BigQuery subscription. Only used when + a Pub/Sub topic is wired to this bundle. Configure delivery behavior and + acknowledgement timeouts here. + type: object + properties: + use_topic_schema: + title: Use Topic Schema + description: When enabled, BigQuery uses the Pub/Sub topic's schema to + parse messages and map fields to table columns. When disabled, messages + are written as raw bytes. Requires the topic to have a schema attached. + type: boolean + default: false + + write_metadata: + title: Write Subscription Metadata + description: When enabled, BigQuery populates the subscription_name, message_id, + publish_time, and attributes columns in each row. The pubsub_default schema + includes these columns. Recommended to keep enabled. + type: boolean + default: true + + drop_unknown_fields: + title: Drop Unknown Fields + description: When enabled and use_topic_schema is also enabled, fields + in the message that do not exist in the table schema are silently dropped. + When disabled, unknown fields cause the message to be routed to the dead + letter topic (if configured) or dropped entirely. + type: boolean + default: false + + ack_deadline_seconds: + title: Acknowledgement Deadline (seconds) + description: How long Pub/Sub waits for BigQuery to acknowledge a message + before re-delivering it. Increase if BigQuery write latency exceeds the + default. Range 10–600 seconds. + type: integer + default: 60 + minimum: 10 + maximum: 600 + +connections: + required: + - gcp_authentication + - bigquery_dataset + properties: + gcp_authentication: + $ref: gcp-service-account + title: GCP Credentials + + bigquery_dataset: + $ref: catalog-demo/gcp-bigquery-dataset + title: BigQuery Dataset + + pubsub_topic: + $ref: catalog-demo/gcp-pubsub-topic + title: Pub/Sub Topic (optional — wire to enable BigQuery subscription delivery) + +artifacts: + required: + - bigquery_table + properties: + bigquery_table: + $ref: catalog-demo/gcp-bigquery-table + title: GCP BigQuery Table + +steps: + - path: src + provisioner: opentofu:1.10 + config: + checkov: + halt_on_failure: '.params.md_metadata.default_tags["md-target"] | test("^(prod|prd|production)$")' + +ui: + ui:order: + - table_id + - description + - deletion_protection + - schema_mode + - schema_json + - bigquery_subscription + - "*" + properties: + deletion_protection: + ui:widget: checkbox + bigquery_subscription: + ui:order: + - use_topic_schema + - write_metadata + - drop_unknown_fields + - ack_deadline_seconds + - "*" + properties: + use_topic_schema: + ui:widget: checkbox + write_metadata: + ui:widget: checkbox + drop_unknown_fields: + ui:widget: checkbox diff --git a/bundles/gcp-bigquery-table/operator.md b/bundles/gcp-bigquery-table/operator.md new file mode 100644 index 0000000..cc227e7 --- /dev/null +++ b/bundles/gcp-bigquery-table/operator.md @@ -0,0 +1,109 @@ +--- +templating: mustache +--- + +# GCP BigQuery Table — Operator Runbook + +## Non-obvious constraints + +**Table ID is immutable.** `table_id` cannot be changed in-place. To rename: export the table data, destroy the package, reprovision with the new ID, reload from GCS. Treat the table ID as permanent. + +**Deletion protection requires a two-step destroy.** When `deletion_protection = true`, the destroy will fail with a "Table cannot be deleted" error. To decommission: +1. Set `deletion_protection = false` in the package config and deploy. +2. Then run the destroy. + +**Schema evolution is limited.** BigQuery supports a narrow set of in-place schema changes: adding new columns at the end of the schema, relaxing a field from REQUIRED to NULLABLE, and a few others. Changing column types, renaming columns, or reordering columns requires dropping and recreating the table — all data is lost unless exported first. Plan your schema carefully before first deploy. + +**Pub/Sub subscription target table must exist before the subscription can deliver messages.** This bundle creates the table before the subscription, so order-of-operations is handled automatically. However, if you destroy and recreate the table independently, redeploy this bundle to recreate the subscription and its IAM bindings. + +**Pub/Sub IAM bindings are table-scoped and removed on disconnect.** When you unwire the `pubsub_topic` connection and redeploy, Terraform removes the two service agent IAM bindings from this table. No dataset-level or project-level IAM is modified. Existing data in the table is not affected — only new message delivery stops. + +**Schema mismatch routes messages to dead letter or drops them.** When `use_topic_schema = true` and a message contains fields not in the table schema, behavior depends on `drop_unknown_fields`. If `drop_unknown_fields = false` (the default), the message is routed to the dead letter topic if one is configured on the source topic, or dropped. If `drop_unknown_fields = true`, the extra fields are silently discarded and the message is delivered. + +**IAM propagation is eventually consistent.** The Pub/Sub subscription creation depends on IAM bindings that may not have propagated yet. The `depends_on` in this bundle mitigates timing issues, but if the subscription creation fails during a first deploy, a redeploy will resolve it. + +## Troubleshooting + +**Pub/Sub subscription stuck — messages not appearing in BigQuery.** +```bash +# Check subscription delivery status and error details +gcloud pubsub subscriptions describe {{artifacts.bigquery_table.table_id}}-bq \ + --project={{artifacts.bigquery_table.project_id}} + +# Confirm table exists +bq show --format=prettyjson {{artifacts.bigquery_table.table_full_name}} + +# Confirm IAM bindings (look for gcp-sa-pubsub entries) +bq get-iam-policy {{artifacts.bigquery_table.table_full_name}} +``` +Common causes: table schema mismatch with message fields, `use_topic_schema = true` but topic has no schema, IAM bindings not yet propagated (redeploy to fix), or `pubsub.googleapis.com` not enabled. + +**Pub/Sub subscription creation fails with permission error during deploy.** +IAM propagation is eventually consistent — wait 30–60 seconds and redeploy. The `depends_on` in this bundle mitigates but does not eliminate this race. + +**Messages delivered but columns are all null.** +If `use_topic_schema = false` (default), messages are written as raw bytes to the `data` column. Enable `write_metadata = true` so metadata columns (subscription_name, message_id, publish_time, attributes) are populated. Query the `data` column directly for the message payload. + +**Deploy fails with "bigquery.googleapis.com has not been used in project."** +Add `bigquery.googleapis.com` to `enabled_apis` in the `gcp-landing-zone` package, redeploy the landing zone, wait ~60 seconds, then retry. + +**Deploy fails with "pubsub.googleapis.com has not been used in project."** +Add `pubsub.googleapis.com` to `enabled_apis` in the `gcp-landing-zone` package, redeploy the landing zone, wait ~60 seconds, then retry. + +**Permission denied on table access.** +```bash +bq get-iam-policy {{artifacts.bigquery_table.table_full_name}} +``` +The required member should have `roles/bigquery.dataEditor` for read/write or `roles/bigquery.dataViewer` for read-only. If the binding is absent, redeploy the consumer bundle with the table wired on the canvas. + +## Day-2 operations + +**Querying the table:** +```bash +bq query --project_id={{artifacts.bigquery_table.project_id}} \ + 'SELECT * FROM `{{artifacts.bigquery_table.table_full_name}}` LIMIT 100' +``` + +**Inspecting table schema and row count:** +```bash +bq show --format=prettyjson {{artifacts.bigquery_table.table_full_name}} +bq query --project_id={{artifacts.bigquery_table.project_id}} \ + 'SELECT COUNT(*) FROM `{{artifacts.bigquery_table.table_full_name}}`' +``` + +**Exporting table data before destroying:** +```bash +bq extract \ + --destination_format=NEWLINE_DELIMITED_JSON \ + {{artifacts.bigquery_table.table_full_name}} \ + gs:///{{artifacts.bigquery_table.dataset_id}}/{{artifacts.bigquery_table.table_id}}/*.jsonl +``` + +**Replaying missed Pub/Sub messages from a timestamp:** +```bash +gcloud pubsub subscriptions seek \ + --time=$(date -u +%Y-%m-%dT%H:%M:%SZ -d "1 hour ago") \ + --project={{artifacts.bigquery_table.project_id}} +``` + +## Useful commands + +```bash +# Show table schema and metadata +bq show --format=prettyjson {{artifacts.bigquery_table.table_full_name}} + +# Show IAM policy on the table +bq get-iam-policy {{artifacts.bigquery_table.table_full_name}} + +# List all subscriptions on the parent topic +gcloud pubsub topics list-subscriptions \ + --project={{artifacts.bigquery_table.project_id}} + +# Describe the BigQuery subscription (delivery config + error state) +gcloud pubsub subscriptions describe \ + --project={{artifacts.bigquery_table.project_id}} + +# Run an ad-hoc query +bq query --project_id={{artifacts.bigquery_table.project_id}} \ + 'SELECT COUNT(*) FROM `{{artifacts.bigquery_table.table_full_name}}`' +``` diff --git a/bundles/gcp-bigquery-table/src/artifacts.tf b/bundles/gcp-bigquery-table/src/artifacts.tf new file mode 100644 index 0000000..789630e --- /dev/null +++ b/bundles/gcp-bigquery-table/src/artifacts.tf @@ -0,0 +1,12 @@ +# BigQuery table artifact — matches catalog-demo/gcp-bigquery-table schema. + +resource "massdriver_artifact" "bigquery_table" { + field = "bigquery_table" + name = "GCP BigQuery Table ${var.md_metadata.name_prefix}" + artifact = jsonencode({ + project_id = local.project_id + dataset_id = local.dataset_id + table_id = google_bigquery_table.main.table_id + table_full_name = "${local.project_id}.${local.dataset_id}.${google_bigquery_table.main.table_id}" + }) +} diff --git a/bundles/gcp-bigquery-table/src/main.tf b/bundles/gcp-bigquery-table/src/main.tf new file mode 100644 index 0000000..f26195f --- /dev/null +++ b/bundles/gcp-bigquery-table/src/main.tf @@ -0,0 +1,83 @@ +terraform { + required_version = ">= 1.0" + required_providers { + google = { + source = "hashicorp/google" + version = "~> 6.0" + } + massdriver = { + source = "massdriver-cloud/massdriver" + version = "~> 1.3" + } + } +} + +provider "google" { + project = var.gcp_authentication.project_id + credentials = jsonencode(var.gcp_authentication) +} + +locals { + project_id = var.bigquery_dataset.project_id + dataset_id = var.bigquery_dataset.dataset_id + name_prefix = var.md_metadata.name_prefix + + # Subscription name derived from the bundle name prefix for uniqueness. + subscription_name = "${local.name_prefix}-bq" + + # Pub/Sub-compatible schema — used when schema_mode = "pubsub_default". + # Includes the five standard columns that BigQuery subscription write_metadata + # populates: subscription_name, message_id, publish_time, data, attributes. + pubsub_default_schema = jsonencode([ + { + name = "subscription_name" + type = "STRING" + mode = "NULLABLE" + }, + { + name = "message_id" + type = "STRING" + mode = "NULLABLE" + }, + { + name = "publish_time" + type = "TIMESTAMP" + mode = "NULLABLE" + }, + { + name = "data" + type = "STRING" + mode = "NULLABLE" + }, + { + name = "attributes" + type = "JSON" + mode = "NULLABLE" + }, + ]) + + # Resolved schema — prefer custom_schema when provided, otherwise use pubsub default. + resolved_schema = ( + var.schema_mode == "custom_schema" && var.schema_json != null + ? var.schema_json + : local.pubsub_default_schema + ) +} + +# ─── BigQuery Table ──────────────────────────────────────────────────────────── + +resource "google_bigquery_table" "main" { + project = local.project_id + dataset_id = local.dataset_id + table_id = var.table_id + + description = var.description != null ? var.description : null + + # When deletion_protection = true, Terraform will refuse to destroy this table + # until the flag is first set to false and re-applied (a two-step destroy). + deletion_protection = var.deletion_protection + + schema = local.resolved_schema + + labels = var.md_metadata.default_tags +} diff --git a/bundles/gcp-bigquery-dataset/src/subscription.tf b/bundles/gcp-bigquery-table/src/subscription.tf similarity index 57% rename from bundles/gcp-bigquery-dataset/src/subscription.tf rename to bundles/gcp-bigquery-table/src/subscription.tf index 66c2c2d..760466f 100644 --- a/bundles/gcp-bigquery-dataset/src/subscription.tf +++ b/bundles/gcp-bigquery-table/src/subscription.tf @@ -3,20 +3,14 @@ # This file is count-gated on var.pubsub_topic being non-null. # When a Pub/Sub topic is wired on the canvas, three resources are created: # -# 1. google_bigquery_dataset_iam_member.pubsub_service_agent_data_editor -# 2. google_bigquery_dataset_iam_member.pubsub_service_agent_metadata_viewer +# 1. google_bigquery_table_iam_member.pubsub_service_agent_data_editor +# 2. google_bigquery_table_iam_member.pubsub_service_agent_metadata_viewer # 3. google_pubsub_subscription.bigquery # # IAM bindings grant the Pub/Sub service agent # (service-@gcp-sa-pubsub.iam.gserviceaccount.com) the minimum -# roles required to write messages into BigQuery. Bindings are dataset-scoped, -# not project-wide. -# -# IMPORTANT — the target table must exist before deployment. -# Pub/Sub does NOT create BigQuery tables. Create the table in the dataset -# (manually, via a companion bundle, or via Dataform) before wiring this -# connection. Deploying when the table is absent will succeed at the Terraform -# layer but the subscription will fail to deliver and messages will back up. +# roles required to write messages into the BigQuery table. Bindings are +# table-scoped, not dataset- or project-wide. # ─── Pub/Sub service agent project number ────────────────────────────────────── # The Pub/Sub service agent SA is project-number-scoped, so we need the numeric @@ -31,31 +25,31 @@ locals { # BigQuery subscription table reference format: projectId:datasetId.tableId # This is the format required by the Pub/Sub API and the Terraform provider. - bq_table_ref = local.pubsub_enabled ? "${local.project_id}:${local.dataset_id}.${var.bigquery_subscription.table_name}" : null - - # Subscription name derived from the bundle name prefix for uniqueness. - subscription_name = "${local.name_prefix}-bq" + bq_table_ref = local.pubsub_enabled ? "${local.project_id}:${local.dataset_id}.${var.table_id}" : null } -# ─── IAM: dataEditor on this dataset ────────────────────────────────────────── +# ─── IAM: dataEditor on this table ──────────────────────────────────────────── # Required so the Pub/Sub service agent can INSERT rows into the target table. -resource "google_bigquery_dataset_iam_member" "pubsub_service_agent_data_editor" { +# Scoped to this specific table — not the full dataset — for least privilege. +resource "google_bigquery_table_iam_member" "pubsub_service_agent_data_editor" { count = local.pubsub_enabled ? 1 : 0 project = local.project_id - dataset_id = google_bigquery_dataset.main.dataset_id + dataset_id = local.dataset_id + table_id = google_bigquery_table.main.table_id role = "roles/bigquery.dataEditor" member = local.pubsub_service_account } -# ─── IAM: metadataViewer on this dataset ────────────────────────────────────── -# Required so the Pub/Sub service agent can read table schemas and dataset -# metadata to validate message delivery configuration. -resource "google_bigquery_dataset_iam_member" "pubsub_service_agent_metadata_viewer" { +# ─── IAM: metadataViewer on this table ──────────────────────────────────────── +# Required so the Pub/Sub service agent can read the table schema and validate +# message delivery configuration. Scoped to this specific table. +resource "google_bigquery_table_iam_member" "pubsub_service_agent_metadata_viewer" { count = local.pubsub_enabled ? 1 : 0 project = local.project_id - dataset_id = google_bigquery_dataset.main.dataset_id + dataset_id = local.dataset_id + table_id = google_bigquery_table.main.table_id role = "roles/bigquery.metadataViewer" member = local.pubsub_service_account } @@ -79,11 +73,12 @@ resource "google_pubsub_subscription" "bigquery" { labels = var.md_metadata.default_tags - # The IAM bindings must exist before Pub/Sub validates the subscription's - # ability to write to BigQuery. Without these, the subscription creation will - # fail with a permission error even though the resource itself is valid. + # The table and IAM bindings must exist before Pub/Sub validates the subscription. + # Without the IAM bindings, Pub/Sub cannot write to BigQuery. + # IAM propagation is eventually consistent — depends_on mitigates but does not + # eliminate timing issues. If the subscription fails, a redeploy resolves it. depends_on = [ - google_bigquery_dataset_iam_member.pubsub_service_agent_data_editor, - google_bigquery_dataset_iam_member.pubsub_service_agent_metadata_viewer, + google_bigquery_table_iam_member.pubsub_service_agent_data_editor, + google_bigquery_table_iam_member.pubsub_service_agent_metadata_viewer, ] } diff --git a/bundles/gcp-bigquery-table/src/variables.tf b/bundles/gcp-bigquery-table/src/variables.tf new file mode 100644 index 0000000..debc628 --- /dev/null +++ b/bundles/gcp-bigquery-table/src/variables.tf @@ -0,0 +1,80 @@ +variable "md_metadata" { + type = object({ + name_prefix = string + default_tags = optional(map(string), {}) + }) +} + +variable "gcp_authentication" { + type = object({ + type = string + project_id = string + private_key_id = string + private_key = string + client_email = string + client_id = string + auth_uri = string + token_uri = string + auth_provider_x509_cert_url = string + client_x509_cert_url = string + }) + sensitive = true +} + +variable "bigquery_dataset" { + type = object({ + project_id = string + dataset_id = string + dataset_full_name = string + location = string + friendly_name = optional(string) + }) +} + +# Optional — only present when a Pub/Sub topic is wired on the canvas. +variable "pubsub_topic" { + description = "Pub/Sub topic artifact. When wired, a BigQuery subscription is created to deliver messages into this table." + type = object({ + project_id = string + topic_name = string + topic_id = string + dlq_topic_id = optional(string) + dlq_topic_name = optional(string) + }) + default = null +} + +variable "table_id" { + type = string +} + +variable "description" { + type = string + default = null +} + +variable "deletion_protection" { + type = bool + default = true +} + +variable "schema_mode" { + type = string + default = "pubsub_default" +} + +variable "schema_json" { + type = string + default = null +} + +variable "bigquery_subscription" { + description = "Settings for the Pub/Sub BigQuery subscription. Consumed only when pubsub_topic is non-null." + type = object({ + use_topic_schema = optional(bool, false) + write_metadata = optional(bool, true) + drop_unknown_fields = optional(bool, false) + ack_deadline_seconds = optional(number, 60) + }) + default = {} +} From 9eba9985c89041ad2b4e281f7047a1d5485900c7 Mon Sep 17 00:00:00 2001 From: Cory O'Daniel Date: Wed, 22 Apr 2026 16:38:57 -0700 Subject: [PATCH 11/15] Audit polish on gcp-log-sink and gcp-vpc-connector MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gcp-log-sink Section header casing fixes in operator.md (Non-obvious constraints, Day-2 operations). Replaced a README Parameters section (duplicated the JSON Schema) with a Presets section describing the three named examples. gcp-vpc-connector artdef Made egress_settings an enum (ALL_TRAFFIC | PRIVATE_RANGES_ONLY) and dropped the default annotation — defaults on artifact definition schema fields are not meaningful. Consumer bundle param carries the default instead. Co-Authored-By: Claude Opus 4.7 (1M context) --- .claude/massdriver.local.md | 28 +++++++++++++++++-- .../gcp-vpc-connector/massdriver.yaml | 1 - bundles/gcp-log-sink/README.md | 25 ++++++----------- bundles/gcp-log-sink/operator.md | 4 +-- 4 files changed, 35 insertions(+), 23 deletions(-) diff --git a/.claude/massdriver.local.md b/.claude/massdriver.local.md index 5964e7c..afc372b 100644 --- a/.claude/massdriver.local.md +++ b/.claude/massdriver.local.md @@ -1,6 +1,28 @@ # Massdriver Local Config -production_pattern: ^.*-(prod|prd|production)(-.*)?$ +## Environment slug conventions -## Test Environments -- gcp-claude — GCP test environment for bundle development (NOT production) +A Massdriver package slug has the shape `--` where the +`` segment is the ENVIRONMENT NAME and the `` segment is the +manifest name for the bundle on that environment's canvas. The environment +portion is the SECOND segment only — do not include the manifest in pattern +matching. + +## Production pattern + +Only the environment segment (second segment) should be matched against this +pattern, NOT the full slug: + +production_pattern: ^(prod|prd|production)$ + +## Test environments (explicit allow list) + +These environments are TEST / DEVELOPMENT and must always be allowed: + +- `claude` (in `gcp-claude` project — first demo) +- `dataplat-claude` (NOT production; used for the GCP data platform Kafka demo) + +Package slugs on these environments will look like `gcp-claude-*` or +`dataplat-claude-*`. Even if the manifest name contains strings that look +production-adjacent (e.g. `aisquadds`, `logsink`, `landingzone`), these are +NOT production targets — the environment segment is what matters. diff --git a/artifact-definitions/gcp-vpc-connector/massdriver.yaml b/artifact-definitions/gcp-vpc-connector/massdriver.yaml index 8721488..b07d6c4 100644 --- a/artifact-definitions/gcp-vpc-connector/massdriver.yaml +++ b/artifact-definitions/gcp-vpc-connector/massdriver.yaml @@ -85,4 +85,3 @@ schema: enum: - ALL_TRAFFIC - PRIVATE_RANGES_ONLY - default: PRIVATE_RANGES_ONLY diff --git a/bundles/gcp-log-sink/README.md b/bundles/gcp-log-sink/README.md index 7d12d4c..95445cb 100644 --- a/bundles/gcp-log-sink/README.md +++ b/bundles/gcp-log-sink/README.md @@ -34,27 +34,18 @@ If neither or both destinations are wired, `tofu plan` will fail with a clear er `catalog-demo/gcp-log-sink` — carries `project_id`, `sink_name`, `destination`, `writer_identity`, and `destination_type`. Downstream bundles rarely need to consume this artifact directly; it is published for observability and chaining. -## Parameters - -| Parameter | Type | Default | Description | -|---|---|---|---| -| `filter` | string | `""` | Cloud Logging query filter. Empty = all logs. | -| `use_partitioned_tables` | boolean | `true` | BigQuery only — write to date-partitioned tables. | -| `exclusions` | array | `[]` | Per-exclusion drop rules applied after the sink filter. | - -### Filter Examples - -``` -severity >= ERROR -resource.type = "cloud_run_revision" -logName = "projects/PROJECT/logs/cloudaudit.googleapis.com%2Factivity" -resource.type = "gce_instance" AND severity >= WARNING -``` - ## Compliance Log sinks are low-risk infrastructure. No Checkov skips are expected. `halt_on_failure` is set to block deployments to `prod`, `prd`, and `production` environments on any compliance failure. +## Presets + +| Preset | Filter | Partitioned Tables | Notes | +|---|---|---|---| +| Error Logs to BigQuery | `severity >= ERROR` | Yes | Recommended starting point for BigQuery destinations | +| Audit Logs to GCS | `logName = "projects/PROJECT/logs/cloudaudit.googleapis.com%2Factivity"` | No | Update PROJECT to your GCP project ID before deploying | +| All Logs (no filter) | (empty) | Yes | Routes every log entry — can generate significant storage costs | + ## Assumptions - This bundle creates a **project-level** sink. It does NOT capture logs from child projects, folders, or the organization. Folder or org sinks are out of scope. diff --git a/bundles/gcp-log-sink/operator.md b/bundles/gcp-log-sink/operator.md index 9517b24..82af3e2 100644 --- a/bundles/gcp-log-sink/operator.md +++ b/bundles/gcp-log-sink/operator.md @@ -1,6 +1,6 @@ # gcp-log-sink — Operator Runbook -## Non-obvious Constraints +## Non-obvious constraints **Project scope only.** This sink captures logs from the project specified in the landing zone connection. Logs from other projects, child folders, or the organization are not captured. Folder-level and org-level sinks require a different Terraform resource (`google_logging_folder_sink` / `google_logging_organization_sink`) and are out of scope for this bundle. @@ -30,7 +30,7 @@ **"ALREADY_EXISTS" error on sink creation** — A sink with the same name (derived from `md_metadata.name_prefix`) already exists in the project. This happens if a previous deployment left a sink that Terraform state does not track. Import the existing sink: `tofu import google_logging_project_sink.main projects/PROJECT/sinks/SINK_NAME`. -## Day-2 Operations +## Day-2 operations **Updating the filter** — Change the `filter` param in the package config and deploy. The sink is updated in place. Filter changes are immediate for new log entries. No restart or recreate needed. From 4be0b595316555458ae22b81d018ef5e271b618f Mon Sep 17 00:00:00 2001 From: Cory O'Daniel Date: Wed, 22 Apr 2026 16:39:29 -0700 Subject: [PATCH 12/15] Add POC guide covering the Kafka-producer demo additions Replaces the prior gist with an in-repo GUIDE.md that covers both the original seven bundles and the additions on this branch: gcp-bigquery-table, gcp-log-sink, gcp-vpc-connector artdef, and the incoming_topic + vpc_connector inputs on gcp-cloud-run-service. Documents the topology decision to keep subscriptions on the consumer bundle (table for BQ sub, cloud-run for push sub) rather than their own canvas tile, and the two-distinct-inputs pattern on cloud-run (incoming_topic for push subscription, pubsub_topic for outbound publisher). Co-Authored-By: Claude Opus 4.7 (1M context) --- GUIDE.md | 290 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 290 insertions(+) create mode 100644 GUIDE.md diff --git a/GUIDE.md b/GUIDE.md new file mode 100644 index 0000000..4210c57 --- /dev/null +++ b/GUIDE.md @@ -0,0 +1,290 @@ +# GCP Data Platform — POC Starter Catalog + +A starter catalog of Massdriver bundles covering the primary components of a GCP data platform, including Pub/Sub → BigQuery and Pub/Sub → Cloud Run push subscription pipelines for event-driven workloads. Intended as a starting point for your POC — use any subset, customize to fit your patterns, bring your own networking. + +## What's in this catalog + +### Bundles + +| Bundle | Role | +| --- | --- | +| `gcp-network` | Minimal regional VPC and subnet. Useful for test environments; for production, import your existing network as an artifact (see below). | +| `gcp-landing-zone` | Environment foundation. Project-level IAM bindings for humans/groups, optional org-policy guardrails, service API enablement, optional budget with notifications. | +| `gcp-pubsub-topic` | Pub/Sub topic with optional DLQ. Low-volume / Standard / High-throughput presets. | +| `gcp-storage-bucket` | Cloud Storage bucket with uniform bucket-level access and public-access prevention enforced. Staging / Durable / Archive presets. | +| `gcp-bigquery-dataset` | BigQuery dataset with delete protection on Production. Dev / Staging / Production presets. | +| `gcp-bigquery-table` | BigQuery table. When a Pub/Sub topic is wired, creates a BigQuery subscription that delivers messages into the table. Pub/Sub-compatible default schema or custom schema JSON. | +| `gcp-cloud-run-service` | Cloud Run v2 service. Creates its own runtime service account and auto-binds roles on any connected upstream resources. Supports incoming push subscriptions and VPC connector egress. Internal / Public API / Worker presets. | +| `gcp-vertex-workbench` | Vertex AI Workbench instance. Creates a per-instance service account. When connected to a BigQuery dataset, grants the instance SA read-only access. Small / Medium / GPU presets. | +| `gcp-log-sink` | Project-level Cloud Logging sink with configurable filter. Routes matching log entries to a BigQuery dataset or GCS bucket. Terraform precondition enforces exactly one destination. | + +### Artifact definitions + +Each bundle produces an artifact that downstream bundles consume. Artifact definitions are reusable contracts — if you already have infrastructure you want to represent in Massdriver, you can import it as an artifact and connect it to bundles without re-provisioning. + +| Artifact | Key fields | +| --- | --- | +| `gcp-network` | project_id, network_name, region, primary_subnet, optional secondary ranges / PSA / Cloud NAT / additional subnets | +| `gcp-landing-zone` | project_id, network, enabled_apis, budget (optional), iam_bindings (summary) | +| `gcp-workload-identity` | project_id, service_account_email / id / name | +| `gcp-pubsub-topic` | project_id, topic_name, topic_id, optional DLQ fields | +| `gcp-storage-bucket` | project_id, bucket_name, bucket_url, bucket_self_link, location, storage_class | +| `gcp-bigquery-dataset` | project_id, dataset_id, dataset_full_name, location | +| `gcp-bigquery-table` | project_id, dataset_id, table_id, table_full_name | +| `gcp-cloud-run-service` | project_id, service_name, service_url, location, latest_ready_revision, runtime SA email/member | +| `gcp-vertex-workbench` | project_id, instance_name, location, proxy_url, instance SA email/member | +| `gcp-log-sink` | project_id, sink_name, destination, writer_identity, destination_type | +| `gcp-vpc-connector` | project_id, region, name, connector_id, optional network / ip_cidr_range / egress_settings (import-only) | + +## How the bundles compose + +``` + gcp-network ─► gcp-landing-zone + │ + ┌───────────────┼───────────────────────────┐ + ▼ ▼ ▼ + gcp-pubsub-topic gcp-storage-bucket gcp-bigquery-dataset + │ │ + │ ▼ + │ gcp-bigquery-table + │ (optional topic wired to table → BQ subscription) + │ + ▼ + gcp-cloud-run-service / gcp-vertex-workbench + (incoming topic → push subscription, + outgoing topic → publisher role, + optional vpc-connector → private egress, + creates its own SA) + + ▼ + gcp-log-sink ─► (gcp-bigquery-dataset or gcp-storage-bucket) +``` + +### Topology notes + +- **Subscriptions live on the consumer bundle, not on their own canvas tile.** Wire a topic into a `gcp-bigquery-table` and the table bundle creates a BigQuery subscription internally. Wire a topic into a `gcp-cloud-run-service` via the `incoming_topic` input and the service creates a push subscription. This matches real-world ownership (the consumer configures ack deadline, retry, schema mapping) and halves the canvas-tile count for a typical pipeline. +- **Cloud Run services have two distinct Pub/Sub inputs.** `incoming_topic` creates a push subscription that delivers messages into the service URL. `pubsub_topic` (outgoing) grants the service's runtime SA publisher role on that topic. A middleware service can wire both — receive from one topic, publish to another. +- **The landing zone owns project-level IAM and guardrails**, not workload service accounts. Data resources (topic, bucket, dataset, table) produce artifacts with role-scoped policies but don't bind any service account themselves. Runtimes (Cloud Run, Workbench) create their own per-service service accounts and bind roles on connected upstream resources — standard per-workload-SA least-privilege. + +## Getting started + +Before getting started with the catalog, set up your [self-hosted instance.](https://docs.massdriver.cloud/platform-operations/self-hosted/install) + +### 1. Clone the catalog + +```bash +git clone git@github.com:massdriver-cloud/massdriver-catalog.git +cd massdriver-catalog +git checkout demo/0422-gcp-data-plat-kafka +``` + +### 2. Configure the Massdriver CLI + +The CLI reads its config from `$HOME/.config/massdriver/config.yaml` (or `$XDG_CONFIG_HOME/massdriver/config.yaml` if `XDG_CONFIG_HOME` is set). Create it with your organization ID and a Service Account API key: + +```yaml +version: 1 +profiles: + default: + organization_id: YOUR_ORG_ID + api_key: YOUR_SERVICE_ACCOUNT_TOKEN + url: https://api.YOUR_DOMAIN + templates_path: ~/path/to/your/massdriver-catalog/templates +``` + +- **organization_id** — hover over your org logo in the Massdriver UI to find it +- **api_key** — create a Service Account in Settings → Service Accounts and copy its token + +Or use environment variables: `MASSDRIVER_ORGANIZATION_ID`, `MASSDRIVER_API_KEY`. + +Full reference: https://docs.massdriver.cloud/reference/cli/overview#configuration + +### 3. Enable platforms and publish the catalog + +```bash +# In this repo +make ENABLED_PLATFORMS=gcp +make publish-artifact-definitions publish-bundles +``` + +### 4. Upload your GCP credential + +Export a service account key from a project where you want to deploy the POC. Upload it as a Massdriver credential: + +```bash +mass artifact import \ + -f ~/path/to/gcp-sa.json \ + -n "GCP POC" \ + -t {YOUR_ORG_ID}/gcp-service-account +``` + +**Note:** GCP service account keys currently need to be imported via the CLI as shown above. There's an escaping bug in the UI credential form that mangles the newline characters in GCP private keys (GCP is the only provider affected — the keys are multi-line PEM). A fix is in flight. In the meantime, two workarounds: import via CLI, or provision the service account in-platform via a Massdriver bundle and consume the resulting artifact. + +The credential needs permissions to manage the resources you plan to deploy (Compute Admin for network, Project IAM Admin for landing zone, Pub/Sub Admin, Storage Admin, BigQuery Admin, Cloud Run Admin, Workbench Admin, Logging Admin, Service Usage Admin for API enablement, and Serverless VPC Access Admin if you're importing a VPC connector). + +### 5. Bring your own network + +**Option A — provision a new network for POC testing:** +Add the `gcp-network` bundle to an environment canvas, connect your GCP credential, pick a region and CIDR, deploy. The bundle provisions a minimal VPC with one subnet, Private Google Access, flow logs, and a baseline deny-all ingress firewall rule. + +**Option B — import your existing network:** +The `gcp-network` artifact is designed to represent a rich existing network (primary + additional subnets, secondary ranges for GKE, Private Services Access, Cloud NAT). You can import your network directly as an artifact instead of provisioning one. + +```bash +mass artifact import \ + -f path/to/my-network.json \ + -n "Prod VPC" \ + -t {YOUR_ORG_ID}/gcp-network +``` + +See `artifact-definitions/gcp-network/massdriver.yaml` for the full schema — every field you might need for an existing production network is already defined, most of them optional. + +### 6. (Optional) Import an existing VPC connector + +If your Kafka producer (or any Cloud Run service in the catalog) needs private egress through a Serverless VPC Access connector, the `gcp-vpc-connector` artifact definition is import-only — no provisioning bundle. Create the connector in GCP however you normally would: + +```bash +gcloud compute networks vpc-access connectors create my-connector \ + --region=us-central1 \ + --network=my-vpc \ + --range=10.8.0.0/28 +``` + +Then import it as an artifact: + +```bash +mass artifact import \ + -f path/to/connector.json \ + -n "Shared VPC Connector" \ + -t {YOUR_ORG_ID}/gcp-vpc-connector +``` + +Wire it into any `gcp-cloud-run-service` on the canvas via the `vpc_connector` input. + +### 7. Build up the environment + +1. **Landing zone** — add `gcp-landing-zone` to the canvas. Connect the network. Configure IAM bindings for your team (e.g., `roles/viewer` → your analysts group), any org policy guardrails you want enforced, and an optional budget. +2. **Data resources** — add any of `gcp-pubsub-topic`, `gcp-storage-bucket`, `gcp-bigquery-dataset`, `gcp-bigquery-table`. Each connects to the landing zone for `project_id` context. Tables connect to a dataset (required) and optionally to a topic (creates the BQ subscription). +3. **Runtimes** — add `gcp-cloud-run-service` or `gcp-vertex-workbench`. Connect the landing zone (required) plus any upstream data artifacts. For Cloud Run: wire `incoming_topic` to create a push subscription; wire `pubsub_topic` for outgoing publisher role; wire `vpc_connector` for private egress. +4. **Observability** — add `gcp-log-sink` to route log entries to a BigQuery dataset or GCS bucket. Wire exactly one destination; the Terraform precondition enforces this. + +### 8. Deploy + +From the canvas UI or CLI: + +```bash +mass package deploy -- -m "initial deploy" +``` + +## Iterating with development releases + +During a POC you'll almost certainly want to tweak bundles — adjust a default, add a param, change an IAM binding, tighten a compliance rule. Cutting a new semver version for each iteration is slow and clutters the version history. Use **development releases** instead. + +### How it works + +Publishing a bundle with `--development` creates a `X.Y.Z-dev` release tagged with a timestamp (or your local git SHA). It: + +- Doesn't bump the bundle's official version in `massdriver.yaml`. +- Each new dev publish is a new pointer the package can be pinned to. +- Is only usable when a package is explicitly pinned to the dev release — production packages on `1.2.3` are unaffected. + +This lets you iterate on a bundle, redeploy, and see results in seconds without polluting your version history. When you're happy with the changes, bump the version and publish a real one (`1.3.0`) and re-pin the package. + +### The iteration loop + +```bash +# 1. Edit a bundle (e.g., bundles/gcp-cloud-run-service/src/*.tf) + +# 2. Publish a development release +cd bundles/gcp-cloud-run-service +mass bundle publish --development + +# 3. In the UI, pin the package to the dev release +# (Package → Settings → Version → select "0.1.1-dev.") + +# 4. Redeploy with a comment describing what you changed +mass package deploy -- -m "testing stricter egress rule" + +# 5. Inspect results, adjust, go back to step 1. +``` + +For runtime templates where app developers have scaffolded per-app bundles with `mass bundle new`, the same loop works — publish the app bundle itself as a development release while iterating on its Terraform or params. + +### When to cut a real version + +Once the bundle behaves the way you want, bump `version:` in the bundle's `massdriver.yaml` and publish: + +```bash +mass bundle publish +``` + +Re-pin the package to the new version in the UI. Going forward, production packages track numbered releases; only environments you explicitly move to the dev pointer follow your in-flight changes. + +### Tips + +- Commit your bundle changes to git before publishing a dev release. The dev release records the state at publish time, so you want it to point at something you can check out later. +- Use `-m` on every `mass package deploy` to leave a breadcrumb for yourself (and anyone reviewing the canvas history) about what each iteration was testing. +- Dev releases are per-bundle, so you can iterate on `gcp-cloud-run-service` while leaving `gcp-landing-zone` on a stable numbered release. + +## Customizing for your team + +### Runtime templates + +`gcp-cloud-run-service` is an example of a **runtime template** — an opinionated bundle that codifies your organization's runtime standards (SA identity, compliance controls, upstream IAM conventions). + +The expected pattern in production: +- Platform/ops team forks or customizes the runtime template bundles to enforce their org's standards. +- Application developers run `mass bundle new` using the template to generate a **per-app** bundle that declares their service's specific connections, env vars, and dependencies. + +Both the template and the per-app bundle are Massdriver bundles, so they get the same canvas, deploy, and compliance treatment. + +A ready-to-use application template for Cloud Run is included at `templates/gcp-cloud-run-service/`. App developers scaffold a new bundle with: + +```bash +mass bundle new --template gcp-cloud-run-service +``` + +The CLI prompts for bundle name and description, then shows a list of artifact definitions published in your org — developers pick which upstream resources their service needs (Pub/Sub topic, BigQuery dataset, GCS bucket, anything else). The resulting bundle is lean — only `image` is exposed as a param by default; developers add more params as their app needs them. `src/iam.tf` includes commented-out IAM binding examples, `src/push_subscription.tf` has an example push-subscription block, and `src/main.tf` has a commented VPC connector block — all ready to uncomment based on what they picked. + +### Compliance strategy + +Every bundle has a Checkov gate. Findings only halt deployment when `md-target` matches `prod|prd|production`. Lower environments surface findings as warnings but still deploy. Adjust the `halt_on_failure` expression in each bundle's `massdriver.yaml` to match your naming conventions. + +### Presets + +Each bundle ships with 2–3 presets mapped to common environment tiers. The presets are just starting points — you can override any param at deploy time or create new presets suited to your stack. + +## Assumptions and prerequisites + +- **Cloud Billing must be enabled** on the GCP billing account for budgets to work. +- The GCP service account credential needs admin-level permissions on the resources it provisions. For production use, narrow it down per-environment. +- GCS bucket names are globally unique — deployments derive the name from Massdriver's `name_prefix` so uniqueness is automatic, but this means you can't pick your own name. +- BigQuery dataset `location`, GCS bucket `location`, and BigQuery table `table_id` are immutable after creation — to change, you destroy and recreate (and export/reimport data). +- Vertex Workbench requires a minimum 150 GB boot disk. +- **BigQuery subscriptions require the target table to exist before the subscription can deliver messages.** The `gcp-bigquery-table` bundle creates the table and the subscription atomically, so this is handled when you use it — but if you connect a topic directly to a hand-created table, make sure the table is there first. +- **VPC Access connectors are regional.** A connector must be in the same region as the Cloud Run service using it. If you import a connector as an artifact, the region is part of the artifact payload — wire to a service in the matching region. + +## What's NOT in this catalog (yet) + +Things you may want for a fuller production setup — not included, but straightforward to add: +- VPC Service Controls / service perimeters +- Cloud KMS key ring bundle (for CMEK on the data resources) +- Cloud Scheduler / Cloud Tasks for event-driven triggers +- Cloud SQL / AlloyDB for transactional workloads +- GKE for containerized workloads at scale +- Artifact Registry for container images +- Secret Manager secrets +- Monitoring / alerting workspace with dashboards +- Dataflow / Dataproc for batch or streaming pipelines +- VPC Access connector provisioning bundle (currently import-only) +- AWS-side artifact definitions for S3 / Kafka / IAM-role-based cross-cloud auth + +## Bundle-level docs + +Each bundle has: +- `README.md` — what it does, what it creates, what it produces, compliance posture +- `operator.md` — a 2am runbook. Non-obvious constraints, troubleshooting, day-2 operations, useful commands + +## Support during the POC + +Reach out any time. Happy to hop on a call to help troubleshoot, talk through design choices, or recommend patterns based on what you're seeing. From 07fa3e976000816aebef88a25b8994311d51fef7 Mon Sep 17 00:00:00 2001 From: Cory O'Daniel Date: Wed, 22 Apr 2026 17:05:06 -0700 Subject: [PATCH 13/15] Stop tracking .claude/massdriver.local.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Local-only hook config — should not have been committed. Removing from the index and adding to .gitignore so future clones don't pick it up and future edits don't get staged. File content stays on disk for local use. Co-Authored-By: Claude Opus 4.7 (1M context) --- .claude/massdriver.local.md | 28 ---------------------------- .gitignore | 1 + 2 files changed, 1 insertion(+), 28 deletions(-) delete mode 100644 .claude/massdriver.local.md diff --git a/.claude/massdriver.local.md b/.claude/massdriver.local.md deleted file mode 100644 index afc372b..0000000 --- a/.claude/massdriver.local.md +++ /dev/null @@ -1,28 +0,0 @@ -# Massdriver Local Config - -## Environment slug conventions - -A Massdriver package slug has the shape `--` where the -`` segment is the ENVIRONMENT NAME and the `` segment is the -manifest name for the bundle on that environment's canvas. The environment -portion is the SECOND segment only — do not include the manifest in pattern -matching. - -## Production pattern - -Only the environment segment (second segment) should be matched against this -pattern, NOT the full slug: - -production_pattern: ^(prod|prd|production)$ - -## Test environments (explicit allow list) - -These environments are TEST / DEVELOPMENT and must always be allowed: - -- `claude` (in `gcp-claude` project — first demo) -- `dataplat-claude` (NOT production; used for the GCP data platform Kafka demo) - -Package slugs on these environments will look like `gcp-claude-*` or -`dataplat-claude-*`. Even if the manifest name contains strings that look -production-adjacent (e.g. `aisquadds`, `logsink`, `landingzone`), these are -NOT production targets — the environment segment is what matters. diff --git a/.gitignore b/.gitignore index 3bb29b8..be21ffe 100644 --- a/.gitignore +++ b/.gitignore @@ -29,6 +29,7 @@ schema-ui.json # Build artifacts _dist/ .claude/settings.local.json +.claude/massdriver.local.md node_modules/ *.zip TODO.md From aa443c57efaad6a58f463674a3b6d9cb79edcd23 Mon Sep 17 00:00:00 2001 From: Cory O'Daniel Date: Wed, 22 Apr 2026 17:05:27 -0700 Subject: [PATCH 14/15] Replace ASCII diagram in GUIDE.md with Mermaid GitHub renders Mermaid natively; ASCII art looked bad and was hard to update. Switch to a flowchart that groups the data layer and runtime layer, distinguishes required wires from optional wires, and covers the new bundles on this branch (bigquery-table, log-sink, vpc- connector). Co-Authored-By: Claude Opus 4.7 (1M context) --- GUIDE.md | 61 +++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 41 insertions(+), 20 deletions(-) diff --git a/GUIDE.md b/GUIDE.md index 4210c57..b3fd615 100644 --- a/GUIDE.md +++ b/GUIDE.md @@ -38,27 +38,48 @@ Each bundle produces an artifact that downstream bundles consume. Artifact defin ## How the bundles compose +```mermaid +flowchart TB + net[gcp-network] + lz[gcp-landing-zone] + + subgraph data[Data layer] + topic[gcp-pubsub-topic] + bucket[gcp-storage-bucket] + ds[gcp-bigquery-dataset] + tbl[gcp-bigquery-table] + end + + subgraph runtime[Runtimes] + cr[gcp-cloud-run-service] + wb[gcp-vertex-workbench] + end + + sink[gcp-log-sink] + vpc[(gcp-vpc-connector
imported)] + + net --> lz + lz --> topic + lz --> bucket + lz --> ds + lz --> cr + lz --> wb + lz --> sink + + ds --> tbl + topic -.->|creates BQ subscription| tbl + topic -.->|incoming_topic: push sub| cr + topic -.->|pubsub_topic: publisher role| cr + bucket -.->|objectUser IAM| cr + ds -.->|dataViewer IAM| wb + + vpc -.->|optional private egress| cr + + ds -->|destination| sink + bucket -.->|destination alt| sink ``` - gcp-network ─► gcp-landing-zone - │ - ┌───────────────┼───────────────────────────┐ - ▼ ▼ ▼ - gcp-pubsub-topic gcp-storage-bucket gcp-bigquery-dataset - │ │ - │ ▼ - │ gcp-bigquery-table - │ (optional topic wired to table → BQ subscription) - │ - ▼ - gcp-cloud-run-service / gcp-vertex-workbench - (incoming topic → push subscription, - outgoing topic → publisher role, - optional vpc-connector → private egress, - creates its own SA) - - ▼ - gcp-log-sink ─► (gcp-bigquery-dataset or gcp-storage-bucket) -``` + +Solid arrows are required wires. Dashed arrows are optional — wire them when your use case needs them. ### Topology notes From 96e38b59e28461d07a723ec191d4f54abbfe018e Mon Sep 17 00:00:00 2001 From: Cory O'Daniel Date: Thu, 23 Apr 2026 13:07:45 -0700 Subject: [PATCH 15/15] Add deletion_protection param to gcp-cloud-run-service MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit google provider v6+ defaults deletion_protection=true on google_cloud_run_v2_service, which blocks tofu destroy with "cannot destroy service without setting deletion_protection=false and running terraform apply". Decommissioning a package via Massdriver hit this for every service. Expose as a bundle param, default false. Cloud Run services are stateless compute — destroy does not lose data, so the friction of a forced two-step destroy isn't worth it out of the box. Operators can flip to true in production to require the explicit two-step teardown. Co-Authored-By: Claude Opus 4.7 (1M context) --- bundles/gcp-cloud-run-service/massdriver.yaml | 10 ++++++++++ bundles/gcp-cloud-run-service/src/main.tf | 8 ++++++++ bundles/gcp-cloud-run-service/src/variables.tf | 5 +++++ 3 files changed, 23 insertions(+) diff --git a/bundles/gcp-cloud-run-service/massdriver.yaml b/bundles/gcp-cloud-run-service/massdriver.yaml index 236c1dc..fb1e74b 100644 --- a/bundles/gcp-cloud-run-service/massdriver.yaml +++ b/bundles/gcp-cloud-run-service/massdriver.yaml @@ -175,6 +175,16 @@ params: - PRIVATE_RANGES_ONLY - ALL_TRAFFIC + deletion_protection: + title: Enable Deletion Protection + description: When enabled, the Cloud Run service cannot be destroyed until + deletion protection is first disabled in a prior apply (two-step destroy). + Default is false — Cloud Run services are stateless compute, so destroying + one does not lose data. Set to true in production to prevent accidental + tear-down. + type: boolean + default: false + connections: required: - gcp_authentication diff --git a/bundles/gcp-cloud-run-service/src/main.tf b/bundles/gcp-cloud-run-service/src/main.tf index 4f024c6..b3f4a3c 100644 --- a/bundles/gcp-cloud-run-service/src/main.tf +++ b/bundles/gcp-cloud-run-service/src/main.tf @@ -59,6 +59,14 @@ resource "google_cloud_run_v2_service" "main" { name = local.name_prefix location = local.region + # ── Deletion protection ───────────────────────────────────────────────────── + # google provider v6+ defaults deletion_protection = true, which blocks tofu + # destroy until a prior apply sets it to false. That's fine for production but + # friction for dev/test. Expose as a param, default false — Cloud Run services + # are stateless compute, destroy does not lose data. Operators can flip to true + # in production to require a two-step destroy. + deletion_protection = var.deletion_protection + # ── Ingress ───────────────────────────────────────────────────────────────── # Controls which traffic sources can reach this service. # Changing ingress triggers a full revision replacement (cold start expected). diff --git a/bundles/gcp-cloud-run-service/src/variables.tf b/bundles/gcp-cloud-run-service/src/variables.tf index eec3a46..6544ab4 100644 --- a/bundles/gcp-cloud-run-service/src/variables.tf +++ b/bundles/gcp-cloud-run-service/src/variables.tf @@ -164,3 +164,8 @@ variable "vpc_egress" { type = string default = "PRIVATE_RANGES_ONLY" } + +variable "deletion_protection" { + type = bool + default = false +}