diff --git a/.gitignore b/.gitignore index 301fee9..be21ffe 100644 --- a/.gitignore +++ b/.gitignore @@ -29,5 +29,7 @@ schema-ui.json # Build artifacts _dist/ .claude/settings.local.json +.claude/massdriver.local.md node_modules/ *.zip +TODO.md diff --git a/GUIDE.md b/GUIDE.md new file mode 100644 index 0000000..b3fd615 --- /dev/null +++ b/GUIDE.md @@ -0,0 +1,311 @@ +# GCP Data Platform — POC Starter Catalog + +A starter catalog of Massdriver bundles covering the primary components of a GCP data platform, including Pub/Sub → BigQuery and Pub/Sub → Cloud Run push subscription pipelines for event-driven workloads. Intended as a starting point for your POC — use any subset, customize to fit your patterns, bring your own networking. + +## What's in this catalog + +### Bundles + +| Bundle | Role | +| --- | --- | +| `gcp-network` | Minimal regional VPC and subnet. Useful for test environments; for production, import your existing network as an artifact (see below). | +| `gcp-landing-zone` | Environment foundation. Project-level IAM bindings for humans/groups, optional org-policy guardrails, service API enablement, optional budget with notifications. | +| `gcp-pubsub-topic` | Pub/Sub topic with optional DLQ. Low-volume / Standard / High-throughput presets. | +| `gcp-storage-bucket` | Cloud Storage bucket with uniform bucket-level access and public-access prevention enforced. Staging / Durable / Archive presets. | +| `gcp-bigquery-dataset` | BigQuery dataset with delete protection on Production. Dev / Staging / Production presets. | +| `gcp-bigquery-table` | BigQuery table. When a Pub/Sub topic is wired, creates a BigQuery subscription that delivers messages into the table. Pub/Sub-compatible default schema or custom schema JSON. | +| `gcp-cloud-run-service` | Cloud Run v2 service. Creates its own runtime service account and auto-binds roles on any connected upstream resources. Supports incoming push subscriptions and VPC connector egress. Internal / Public API / Worker presets. | +| `gcp-vertex-workbench` | Vertex AI Workbench instance. Creates a per-instance service account. When connected to a BigQuery dataset, grants the instance SA read-only access. Small / Medium / GPU presets. | +| `gcp-log-sink` | Project-level Cloud Logging sink with configurable filter. Routes matching log entries to a BigQuery dataset or GCS bucket. Terraform precondition enforces exactly one destination. | + +### Artifact definitions + +Each bundle produces an artifact that downstream bundles consume. Artifact definitions are reusable contracts — if you already have infrastructure you want to represent in Massdriver, you can import it as an artifact and connect it to bundles without re-provisioning. + +| Artifact | Key fields | +| --- | --- | +| `gcp-network` | project_id, network_name, region, primary_subnet, optional secondary ranges / PSA / Cloud NAT / additional subnets | +| `gcp-landing-zone` | project_id, network, enabled_apis, budget (optional), iam_bindings (summary) | +| `gcp-workload-identity` | project_id, service_account_email / id / name | +| `gcp-pubsub-topic` | project_id, topic_name, topic_id, optional DLQ fields | +| `gcp-storage-bucket` | project_id, bucket_name, bucket_url, bucket_self_link, location, storage_class | +| `gcp-bigquery-dataset` | project_id, dataset_id, dataset_full_name, location | +| `gcp-bigquery-table` | project_id, dataset_id, table_id, table_full_name | +| `gcp-cloud-run-service` | project_id, service_name, service_url, location, latest_ready_revision, runtime SA email/member | +| `gcp-vertex-workbench` | project_id, instance_name, location, proxy_url, instance SA email/member | +| `gcp-log-sink` | project_id, sink_name, destination, writer_identity, destination_type | +| `gcp-vpc-connector` | project_id, region, name, connector_id, optional network / ip_cidr_range / egress_settings (import-only) | + +## How the bundles compose + +```mermaid +flowchart TB + net[gcp-network] + lz[gcp-landing-zone] + + subgraph data[Data layer] + topic[gcp-pubsub-topic] + bucket[gcp-storage-bucket] + ds[gcp-bigquery-dataset] + tbl[gcp-bigquery-table] + end + + subgraph runtime[Runtimes] + cr[gcp-cloud-run-service] + wb[gcp-vertex-workbench] + end + + sink[gcp-log-sink] + vpc[(gcp-vpc-connector
imported)] + + net --> lz + lz --> topic + lz --> bucket + lz --> ds + lz --> cr + lz --> wb + lz --> sink + + ds --> tbl + topic -.->|creates BQ subscription| tbl + topic -.->|incoming_topic: push sub| cr + topic -.->|pubsub_topic: publisher role| cr + bucket -.->|objectUser IAM| cr + ds -.->|dataViewer IAM| wb + + vpc -.->|optional private egress| cr + + ds -->|destination| sink + bucket -.->|destination alt| sink +``` + +Solid arrows are required wires. Dashed arrows are optional — wire them when your use case needs them. + +### Topology notes + +- **Subscriptions live on the consumer bundle, not on their own canvas tile.** Wire a topic into a `gcp-bigquery-table` and the table bundle creates a BigQuery subscription internally. Wire a topic into a `gcp-cloud-run-service` via the `incoming_topic` input and the service creates a push subscription. This matches real-world ownership (the consumer configures ack deadline, retry, schema mapping) and halves the canvas-tile count for a typical pipeline. +- **Cloud Run services have two distinct Pub/Sub inputs.** `incoming_topic` creates a push subscription that delivers messages into the service URL. `pubsub_topic` (outgoing) grants the service's runtime SA publisher role on that topic. A middleware service can wire both — receive from one topic, publish to another. +- **The landing zone owns project-level IAM and guardrails**, not workload service accounts. Data resources (topic, bucket, dataset, table) produce artifacts with role-scoped policies but don't bind any service account themselves. Runtimes (Cloud Run, Workbench) create their own per-service service accounts and bind roles on connected upstream resources — standard per-workload-SA least-privilege. + +## Getting started + +Before getting started with the catalog, set up your [self-hosted instance.](https://docs.massdriver.cloud/platform-operations/self-hosted/install) + +### 1. Clone the catalog + +```bash +git clone git@github.com:massdriver-cloud/massdriver-catalog.git +cd massdriver-catalog +git checkout demo/0422-gcp-data-plat-kafka +``` + +### 2. Configure the Massdriver CLI + +The CLI reads its config from `$HOME/.config/massdriver/config.yaml` (or `$XDG_CONFIG_HOME/massdriver/config.yaml` if `XDG_CONFIG_HOME` is set). Create it with your organization ID and a Service Account API key: + +```yaml +version: 1 +profiles: + default: + organization_id: YOUR_ORG_ID + api_key: YOUR_SERVICE_ACCOUNT_TOKEN + url: https://api.YOUR_DOMAIN + templates_path: ~/path/to/your/massdriver-catalog/templates +``` + +- **organization_id** — hover over your org logo in the Massdriver UI to find it +- **api_key** — create a Service Account in Settings → Service Accounts and copy its token + +Or use environment variables: `MASSDRIVER_ORGANIZATION_ID`, `MASSDRIVER_API_KEY`. + +Full reference: https://docs.massdriver.cloud/reference/cli/overview#configuration + +### 3. Enable platforms and publish the catalog + +```bash +# In this repo +make ENABLED_PLATFORMS=gcp +make publish-artifact-definitions publish-bundles +``` + +### 4. Upload your GCP credential + +Export a service account key from a project where you want to deploy the POC. Upload it as a Massdriver credential: + +```bash +mass artifact import \ + -f ~/path/to/gcp-sa.json \ + -n "GCP POC" \ + -t {YOUR_ORG_ID}/gcp-service-account +``` + +**Note:** GCP service account keys currently need to be imported via the CLI as shown above. There's an escaping bug in the UI credential form that mangles the newline characters in GCP private keys (GCP is the only provider affected — the keys are multi-line PEM). A fix is in flight. In the meantime, two workarounds: import via CLI, or provision the service account in-platform via a Massdriver bundle and consume the resulting artifact. + +The credential needs permissions to manage the resources you plan to deploy (Compute Admin for network, Project IAM Admin for landing zone, Pub/Sub Admin, Storage Admin, BigQuery Admin, Cloud Run Admin, Workbench Admin, Logging Admin, Service Usage Admin for API enablement, and Serverless VPC Access Admin if you're importing a VPC connector). + +### 5. Bring your own network + +**Option A — provision a new network for POC testing:** +Add the `gcp-network` bundle to an environment canvas, connect your GCP credential, pick a region and CIDR, deploy. The bundle provisions a minimal VPC with one subnet, Private Google Access, flow logs, and a baseline deny-all ingress firewall rule. + +**Option B — import your existing network:** +The `gcp-network` artifact is designed to represent a rich existing network (primary + additional subnets, secondary ranges for GKE, Private Services Access, Cloud NAT). You can import your network directly as an artifact instead of provisioning one. + +```bash +mass artifact import \ + -f path/to/my-network.json \ + -n "Prod VPC" \ + -t {YOUR_ORG_ID}/gcp-network +``` + +See `artifact-definitions/gcp-network/massdriver.yaml` for the full schema — every field you might need for an existing production network is already defined, most of them optional. + +### 6. (Optional) Import an existing VPC connector + +If your Kafka producer (or any Cloud Run service in the catalog) needs private egress through a Serverless VPC Access connector, the `gcp-vpc-connector` artifact definition is import-only — no provisioning bundle. Create the connector in GCP however you normally would: + +```bash +gcloud compute networks vpc-access connectors create my-connector \ + --region=us-central1 \ + --network=my-vpc \ + --range=10.8.0.0/28 +``` + +Then import it as an artifact: + +```bash +mass artifact import \ + -f path/to/connector.json \ + -n "Shared VPC Connector" \ + -t {YOUR_ORG_ID}/gcp-vpc-connector +``` + +Wire it into any `gcp-cloud-run-service` on the canvas via the `vpc_connector` input. + +### 7. Build up the environment + +1. **Landing zone** — add `gcp-landing-zone` to the canvas. Connect the network. Configure IAM bindings for your team (e.g., `roles/viewer` → your analysts group), any org policy guardrails you want enforced, and an optional budget. +2. **Data resources** — add any of `gcp-pubsub-topic`, `gcp-storage-bucket`, `gcp-bigquery-dataset`, `gcp-bigquery-table`. Each connects to the landing zone for `project_id` context. Tables connect to a dataset (required) and optionally to a topic (creates the BQ subscription). +3. **Runtimes** — add `gcp-cloud-run-service` or `gcp-vertex-workbench`. Connect the landing zone (required) plus any upstream data artifacts. For Cloud Run: wire `incoming_topic` to create a push subscription; wire `pubsub_topic` for outgoing publisher role; wire `vpc_connector` for private egress. +4. **Observability** — add `gcp-log-sink` to route log entries to a BigQuery dataset or GCS bucket. Wire exactly one destination; the Terraform precondition enforces this. + +### 8. Deploy + +From the canvas UI or CLI: + +```bash +mass package deploy -- -m "initial deploy" +``` + +## Iterating with development releases + +During a POC you'll almost certainly want to tweak bundles — adjust a default, add a param, change an IAM binding, tighten a compliance rule. Cutting a new semver version for each iteration is slow and clutters the version history. Use **development releases** instead. + +### How it works + +Publishing a bundle with `--development` creates a `X.Y.Z-dev` release tagged with a timestamp (or your local git SHA). It: + +- Doesn't bump the bundle's official version in `massdriver.yaml`. +- Each new dev publish is a new pointer the package can be pinned to. +- Is only usable when a package is explicitly pinned to the dev release — production packages on `1.2.3` are unaffected. + +This lets you iterate on a bundle, redeploy, and see results in seconds without polluting your version history. When you're happy with the changes, bump the version and publish a real one (`1.3.0`) and re-pin the package. + +### The iteration loop + +```bash +# 1. Edit a bundle (e.g., bundles/gcp-cloud-run-service/src/*.tf) + +# 2. Publish a development release +cd bundles/gcp-cloud-run-service +mass bundle publish --development + +# 3. In the UI, pin the package to the dev release +# (Package → Settings → Version → select "0.1.1-dev.") + +# 4. Redeploy with a comment describing what you changed +mass package deploy -- -m "testing stricter egress rule" + +# 5. Inspect results, adjust, go back to step 1. +``` + +For runtime templates where app developers have scaffolded per-app bundles with `mass bundle new`, the same loop works — publish the app bundle itself as a development release while iterating on its Terraform or params. + +### When to cut a real version + +Once the bundle behaves the way you want, bump `version:` in the bundle's `massdriver.yaml` and publish: + +```bash +mass bundle publish +``` + +Re-pin the package to the new version in the UI. Going forward, production packages track numbered releases; only environments you explicitly move to the dev pointer follow your in-flight changes. + +### Tips + +- Commit your bundle changes to git before publishing a dev release. The dev release records the state at publish time, so you want it to point at something you can check out later. +- Use `-m` on every `mass package deploy` to leave a breadcrumb for yourself (and anyone reviewing the canvas history) about what each iteration was testing. +- Dev releases are per-bundle, so you can iterate on `gcp-cloud-run-service` while leaving `gcp-landing-zone` on a stable numbered release. + +## Customizing for your team + +### Runtime templates + +`gcp-cloud-run-service` is an example of a **runtime template** — an opinionated bundle that codifies your organization's runtime standards (SA identity, compliance controls, upstream IAM conventions). + +The expected pattern in production: +- Platform/ops team forks or customizes the runtime template bundles to enforce their org's standards. +- Application developers run `mass bundle new` using the template to generate a **per-app** bundle that declares their service's specific connections, env vars, and dependencies. + +Both the template and the per-app bundle are Massdriver bundles, so they get the same canvas, deploy, and compliance treatment. + +A ready-to-use application template for Cloud Run is included at `templates/gcp-cloud-run-service/`. App developers scaffold a new bundle with: + +```bash +mass bundle new --template gcp-cloud-run-service +``` + +The CLI prompts for bundle name and description, then shows a list of artifact definitions published in your org — developers pick which upstream resources their service needs (Pub/Sub topic, BigQuery dataset, GCS bucket, anything else). The resulting bundle is lean — only `image` is exposed as a param by default; developers add more params as their app needs them. `src/iam.tf` includes commented-out IAM binding examples, `src/push_subscription.tf` has an example push-subscription block, and `src/main.tf` has a commented VPC connector block — all ready to uncomment based on what they picked. + +### Compliance strategy + +Every bundle has a Checkov gate. Findings only halt deployment when `md-target` matches `prod|prd|production`. Lower environments surface findings as warnings but still deploy. Adjust the `halt_on_failure` expression in each bundle's `massdriver.yaml` to match your naming conventions. + +### Presets + +Each bundle ships with 2–3 presets mapped to common environment tiers. The presets are just starting points — you can override any param at deploy time or create new presets suited to your stack. + +## Assumptions and prerequisites + +- **Cloud Billing must be enabled** on the GCP billing account for budgets to work. +- The GCP service account credential needs admin-level permissions on the resources it provisions. For production use, narrow it down per-environment. +- GCS bucket names are globally unique — deployments derive the name from Massdriver's `name_prefix` so uniqueness is automatic, but this means you can't pick your own name. +- BigQuery dataset `location`, GCS bucket `location`, and BigQuery table `table_id` are immutable after creation — to change, you destroy and recreate (and export/reimport data). +- Vertex Workbench requires a minimum 150 GB boot disk. +- **BigQuery subscriptions require the target table to exist before the subscription can deliver messages.** The `gcp-bigquery-table` bundle creates the table and the subscription atomically, so this is handled when you use it — but if you connect a topic directly to a hand-created table, make sure the table is there first. +- **VPC Access connectors are regional.** A connector must be in the same region as the Cloud Run service using it. If you import a connector as an artifact, the region is part of the artifact payload — wire to a service in the matching region. + +## What's NOT in this catalog (yet) + +Things you may want for a fuller production setup — not included, but straightforward to add: +- VPC Service Controls / service perimeters +- Cloud KMS key ring bundle (for CMEK on the data resources) +- Cloud Scheduler / Cloud Tasks for event-driven triggers +- Cloud SQL / AlloyDB for transactional workloads +- GKE for containerized workloads at scale +- Artifact Registry for container images +- Secret Manager secrets +- Monitoring / alerting workspace with dashboards +- Dataflow / Dataproc for batch or streaming pipelines +- VPC Access connector provisioning bundle (currently import-only) +- AWS-side artifact definitions for S3 / Kafka / IAM-role-based cross-cloud auth + +## Bundle-level docs + +Each bundle has: +- `README.md` — what it does, what it creates, what it produces, compliance posture +- `operator.md` — a 2am runbook. Non-obvious constraints, troubleshooting, day-2 operations, useful commands + +## Support during the POC + +Reach out any time. Happy to hop on a call to help troubleshoot, talk through design choices, or recommend patterns based on what you're seeing. diff --git a/artifact-definitions/gcp-bigquery-dataset/massdriver.yaml b/artifact-definitions/gcp-bigquery-dataset/massdriver.yaml new file mode 100644 index 0000000..315147d --- /dev/null +++ b/artifact-definitions/gcp-bigquery-dataset/massdriver.yaml @@ -0,0 +1,78 @@ +name: gcp-bigquery-dataset +label: GCP BigQuery Dataset +icon: https://raw.githubusercontent.com/massdriver-cloud/massdriver-catalog/refs/heads/main/platforms/gcp/icon.png + +# IAM Role Binding Pattern (examples — adapt to your actual consumer): +# Downstream bundles that need read-only access bind roles/bigquery.dataViewer. +# Downstream bundles that need read+write access bind roles/bigquery.dataEditor. +# Downstream bundles that need full control bind roles/bigquery.dataOwner. +# +# Terraform example — grant data viewer access to a workload service account: +# resource "google_bigquery_dataset_iam_member" "reader" { +# dataset_id = var.bigquery_dataset.dataset_id +# role = "roles/bigquery.dataViewer" +# member = "serviceAccount:${var.bigquery_dataset.workload_sa_email}" +# } +# +# Note: BigQuery IAM operates at dataset level by default. For table-level access, +# use google_bigquery_table_iam_member instead. Dataset-level bindings propagate to +# all tables within the dataset; table-level bindings do not propagate up. +# +# Policy examples below (reader / writer / admin) follow this same pattern. They are +# illustrative — the actual IAM member string comes from the consumer bundle's +# service account, not from this artifact. +exports: [] + +schema: + title: GCP BigQuery Dataset + description: A Google Cloud BigQuery dataset. Carries the project ID, dataset ID, + fully-qualified name (.), and location so downstream bundles + can reference the dataset for querying, loading, and exporting without hard-coding + project or dataset identifiers. + type: object + required: + - project_id + - dataset_id + - dataset_full_name + - location + properties: + project_id: + title: Project ID + description: GCP project identifier that owns this BigQuery dataset + type: string + examples: + - my-gcp-project-123 + + dataset_id: + title: Dataset ID + description: BigQuery dataset identifier (letters, digits, underscores — no hyphens) + type: string + examples: + - my_analytics_dataset + + dataset_full_name: + title: Dataset Full Name + description: Fully-qualified BigQuery dataset name in . form. + Use this in SQL FROM clauses and bq CLI commands. + type: string + examples: + - my-gcp-project-123.my_analytics_dataset + + location: + title: Location + description: BigQuery location where the dataset is stored (region or multi-region). + Location is immutable after creation. + type: string + examples: + - US + - us-central1 + - EU + + friendly_name: + title: Friendly Name + description: Human-readable display name for the dataset (optional) + type: + - string + - "null" + examples: + - My Analytics Dataset diff --git a/artifact-definitions/gcp-bigquery-table/massdriver.yaml b/artifact-definitions/gcp-bigquery-table/massdriver.yaml new file mode 100644 index 0000000..6c9b609 --- /dev/null +++ b/artifact-definitions/gcp-bigquery-table/massdriver.yaml @@ -0,0 +1,67 @@ +name: gcp-bigquery-table +label: GCP BigQuery Table +icon: https://raw.githubusercontent.com/massdriver-cloud/massdriver-catalog/refs/heads/main/platforms/gcp/icon.png + +# IAM Role Binding Pattern (examples — adapt to your actual consumer): +# Downstream bundles that need read-only access bind roles/bigquery.dataViewer at the table level. +# Downstream bundles that need read+write access bind roles/bigquery.dataEditor at the table level. +# +# Terraform example — grant data viewer access to a workload service account: +# resource "google_bigquery_table_iam_member" "reader" { +# project = var.bigquery_table.project_id +# dataset_id = var.bigquery_table.dataset_id +# table_id = var.bigquery_table.table_id +# role = "roles/bigquery.dataViewer" +# member = "serviceAccount:" +# } +# +# Note: Table-level IAM bindings do not propagate up to the parent dataset. +# Dataset-level bindings DO propagate down to all tables. Prefer dataset-level +# bindings for broad access and table-level bindings for scoped isolation. +# +# Policy examples below (reader / writer) follow this same pattern. They are +# illustrative — the actual IAM member string comes from the consumer bundle's +# service account, not from this artifact. +exports: [] + +schema: + title: GCP BigQuery Table + description: A Google Cloud BigQuery table. Carries the project ID, dataset ID, + table ID, and fully-qualified table name (..) so + downstream bundles can reference the table for querying, loading, and Pub/Sub + subscription delivery without hard-coding identifiers. + type: object + required: + - project_id + - dataset_id + - table_id + - table_full_name + properties: + project_id: + title: Project ID + description: GCP project identifier that owns this BigQuery table + type: string + examples: + - my-gcp-project-123 + + dataset_id: + title: Dataset ID + description: BigQuery dataset identifier that contains this table (letters, digits, underscores — no hyphens) + type: string + examples: + - my_analytics_dataset + + table_id: + title: Table ID + description: BigQuery table identifier within the dataset (letters, digits, underscores — no hyphens) + type: string + examples: + - messages + + table_full_name: + title: Table Full Name + description: Fully-qualified BigQuery table name in ..
form. + Use this in SQL FROM clauses and bq CLI commands. + type: string + examples: + - my-gcp-project-123.my_analytics_dataset.messages diff --git a/artifact-definitions/gcp-cloud-run-service/massdriver.yaml b/artifact-definitions/gcp-cloud-run-service/massdriver.yaml new file mode 100644 index 0000000..dca5f65 --- /dev/null +++ b/artifact-definitions/gcp-cloud-run-service/massdriver.yaml @@ -0,0 +1,102 @@ +name: gcp-cloud-run-service +label: GCP Cloud Run Service +icon: https://raw.githubusercontent.com/massdriver-cloud/massdriver-catalog/refs/heads/main/platforms/gcp/icon.png + +# IAM Role Binding Pattern — downstream bundles that need to invoke this service: +# +# The `runtime_service_account_member` field carries the IAM principal string +# ("serviceAccount:") for the service's runtime SA. Use it to grant +# downstream resources access to write to this service's upstream dependencies. +# +# To allow an external caller (e.g., Pub/Sub push subscription, Cloud Scheduler) +# to invoke this Cloud Run service: +# +# resource "google_cloud_run_v2_service_iam_member" "invoker" { +# project = var.cloud_run_service.project_id +# location = var.cloud_run_service.location +# name = var.cloud_run_service.service_name +# role = "roles/run.invoker" +# member = "" # e.g., serviceAccount:scheduler-sa@project.iam.gserviceaccount.com +# } +# +# Policy examples below are illustrative only — the actual IAM member comes from +# the calling bundle's service account, not from this artifact. +# +# invoker policy: +# role: roles/run.invoker +# member: +# resource: projects//locations//services/ +exports: [] + +schema: + title: GCP Cloud Run Service + description: A deployed Google Cloud Run v2 service. Carries the project ID, + service name, HTTPS URL, region, latest ready revision name, and the runtime + service account identity so downstream bundles can invoke the service or grant + it additional permissions without hard-coding project or service identifiers. + type: object + required: + - project_id + - service_name + - service_url + - location + - latest_ready_revision + - runtime_service_account_email + - runtime_service_account_member + properties: + project_id: + title: Project ID + description: GCP project identifier that owns this Cloud Run service + type: string + examples: + - my-gcp-project-123 + + service_name: + title: Service Name + description: Cloud Run service name (short name, not fully-qualified resource path) + type: string + examples: + - my-api-service + + service_url: + title: Service URL + description: HTTPS URL where the service is reachable. For internal ingress + services, this URL is only reachable from within the VPC or via Cloud Load + Balancing. For all-ingress services, this is publicly reachable. + type: string + examples: + - https://my-api-service-abc123-uc.a.run.app + + location: + title: Location + description: GCP region where the Cloud Run service is deployed + type: string + examples: + - us-central1 + + latest_ready_revision: + title: Latest Ready Revision + description: Name of the most recent revision that is currently serving traffic. + Use this to pin a specific revision when configuring traffic splits or + rolling back to a known-good state. + type: string + examples: + - my-api-service-00001-abc + + runtime_service_account_email: + title: Runtime Service Account Email + description: "Email address of the GCP service account the Cloud Run service + runs as. Downstream bundles that need to grant this service access to other + resources bind IAM roles to this email using the serviceAccount: prefix." + type: string + examples: + - data-workload@my-gcp-project-123.iam.gserviceaccount.com + + runtime_service_account_member: + title: Runtime Service Account IAM Member + description: "The full IAM principal string for the runtime service account, + in 'serviceAccount:' form. Use this directly as the member argument + in google_*_iam_member resources so callers do not have to construct it manually." + type: string + examples: + - serviceAccount:data-workload@my-gcp-project-123.iam.gserviceaccount.com diff --git a/artifact-definitions/gcp-landing-zone/massdriver.yaml b/artifact-definitions/gcp-landing-zone/massdriver.yaml new file mode 100644 index 0000000..73e57ee --- /dev/null +++ b/artifact-definitions/gcp-landing-zone/massdriver.yaml @@ -0,0 +1,144 @@ +name: gcp-landing-zone +label: GCP Landing Zone +icon: https://raw.githubusercontent.com/massdriver-cloud/massdriver-catalog/refs/heads/main/platforms/gcp/icon.png + +# Landing zone scope: +# This artifact represents the project-level governance layer — shared VPC, enabled APIs, +# billing budget reference, and a summary of the project-level IAM bindings and org-policy +# guardrails that the landing zone applied. +# +# It does NOT carry a workload service account. Each consumer bundle creates its own +# runtime SA and binds it to the resources it owns. Use the artifact policy comments in +# each data-resource artdef (gcp-pubsub-topic, gcp-bigquery-dataset, gcp-storage-bucket) +# as the canonical role-binding reference. +# +# The network fields allow downstream bundles to place resources in the shared VPC: +# subnet_self_link = var.landing_zone.network.primary_subnet.self_link +exports: [] + +schema: + title: GCP Landing Zone + description: Environment governance artifact — carries the shared VPC network, + enabled API list, billing budget reference, and an informational summary of + project-level IAM bindings and org-policy guardrails applied by the landing zone. + Downstream bundles connect to this instead of wiring network separately, and each + creates its own runtime service account for its workload identity. + type: object + required: + - project_id + - network + - enabled_apis + - budget + properties: + project_id: + title: Project ID + description: GCP project identifier for this environment + type: string + examples: + - my-gcp-project-123 + + network: + title: Network + description: Shared VPC network for this environment + type: object + required: + - network_name + - network_self_link + - region + - primary_subnet + properties: + network_name: + title: Network Name + type: string + examples: + - data-platform-vpc + network_self_link: + title: Network Self Link + type: string + examples: + - https://www.googleapis.com/compute/v1/projects/my-project/global/networks/my-vpc + region: + title: Region + type: string + examples: + - us-central1 + primary_subnet: + title: Primary Subnet + type: object + required: + - name + - cidr + - self_link + properties: + name: + title: Name + type: string + cidr: + title: CIDR + type: string + self_link: + title: Self Link + type: string + + enabled_apis: + title: Enabled APIs + description: GCP service APIs enabled in this project by the landing zone + type: array + items: + type: string + examples: + - - compute.googleapis.com + - bigquery.googleapis.com + - run.googleapis.com + + budget: + title: Budget + description: Billing budget reference for this environment. enabled=false when no budget was configured. + type: object + required: + - enabled + properties: + enabled: + title: Budget Enabled + type: boolean + budget_name: + title: Budget Display Name + type: + - string + - "null" + billing_account_id: + title: Billing Account ID + type: + - string + - "null" + amount_usd: + title: Budget Amount (USD) + type: + - number + - "null" + + iam_bindings: + title: IAM Bindings + description: Informational summary of project-level IAM bindings applied by this + landing zone (e.g., human operators and groups). Each entry records the role + and member list as applied. This is NOT enforced by downstream bundles — it is + a read-only audit trail of what the landing zone provisioned. + type: array + items: + type: object + required: + - role + - member + properties: + role: + title: Role + type: string + examples: + - roles/viewer + - roles/bigquery.dataViewer + member: + title: Member + type: string + examples: + - user:alice@example.com + - group:analysts@example.com diff --git a/artifact-definitions/gcp-log-sink/massdriver.yaml b/artifact-definitions/gcp-log-sink/massdriver.yaml new file mode 100644 index 0000000..78471db --- /dev/null +++ b/artifact-definitions/gcp-log-sink/massdriver.yaml @@ -0,0 +1,68 @@ +name: gcp-log-sink +label: GCP Log Sink +icon: https://raw.githubusercontent.com/massdriver-cloud/massdriver-catalog/refs/heads/main/platforms/gcp/icon.png + +# Log sinks do not expose a useful IAM surface downstream. The writer_identity +# SA is Google-managed and grants are made ON the destination resource at sink +# creation time. Consumers of this artifact read sink metadata only — no IAM +# binding pattern is needed. +exports: [] + +schema: + title: GCP Log Sink + description: A Google Cloud Logging project sink. Carries the sink name, the + fully-qualified destination string, the Google-managed writer service account + identity, destination type (bigquery or gcs), and the owning project ID. + Downstream bundles can use writer_identity to grant additional access on the + destination resource if needed. + type: object + required: + - project_id + - sink_name + - destination + - writer_identity + - destination_type + properties: + project_id: + title: Project ID + description: GCP project identifier that owns this log sink + type: string + examples: + - my-gcp-project-123 + + sink_name: + title: Sink Name + description: Cloud Logging sink resource name + type: string + examples: + - dataplat-dev-log-sink + + destination: + title: Destination + description: >- + Fully-qualified logging destination URI. For BigQuery: + bigquery.googleapis.com/projects/PROJECT/datasets/DATASET. For GCS: + storage.googleapis.com/BUCKET_NAME. + type: string + examples: + - bigquery.googleapis.com/projects/my-project/datasets/my_dataset + - storage.googleapis.com/my-logs-bucket + + writer_identity: + title: Writer Identity + description: Google-managed service account email that Cloud Logging uses to + write log entries to the destination. This SA must be granted the + destination-appropriate role (bigquery.dataEditor for BigQuery, + storage.objectCreator for GCS). The bundle grants this automatically at + creation time. + type: string + examples: + - serviceAccount:p123456789-123456@gcp-sa-logging.iam.gserviceaccount.com + + destination_type: + title: Destination Type + description: Whether logs are routed to BigQuery or GCS + type: string + enum: + - bigquery + - gcs diff --git a/artifact-definitions/gcp-network/massdriver.yaml b/artifact-definitions/gcp-network/massdriver.yaml new file mode 100644 index 0000000..537e67f --- /dev/null +++ b/artifact-definitions/gcp-network/massdriver.yaml @@ -0,0 +1,190 @@ +name: gcp-network +label: GCP Network +icon: https://raw.githubusercontent.com/massdriver-cloud/massdriver-catalog/refs/heads/main/platforms/gcp/icon.png + +# IAM Role Binding Pattern (for downstream bundles that need network access): +# When binding IAM roles to a GCP network artifact, use data.network_name or +# data.self_link to scope role bindings to the VPC level. For subnet-scoped +# bindings (e.g., roles/compute.networkUser), bind to data.subnets[*].self_link. +# Example gcloud command: +# gcloud projects add-iam-policy-binding $PROJECT \ +# --member="serviceAccount:$SA_EMAIL" \ +# --role="roles/compute.networkUser" \ +# --condition="expression=resource.name.startsWith('projects/$PROJECT/regions/$REGION/subnetworks/$SUBNET')" +exports: [] + +schema: + title: GCP Network + description: Region-scoped GCP VPC network with subnet configuration. Captures the + core network topology including optional fields for customer-managed networks with + secondary ranges (GKE), private services access, Cloud NAT, and additional subnets. + type: object + required: + - project_id + - network_name + - network_self_link + - region + - primary_subnet + properties: + project_id: + title: Project ID + description: GCP project identifier that owns this VPC + type: string + examples: + - my-gcp-project-123 + + network_name: + title: Network Name + description: Short name of the VPC network resource + type: string + examples: + - my-vpc + + network_self_link: + title: Network Self Link + description: Fully-qualified self-link URL of the VPC network + type: string + examples: + - https://www.googleapis.com/compute/v1/projects/my-project/global/networks/my-vpc + + region: + title: Region + description: GCP region where regional resources (subnets, NAT) are provisioned + type: string + examples: + - us-central1 + - us-east1 + - us-west1 + - us-east4 + - europe-west1 + + primary_subnet: + title: Primary Subnet + description: The main subnet for general workload placement + type: object + required: + - name + - cidr + - self_link + properties: + name: + title: Name + description: Short name of the subnetwork resource + type: string + examples: + - my-vpc-subnet-us-central1 + cidr: + title: CIDR + description: Primary IP address range of the subnet + type: string + examples: + - 10.0.0.0/20 + self_link: + title: Self Link + description: Fully-qualified self-link URL of the subnetwork + type: string + examples: + - https://www.googleapis.com/compute/v1/projects/my-project/regions/us-central1/subnetworks/my-subnet + + # Optional — present when this subnet has GKE secondary ranges + secondary_ranges: + title: Secondary Ranges + description: Named secondary IP ranges on the subnet, typically used for GKE + pods and services + type: object + properties: + pods_cidr: + title: Pods CIDR + description: Secondary range allocated for GKE pod IPs + type: string + examples: + - 10.1.0.0/16 + pods_range_name: + title: Pods Range Name + description: Name of the secondary range used for GKE pods + type: string + services_cidr: + title: Services CIDR + description: Secondary range allocated for GKE service cluster IPs + type: string + examples: + - 10.2.0.0/20 + services_range_name: + title: Services Range Name + description: Name of the secondary range used for GKE services + type: string + + # Optional — present when Private Services Access is configured (Cloud SQL, Memorystore, etc.) + private_services_access: + title: Private Services Access + description: RFC 1918 peering range reserved for Google-managed services such as + Cloud SQL and Memorystore + type: object + required: + - cidr + - peering_name + properties: + cidr: + title: CIDR + description: Address range allocated for Private Services Access peering + type: string + examples: + - 10.100.0.0/16 + peering_name: + title: Peering Name + description: Name of the VPC peering connection to servicenetworking.googleapis.com + type: string + examples: + - servicenetworking-googleapis-com + + # Optional — present when Cloud NAT is configured on this network + cloud_nat: + title: Cloud NAT + description: Cloud NAT configuration providing outbound internet access for + private instances + type: object + required: + - router_name + - nat_name + properties: + router_name: + title: Router Name + description: Name of the Cloud Router that hosts the NAT gateway + type: string + nat_name: + title: NAT Name + description: Name of the Cloud NAT resource + type: string + nat_ips: + title: NAT IPs + description: Static external IP addresses assigned to the NAT gateway, if any + type: array + items: + type: string + + # Optional — additional subnets beyond the primary (e.g. proxy-only, management) + additional_subnets: + title: Additional Subnets + description: Any subnets beyond the primary — proxy-only, management, or + purpose-specific ranges + type: array + items: + type: object + required: + - name + - cidr + - self_link + properties: + name: + title: Name + type: string + cidr: + title: CIDR + type: string + self_link: + title: Self Link + type: string + purpose: + title: Purpose + description: Subnet purpose (e.g. PRIVATE, REGIONAL_MANAGED_PROXY) + type: string diff --git a/artifact-definitions/gcp-pubsub-topic/massdriver.yaml b/artifact-definitions/gcp-pubsub-topic/massdriver.yaml new file mode 100644 index 0000000..26a488e --- /dev/null +++ b/artifact-definitions/gcp-pubsub-topic/massdriver.yaml @@ -0,0 +1,70 @@ +name: gcp-pubsub-topic +label: GCP Pub/Sub Topic +icon: https://raw.githubusercontent.com/massdriver-cloud/massdriver-catalog/refs/heads/main/platforms/gcp/icon.png + +# IAM Role Binding Pattern (example — adapt to your actual resource): +# Downstream bundles that need to publish to this topic bind roles/pubsub.publisher. +# Downstream bundles that need to subscribe bind roles/pubsub.subscriber. +# +# Terraform example — grant publisher access to a workload service account: +# resource "google_pubsub_topic_iam_member" "publisher" { +# project = var.pubsub_topic.project_id +# topic = var.pubsub_topic.topic_name +# role = "roles/pubsub.publisher" +# member = "serviceAccount:${var.pubsub_topic.publisher_sa_email}" +# } +# +# Policy examples below (publisher / subscriber) follow this same pattern. +# They are illustrative — the actual IAM member string comes from the consumer +# bundle's service account, not from this artifact. +exports: [] + +schema: + title: GCP Pub/Sub Topic + description: A Google Cloud Pub/Sub topic. Carries the fully-qualified topic + ID, short topic name, and project context so downstream bundles can publish + messages or attach subscriptions without hard-coding project or topic IDs. + Optionally includes dead-letter queue (DLQ) topic references. + type: object + required: + - project_id + - topic_name + - topic_id + properties: + project_id: + title: Project ID + description: GCP project identifier that owns this Pub/Sub topic + type: string + examples: + - my-gcp-project-123 + + topic_name: + title: Topic Name + description: Short name of the Pub/Sub topic (without project prefix) + type: string + examples: + - my-data-pipeline-events + + topic_id: + title: Topic ID + description: Fully-qualified Pub/Sub topic resource name + type: string + examples: + - projects/my-gcp-project-123/topics/my-data-pipeline-events + + # Optional — only present when the topic was provisioned with a DLQ + dlq_topic_name: + title: DLQ Topic Name + description: Short name of the dead-letter queue topic (present only when DLQ + is enabled on the main topic) + type: string + examples: + - my-data-pipeline-events-dlq + + dlq_topic_id: + title: DLQ Topic ID + description: Fully-qualified resource name of the dead-letter queue topic + (present only when DLQ is enabled on the main topic) + type: string + examples: + - projects/my-gcp-project-123/topics/my-data-pipeline-events-dlq diff --git a/artifact-definitions/gcp-storage-bucket/massdriver.yaml b/artifact-definitions/gcp-storage-bucket/massdriver.yaml new file mode 100644 index 0000000..12225d1 --- /dev/null +++ b/artifact-definitions/gcp-storage-bucket/massdriver.yaml @@ -0,0 +1,80 @@ +name: gcp-storage-bucket +label: GCP Storage Bucket +icon: https://raw.githubusercontent.com/massdriver-cloud/massdriver-catalog/refs/heads/main/platforms/gcp/icon.png + +# IAM Role Binding Pattern (example — adapt to your actual resource): +# Downstream bundles that need to read objects bind roles/storage.objectViewer. +# Downstream bundles that need to read+write objects bind roles/storage.objectUser. +# Downstream bundles that need full admin bind roles/storage.admin. +# +# Terraform example — grant object read access to a workload service account: +# resource "google_storage_bucket_iam_member" "reader" { +# bucket = var.storage_bucket.bucket_name +# role = "roles/storage.objectViewer" +# member = "serviceAccount:${var.storage_bucket.workload_sa_email}" +# } +# +# Policy examples below (object_reader / object_writer / admin) follow this same +# pattern. They are illustrative — the actual IAM member string comes from the +# consumer bundle's service account, not from this artifact. +exports: [] + +schema: + title: GCP Storage Bucket + description: A Google Cloud Storage bucket. Carries the bucket name, canonical + URL (gs:// form), self-link (REST API form), location, and owning project so + downstream bundles can read/write objects without hard-coding bucket or project + identifiers. + type: object + required: + - project_id + - bucket_name + - bucket_url + - bucket_self_link + - location + - storage_class + properties: + project_id: + title: Project ID + description: GCP project identifier that owns this storage bucket + type: string + examples: + - my-gcp-project-123 + + bucket_name: + title: Bucket Name + description: Globally-unique GCS bucket name + type: string + examples: + - my-data-platform-events-abc123 + + bucket_url: + title: Bucket URL + description: Canonical GCS URL for use with gsutil and client libraries (gs:// form) + type: string + examples: + - gs://my-data-platform-events-abc123 + + bucket_self_link: + title: Bucket Self Link + description: GCS REST API resource URL (https://www.googleapis.com/storage/v1/b/ form) + type: string + examples: + - https://www.googleapis.com/storage/v1/b/my-data-platform-events-abc123 + + location: + title: Location + description: GCS location where the bucket is deployed (region, dual-region, or multi-region) + type: string + examples: + - US + - us-central1 + - NAM4 + + storage_class: + title: Storage Class + description: GCS storage class of the bucket + type: string + examples: + - STANDARD + - COLDLINE diff --git a/artifact-definitions/gcp-vertex-workbench/massdriver.yaml b/artifact-definitions/gcp-vertex-workbench/massdriver.yaml new file mode 100644 index 0000000..b9d45bf --- /dev/null +++ b/artifact-definitions/gcp-vertex-workbench/massdriver.yaml @@ -0,0 +1,98 @@ +name: gcp-vertex-workbench +label: GCP Vertex AI Workbench Instance +icon: https://raw.githubusercontent.com/massdriver-cloud/massdriver-catalog/refs/heads/main/platforms/gcp/icon.png + +# IAM Role Binding Pattern — granting external access to this Workbench instance: +# +# Workbench instances are single-user data-science environments. The primary IAM +# surface is the instance service account (instance_service_account_member), which +# is what you bind roles TO — not roles ON the Workbench instance itself. +# +# To grant an external principal read-only visibility to the Workbench instance +# (e.g., for audit or platform-admin purposes): +# +# resource "google_notebooks_instance_iam_member" "viewer" { +# project = var.vertex_workbench.project_id +# location = var.vertex_workbench.location +# name = var.vertex_workbench.instance_name +# role = "roles/notebooks.viewer" +# member = "user:alice@example.com" +# } +# +# The instance_service_account_member field carries the IAM principal string +# ("serviceAccount:") for the instance SA. Use it to grant the Workbench +# instance access to downstream resources (e.g., dataset viewer, bucket reader) +# without hard-coding the SA email. +# +# Example — granting a connected dataset access to the instance SA: +# resource "google_bigquery_dataset_iam_member" "reader" { +# dataset_id = var.bigquery_dataset.dataset_id +# role = "roles/bigquery.dataViewer" +# member = var.vertex_workbench.instance_service_account_member +# } +exports: [] + +schema: + title: GCP Vertex AI Workbench Instance + description: A deployed Vertex AI Workbench instance. Carries the project ID, + instance name, zone, JupyterLab proxy URL, and the instance service account + identity so downstream bundles can grant the Workbench access to data resources + without hard-coding project or service account identifiers. + type: object + required: + - project_id + - instance_name + - location + - proxy_url + - instance_service_account_email + - instance_service_account_member + properties: + project_id: + title: Project ID + description: GCP project identifier that owns this Workbench instance + type: string + examples: + - my-gcp-project-123 + + instance_name: + title: Instance Name + description: Workbench instance name (short name used in gcloud commands) + type: string + examples: + - my-workbench-instance + + location: + title: Location (Zone) + description: GCP zone where the Workbench instance is deployed. Workbench + instances are zonal resources — location is a zone (e.g., us-central1-a), + not a region. + type: string + examples: + - us-central1-a + + proxy_url: + title: JupyterLab Proxy URL + description: The HTTPS proxy URL to access the JupyterLab interface for this + Workbench instance. Populated after the instance is running. May be empty + while the instance is starting or if proxy access is disabled. + type: string + examples: + - https://abc123-dot-us-central1.notebooks.googleusercontent.com/ + + instance_service_account_email: + title: Instance Service Account Email + description: Email address of the GCP service account this Workbench instance + runs as. Downstream bundles bind IAM roles to this email to grant the + Workbench access to data resources. + type: string + examples: + - my-workbench@my-gcp-project-123.iam.gserviceaccount.com + + instance_service_account_member: + title: Instance Service Account IAM Member + description: "The full IAM principal string for the instance service account, + in 'serviceAccount:' form. Use this directly as the member argument + in google_*_iam_member resources so callers do not have to construct it manually." + type: string + examples: + - serviceAccount:my-workbench@my-gcp-project-123.iam.gserviceaccount.com diff --git a/artifact-definitions/gcp-vpc-connector/massdriver.yaml b/artifact-definitions/gcp-vpc-connector/massdriver.yaml new file mode 100644 index 0000000..b07d6c4 --- /dev/null +++ b/artifact-definitions/gcp-vpc-connector/massdriver.yaml @@ -0,0 +1,87 @@ +name: gcp-vpc-connector +label: GCP Serverless VPC Access Connector +icon: https://raw.githubusercontent.com/massdriver-cloud/massdriver-catalog/refs/heads/main/platforms/gcp/icon.png + +# IAM Role Binding Pattern (for bundles that consume this artifact): +# +# VPC Access connectors do not have their own IAM surface — access is governed by +# the VPC and subnet IAM. Consumers reference the connector by its fully-qualified +# name on the Cloud Run / Cloud Functions service. Example Terraform: +# +# resource "google_cloud_run_v2_service" "main" { +# template { +# vpc_access { +# connector = var.vpc_connector.connector_id +# egress = var.vpc_connector.egress_settings +# } +# } +# } +# +# To let a service account use a connector in another project, that SA needs +# `roles/vpcaccess.user` on the connector's project. +exports: [] + +schema: + title: GCP Serverless VPC Access Connector + description: An existing or provisioned Serverless VPC Access connector. Consumer + bundles (Cloud Run, Cloud Functions) reference connector_id to route egress + traffic through the VPC. + type: object + required: + - project_id + - region + - name + - connector_id + properties: + project_id: + title: Project ID + description: GCP project that owns the connector + type: string + examples: + - my-gcp-project-123 + + region: + title: Region + description: GCP region where the connector is deployed. Must match the region + of the consuming service. + type: string + examples: + - us-central1 + + name: + title: Connector Name + description: Short name of the connector + type: string + examples: + - my-vpc-connector + + connector_id: + title: Connector Resource ID + description: Fully-qualified connector resource name used on the consuming + service's vpc_access.connector field + type: string + examples: + - projects/my-gcp-project-123/locations/us-central1/connectors/my-vpc-connector + + network: + title: VPC Network + description: Name of the VPC network the connector is attached to + type: string + examples: + - default + + ip_cidr_range: + title: IP CIDR Range + description: /28 CIDR range reserved for the connector's internal addresses + type: string + examples: + - 10.8.0.0/28 + + egress_settings: + title: Default Egress Settings + description: Suggested egress setting for services using this connector. + Consumers may override. + type: string + enum: + - ALL_TRAFFIC + - PRIVATE_RANGES_ONLY diff --git a/artifact-definitions/gcp-workload-identity/massdriver.yaml b/artifact-definitions/gcp-workload-identity/massdriver.yaml new file mode 100644 index 0000000..69c973d --- /dev/null +++ b/artifact-definitions/gcp-workload-identity/massdriver.yaml @@ -0,0 +1,63 @@ +name: gcp-workload-identity +label: GCP Workload Identity +icon: https://raw.githubusercontent.com/massdriver-cloud/massdriver-catalog/refs/heads/main/platforms/gcp/icon.png + +# IAM Role Binding Pattern (for downstream bundles that consume this artifact): +# Grant this service account access to your resource by binding an IAM role. +# Example: grant BigQuery Data Viewer on a dataset: +# gcloud projects add-iam-policy-binding $PROJECT_ID \ +# --member="serviceAccount:${data.service_account_email}" \ +# --role="roles/bigquery.dataViewer" +# +# For resource-scoped bindings (preferred — least privilege): +# gcloud bigquery datasets add-iam-policy-binding $DATASET \ +# --member="serviceAccount:${data.service_account_email}" \ +# --role="roles/bigquery.dataEditor" +# +# In Terraform use google_project_iam_member or google__iam_member: +# resource "google_project_iam_member" "workload" { +# project = var.workload_identity.project_id +# role = "roles/run.invoker" +# member = "serviceAccount:${var.workload_identity.service_account_email}" +# } +exports: [] + +schema: + title: GCP Workload Identity + description: Runtime service account identity for workloads in this environment. + Downstream bundles (Cloud Run, Vertex Workbench, etc.) bind IAM roles to this + service account to grant it access to their resources (BigQuery, GCS, Pub/Sub, etc.). + type: object + required: + - project_id + - service_account_email + - service_account_id + - service_account_name + properties: + project_id: + title: Project ID + description: GCP project identifier that owns this service account + type: string + examples: + - my-gcp-project-123 + + service_account_email: + title: Service Account Email + description: Email address of the service account; use this as the IAM member string + type: string + examples: + - data-platform-sa@my-project.iam.gserviceaccount.com + + service_account_id: + title: Service Account ID + description: Unique numeric ID of the service account + type: string + examples: + - "123456789012345678901" + + service_account_name: + title: Service Account Name + description: Fully-qualified resource name (projects/{project}/serviceAccounts/{email}) + type: string + examples: + - projects/my-project/serviceAccounts/data-platform-sa@my-project.iam.gserviceaccount.com diff --git a/bundles/gcp-bigquery-dataset/README.md b/bundles/gcp-bigquery-dataset/README.md new file mode 100644 index 0000000..7e09213 --- /dev/null +++ b/bundles/gcp-bigquery-dataset/README.md @@ -0,0 +1,81 @@ +# gcp-bigquery-dataset + +Google Cloud BigQuery dataset with configurable location, default table expiration, and delete protection. Use this bundle to provision a managed analytics dataset for data platform workloads — Cloud Run pipelines, Vertex Workbench notebooks, Dataflow jobs, and ad-hoc SQL analytics. + +## Use Cases + +- Centralized analytics dataset consumed by multiple downstream services with scoped IAM +- Dev/staging datasets with automatic table expiration to control storage cost growth +- Production datasets with delete protection to prevent accidental data loss + +## Resources Created + +| Resource | Type | Notes | +|---|---|---| +| `google_bigquery_dataset.main` | BigQuery dataset | Location, expiration, and delete protection set at provision time; Google-managed encryption | + +This bundle does NOT create tables, subscriptions, or workload IAM bindings. Consumer bundles (e.g., `gcp-bigquery-table`, `gcp-cloud-run-service`, `gcp-vertex-workbench`) create their own resources and bind the appropriate roles on this dataset when connected on the canvas. + +## Connections + +| Connection | Required | Artifact Type | How It Is Used | +|---|---|---|---| +| `gcp_authentication` | Yes | `gcp-service-account` | Deploy credential — project ID and service account key for the Google provider | +| `landing_zone` | Yes | `catalog-demo/gcp-landing-zone` | Provides `project_id` for resource placement | + +## Artifact Produced + +**Artifact type:** `catalog-demo/gcp-bigquery-dataset` + +| Field | Type | Description | +|---|---|---| +| `project_id` | string | GCP project ID that owns the dataset | +| `dataset_id` | string | BigQuery dataset identifier (letters, digits, underscores) | +| `dataset_full_name` | string | Fully-qualified name in `.` form — use directly in SQL `FROM` clauses | +| `location` | string | BigQuery location where the dataset is stored | +| `friendly_name` | string or null | Human-readable display name if set; null otherwise | + +Consumer bundles bind IAM roles using `dataset_id` and `project_id` from this artifact. Example patterns: + +```hcl +# Read/write access (Cloud Run workers) +resource "google_bigquery_dataset_iam_member" "runtime_editor" { + project = var.bigquery_dataset.project_id + dataset_id = var.bigquery_dataset.dataset_id + role = "roles/bigquery.dataEditor" + member = "serviceAccount:${google_service_account.runtime.email}" +} + +# Read-only access (Vertex Workbench notebooks) +resource "google_bigquery_dataset_iam_member" "dataset_viewer" { + project = var.bigquery_dataset.project_id + dataset_id = var.bigquery_dataset.dataset_id + role = "roles/bigquery.dataViewer" + member = "serviceAccount:${google_service_account.instance.email}" +} +``` + +## Compliance + +### Checkov skips + +| Check | Reason | +|---|---| +| `CKV_GCP_81` | Requires CMEK on all BigQuery datasets. CMEK is intentionally out of scope for this bundle — Google-managed encryption is used. Checkov fires this check whenever a `default_encryption_configuration` block is absent. If CMEK is required, use a separate bundle with a KMS key connection. | + +The `halt_on_failure` expression in `massdriver.yaml` blocks deployments with unresolved high-severity Checkov findings when the environment target matches `prod`, `prd`, or `production`. + +## Assumptions + +- `bigquery.googleapis.com` must be enabled in the landing zone before deploying. Add it to `enabled_apis` in the `gcp-landing-zone` package. +- The `gcp_authentication` credential has `bigquery.admin` or equivalent IAM on the project. +- `dataset_id` is immutable after creation. Changing it requires destroying and recreating the dataset — all data is lost unless exported first. +- `default_table_expiration_days` applies only to tables created after the setting is applied. Existing tables are not affected. + +## Presets + +| Preset | Location | Default Table Expiration | Delete Protection | +|---|---|---|---| +| Dev | US | 30 days | Off | +| Staging | US | 90 days | Off | +| Production | US | None | On | diff --git a/bundles/gcp-bigquery-dataset/massdriver.yaml b/bundles/gcp-bigquery-dataset/massdriver.yaml new file mode 100644 index 0000000..cde6803 --- /dev/null +++ b/bundles/gcp-bigquery-dataset/massdriver.yaml @@ -0,0 +1,139 @@ +name: gcp-bigquery-dataset +description: Google Cloud BigQuery dataset with configurable location, default table + expiration, and delete protection. Enforces dataset-level IAM access control as + a non-negotiable baseline. Grants the landing zone's workload service account dataEditor + access on the dataset. Emits a gcp-bigquery-dataset artifact for downstream Cloud + Run, Vertex Workbench, and query workloads. +source_url: https://github.com/massdriver-cloud/massdriver-catalog/tree/main/bundles/gcp-bigquery-dataset +version: 0.1.1 + +params: + required: + - dataset_id + - location + - delete_protection + examples: + - __name: Dev + dataset_id: dev_dataset + location: US + default_table_expiration_days: 30 + delete_protection: false + - __name: Staging + dataset_id: staging_dataset + location: US + default_table_expiration_days: 90 + delete_protection: false + - __name: Production + dataset_id: prod_dataset + location: US + delete_protection: true + + properties: + dataset_id: + title: Dataset ID + description: BigQuery dataset identifier. Must contain only letters, digits, and + underscores — no hyphens or spaces. Maximum 1024 characters. Cannot be changed + after creation without destroying and recreating the dataset. + type: string + $md.immutable: true + pattern: "^[a-zA-Z0-9_]+$" + + friendly_name: + title: Friendly Name + description: Optional human-readable display name shown in the BigQuery UI. Does + not affect queries or API access — the dataset_id is always the identifier. + type: string + + description: + title: Description + description: Optional free-text description of the dataset's purpose, data classification, + or owner team. Stored in BigQuery metadata and visible in the console. + type: string + + location: + title: Location + description: BigQuery location where the dataset and its tables are stored. Multi-regions + (US, EU) provide highest availability. Single regions co-locate data with compute + for lower query latency. Location is immutable — changing it requires destroying + and recreating the dataset (you will lose all data unless exported first). + type: string + $md.immutable: true + default: US + enum: + - US + - EU + - us-central1 + - us-east1 + - us-east4 + - us-west1 + - us-west2 + - europe-west1 + - europe-west2 + - europe-west3 + - europe-west4 + - asia-east1 + - asia-northeast1 + - asia-south1 + - asia-southeast1 + - australia-southeast1 + - southamerica-east1 + + default_table_expiration_days: + title: Default Table Expiration (days) + description: Number of days after creation that new tables in this dataset will + be automatically deleted. Applies only to tables created after this setting + is applied — existing tables are not affected. Set to 0 or omit for no automatic + expiration. Recommended for dev and staging to prevent unbounded cost growth. + type: integer + minimum: 0 + default: 0 + + delete_protection: + title: Enable Delete Protection + description: When enabled, the dataset cannot be destroyed until delete protection + is first disabled (a two-step destroy). Prevents accidental data loss in production. + Strongly recommended for production datasets. Disable only immediately before + a planned decommission. + type: boolean + default: false + +connections: + required: + - gcp_authentication + - landing_zone + properties: + gcp_authentication: + $ref: gcp-service-account + title: GCP Credentials + + landing_zone: + $ref: catalog-demo/gcp-landing-zone + title: GCP Landing Zone + +artifacts: + required: + - bigquery_dataset + properties: + bigquery_dataset: + $ref: catalog-demo/gcp-bigquery-dataset + title: GCP BigQuery Dataset + +steps: + - path: src + provisioner: opentofu:1.10 + config: + checkov: + halt_on_failure: '.params.md_metadata.default_tags["md-target"] | test("^(prod|prd|production)$")' + +ui: + ui:order: + - dataset_id + - friendly_name + - description + - location + - default_table_expiration_days + - delete_protection + - "*" + properties: + delete_protection: + ui:widget: checkbox diff --git a/bundles/gcp-bigquery-dataset/operator.md b/bundles/gcp-bigquery-dataset/operator.md new file mode 100644 index 0000000..5497daf --- /dev/null +++ b/bundles/gcp-bigquery-dataset/operator.md @@ -0,0 +1,98 @@ +--- +templating: mustache +--- + +# GCP BigQuery Dataset — Operator Runbook + +## Non-obvious constraints + +**Dataset ID is immutable.** `dataset_id` cannot be changed in-place. To rename: export all tables, destroy the package, reprovision with the new ID, reload from GCS. Treat the dataset ID as permanent. + +**Location is immutable.** Datasets cannot be moved between regions after creation. To change location: export all tables (`bq extract` to GCS), destroy the package, reprovision in the new location, reload. Budget for data transfer costs and downtime. + +**`default_table_expiration_ms` applies to NEW tables only.** Changing this on an existing dataset does not expire or modify existing tables. To set expiration on an existing table, update it directly via `bq update`. + +**Delete protection requires a two-step destroy.** When `delete_protection = true`, the destroy will fail. To decommission: +1. Set `delete_protection = false` in the package config and deploy. +2. Then run the destroy. + +**Dataset-level IAM propagates to all tables, current and future.** For row-level or table-level isolation, use BigQuery row-level security policies or bind IAM at the table level separately. + +**Consumer bundles are responsible for their own IAM bindings.** Consumer bundles bind their own service accounts to this dataset. If a service can't query or load data, the IAM binding is missing from the consumer bundle — not from here. + +**Cross-region queries are not supported.** BigQuery cannot join tables in different regions in a single query. Use Storage Transfer Service or BigQuery Data Transfer Service to replicate data first. + +## Troubleshooting + +**Permission denied on dataset access.** +```bash +bq get-iam-policy {{artifacts.bigquery_dataset.dataset_full_name}} +``` +The required member should have `roles/bigquery.dataEditor` for read/write or `roles/bigquery.dataViewer` for read-only. If the binding is absent, redeploy the consumer bundle with the dataset wired on the canvas. + +**Quota exceeded on concurrent jobs or daily bytes scanned.** +BigQuery per-project quotas are not manageable through this bundle. Check the BigQuery quota dashboard in the GCP console and request increases if needed. + +**Streaming insert rows not expiring as expected.** +Rows inserted via the streaming API have a delay before table expiration recalculation applies. Batch loads have no such lag. + +**Deploy fails with "bigquery.googleapis.com has not been used in project."** +Add `bigquery.googleapis.com` to `enabled_apis` in the `gcp-landing-zone` package, redeploy the landing zone, wait ~60 seconds, then retry. + +**Table schema mismatch or load failure.** +```bash +bq show --format=prettyjson {{artifacts.bigquery_dataset.dataset_full_name}}. +``` + +## Day-2 operations + +**Setting expiration on an existing table** (default expiration doesn't backfill): +```bash +# Set expiration 30 days from now +EXPIRY=$(date -d "+30 days" +%s000 2>/dev/null || date -v+30d +%s000) +bq update --expiration=$EXPIRY {{artifacts.bigquery_dataset.dataset_full_name}}. + +# Remove expiration from a table +bq update --expiration=0 {{artifacts.bigquery_dataset.dataset_full_name}}. +``` + +**Exporting all tables before destroying the dataset:** +```bash +for TABLE in $(bq ls --format=csv {{artifacts.bigquery_dataset.dataset_full_name}} | tail -n +2 | cut -d, -f1); do + bq extract \ + --destination_format=NEWLINE_DELIMITED_JSON \ + {{artifacts.bigquery_dataset.dataset_full_name}}.$TABLE \ + gs:///{{artifacts.bigquery_dataset.dataset_id}}/$TABLE/*.jsonl +done +``` + +**Granting read-only access to another principal** (outside Terraform — overwritten on next apply): +```bash +bq add-iam-policy-binding \ + --member="serviceAccount:" \ + --role="roles/bigquery.dataViewer" \ + {{artifacts.bigquery_dataset.dataset_full_name}} +``` + +## Useful commands + +```bash +# Show dataset metadata (location, expiration, labels) +bq show --format=prettyjson {{artifacts.bigquery_dataset.dataset_full_name}} + +# List tables in the dataset +bq ls {{artifacts.bigquery_dataset.dataset_full_name}} + +# Show IAM policy on the dataset +bq get-iam-policy {{artifacts.bigquery_dataset.dataset_full_name}} + +# Show a specific table's schema and metadata +bq show --format=prettyjson {{artifacts.bigquery_dataset.dataset_full_name}}. + +# Check a table's current expiration time +bq show --format=prettyjson {{artifacts.bigquery_dataset.dataset_full_name}}. | jq '.expirationTime' + +# Run an ad-hoc query (billed to project) +bq query --project_id={{artifacts.bigquery_dataset.project_id}} \ + 'SELECT COUNT(*) FROM `{{artifacts.bigquery_dataset.dataset_full_name}}.`' +``` diff --git a/bundles/gcp-bigquery-dataset/src/.checkov.yml b/bundles/gcp-bigquery-dataset/src/.checkov.yml new file mode 100644 index 0000000..ab0bb33 --- /dev/null +++ b/bundles/gcp-bigquery-dataset/src/.checkov.yml @@ -0,0 +1,8 @@ +skip-check: + # CKV_GCP_81: Ensure Big Query Datasets are encrypted with Customer Supplied Encryption Keys (CSEK) + # CMEK is intentionally out of scope for this bundle. All datasets use Google-managed + # encryption, which is appropriate for the workloads this bundle targets. Checkov + # fires this check whenever a default_encryption_configuration block is absent, + # making it a false positive here. If CMEK is required for a specific workload, + # a separate bundle with a KMS connection should be used. + - CKV_GCP_81 diff --git a/bundles/gcp-bigquery-dataset/src/artifacts.tf b/bundles/gcp-bigquery-dataset/src/artifacts.tf new file mode 100644 index 0000000..7ddc0d4 --- /dev/null +++ b/bundles/gcp-bigquery-dataset/src/artifacts.tf @@ -0,0 +1,13 @@ +# BigQuery dataset artifact — matches catalog-demo/gcp-bigquery-dataset schema. + +resource "massdriver_artifact" "bigquery_dataset" { + field = "bigquery_dataset" + name = "GCP BigQuery Dataset ${var.md_metadata.name_prefix}" + artifact = jsonencode({ + project_id = local.project_id + dataset_id = google_bigquery_dataset.main.dataset_id + dataset_full_name = "${local.project_id}.${google_bigquery_dataset.main.dataset_id}" + location = google_bigquery_dataset.main.location + friendly_name = google_bigquery_dataset.main.friendly_name != "" ? google_bigquery_dataset.main.friendly_name : null + }) +} diff --git a/bundles/gcp-bigquery-dataset/src/main.tf b/bundles/gcp-bigquery-dataset/src/main.tf new file mode 100644 index 0000000..4503e24 --- /dev/null +++ b/bundles/gcp-bigquery-dataset/src/main.tf @@ -0,0 +1,74 @@ +terraform { + required_version = ">= 1.0" + required_providers { + google = { + source = "hashicorp/google" + version = "~> 6.0" + } + massdriver = { + source = "massdriver-cloud/massdriver" + version = "~> 1.3" + } + } +} + +provider "google" { + project = var.gcp_authentication.project_id + credentials = jsonencode(var.gcp_authentication) +} + +locals { + project_id = var.landing_zone.project_id + name_prefix = var.md_metadata.name_prefix + dataset_id = var.dataset_id + + # Convert days → milliseconds for the BigQuery API. BigQuery requires ms. + # 0 or null input means "no expiration" → pass null to terraform resource. + default_table_expiration_ms = ( + var.default_table_expiration_days != null && var.default_table_expiration_days > 0 + ? var.default_table_expiration_days * 24 * 60 * 60 * 1000 + : null + ) +} + +# ─── BigQuery Dataset ────────────────────────────────────────────────────────── + +resource "google_bigquery_dataset" "main" { + project = local.project_id + dataset_id = var.dataset_id + location = var.location + + friendly_name = var.friendly_name != null ? var.friendly_name : null + description = var.description != null ? var.description : null + + # ── Default table expiration ───────────────────────────────────────────────── + # Only applies to NEW tables created after this setting is applied. Existing + # tables in the dataset are NOT retroactively expired. Set to null for no + # automatic expiration (recommended for production). + default_table_expiration_ms = local.default_table_expiration_ms + + # ── Delete protection ──────────────────────────────────────────────────────── + # When true, Terraform will refuse to destroy this dataset until the flag is + # first set to false and re-applied (a two-step destroy). Prevents accidental + # data loss. Default is false so non-prod environments can be torn down freely. + delete_contents_on_destroy = !var.delete_protection + + # Google-managed encryption is used for all datasets provisioned by this bundle. + # CMEK is intentionally out of scope — see src/.checkov.yml for CKV_GCP_81 skip rationale. + + labels = var.md_metadata.default_tags +} + +# ─── No workload IAM binding here ──────────────────────────────────────────── +# BigQuery datasets do not own a runtime identity. The landing zone no longer +# provides a shared workload SA. Consumer bundles (e.g. gcp-cloud-run-service) +# create their OWN service account and the Cloud Run bundle grants dataEditor +# access on this dataset when connected on the canvas. +# +# Artifact policy pattern — grant a consumer's SA data editor access: +# resource "google_bigquery_dataset_iam_member" "runtime_editor" { +# project = var.bigquery_dataset.project_id +# dataset_id = var.bigquery_dataset.dataset_id +# role = "roles/bigquery.dataEditor" +# member = "serviceAccount:" +# } diff --git a/bundles/gcp-bigquery-dataset/src/variables.tf b/bundles/gcp-bigquery-dataset/src/variables.tf new file mode 100644 index 0000000..50399ba --- /dev/null +++ b/bundles/gcp-bigquery-dataset/src/variables.tf @@ -0,0 +1,74 @@ +variable "md_metadata" { + type = object({ + name_prefix = string + default_tags = optional(map(string), {}) + }) +} + +variable "gcp_authentication" { + type = object({ + type = string + project_id = string + private_key_id = string + private_key = string + client_email = string + client_id = string + auth_uri = string + token_uri = string + auth_provider_x509_cert_url = string + client_x509_cert_url = string + }) + sensitive = true +} + +variable "landing_zone" { + type = object({ + project_id = string + network = object({ + network_name = string + network_self_link = string + region = string + primary_subnet = object({ + name = string + cidr = string + self_link = string + }) + }) + enabled_apis = list(string) + budget = object({ + enabled = bool + budget_name = optional(string) + billing_account_id = optional(string) + amount_usd = optional(number) + }) + }) +} + +variable "dataset_id" { + type = string +} + +variable "friendly_name" { + type = string + default = null +} + +variable "description" { + type = string + default = null +} + +variable "location" { + type = string + default = "US" +} + +variable "default_table_expiration_days" { + type = number + default = 0 +} + +variable "delete_protection" { + type = bool + default = false +} diff --git a/bundles/gcp-bigquery-table/README.md b/bundles/gcp-bigquery-table/README.md new file mode 100644 index 0000000..68c5b14 --- /dev/null +++ b/bundles/gcp-bigquery-table/README.md @@ -0,0 +1,90 @@ +# gcp-bigquery-table + +Google Cloud BigQuery table with configurable schema and optional Pub/Sub subscription delivery. Use this bundle to provision a managed table inside an existing BigQuery dataset — with or without a Pub/Sub subscription routing messages into it. + +## Use Cases + +- Pub/Sub-to-BigQuery pipeline: wire a topic to this table and messages land automatically +- Custom-schema analytics table: define your own BigQuery schema JSON and deploy +- Production tables with deletion protection to prevent accidental data loss + +## Resources Created + +| Resource | Type | Notes | +|---|---|---| +| `google_bigquery_table.main` | BigQuery table | Schema is either Pub/Sub-compatible (5 standard columns) or user-provided JSON | +| `google_bigquery_table_iam_member.pubsub_service_agent_data_editor` | IAM binding | Created only when a Pub/Sub topic is wired. Grants the Pub/Sub service agent `roles/bigquery.dataEditor` on this table. | +| `google_bigquery_table_iam_member.pubsub_service_agent_metadata_viewer` | IAM binding | Created only when a Pub/Sub topic is wired. Grants the Pub/Sub service agent `roles/bigquery.metadataViewer` on this table. | +| `google_pubsub_subscription.bigquery` | Pub/Sub subscription | Created only when a Pub/Sub topic is wired. Delivers messages from the topic into this table. | + +IAM bindings are table-scoped (not dataset-wide) for least privilege. The bindings are removed when the `pubsub_topic` connection is unwired and the bundle is redeployed. + +## Connections + +| Connection | Required | Artifact Type | How It Is Used | +|---|---|---|---| +| `gcp_authentication` | Yes | `gcp-service-account` | Deploy credential — project ID and service account key for the Google provider | +| `bigquery_dataset` | Yes | `catalog-demo/gcp-bigquery-dataset` | Provides `project_id` and `dataset_id` for table placement | +| `pubsub_topic` | No | `catalog-demo/gcp-pubsub-topic` | When wired, creates a Pub/Sub BigQuery subscription that delivers messages into this table | + +## Schema Modes + +**`pubsub_default`** (recommended when wiring a Pub/Sub topic): Creates the table with five standard Pub/Sub columns: +- `subscription_name STRING` — name of the subscription that delivered the message +- `message_id STRING` — unique message identifier assigned by Pub/Sub +- `publish_time TIMESTAMP` — time the message was published to the topic +- `data STRING` — message payload (base64-decoded when `use_topic_schema = false`) +- `attributes JSON` — key-value attributes attached to the message + +**`custom_schema`**: Provide the full BigQuery schema as a JSON array in the `schema_json` parameter. Each field descriptor requires at minimum `name` and `type`. Example: +```json +[ + {"name": "event_type", "type": "STRING", "mode": "NULLABLE"}, + {"name": "payload", "type": "JSON", "mode": "NULLABLE"}, + {"name": "created_at", "type": "TIMESTAMP", "mode": "NULLABLE"} +] +``` + +## Artifact Produced + +**Artifact type:** `catalog-demo/gcp-bigquery-table` + +| Field | Type | Description | +|---|---|---| +| `project_id` | string | GCP project ID that owns the table | +| `dataset_id` | string | BigQuery dataset containing this table | +| `table_id` | string | BigQuery table identifier | +| `table_full_name` | string | Fully-qualified name in `..
` form — use directly in SQL `FROM` clauses | + +Consumer bundles bind IAM roles using the table fields from this artifact. Example: + +```hcl +resource "google_bigquery_table_iam_member" "reader" { + project = var.bigquery_table.project_id + dataset_id = var.bigquery_table.dataset_id + table_id = var.bigquery_table.table_id + role = "roles/bigquery.dataViewer" + member = "serviceAccount:${google_service_account.runtime.email}" +} +``` + +## Compliance + +No Checkov checks are skipped. All findings in this bundle are resolved in Terraform directly. + +The `halt_on_failure` expression in `massdriver.yaml` blocks deployments with unresolved high-severity Checkov findings when the environment target matches `prod`, `prd`, or `production`. + +## Assumptions + +- `bigquery.googleapis.com` must be enabled in the landing zone before deploying. +- The `gcp_authentication` credential has `bigquery.admin` or equivalent IAM on the project. +- `table_id` is immutable after creation. Changing it requires destroying and recreating the table — all data is lost unless exported first. +- When wiring a `pubsub_topic`, `pubsub.googleapis.com` must also be enabled in the project. +- The BigQuery subscription target table (this table) must exist before Pub/Sub can validate the subscription. This bundle creates the table first, so order-of-operations is handled automatically. + +## Presets + +| Preset | Schema Mode | Deletion Protection | +|---|---|---| +| Pub/Sub Default | pubsub_default | On | +| Custom Schema | custom_schema | On | diff --git a/bundles/gcp-bigquery-table/massdriver.yaml b/bundles/gcp-bigquery-table/massdriver.yaml new file mode 100644 index 0000000..cd9a38c --- /dev/null +++ b/bundles/gcp-bigquery-table/massdriver.yaml @@ -0,0 +1,180 @@ +name: gcp-bigquery-table +description: Google Cloud BigQuery table with configurable schema and optional Pub/Sub + subscription delivery. When a Pub/Sub topic is wired, creates a BigQuery subscription + that routes messages from the topic into this table. Emits a gcp-bigquery-table + artifact for downstream query and pipeline workloads. +source_url: https://github.com/massdriver-cloud/massdriver-catalog/tree/main/bundles/gcp-bigquery-table +version: 0.1.1 + +params: + required: + - table_id + - deletion_protection + - schema_mode + examples: + - __name: Pub/Sub Default + table_id: messages + deletion_protection: true + schema_mode: pubsub_default + bigquery_subscription: + use_topic_schema: false + write_metadata: true + drop_unknown_fields: false + ack_deadline_seconds: 60 + - __name: Custom Schema + table_id: events + deletion_protection: true + schema_mode: custom_schema + schema_json: '[{"name":"event_type","type":"STRING","mode":"NULLABLE"},{"name":"payload","type":"JSON","mode":"NULLABLE"},{"name":"created_at","type":"TIMESTAMP","mode":"NULLABLE"}]' + + properties: + table_id: + title: Table ID + description: BigQuery table identifier. Must contain only letters, digits, and + underscores — no hyphens or spaces. Maximum 1024 characters. Cannot be changed + after creation without destroying and recreating the table. + type: string + $md.immutable: true + pattern: "^[a-zA-Z0-9_]+$" + + description: + title: Description + description: Optional free-text description of the table's purpose, data classification, + or owner team. Stored in BigQuery metadata and visible in the console. + type: string + + deletion_protection: + title: Enable Deletion Protection + description: When enabled, the table cannot be destroyed until deletion protection + is first disabled (a two-step destroy). Prevents accidental data loss in production. + Strongly recommended for production tables. Disable only immediately before + a planned decommission. + type: boolean + default: true + + schema_mode: + title: Schema Mode + description: >- + Controls how the table schema is defined. "pubsub_default" creates the table + with a Pub/Sub-compatible schema (subscription_name, message_id, publish_time, + data, attributes) — use this when wiring a Pub/Sub topic. "custom_schema" + lets you provide the full BigQuery schema JSON via the schema_json parameter. + type: string + default: pubsub_default + enum: + - pubsub_default + - custom_schema + + schema_json: + title: Schema JSON + description: >- + BigQuery table schema as a JSON array of field descriptors. Only used when + schema_mode is "custom_schema". Each element must have "name", "type", and + optionally "mode" (NULLABLE, REQUIRED, REPEATED) and "description". + Example: [{"name":"event_type","type":"STRING","mode":"NULLABLE"}, + {"name":"payload","type":"JSON","mode":"NULLABLE"}, + {"name":"created_at","type":"TIMESTAMP","mode":"NULLABLE"}] + Valid types: STRING, BYTES, INTEGER, INT64, FLOAT, FLOAT64, NUMERIC, + BIGNUMERIC, BOOLEAN, BOOL, TIMESTAMP, DATE, TIME, DATETIME, JSON, RECORD, STRUCT. + type: string + + bigquery_subscription: + title: BigQuery Subscription Settings + description: Settings for the Pub/Sub BigQuery subscription. Only used when + a Pub/Sub topic is wired to this bundle. Configure delivery behavior and + acknowledgement timeouts here. + type: object + properties: + use_topic_schema: + title: Use Topic Schema + description: When enabled, BigQuery uses the Pub/Sub topic's schema to + parse messages and map fields to table columns. When disabled, messages + are written as raw bytes. Requires the topic to have a schema attached. + type: boolean + default: false + + write_metadata: + title: Write Subscription Metadata + description: When enabled, BigQuery populates the subscription_name, message_id, + publish_time, and attributes columns in each row. The pubsub_default schema + includes these columns. Recommended to keep enabled. + type: boolean + default: true + + drop_unknown_fields: + title: Drop Unknown Fields + description: When enabled and use_topic_schema is also enabled, fields + in the message that do not exist in the table schema are silently dropped. + When disabled, unknown fields cause the message to be routed to the dead + letter topic (if configured) or dropped entirely. + type: boolean + default: false + + ack_deadline_seconds: + title: Acknowledgement Deadline (seconds) + description: How long Pub/Sub waits for BigQuery to acknowledge a message + before re-delivering it. Increase if BigQuery write latency exceeds the + default. Range 10–600 seconds. + type: integer + default: 60 + minimum: 10 + maximum: 600 + +connections: + required: + - gcp_authentication + - bigquery_dataset + properties: + gcp_authentication: + $ref: gcp-service-account + title: GCP Credentials + + bigquery_dataset: + $ref: catalog-demo/gcp-bigquery-dataset + title: BigQuery Dataset + + pubsub_topic: + $ref: catalog-demo/gcp-pubsub-topic + title: Pub/Sub Topic (optional — wire to enable BigQuery subscription delivery) + +artifacts: + required: + - bigquery_table + properties: + bigquery_table: + $ref: catalog-demo/gcp-bigquery-table + title: GCP BigQuery Table + +steps: + - path: src + provisioner: opentofu:1.10 + config: + checkov: + halt_on_failure: '.params.md_metadata.default_tags["md-target"] | test("^(prod|prd|production)$")' + +ui: + ui:order: + - table_id + - description + - deletion_protection + - schema_mode + - schema_json + - bigquery_subscription + - "*" + properties: + deletion_protection: + ui:widget: checkbox + bigquery_subscription: + ui:order: + - use_topic_schema + - write_metadata + - drop_unknown_fields + - ack_deadline_seconds + - "*" + properties: + use_topic_schema: + ui:widget: checkbox + write_metadata: + ui:widget: checkbox + drop_unknown_fields: + ui:widget: checkbox diff --git a/bundles/gcp-bigquery-table/operator.md b/bundles/gcp-bigquery-table/operator.md new file mode 100644 index 0000000..cc227e7 --- /dev/null +++ b/bundles/gcp-bigquery-table/operator.md @@ -0,0 +1,109 @@ +--- +templating: mustache +--- + +# GCP BigQuery Table — Operator Runbook + +## Non-obvious constraints + +**Table ID is immutable.** `table_id` cannot be changed in-place. To rename: export the table data, destroy the package, reprovision with the new ID, reload from GCS. Treat the table ID as permanent. + +**Deletion protection requires a two-step destroy.** When `deletion_protection = true`, the destroy will fail with a "Table cannot be deleted" error. To decommission: +1. Set `deletion_protection = false` in the package config and deploy. +2. Then run the destroy. + +**Schema evolution is limited.** BigQuery supports a narrow set of in-place schema changes: adding new columns at the end of the schema, relaxing a field from REQUIRED to NULLABLE, and a few others. Changing column types, renaming columns, or reordering columns requires dropping and recreating the table — all data is lost unless exported first. Plan your schema carefully before first deploy. + +**Pub/Sub subscription target table must exist before the subscription can deliver messages.** This bundle creates the table before the subscription, so order-of-operations is handled automatically. However, if you destroy and recreate the table independently, redeploy this bundle to recreate the subscription and its IAM bindings. + +**Pub/Sub IAM bindings are table-scoped and removed on disconnect.** When you unwire the `pubsub_topic` connection and redeploy, Terraform removes the two service agent IAM bindings from this table. No dataset-level or project-level IAM is modified. Existing data in the table is not affected — only new message delivery stops. + +**Schema mismatch routes messages to dead letter or drops them.** When `use_topic_schema = true` and a message contains fields not in the table schema, behavior depends on `drop_unknown_fields`. If `drop_unknown_fields = false` (the default), the message is routed to the dead letter topic if one is configured on the source topic, or dropped. If `drop_unknown_fields = true`, the extra fields are silently discarded and the message is delivered. + +**IAM propagation is eventually consistent.** The Pub/Sub subscription creation depends on IAM bindings that may not have propagated yet. The `depends_on` in this bundle mitigates timing issues, but if the subscription creation fails during a first deploy, a redeploy will resolve it. + +## Troubleshooting + +**Pub/Sub subscription stuck — messages not appearing in BigQuery.** +```bash +# Check subscription delivery status and error details +gcloud pubsub subscriptions describe {{artifacts.bigquery_table.table_id}}-bq \ + --project={{artifacts.bigquery_table.project_id}} + +# Confirm table exists +bq show --format=prettyjson {{artifacts.bigquery_table.table_full_name}} + +# Confirm IAM bindings (look for gcp-sa-pubsub entries) +bq get-iam-policy {{artifacts.bigquery_table.table_full_name}} +``` +Common causes: table schema mismatch with message fields, `use_topic_schema = true` but topic has no schema, IAM bindings not yet propagated (redeploy to fix), or `pubsub.googleapis.com` not enabled. + +**Pub/Sub subscription creation fails with permission error during deploy.** +IAM propagation is eventually consistent — wait 30–60 seconds and redeploy. The `depends_on` in this bundle mitigates but does not eliminate this race. + +**Messages delivered but columns are all null.** +If `use_topic_schema = false` (default), messages are written as raw bytes to the `data` column. Enable `write_metadata = true` so metadata columns (subscription_name, message_id, publish_time, attributes) are populated. Query the `data` column directly for the message payload. + +**Deploy fails with "bigquery.googleapis.com has not been used in project."** +Add `bigquery.googleapis.com` to `enabled_apis` in the `gcp-landing-zone` package, redeploy the landing zone, wait ~60 seconds, then retry. + +**Deploy fails with "pubsub.googleapis.com has not been used in project."** +Add `pubsub.googleapis.com` to `enabled_apis` in the `gcp-landing-zone` package, redeploy the landing zone, wait ~60 seconds, then retry. + +**Permission denied on table access.** +```bash +bq get-iam-policy {{artifacts.bigquery_table.table_full_name}} +``` +The required member should have `roles/bigquery.dataEditor` for read/write or `roles/bigquery.dataViewer` for read-only. If the binding is absent, redeploy the consumer bundle with the table wired on the canvas. + +## Day-2 operations + +**Querying the table:** +```bash +bq query --project_id={{artifacts.bigquery_table.project_id}} \ + 'SELECT * FROM `{{artifacts.bigquery_table.table_full_name}}` LIMIT 100' +``` + +**Inspecting table schema and row count:** +```bash +bq show --format=prettyjson {{artifacts.bigquery_table.table_full_name}} +bq query --project_id={{artifacts.bigquery_table.project_id}} \ + 'SELECT COUNT(*) FROM `{{artifacts.bigquery_table.table_full_name}}`' +``` + +**Exporting table data before destroying:** +```bash +bq extract \ + --destination_format=NEWLINE_DELIMITED_JSON \ + {{artifacts.bigquery_table.table_full_name}} \ + gs:///{{artifacts.bigquery_table.dataset_id}}/{{artifacts.bigquery_table.table_id}}/*.jsonl +``` + +**Replaying missed Pub/Sub messages from a timestamp:** +```bash +gcloud pubsub subscriptions seek \ + --time=$(date -u +%Y-%m-%dT%H:%M:%SZ -d "1 hour ago") \ + --project={{artifacts.bigquery_table.project_id}} +``` + +## Useful commands + +```bash +# Show table schema and metadata +bq show --format=prettyjson {{artifacts.bigquery_table.table_full_name}} + +# Show IAM policy on the table +bq get-iam-policy {{artifacts.bigquery_table.table_full_name}} + +# List all subscriptions on the parent topic +gcloud pubsub topics list-subscriptions \ + --project={{artifacts.bigquery_table.project_id}} + +# Describe the BigQuery subscription (delivery config + error state) +gcloud pubsub subscriptions describe \ + --project={{artifacts.bigquery_table.project_id}} + +# Run an ad-hoc query +bq query --project_id={{artifacts.bigquery_table.project_id}} \ + 'SELECT COUNT(*) FROM `{{artifacts.bigquery_table.table_full_name}}`' +``` diff --git a/bundles/gcp-bigquery-table/src/artifacts.tf b/bundles/gcp-bigquery-table/src/artifacts.tf new file mode 100644 index 0000000..789630e --- /dev/null +++ b/bundles/gcp-bigquery-table/src/artifacts.tf @@ -0,0 +1,12 @@ +# BigQuery table artifact — matches catalog-demo/gcp-bigquery-table schema. + +resource "massdriver_artifact" "bigquery_table" { + field = "bigquery_table" + name = "GCP BigQuery Table ${var.md_metadata.name_prefix}" + artifact = jsonencode({ + project_id = local.project_id + dataset_id = local.dataset_id + table_id = google_bigquery_table.main.table_id + table_full_name = "${local.project_id}.${local.dataset_id}.${google_bigquery_table.main.table_id}" + }) +} diff --git a/bundles/gcp-bigquery-table/src/main.tf b/bundles/gcp-bigquery-table/src/main.tf new file mode 100644 index 0000000..f26195f --- /dev/null +++ b/bundles/gcp-bigquery-table/src/main.tf @@ -0,0 +1,83 @@ +terraform { + required_version = ">= 1.0" + required_providers { + google = { + source = "hashicorp/google" + version = "~> 6.0" + } + massdriver = { + source = "massdriver-cloud/massdriver" + version = "~> 1.3" + } + } +} + +provider "google" { + project = var.gcp_authentication.project_id + credentials = jsonencode(var.gcp_authentication) +} + +locals { + project_id = var.bigquery_dataset.project_id + dataset_id = var.bigquery_dataset.dataset_id + name_prefix = var.md_metadata.name_prefix + + # Subscription name derived from the bundle name prefix for uniqueness. + subscription_name = "${local.name_prefix}-bq" + + # Pub/Sub-compatible schema — used when schema_mode = "pubsub_default". + # Includes the five standard columns that BigQuery subscription write_metadata + # populates: subscription_name, message_id, publish_time, data, attributes. + pubsub_default_schema = jsonencode([ + { + name = "subscription_name" + type = "STRING" + mode = "NULLABLE" + }, + { + name = "message_id" + type = "STRING" + mode = "NULLABLE" + }, + { + name = "publish_time" + type = "TIMESTAMP" + mode = "NULLABLE" + }, + { + name = "data" + type = "STRING" + mode = "NULLABLE" + }, + { + name = "attributes" + type = "JSON" + mode = "NULLABLE" + }, + ]) + + # Resolved schema — prefer custom_schema when provided, otherwise use pubsub default. + resolved_schema = ( + var.schema_mode == "custom_schema" && var.schema_json != null + ? var.schema_json + : local.pubsub_default_schema + ) +} + +# ─── BigQuery Table ──────────────────────────────────────────────────────────── + +resource "google_bigquery_table" "main" { + project = local.project_id + dataset_id = local.dataset_id + table_id = var.table_id + + description = var.description != null ? var.description : null + + # When deletion_protection = true, Terraform will refuse to destroy this table + # until the flag is first set to false and re-applied (a two-step destroy). + deletion_protection = var.deletion_protection + + schema = local.resolved_schema + + labels = var.md_metadata.default_tags +} diff --git a/bundles/gcp-bigquery-table/src/subscription.tf b/bundles/gcp-bigquery-table/src/subscription.tf new file mode 100644 index 0000000..760466f --- /dev/null +++ b/bundles/gcp-bigquery-table/src/subscription.tf @@ -0,0 +1,84 @@ +# ─── BigQuery Subscription (optional) ───────────────────────────────────────── +# +# This file is count-gated on var.pubsub_topic being non-null. +# When a Pub/Sub topic is wired on the canvas, three resources are created: +# +# 1. google_bigquery_table_iam_member.pubsub_service_agent_data_editor +# 2. google_bigquery_table_iam_member.pubsub_service_agent_metadata_viewer +# 3. google_pubsub_subscription.bigquery +# +# IAM bindings grant the Pub/Sub service agent +# (service-@gcp-sa-pubsub.iam.gserviceaccount.com) the minimum +# roles required to write messages into the BigQuery table. Bindings are +# table-scoped, not dataset- or project-wide. + +# ─── Pub/Sub service agent project number ────────────────────────────────────── +# The Pub/Sub service agent SA is project-number-scoped, so we need the numeric +# project number to construct the identity. +data "google_project" "this" { + project_id = local.project_id +} + +locals { + pubsub_enabled = var.pubsub_topic != null + pubsub_service_account = "serviceAccount:service-${data.google_project.this.number}@gcp-sa-pubsub.iam.gserviceaccount.com" + + # BigQuery subscription table reference format: projectId:datasetId.tableId + # This is the format required by the Pub/Sub API and the Terraform provider. + bq_table_ref = local.pubsub_enabled ? "${local.project_id}:${local.dataset_id}.${var.table_id}" : null +} + +# ─── IAM: dataEditor on this table ──────────────────────────────────────────── +# Required so the Pub/Sub service agent can INSERT rows into the target table. +# Scoped to this specific table — not the full dataset — for least privilege. +resource "google_bigquery_table_iam_member" "pubsub_service_agent_data_editor" { + count = local.pubsub_enabled ? 1 : 0 + + project = local.project_id + dataset_id = local.dataset_id + table_id = google_bigquery_table.main.table_id + role = "roles/bigquery.dataEditor" + member = local.pubsub_service_account +} + +# ─── IAM: metadataViewer on this table ──────────────────────────────────────── +# Required so the Pub/Sub service agent can read the table schema and validate +# message delivery configuration. Scoped to this specific table. +resource "google_bigquery_table_iam_member" "pubsub_service_agent_metadata_viewer" { + count = local.pubsub_enabled ? 1 : 0 + + project = local.project_id + dataset_id = local.dataset_id + table_id = google_bigquery_table.main.table_id + role = "roles/bigquery.metadataViewer" + member = local.pubsub_service_account +} + +# ─── Pub/Sub subscription with BigQuery delivery ─────────────────────────────── +resource "google_pubsub_subscription" "bigquery" { + count = local.pubsub_enabled ? 1 : 0 + + project = var.pubsub_topic.project_id + name = local.subscription_name + topic = var.pubsub_topic.topic_id + + ack_deadline_seconds = var.bigquery_subscription.ack_deadline_seconds + + bigquery_config { + table = local.bq_table_ref + use_topic_schema = var.bigquery_subscription.use_topic_schema + write_metadata = var.bigquery_subscription.write_metadata + drop_unknown_fields = var.bigquery_subscription.drop_unknown_fields + } + + labels = var.md_metadata.default_tags + + # The table and IAM bindings must exist before Pub/Sub validates the subscription. + # Without the IAM bindings, Pub/Sub cannot write to BigQuery. + # IAM propagation is eventually consistent — depends_on mitigates but does not + # eliminate timing issues. If the subscription fails, a redeploy resolves it. + depends_on = [ + google_bigquery_table_iam_member.pubsub_service_agent_data_editor, + google_bigquery_table_iam_member.pubsub_service_agent_metadata_viewer, + ] +} diff --git a/bundles/gcp-bigquery-table/src/variables.tf b/bundles/gcp-bigquery-table/src/variables.tf new file mode 100644 index 0000000..debc628 --- /dev/null +++ b/bundles/gcp-bigquery-table/src/variables.tf @@ -0,0 +1,80 @@ +variable "md_metadata" { + type = object({ + name_prefix = string + default_tags = optional(map(string), {}) + }) +} + +variable "gcp_authentication" { + type = object({ + type = string + project_id = string + private_key_id = string + private_key = string + client_email = string + client_id = string + auth_uri = string + token_uri = string + auth_provider_x509_cert_url = string + client_x509_cert_url = string + }) + sensitive = true +} + +variable "bigquery_dataset" { + type = object({ + project_id = string + dataset_id = string + dataset_full_name = string + location = string + friendly_name = optional(string) + }) +} + +# Optional — only present when a Pub/Sub topic is wired on the canvas. +variable "pubsub_topic" { + description = "Pub/Sub topic artifact. When wired, a BigQuery subscription is created to deliver messages into this table." + type = object({ + project_id = string + topic_name = string + topic_id = string + dlq_topic_id = optional(string) + dlq_topic_name = optional(string) + }) + default = null +} + +variable "table_id" { + type = string +} + +variable "description" { + type = string + default = null +} + +variable "deletion_protection" { + type = bool + default = true +} + +variable "schema_mode" { + type = string + default = "pubsub_default" +} + +variable "schema_json" { + type = string + default = null +} + +variable "bigquery_subscription" { + description = "Settings for the Pub/Sub BigQuery subscription. Consumed only when pubsub_topic is non-null." + type = object({ + use_topic_schema = optional(bool, false) + write_metadata = optional(bool, true) + drop_unknown_fields = optional(bool, false) + ack_deadline_seconds = optional(number, 60) + }) + default = {} +} diff --git a/bundles/gcp-cloud-run-service/README.md b/bundles/gcp-cloud-run-service/README.md new file mode 100644 index 0000000..f7484c0 --- /dev/null +++ b/bundles/gcp-cloud-run-service/README.md @@ -0,0 +1,121 @@ +# gcp-cloud-run-service + +Google Cloud Run v2 service. Each bundle instance creates its own runtime service account and automatically grants it the minimum-privilege role on any connected upstream artifact (Pub/Sub topic, BigQuery dataset, GCS bucket) — no manual IAM wiring required. + +## Use Cases + +- Internal APIs and microservices consuming Pub/Sub and BigQuery without internet exposure +- Event-driven workers triggered by Pub/Sub push subscriptions or Cloud Scheduler +- Public HTTPS APIs with anonymous or token-authenticated access +- Data pipeline workers reading from GCS and writing to BigQuery or Pub/Sub + +## Use as a Runtime Template + +This bundle is an example runtime template — an opinionated standard for how Cloud Run services are provisioned. It encodes a security baseline (per-service workload identity, ingress controls, compliance skips with documented rationale) and auto-wires IAM for common data dependencies. + +Typical workflow: +1. Platform team publishes this template bundle (or a fork) to Massdriver. +2. Application developer runs `mass bundle new` pointing at the template to generate a bundle for their specific service. +3. The developer customizes image, connections, environment variables, and app-specific dependencies. The platform baseline is inherited. + + + +## Resources Created + +| Resource | Type | Notes | +|---|---|---| +| `google_service_account.runtime` | Per-service runtime SA | This service's workload identity — one per bundle instance | +| `google_cloud_run_v2_service.main` | Cloud Run v2 service | Runs containers as the runtime SA | +| `google_cloud_run_v2_service_iam_member` (allUsers) | Public invoker IAM | Created only when `allow_unauthenticated = true` | +| `google_pubsub_topic_iam_member` | Pub/Sub publisher IAM | Created only when `pubsub_topic` is connected | +| `google_bigquery_dataset_iam_member` | BigQuery data editor IAM | Created only when `bigquery_dataset` is connected | +| `google_storage_bucket_iam_member` | GCS object user IAM | Created only when `storage_bucket` is connected | +| `google_service_account.push_invoker` | Push invoker SA | Created only when `incoming_topic` is connected — used by Pub/Sub for OIDC, separate from the runtime SA | +| `google_cloud_run_v2_service_iam_member` (push_invoker) | Push invoker IAM | Created only when `incoming_topic` is connected — grants `roles/run.invoker` to the push invoker SA | +| `google_pubsub_subscription.push` | Pub/Sub push subscription | Created only when `incoming_topic` is connected — delivers messages to this service's URL | + +## Connections + +### Required + +| Connection | Artifact Type | How It Is Used | +|---|---|---| +| `gcp_authentication` | `gcp-service-account` | GCP credentials used by Terraform to provision resources | +| `landing_zone` | `catalog-demo/gcp-landing-zone` | Provides `project_id` and `network.region` | + +### Optional + +Connecting or disconnecting a canvas wire does not take effect until a Terraform apply runs. + +**Outgoing data connections** — grant this service's runtime SA the listed IAM role on the upstream resource: + +| Connection | Artifact Type | IAM Role Granted | +|---|---|---| +| `pubsub_topic` | `catalog-demo/gcp-pubsub-topic` | `roles/pubsub.publisher` on the topic | +| `bigquery_dataset` | `catalog-demo/gcp-bigquery-dataset` | `roles/bigquery.dataEditor` on the dataset | +| `storage_bucket` | `catalog-demo/gcp-storage-bucket` | `roles/storage.objectUser` on the bucket | + +**Incoming message delivery** — creates a Pub/Sub push subscription that calls this service's URL: + +| Connection | Artifact Type | What Gets Created | +|---|---|---| +| `incoming_topic` | `catalog-demo/gcp-pubsub-topic` | Push subscription on the topic + a dedicated `push_invoker` SA granted `roles/run.invoker` on this service | + +The push subscription uses a separate `push_invoker` service account (not the runtime SA) for OIDC authentication. Pub/Sub attaches a signed OIDC token for that SA to every HTTP request. Cloud Run validates the token and the `roles/run.invoker` binding before routing the request to the container. The `push_ack_deadline_seconds` param (default 60, max 600) controls how long Pub/Sub waits for a 2xx before redelivering. + +**Private egress** — routes outbound traffic through a VPC for access to private endpoints: + +| Connection | Artifact Type | What Gets Created | +|---|---|---| +| `vpc_connector` | `catalog-demo/gcp-vpc-connector` | Attaches the connector to the Cloud Run service's `vpc_access` block | + +The `vpc_egress` param controls whether only RFC1918 traffic (`PRIVATE_RANGES_ONLY`) or all outbound traffic (`ALL_TRAFFIC`) goes through the connector. Use `ALL_TRAFFIC` when downstream services such as Kafka brokers are on private IPs reachable only through the VPC. The connector must be in the same GCP region as this Cloud Run service. + +## Artifact Produced + +**Artifact type:** `catalog-demo/gcp-cloud-run-service` + +| Field | Type | Description | +|---|---|---| +| `project_id` | string | GCP project that owns the service | +| `service_name` | string | Short service name (used in gcloud commands) | +| `service_url` | string | HTTPS URL of the service (`.run.app` domain) | +| `location` | string | GCP region where the service is deployed | +| `latest_ready_revision` | string | Name of the currently-serving revision | +| `runtime_service_account_email` | string | Email of this service's own runtime SA | +| `runtime_service_account_member` | string | IAM principal string (`serviceAccount:`) for downstream bindings | + +`runtime_service_account_member` is designed for downstream bundles (Scheduler, Pub/Sub push) that need to grant `roles/run.invoker` to this service's identity. + +## Compliance + +### Hardcoded controls + +| Control | Value | Reason | +|---|---|---| +| Per-service runtime identity | `google_service_account.runtime` (one per bundle instance) | Each service gets its own SA with bindings only to resources it connects to — no shared SA that grants access across all workloads | +| Resource labels | Massdriver default tags | Enforces cost attribution and environment tagging on all revisions | + +### Checkov skips + +| Check | Reason | +|---|---| +| `CKV_GCP_102` | Ingress is intentionally configurable. The check fires on any non-internal service without distinguishing IAM controls. Internal-preset services pass this check without the skip; only public-ingress services need it bypassed. | +| `CKV_GCP_103` | Binary Authorization requires a pre-configured attestor policy at the project level. Enabling it per-service without an attestor causes all deployments to fail. Teams requiring binary authorization should enforce it via `google_binary_authorization_policy`. | + +The `halt_on_failure` expression in `massdriver.yaml` blocks deployments with unresolved high-severity Checkov findings when the environment target matches `prod`, `prd`, or `production`. + +## Assumptions + +- The landing zone provides `project_id` and `network.region`. It does NOT provide a workload SA — this bundle creates its own. +- The runtime SA does not automatically have `roles/artifactregistry.reader`. If your image is in Artifact Registry, grant that role manually or add it to the bundle source. +- The VPC connector is consumed by this bundle (via the `vpc_connector` optional connection) but not provisioned here. Deploy a VPC connector bundle separately and wire it on the canvas. +- The default image (`gcr.io/cloudrun/hello`) is the Google-managed hello-world container. Replace it with your application image before a real deployment. + +## Presets + +| Preset | Ingress | Min Instances | Max Instances | CPU | Memory | Unauthenticated | +|---|---|---|---|---|---|---| +| Internal | `internal` | 0 | 10 | 1 | 512Mi | false | +| Public API | `all` | 1 | 100 | 2 | 1Gi | true | +| Worker | `internal` | 1 | 50 | 2 | 2Gi | false | diff --git a/bundles/gcp-cloud-run-service/massdriver.yaml b/bundles/gcp-cloud-run-service/massdriver.yaml new file mode 100644 index 0000000..fb1e74b --- /dev/null +++ b/bundles/gcp-cloud-run-service/massdriver.yaml @@ -0,0 +1,260 @@ +name: gcp-cloud-run-service +description: Google Cloud Run v2 service with auto-binding IAM for upstream data + artifacts. Provisions the Cloud Run service running as the landing zone's workload + service account, and automatically grants the service account the appropriate IAM + role on any connected upstream artifact (Pub/Sub publisher, BigQuery dataEditor, + GCS objectUser). Emits a gcp-cloud-run-service artifact for downstream event sources + (Scheduler, Pub/Sub push) to use for invoking the service. +source_url: https://github.com/massdriver-cloud/massdriver-catalog/tree/main/bundles/gcp-cloud-run-service +version: 0.1.1 + +params: + required: + - image + - port + - cpu + - memory + - min_instances + - max_instances + - ingress + - allow_unauthenticated + examples: + - __name: Internal + image: gcr.io/cloudrun/hello + port: 8080 + cpu: "1" + memory: 512Mi + min_instances: 0 + max_instances: 10 + ingress: internal + allow_unauthenticated: false + - __name: Public API + image: gcr.io/cloudrun/hello + port: 8080 + cpu: "2" + memory: 1Gi + min_instances: 1 + max_instances: 100 + ingress: all + allow_unauthenticated: true + - __name: Worker + image: gcr.io/cloudrun/hello + port: 8080 + cpu: "2" + memory: 2Gi + min_instances: 1 + max_instances: 50 + ingress: internal + allow_unauthenticated: false + + properties: + image: + title: Container Image + description: Fully-qualified container image reference to deploy. Supports + Docker Hub, Google Artifact Registry, and GCR images. Defaults to the Cloud + Run hello-world image so the bundle is deployable out of the box before your + application image is ready. Pin to a digest (image@sha256:...) rather than + a mutable tag for production deployments. + type: string + default: gcr.io/cloudrun/hello + + port: + title: Container Port + description: TCP port that your container listens on. Cloud Run sends all HTTP/2 + and HTTP/1.1 traffic to this port. Must match what the running process actually + binds — a mismatch causes revision-failed-readiness errors and Cloud Run will + roll back the revision. + type: integer + minimum: 1 + maximum: 65535 + default: 8080 + + cpu: + title: CPU + description: Number of vCPUs allocated to each container instance. Cloud Run + supports 1, 2, 4, and 8 vCPUs. Values above 1 require at least 512Mi memory. + CPU is only allocated while a request is being processed unless min_instances + is greater than 0 (always-on). Higher CPU allows more concurrent goroutines + or threads within a single instance before autoscaling triggers. + type: string + default: "1" + enum: + - "1" + - "2" + - "4" + - "8" + + memory: + title: Memory + description: Memory allocated to each container instance. Must be at least + 512Mi when CPU is 2 or higher. Cloud Run enforces CPU-to-memory ratios — + if you increase CPU, you may need to increase memory too to avoid a deploy + error. For workers processing large payloads, start with 2Gi and tune down. + type: string + default: 512Mi + enum: + - 256Mi + - 512Mi + - 1Gi + - 2Gi + - 4Gi + - 8Gi + - 16Gi + - 32Gi + + min_instances: + title: Minimum Instances + description: Minimum number of container instances to keep running at all times. + Set to 0 for scale-to-zero (cost-efficient for low-traffic or batch workloads). + Set to 1 or higher to eliminate cold starts. Any value above 0 disables + scale-to-zero — you are billed for idle capacity continuously. + type: integer + minimum: 0 + default: 0 + + max_instances: + title: Maximum Instances + description: Maximum number of container instances Cloud Run will scale to. + Cloud Run default is 100. Reduce this to cap costs or to protect downstream + databases from connection storms. Increasing beyond 100 requires a quota + request in GCP. + type: integer + minimum: 1 + default: 100 + + ingress: + title: Ingress + description: Controls which traffic sources can reach the service. `internal` + restricts access to VPC networks and Cloud Load Balancing in the same project. + `internal-and-cloud-load-balancing` adds Google Cloud Load Balancing traffic. + `all` allows all internet traffic to reach the service directly via its + .run.app URL. Changing ingress settings triggers a full revision replacement + with a brief cold start. + type: string + default: internal + enum: + - all + - internal + - internal-and-cloud-load-balancing + + allow_unauthenticated: + title: Allow Unauthenticated Requests + description: When true, grants the `roles/run.invoker` IAM role to `allUsers` + on this service, making the .run.app URL publicly accessible without a Bearer + token. Required for the Public API preset when you want anonymous access. + When false, callers must present a valid GCP identity token — use this for + internal APIs and workers. Has no effect on ingress routing — set `ingress` + to `all` separately if you want public network access. + type: boolean + default: false + + push_ack_deadline_seconds: + title: Push Subscription Ack Deadline (seconds) + description: Maximum time Pub/Sub waits for the service to acknowledge a push + message before redelivering it. Range is 10–600 seconds. Only used when + `incoming_topic` is connected. Set this to at least as long as your handler's + maximum processing time. If your handler cannot finish within 600 seconds, + acknowledge early and process asynchronously. + type: integer + minimum: 10 + maximum: 600 + default: 60 + + vpc_egress: + title: VPC Egress + description: Controls which traffic is routed through the VPC connector. Only + used when `vpc_connector` is connected. `PRIVATE_RANGES_ONLY` routes RFC1918 + traffic (10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16) through the connector; + public destinations still egress directly. `ALL_TRAFFIC` forces all outbound + traffic — including public API calls — through the connector and VPC. Use + `ALL_TRAFFIC` when the downstream endpoint (e.g., a Kafka broker) is on a + private IP behind the connector. + type: string + default: PRIVATE_RANGES_ONLY + enum: + - PRIVATE_RANGES_ONLY + - ALL_TRAFFIC + + deletion_protection: + title: Enable Deletion Protection + description: When enabled, the Cloud Run service cannot be destroyed until + deletion protection is first disabled in a prior apply (two-step destroy). + Default is false — Cloud Run services are stateless compute, so destroying + one does not lose data. Set to true in production to prevent accidental + tear-down. + type: boolean + default: false + +connections: + required: + - gcp_authentication + - landing_zone + properties: + gcp_authentication: + $ref: gcp-service-account + title: GCP Credentials + + landing_zone: + $ref: catalog-demo/gcp-landing-zone + title: GCP Landing Zone + + # Optional upstream artifact connections — omit from canvas to skip IAM binding. + # When connected, the bundle automatically grants the workload service account + # the appropriate role on the upstream resource (see src/iam.tf for binding logic). + pubsub_topic: + $ref: catalog-demo/gcp-pubsub-topic + title: Pub/Sub Topic (optional) + + bigquery_dataset: + $ref: catalog-demo/gcp-bigquery-dataset + title: BigQuery Dataset (optional) + + storage_bucket: + $ref: catalog-demo/gcp-storage-bucket + title: Storage Bucket (optional) + + # incoming_topic: when wired, this bundle creates a Pub/Sub push subscription + # that delivers messages FROM this topic INTO this Cloud Run service's URL. + # The subscription uses a dedicated push_invoker SA for OIDC authentication — + # separate from the runtime SA that runs the container. + incoming_topic: + $ref: catalog-demo/gcp-pubsub-topic + title: Incoming Pub/Sub Topic (optional — creates push subscription) + + # vpc_connector: when wired, attaches the connector to this Cloud Run service + # for private VPC egress. The connector must be in the same region as the service. + vpc_connector: + $ref: catalog-demo/gcp-vpc-connector + title: VPC Connector (optional — private egress) + +artifacts: + required: + - cloud_run_service + properties: + cloud_run_service: + $ref: catalog-demo/gcp-cloud-run-service + title: GCP Cloud Run Service + +steps: + - path: src + provisioner: opentofu:1.10 + config: + checkov: + halt_on_failure: '.params.md_metadata.default_tags["md-target"] | test("^(prod|prd|production)$")' + +ui: + ui:order: + - image + - port + - cpu + - memory + - min_instances + - max_instances + - ingress + - allow_unauthenticated + - vpc_egress + - push_ack_deadline_seconds + - "*" + properties: + allow_unauthenticated: + ui:widget: checkbox diff --git a/bundles/gcp-cloud-run-service/operator.md b/bundles/gcp-cloud-run-service/operator.md new file mode 100644 index 0000000..ecbc6d6 --- /dev/null +++ b/bundles/gcp-cloud-run-service/operator.md @@ -0,0 +1,138 @@ +--- +templating: mustache +--- + +# GCP Cloud Run Service — Operator Runbook + +## Non-obvious constraints + +**Each bundle instance creates its own service account.** The SA email is derived from the bundle's `name_prefix`. If the package is renamed, the SA is destroyed and recreated. Any out-of-band IAM bindings referencing the old SA email (e.g., manually granted Artifact Registry reader) must be reapplied. Canvas-wired bindings (Pub/Sub, BigQuery, GCS) are recreated automatically on the next deploy. + +**The push subscription uses a SEPARATE service account from the runtime SA.** When `incoming_topic` is connected, this bundle creates two SAs: the runtime SA (which the container runs as and which holds data-access IAM bindings) and a `push_invoker` SA (which Pub/Sub uses exclusively to OIDC-authenticate HTTP push deliveries). Do not confuse them — they have different emails, different roles, and different lifecycles. The push invoker SA is named `-p` in GCP. + +**The VPC connector must be in the same region as this Cloud Run service.** The connector region is taken from the `catalog-demo/gcp-vpc-connector` artifact (`connector.region`). If the connector is in a different region than the landing zone's `network.region`, the Cloud Run deploy will fail with a region mismatch error. Deploy the connector bundle in the correct region before wiring. + +**`vpc_egress = PRIVATE_RANGES_ONLY` does NOT route all traffic through the VPC.** Only RFC1918 destinations (10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16) are routed through the connector. Public API calls (e.g., Google APIs, external HTTP endpoints) still egress directly to the internet. If your downstream endpoint — such as a Kafka broker — is on a private IP behind the connector, use `ALL_TRAFFIC` to force all egress through the VPC. If using `ALL_TRAFFIC`, ensure the VPC has a Cloud NAT gateway configured, otherwise internet-bound traffic will have no route. + +**Push subscription ack deadline is capped at 600 seconds.** If a handler cannot complete within 600 seconds, it must acknowledge the message early (return HTTP 2xx immediately) and process asynchronously using a background task, Cloud Tasks, or another mechanism. Returning a non-2xx after the deadline causes Pub/Sub to redeliver the message, which leads to duplicate processing. + +**New deployments route 100% of traffic to the latest revision immediately.** Blue/green splits must be configured before deploying the new revision. You cannot retroactively split traffic between revisions once the new one is live at 100%. + +**Changing `ingress` triggers a new revision and a cold start.** Even if `min_instances > 0`, an ingress change forces revision replacement. + +**`min_instances > 0` means continuous billing.** You pay for idle capacity at the full CPU+memory rate at all times. + +**Container port must match what the image listens on.** A mismatch causes revision health check failure and Cloud Run rolls back. Error in logs: `Container failed to start. Failed to start and then listen on the port defined by the PORT environment variable.` + +**The runtime SA does not have `roles/artifactregistry.reader` by default.** If a revision fails with `image not found` or `permission denied` at startup, grant the role: +```bash +gcloud artifacts repositories add-iam-policy-binding \ + --location={{artifacts.cloud_run_service.location}} \ + --project={{artifacts.cloud_run_service.project_id}} \ + --member="{{artifacts.cloud_run_service.runtime_service_account_member}}" \ + --role="roles/artifactregistry.reader" +``` + +**Canvas wire changes require a deploy to take effect.** Connecting or disconnecting a data artifact on the canvas does not grant or revoke IAM access. The Terraform apply must run to create or destroy the binding. + +## Troubleshooting + +**Revision fails to start (startup timeout).** +Default startup probe timeout is 240 seconds. Diagnose: +```bash +gcloud logging read \ + 'resource.type="cloud_run_revision" AND resource.labels.service_name="{{artifacts.cloud_run_service.service_name}}" AND (textPayload:"Container failed" OR textPayload:"failed to start")' \ + --project={{artifacts.cloud_run_service.project_id}} \ + --limit=20 +``` +Check for: missing environment variables, wrong port, failed startup connections. Test locally: `docker run -p 8080: ` and confirm it starts quickly. + +**5xx errors in production.** +```bash +gcloud logging read \ + 'resource.type="cloud_run_revision" AND resource.labels.service_name="{{artifacts.cloud_run_service.service_name}}" AND httpRequest.status>=500' \ + --project={{artifacts.cloud_run_service.project_id}} \ + --limit=50 \ + --format="table(timestamp,httpRequest.status,httpRequest.requestUrl)" +``` + +**Service can't access a connected resource (Pub/Sub, BigQuery, GCS).** +Confirm the canvas wire is connected AND the package has been deployed since the wire was added. Check the specific IAM binding: +```bash +# Pub/Sub +gcloud pubsub topics get-iam-policy \ + --project={{artifacts.cloud_run_service.project_id}} \ + --format="table(bindings.role,bindings.members)" + +# BigQuery +bq get-iam-policy {{artifacts.cloud_run_service.project_id}}: + +# GCS +gcloud storage buckets get-iam-policy gs:// +``` +The member should be `{{artifacts.cloud_run_service.runtime_service_account_member}}`. + +## Day-2 operations + +**Rolling back to a prior revision:** +```bash +# List revisions to find the last known-good one +gcloud run revisions list \ + --service={{artifacts.cloud_run_service.service_name}} \ + --region={{artifacts.cloud_run_service.location}} \ + --project={{artifacts.cloud_run_service.project_id}} \ + --format="table(name,status.conditions[0].status)" + +# Shift 100% traffic to the prior revision +gcloud run services update-traffic {{artifacts.cloud_run_service.service_name}} \ + --region={{artifacts.cloud_run_service.location}} \ + --project={{artifacts.cloud_run_service.project_id}} \ + --to-revisions==100 +``` +This rollback is manual and temporary. The next Massdriver deploy overrides it. Fix the image or config, then redeploy. + +**Pinning to a digest to prevent silent image changes:** +```bash +gcloud container images describe : \ + --format="value(image_summary.digest)" +# Use the output sha256:... in the image param: @sha256:... +``` + +**Scaling changes:** Update `min_instances` or `max_instances` params and redeploy. In-place, safe. + +## Useful commands + +```bash +# Describe the service (traffic splits, SA, status) +gcloud run services describe {{artifacts.cloud_run_service.service_name}} \ + --region={{artifacts.cloud_run_service.location}} \ + --project={{artifacts.cloud_run_service.project_id}} \ + --format="yaml(name,status,spec.template.spec.serviceAccountName,spec.traffic)" + +# List revisions with status +gcloud run revisions list \ + --service={{artifacts.cloud_run_service.service_name}} \ + --region={{artifacts.cloud_run_service.location}} \ + --project={{artifacts.cloud_run_service.project_id}} \ + --format="table(name,status.conditions[0].status,metadata.creationTimestamp)" + +# Tail recent application logs +gcloud logging read \ + 'resource.type="cloud_run_revision" AND resource.labels.service_name="{{artifacts.cloud_run_service.service_name}}"' \ + --project={{artifacts.cloud_run_service.project_id}} \ + --limit=100 \ + --format=json | jq '.[].textPayload // .[].jsonPayload' + +# Send a test request (authenticated) +curl -H "Authorization: Bearer $(gcloud auth print-identity-token)" \ + {{artifacts.cloud_run_service.service_url}}/healthz + +# Check IAM on the service +gcloud run services get-iam-policy {{artifacts.cloud_run_service.service_name}} \ + --region={{artifacts.cloud_run_service.location}} \ + --project={{artifacts.cloud_run_service.project_id}} + +# Describe the runtime service account +gcloud iam service-accounts describe {{artifacts.cloud_run_service.runtime_service_account_email}} \ + --project={{artifacts.cloud_run_service.project_id}} +``` diff --git a/bundles/gcp-cloud-run-service/src/.checkov.yml b/bundles/gcp-cloud-run-service/src/.checkov.yml new file mode 100644 index 0000000..b2157a6 --- /dev/null +++ b/bundles/gcp-cloud-run-service/src/.checkov.yml @@ -0,0 +1,20 @@ +skip-check: + # CKV_GCP_102: Ensure Cloud Run service is not publicly accessible + # This check flags any Cloud Run service that has ingress=all or is not restricted + # to internal traffic. This bundle intentionally exposes the ingress setting as a + # configurable parameter because Cloud Run services legitimately need to be public + # (Public API preset) or internal (Internal / Worker presets). The allow_unauthenticated + # param further controls IAM-level access. Blanket-skipping is appropriate here because + # the check does not distinguish between the three valid ingress modes — it fires on + # all non-internal services regardless of IAM controls. Operators requiring internal-only + # can set ingress=internal, which makes this check pass without the skip. + - CKV_GCP_102 + + # CKV_GCP_103: Ensure Cloud Run service requires Binary Authorization + # Binary Authorization enforces a deploy-time policy that container images must be + # attested (signed) before they can run. This is a valid control for strict supply-chain + # environments but requires a separate Binary Authorization policy infrastructure that + # is out of scope for this bundle. Enabling it without a configured attestor causes + # all deployments to fail. Teams that require binary authorization should implement + # it at the project level via google_binary_authorization_policy, not per-service. + - CKV_GCP_103 diff --git a/bundles/gcp-cloud-run-service/src/artifacts.tf b/bundles/gcp-cloud-run-service/src/artifacts.tf new file mode 100644 index 0000000..aca34fa --- /dev/null +++ b/bundles/gcp-cloud-run-service/src/artifacts.tf @@ -0,0 +1,21 @@ +# Cloud Run service artifact — matches catalog-demo/gcp-cloud-run-service schema. +# Emits after the service is fully deployed and the first revision is ready. +# Downstream bundles (Scheduler, Pub/Sub push subscriptions) consume service_url +# and runtime_service_account_member to configure invocation and IAM. +# +# runtime_service_account_email / runtime_service_account_member now reference +# THIS bundle's own runtime SA (created in main.tf), NOT the landing zone SA. + +resource "massdriver_artifact" "cloud_run_service" { + field = "cloud_run_service" + name = "GCP Cloud Run Service ${var.md_metadata.name_prefix}" + artifact = jsonencode({ + project_id = local.project_id + service_name = google_cloud_run_v2_service.main.name + service_url = google_cloud_run_v2_service.main.uri + location = google_cloud_run_v2_service.main.location + latest_ready_revision = google_cloud_run_v2_service.main.latest_ready_revision + runtime_service_account_email = local.runtime_sa_email + runtime_service_account_member = local.runtime_sa_member + }) +} diff --git a/bundles/gcp-cloud-run-service/src/iam.tf b/bundles/gcp-cloud-run-service/src/iam.tf new file mode 100644 index 0000000..b489baa --- /dev/null +++ b/bundles/gcp-cloud-run-service/src/iam.tf @@ -0,0 +1,75 @@ +# ─── Upstream Artifact IAM Auto-Binding ─────────────────────────────────────── +# +# This file implements the "auto-binding" pattern for Cloud Run services that +# consume upstream data artifacts. For each optional connection that IS wired on +# the canvas, Terraform grants THIS bundle's runtime service account the minimum- +# privilege role required to use that resource. +# +# The runtime SA (google_service_account.runtime in main.tf) is created by this +# bundle — not inherited from the landing zone. This means each Cloud Run service +# gets its own identity with bindings only to the resources it actually connects to. +# +# HOW IT WORKS +# ──────────── +# Massdriver passes optional connections as null when not wired on the canvas, +# or as a plain object when wired. We detect presence with: var. != null +# Then use `count = var. != null ? 1 : 0` to conditionally create +# the binding. No connection → no IAM change. Add connection → binding appears +# on next deploy. Remove connection → binding is destroyed on next deploy. +# +# ROLES GRANTED +# ───────────── +# Pub/Sub topic → roles/pubsub.publisher +# Allows the service to publish messages to the topic. Does NOT grant +# subscription creation or management. For subscriber access, use a separate +# binding with roles/pubsub.subscriber. +# +# BigQuery dataset → roles/bigquery.dataEditor +# Allows reading, writing, and deleting table data, and creating/deleting +# tables within the dataset. Does NOT allow dropping the dataset itself. +# For read-only access, use roles/bigquery.dataViewer instead. +# +# Storage bucket → roles/storage.objectUser +# Allows reading and writing objects (get, list, create, delete). Does NOT +# grant bucket-level admin (lifecycle, IAM, metadata changes). For read-only +# access, use roles/storage.objectViewer instead. + +# ── Pub/Sub Topic ───────────────────────────────────────────────────────────── +# Grant this service's runtime SA publisher access to the connected Pub/Sub topic. +# Binding is topic-scoped — does not grant access to other topics. + +resource "google_pubsub_topic_iam_member" "runtime_publisher" { + count = var.pubsub_topic != null ? 1 : 0 + + project = var.pubsub_topic.project_id + topic = var.pubsub_topic.topic_name + role = "roles/pubsub.publisher" + member = local.runtime_sa_member +} + +# ── BigQuery Dataset ─────────────────────────────────────────────────────────── +# Grant this service's runtime SA dataEditor on the connected BigQuery dataset. +# Binding is dataset-scoped — propagates to all current and future tables in +# the dataset. For table-level isolation, use google_bigquery_table_iam_member. + +resource "google_bigquery_dataset_iam_member" "runtime_data_editor" { + count = var.bigquery_dataset != null ? 1 : 0 + + project = var.bigquery_dataset.project_id + dataset_id = var.bigquery_dataset.dataset_id + role = "roles/bigquery.dataEditor" + member = local.runtime_sa_member +} + +# ── Storage Bucket ───────────────────────────────────────────────────────────── +# Grant this service's runtime SA objectUser on the connected GCS bucket. +# Binding is bucket-scoped — allows read/write of all objects in the bucket. +# For read-only access, use roles/storage.objectViewer. + +resource "google_storage_bucket_iam_member" "runtime_object_user" { + count = var.storage_bucket != null ? 1 : 0 + + bucket = var.storage_bucket.bucket_name + role = "roles/storage.objectUser" + member = local.runtime_sa_member +} diff --git a/bundles/gcp-cloud-run-service/src/main.tf b/bundles/gcp-cloud-run-service/src/main.tf new file mode 100644 index 0000000..b3f4a3c --- /dev/null +++ b/bundles/gcp-cloud-run-service/src/main.tf @@ -0,0 +1,149 @@ +terraform { + required_version = ">= 1.0" + required_providers { + google = { + source = "hashicorp/google" + version = "~> 6.0" + } + massdriver = { + source = "massdriver-cloud/massdriver" + version = "~> 1.3" + } + } +} + +provider "google" { + project = var.gcp_authentication.project_id + credentials = jsonencode(var.gcp_authentication) +} + +locals { + project_id = var.landing_zone.project_id + name_prefix = var.md_metadata.name_prefix + region = var.landing_zone.network.region + + # Runtime SA is created by THIS bundle — not inherited from the landing zone. + # The SA email and member string are sourced from the google_service_account resource below. + # Use these locals anywhere an SA principal is needed (iam.tf, artifacts.tf). + runtime_sa_email = google_service_account.runtime.email + runtime_sa_member = "serviceAccount:${google_service_account.runtime.email}" +} + +# ─── Runtime Service Account ────────────────────────────────────────────────── +# Each Cloud Run service instance creates its own SA. This is the identity the +# service runs as and the principal that IAM bindings in iam.tf grant access to. +# +# account_id is derived from name_prefix and capped at 30 chars (GCP limit is 30). +# The SA is created in the landing zone's project — the project that owns the +# Cloud Run service and the upstream data resources. +# +# IMPORTANT: This SA is destroyed and recreated if the name_prefix changes (e.g., +# if the package is renamed). That is a destructive operation — downstream IAM +# bindings referencing the old email are invalidated. Plan SA naming carefully +# before first deploy; treat it as immutable after that. + +resource "google_service_account" "runtime" { + project = local.project_id + account_id = substr(local.name_prefix, 0, 30) + display_name = "Cloud Run Runtime — ${local.name_prefix}" + description = "Runtime identity for Cloud Run service ${local.name_prefix}. Managed by Massdriver." +} + +# ─── Cloud Run v2 Service ────────────────────────────────────────────────────── +# Uses the v2 API (google_cloud_run_v2_service), which is the current GA surface. +# The v1 resource (google_cloud_run_service) is deprecated and lacks v2-only +# features such as direct VPC egress and improved traffic management. + +resource "google_cloud_run_v2_service" "main" { + project = local.project_id + name = local.name_prefix + location = local.region + + # ── Deletion protection ───────────────────────────────────────────────────── + # google provider v6+ defaults deletion_protection = true, which blocks tofu + # destroy until a prior apply sets it to false. That's fine for production but + # friction for dev/test. Expose as a param, default false — Cloud Run services + # are stateless compute, destroy does not lose data. Operators can flip to true + # in production to require a two-step destroy. + deletion_protection = var.deletion_protection + + # ── Ingress ───────────────────────────────────────────────────────────────── + # Controls which traffic sources can reach this service. + # Changing ingress triggers a full revision replacement (cold start expected). + ingress = upper(var.ingress) == "INTERNAL-AND-CLOUD-LOAD-BALANCING" ? "INGRESS_TRAFFIC_INTERNAL_LOAD_BALANCER" : ( + upper(var.ingress) == "INTERNAL" ? "INGRESS_TRAFFIC_INTERNAL_ONLY" : "INGRESS_TRAFFIC_ALL" + ) + + template { + # ── Runtime identity ────────────────────────────────────────────────────── + # Run every revision as this bundle's own runtime service account (created above). + # iam.tf grants this SA the minimum required roles on any connected upstream artifact. + service_account = local.runtime_sa_email + + # ── Scaling ─────────────────────────────────────────────────────────────── + # min_instance_count > 0 disables scale-to-zero. You pay for idle capacity. + scaling { + min_instance_count = var.min_instances + max_instance_count = var.max_instances + } + + containers { + image = var.image + + ports { + container_port = var.port + } + + resources { + limits = { + cpu = var.cpu + memory = var.memory + } + } + } + + # ── VPC Connector ───────────────────────────────────────────────────────── + # Only configured when vpc_connector is wired on the canvas. + # connector_id is the fully-qualified resource name provided by the + # catalog-demo/gcp-vpc-connector artifact. egress is controlled by the + # vpc_egress param — use ALL_TRAFFIC to force all outbound traffic (including + # public destinations) through the VPC, e.g. for Kafka on a private endpoint. + dynamic "vpc_access" { + for_each = var.vpc_connector != null ? [1] : [] + content { + connector = var.vpc_connector.connector_id + egress = var.vpc_egress + } + } + } + + labels = var.md_metadata.default_tags + + lifecycle { + ignore_changes = [ + # Allow external traffic management tools (e.g., gcloud beta run services + # update-traffic) to adjust revision splits without Terraform reverting them. + template[0].labels, + ] + } +} + +# ─── Public Invoker IAM ──────────────────────────────────────────────────────── +# Only created when allow_unauthenticated = true. Grants roles/run.invoker to +# allUsers, making the .run.app URL publicly accessible without a Bearer token. +# When false, callers must present a valid GCP identity token. +# +# Note: This IAM binding is independent of ingress. You can have: +# ingress=all + allow_unauthenticated=false → public network, authenticated +# ingress=all + allow_unauthenticated=true → fully public (anonymous access) +# ingress=internal + allow_unauthenticated=false → VPC-only, authenticated + +resource "google_cloud_run_v2_service_iam_member" "all_users_invoker" { + count = var.allow_unauthenticated ? 1 : 0 + + project = local.project_id + location = local.region + name = google_cloud_run_v2_service.main.name + role = "roles/run.invoker" + member = "allUsers" +} diff --git a/bundles/gcp-cloud-run-service/src/push_subscription.tf b/bundles/gcp-cloud-run-service/src/push_subscription.tf new file mode 100644 index 0000000..7a12041 --- /dev/null +++ b/bundles/gcp-cloud-run-service/src/push_subscription.tf @@ -0,0 +1,106 @@ +# ─── Pub/Sub Push Subscription ──────────────────────────────────────────────── +# +# This file is active only when `incoming_topic` is wired on the canvas. +# It creates everything needed for Pub/Sub to authenticate and invoke this +# Cloud Run service via HTTP push. +# +# TWO SERVICE ACCOUNT PATTERN +# ─────────────────────────── +# This uses two separate service accounts with distinct purposes: +# +# google_service_account.runtime (created in main.tf) +# ↳ The identity the container RUNS AS. It holds IAM bindings for +# Pub/Sub publisher, BigQuery, GCS, etc. — resources the app accesses. +# DO NOT use this SA for the push subscription OIDC token. +# +# google_service_account.push_invoker (created in THIS file) +# ↳ The identity Pub/Sub uses to INVOKE the Cloud Run service via HTTP. +# It is granted only roles/run.invoker on this specific service. +# Pub/Sub attaches an OIDC token for this SA to each push request, +# which Cloud Run validates before passing the request to the container. +# +# Separating these SAs means a compromised push subscription token cannot be +# used to publish messages or access data resources, and the runtime SA cannot +# be used to forge push deliveries from other topics. +# +# FLOW +# ──── +# Pub/Sub publishes a message to incoming_topic +# → Pub/Sub's push delivery thread attaches an OIDC token for push_invoker SA +# → Cloud Run validates the token → roles/run.invoker check passes +# → Request is routed to the container (running as the runtime SA) +# → Container processes the message and returns 2xx to acknowledge + +# ─── Push Invoker Service Account ───────────────────────────────────────────── +# A dedicated SA used exclusively by Pub/Sub to OIDC-authenticate push requests. +# account_id is capped at 30 chars (GCP limit). We use a "-p" suffix to +# distinguish it from the runtime SA that shares the same name_prefix. + +resource "google_service_account" "push_invoker" { + count = var.incoming_topic != null ? 1 : 0 + + project = local.project_id + account_id = "${substr(local.name_prefix, 0, 28)}-p" + display_name = "Pub/Sub Push Invoker — ${local.name_prefix}" + description = "Used by Pub/Sub to invoke Cloud Run service ${local.name_prefix} via OIDC push. Managed by Massdriver." +} + +# ─── Grant push_invoker SA run.invoker on THIS service ──────────────────────── +# Scoped to this specific Cloud Run service — not a project-level binding. +# This is the minimal permission Pub/Sub needs to successfully deliver messages. + +resource "google_cloud_run_v2_service_iam_member" "push_invoker" { + count = var.incoming_topic != null ? 1 : 0 + + project = local.project_id + location = local.region + name = google_cloud_run_v2_service.main.name + role = "roles/run.invoker" + member = "serviceAccount:${google_service_account.push_invoker[0].email}" +} + +# ─── Pub/Sub Push Subscription ──────────────────────────────────────────────── +# Subscribes to incoming_topic and delivers messages to this service's URL. +# +# push_endpoint: the service's root URI (provided by the Cloud Run v2 API). +# Append a path (e.g., /events) in the service code or override push_endpoint +# to a path — Pub/Sub appends nothing by default. +# +# oidc_token: Pub/Sub attaches a signed OIDC token for push_invoker SA on every +# request. Cloud Run validates the token and checks run.invoker before routing. +# audience defaults to the push_endpoint URL, which is the correct value for +# Cloud Run OIDC validation. +# +# ack_deadline_seconds: if the service does not return 2xx within this window, +# Pub/Sub redelivers the message. Max is 600s. Long-running handlers must either +# acknowledge early (return 2xx, then process async) or stay well under the limit. +# +# retry_policy: exponential backoff between redeliveries. 10s minimum and 600s +# maximum are sensible defaults for most event-driven workloads. Tune if your +# downstream has specific rate constraints. + +resource "google_pubsub_subscription" "push" { + count = var.incoming_topic != null ? 1 : 0 + + project = var.incoming_topic.project_id + name = "${local.name_prefix}-push" + topic = var.incoming_topic.topic_id + + ack_deadline_seconds = var.push_ack_deadline_seconds + + push_config { + push_endpoint = google_cloud_run_v2_service.main.uri + + oidc_token { + service_account_email = google_service_account.push_invoker[0].email + # audience defaults to push_endpoint — correct for Cloud Run OIDC validation + } + } + + retry_policy { + minimum_backoff = "10s" + maximum_backoff = "600s" + } + + labels = var.md_metadata.default_tags +} diff --git a/bundles/gcp-cloud-run-service/src/variables.tf b/bundles/gcp-cloud-run-service/src/variables.tf new file mode 100644 index 0000000..6544ab4 --- /dev/null +++ b/bundles/gcp-cloud-run-service/src/variables.tf @@ -0,0 +1,171 @@ +variable "md_metadata" { + type = object({ + name_prefix = string + default_tags = optional(map(string), {}) + }) +} + +variable "gcp_authentication" { + type = object({ + type = string + project_id = string + private_key_id = string + private_key = string + client_email = string + client_id = string + auth_uri = string + token_uri = string + auth_provider_x509_cert_url = string + client_x509_cert_url = string + }) + sensitive = true +} + +variable "landing_zone" { + type = object({ + project_id = string + network = object({ + network_name = string + network_self_link = string + region = string + primary_subnet = object({ + name = string + cidr = string + self_link = string + }) + }) + enabled_apis = list(string) + budget = object({ + enabled = bool + budget_name = optional(string) + billing_account_id = optional(string) + amount_usd = optional(number) + }) + }) +} + +# ─── Optional upstream artifact connections ──────────────────────────────────── +# These variables are null when the connection is not wired on the canvas. +# Massdriver passes optional connections as a plain object or null — NOT a list. +# iam.tf uses count = var. != null ? 1 : 0 to conditionally create IAM +# bindings, and references fields directly (e.g., var.pubsub_topic.topic_name). + +variable "pubsub_topic" { + description = "Optional Pub/Sub topic connection. When provided, the runtime SA is granted roles/pubsub.publisher on the topic." + type = object({ + project_id = string + topic_name = string + topic_id = string + dlq_topic_name = optional(string) + dlq_topic_id = optional(string) + }) + default = null +} + +variable "bigquery_dataset" { + description = "Optional BigQuery dataset connection. When provided, the runtime SA is granted roles/bigquery.dataEditor on the dataset." + type = object({ + project_id = string + dataset_id = string + dataset_full_name = string + location = string + friendly_name = optional(string) + }) + default = null +} + +variable "storage_bucket" { + description = "Optional GCS bucket connection. When provided, the runtime SA is granted roles/storage.objectUser on the bucket." + type = object({ + project_id = string + bucket_name = string + bucket_url = string + bucket_self_link = string + location = string + storage_class = string + }) + default = null +} + +variable "incoming_topic" { + description = "Optional Pub/Sub topic connection. When provided, a push subscription is created that delivers messages from this topic to this Cloud Run service's URL. Uses a dedicated push_invoker SA for OIDC authentication." + type = object({ + project_id = string + topic_name = string + topic_id = string + dlq_topic_name = optional(string) + dlq_topic_id = optional(string) + }) + default = null +} + +variable "vpc_connector" { + description = "Optional VPC connector connection. When provided, the Cloud Run service's vpc_access block is configured with the connector for private VPC egress." + type = object({ + project_id = string + region = string + name = string + connector_id = string + network = optional(string) + ip_cidr_range = optional(string) + egress_settings = optional(string) + }) + default = null +} + +# ─── Service params ──────────────────────────────────────────────────────────── + +variable "image" { + type = string + default = "gcr.io/cloudrun/hello" +} + +variable "port" { + type = number + default = 8080 +} + +variable "cpu" { + type = string + default = "1" +} + +variable "memory" { + type = string + default = "512Mi" +} + +variable "min_instances" { + type = number + default = 0 +} + +variable "max_instances" { + type = number + default = 100 +} + +variable "ingress" { + type = string + default = "internal" +} + +variable "allow_unauthenticated" { + type = bool + default = false +} + +variable "push_ack_deadline_seconds" { + type = number + default = 60 +} + +variable "vpc_egress" { + type = string + default = "PRIVATE_RANGES_ONLY" +} + +variable "deletion_protection" { + type = bool + default = false +} diff --git a/bundles/gcp-landing-zone/README.md b/bundles/gcp-landing-zone/README.md new file mode 100644 index 0000000..53417fe --- /dev/null +++ b/bundles/gcp-landing-zone/README.md @@ -0,0 +1,101 @@ +# gcp-landing-zone + +Project-level governance construct for a GCP data platform. Deploy this once per environment before any workload bundles. + +- Enables GCP service APIs required by your data platform stack +- Applies project-level IAM bindings for human operators and groups (e.g., `roles/viewer` to `group:data-analysts@example.com`) +- Enforces org-policy guardrails at the project level (e.g., disable SA key creation, block public GCS access) +- Optionally configures a billing budget with spend-threshold email alerts +- Folds the input `gcp-network` artifact into its own `landing_zone` output so downstream bundles need only one connection + +**This bundle does NOT provision workload service accounts.** Each consumer bundle (Cloud Run, Vertex Workbench) creates its own runtime SA with least-privilege bindings on the specific resources it uses. Project-level IAM here is for human operators and group access only. + +## Resources Created + +| Resource | Type | Notes | +|---|---|---| +| `google_project_service.apis` | API enablement (one per API) | `disable_on_destroy = false` — removing an API from params does not disable it in GCP | +| `google_project_iam_member.operators` | Project IAM bindings | One resource per `{role, member}` entry; additive, non-authoritative | +| `google_project_organization_policy.guardrails` | Org policy constraints | Project-scoped; one resource per constraint | +| `google_billing_budget.environment` | Billing budget | Created only when `budget.enabled = true` | +| `google_monitoring_notification_channel.budget_email` | Email alert channel | Created only when budget is enabled and `notification_emails` is non-empty | + +## Connections + +| Connection | Artifact Type | How It Is Used | +|---|---|---| +| `gcp_authentication` | `gcp-service-account` | Deploy credential — project ID and service account key for the Google provider | +| `network` | `catalog-demo/gcp-network` | Network metadata passed through into the `landing_zone` artifact for downstream use | + +## Artifact Produced + +**Artifact type:** `catalog-demo/gcp-landing-zone` + +Downstream bundles connect to this one artifact to get everything they need. + +| Field | Description | +|---|---| +| `project_id` | GCP project ID | +| `network.network_name` | VPC name (passed through from input) | +| `network.network_self_link` | VPC self-link URI | +| `network.region` | Subnet region | +| `network.primary_subnet.name` | Subnet name | +| `network.primary_subnet.cidr` | Subnet CIDR range | +| `network.primary_subnet.self_link` | Subnet self-link URI | +| `enabled_apis` | List of APIs enabled by this landing zone | +| `iam_bindings` | Informational list of project-level `{role, member}` bindings applied — audit trail only, not consumed by downstream bundles | +| `budget.enabled` | Whether a budget was configured | +| `budget.budget_name` | Budget display name (null when disabled) | +| `budget.billing_account_id` | Billing account the budget is attached to (null when disabled) | +| `budget.amount_usd` | Monthly budget limit in USD (null when disabled) | + +## IAM Pattern for Consumer Bundles + +Each consumer bundle creates its own service account and binds it to the specific resources it uses. The landing zone does not provide or share a workload SA. Example: + +```hcl +resource "google_service_account" "runtime" { + project = var.landing_zone.project_id + account_id = "${var.md_metadata.name_prefix}-sa" + display_name = "Runtime SA for ${var.md_metadata.name_prefix}" +} + +resource "google_bigquery_dataset_iam_member" "runtime_editor" { + dataset_id = var.bigquery_dataset.dataset_id + role = "roles/bigquery.dataEditor" + member = "serviceAccount:${google_service_account.runtime.email}" +} +``` + +The artifact policy comments in `gcp-pubsub-topic`, `gcp-bigquery-dataset`, and `gcp-storage-bucket` source files are the canonical role-binding reference. + +## Compliance + +### Hardcoded security controls + +| Control | Mechanism | Reason | +|---|---|---| +| Additive IAM only | `google_project_iam_member` (per-binding, non-authoritative) | Avoids clobbering bindings set by GCP defaults or other automation | +| APIs not disabled on destroy | `disable_on_destroy = false` | Prevents accidental disruption of other resources that depend on the same APIs | + +### Checkov skips + +| Check | Reason | +|---|---| +| `CKV_GCP_118` | Skipped on `google_project_service` — API enablement resources do not accept IAM policies | + +The `halt_on_failure` expression in `massdriver.yaml` blocks deployments with unresolved high-severity Checkov findings when the environment target matches `prod`, `prd`, or `production`. + +## Assumptions + +- The GCP project already exists — this bundle does not create projects. +- The `gcp_authentication` credential has `iam.admin`, `serviceusage.serviceUsageAdmin`, `orgpolicy.policy.set` (project scope), and (if using budgets) `billing.budgets.create` IAM. +- Cloud Billing must be linked to the project before budgets can be created. +- `billingbudgets.googleapis.com` must be in `enabled_apis` when `budget.enabled = true`. + +## Presets + +| Preset | Budget | Notable APIs | +|---|---|---| +| Standard (no budget) | Disabled | compute, iam, resourcemanager, serviceusage, run, bigquery, storage, pubsub, aiplatform, notebooks, logging, monitoring | +| Standard (with budget) | Enabled — $500/mo, alerts at 50%/90%/100% | All of the above plus billingbudgets; example org policies: disable SA keys, block public GCS, require OS Login | diff --git a/bundles/gcp-landing-zone/massdriver.yaml b/bundles/gcp-landing-zone/massdriver.yaml new file mode 100644 index 0000000..f951ec3 --- /dev/null +++ b/bundles/gcp-landing-zone/massdriver.yaml @@ -0,0 +1,288 @@ +name: gcp-landing-zone +description: Project-level governance construct for a GCP data platform. Enables required + service APIs, applies project-level IAM bindings for human operators and groups, enforces + org-policy guardrails, configures a billing budget with threshold alerts, and emits a + single landing-zone artifact so downstream bundles only need one connection. Does NOT + provision workload service accounts — each consumer bundle creates its own. +source_url: https://github.com/massdriver-cloud/massdriver-catalog/tree/main/bundles/gcp-landing-zone +version: 0.1.1 + +params: + required: + - enabled_apis + - budget + - iam_bindings + - org_policies + examples: + - __name: Standard (no budget) + enabled_apis: + - compute.googleapis.com + - iam.googleapis.com + - cloudresourcemanager.googleapis.com + - serviceusage.googleapis.com + - run.googleapis.com + - bigquery.googleapis.com + - storage.googleapis.com + - pubsub.googleapis.com + - aiplatform.googleapis.com + - notebooks.googleapis.com + - logging.googleapis.com + - monitoring.googleapis.com + budget: + enabled: false + iam_bindings: [] + org_policies: [] + - __name: Standard (with budget) + enabled_apis: + - compute.googleapis.com + - iam.googleapis.com + - cloudresourcemanager.googleapis.com + - serviceusage.googleapis.com + - billingbudgets.googleapis.com + - run.googleapis.com + - bigquery.googleapis.com + - storage.googleapis.com + - pubsub.googleapis.com + - aiplatform.googleapis.com + - notebooks.googleapis.com + - logging.googleapis.com + - monitoring.googleapis.com + budget: + enabled: true + billing_account_id: "012345-ABCDEF-012345" + amount: 500 + threshold_percentages: + - 50 + - 90 + - 100 + notification_emails: + - platform-alerts@example.com + iam_bindings: [] + org_policies: + - constraint: constraints/iam.disableServiceAccountKeyCreation + enforced: true + - constraint: constraints/storage.publicAccessPrevention + enforced: true + - constraint: constraints/compute.requireOsLogin + enforced: true + properties: + enabled_apis: + title: Enabled APIs + description: GCP service APIs to enable in this project. Select from the list. + Add billingbudgets.googleapis.com if you enable a budget below. + type: array + uniqueItems: true + items: + type: string + enum: + - compute.googleapis.com + - iam.googleapis.com + - cloudresourcemanager.googleapis.com + - serviceusage.googleapis.com + - billingbudgets.googleapis.com + - run.googleapis.com + - bigquery.googleapis.com + - storage.googleapis.com + - pubsub.googleapis.com + - aiplatform.googleapis.com + - notebooks.googleapis.com + - logging.googleapis.com + - monitoring.googleapis.com + default: + - compute.googleapis.com + - iam.googleapis.com + - cloudresourcemanager.googleapis.com + - serviceusage.googleapis.com + + iam_bindings: + title: Project IAM Bindings + description: Project-level IAM bindings for human operators and groups. Each entry + grants a single member a role on this project. Uses google_project_iam_member + (additive) — will not clobber bindings set outside Terraform. Intended for + humans and groups, not workload service accounts (those are created by consumer bundles). + Empty array is valid; bindings are optional. + type: array + default: [] + items: + type: object + required: + - role + - member + properties: + role: + title: IAM Role + description: GCP role identifier, e.g. roles/viewer or roles/bigquery.dataViewer + type: string + examples: + - roles/viewer + - roles/bigquery.dataViewer + - roles/storage.objectViewer + member: + title: Member + description: IAM member string. Prefix with user:, group:, or serviceAccount:. + type: string + examples: + - user:alice@example.com + - group:data-analysts@example.com + + org_policies: + title: Org Policy Guardrails + description: Project-scoped org-policy constraints to enforce security posture. + Uses google_project_organization_policy applied at the project level (not org-wide). + Each entry enforces or denies a named constraint. Empty array is valid. + Common constraints are listed in the preset above. + type: array + default: [] + items: + type: object + required: + - constraint + - enforced + properties: + constraint: + title: Constraint + description: Org policy constraint name, e.g. constraints/iam.disableServiceAccountKeyCreation + type: string + examples: + - constraints/iam.disableServiceAccountKeyCreation + - constraints/storage.publicAccessPrevention + - constraints/compute.requireOsLogin + - constraints/compute.vmExternalIpAccess + enforced: + title: Enforced + description: When true, the boolean constraint is enforced. For list constraints + (e.g. vmExternalIpAccess), use enforced=true to deny all values. + type: boolean + default: true + + budget: + title: Billing Budget + description: Optionally configure a GCP billing budget with spend alerts for this environment. + Enable billingbudgets.googleapis.com in the API list above when using this feature. + type: object + required: + - enabled + properties: + enabled: + type: boolean + title: Enable Budget + description: Create a GCP billing budget with threshold email alerts + default: false + dependencies: + enabled: + oneOf: + - properties: + enabled: + const: true + billing_account_id: + title: Billing Account ID + description: GCP billing account to attach the budget to. Find it in + Cloud Console under Billing > Account management (format XXXXXX-XXXXXX-XXXXXX). + Cloud Billing must be enabled on the account. + type: string + pattern: "^[0-9A-Fa-f]{6}-[0-9A-Za-z]{6}-[0-9A-Za-z]{6}$" + examples: + - 015537-E00AAA-3F7EDD + amount: + title: Budget Amount (USD) + description: Monthly spend limit in US dollars + type: number + minimum: 1 + default: 500 + threshold_percentages: + title: Alert Thresholds (%) + description: Percentage spend thresholds at which email alerts are triggered. + E.g. 50 = 50%, 90 = 90%, 100 = 100%. + type: array + minItems: 1 + maxItems: 5 + default: + - 50 + - 90 + - 100 + items: + type: number + minimum: 1 + maximum: 150 + notification_emails: + title: Notification Emails + description: Email addresses to notify when spend thresholds are crossed (optional) + type: array + items: + type: string + format: email + required: + - billing_account_id + - amount + - threshold_percentages + - properties: + enabled: + const: false + +connections: + required: + - gcp_authentication + - network + properties: + gcp_authentication: + $ref: gcp-service-account + title: GCP Credentials + + network: + $ref: catalog-demo/gcp-network + title: GCP Network + +artifacts: + required: + - landing_zone + properties: + landing_zone: + $ref: catalog-demo/gcp-landing-zone + title: GCP Landing Zone + +steps: + - path: src + provisioner: opentofu:1.10 + config: + checkov: + halt_on_failure: '.params.md_metadata.default_tags["md-target"] | test("^(prod|prd|production)$")' + +ui: + ui:order: + - enabled_apis + - iam_bindings + - org_policies + - budget + - "*" + properties: + budget: + ui:order: + - enabled + - billing_account_id + - amount + - threshold_percentages + - notification_emails + - "*" + properties: + enabled: + ui:widget: checkbox + threshold_percentages: + ui:options: + orderable: false + items: + ui:title: "Threshold (%)" + iam_bindings: + items: + ui:order: + - role + - member + - "*" + org_policies: + items: + ui:order: + - constraint + - enforced + - "*" + properties: + enforced: + ui:widget: checkbox diff --git a/bundles/gcp-landing-zone/operator.md b/bundles/gcp-landing-zone/operator.md new file mode 100644 index 0000000..a8bb171 --- /dev/null +++ b/bundles/gcp-landing-zone/operator.md @@ -0,0 +1,101 @@ +--- +templating: mustache +--- + +# GCP Landing Zone — Operator Runbook + +## Non-obvious constraints + +**This bundle manages project-level IAM for humans and groups, not workload service accounts.** Consumer bundles (Cloud Run, Vertex Workbench) own their own runtime SAs. Do not add workload SAs here. + +**IAM bindings are additive and only removed when explicitly deleted from params.** `google_project_iam_member` does not reconcile the full project IAM policy. Removing a binding from params and redeploying destroys only that specific binding resource — all other project-level bindings remain untouched. + +**Org policies are project-scoped, not org-wide.** `google_project_organization_policy` applies constraints at the project level only. Org-wide enforcement requires setting the policy at the org node, which is out of scope for this bundle. + +**Removing an API from `enabled_apis` does not disable it in GCP.** `disable_on_destroy = false` means Terraform removes the state entry but never calls the GCP disable API. The API stays enabled. To actually disable it, run `gcloud services disable` manually after confirming no resources depend on it. + +**Budget requires Cloud Billing linked to the project.** If deploy fails with a billing budget error, confirm the project has a billing account attached in the GCP console before enabling the budget param. + +**Budget alert emails require a verified notification channel.** The Google Cloud Monitoring email channel must be verified in GCP before alerts deliver. Billing admins on the account always receive alerts regardless of channel configuration. + +**Newly added APIs can take 1–2 minutes to propagate.** If a downstream bundle deploy fails immediately after adding an API here, wait ~60 seconds and retry. + +## Troubleshooting + +**Downstream bundle fails with "API has not been used in project X."** +Add the required API to `enabled_apis` in this package, deploy, wait ~60 seconds, then retry the downstream bundle. + +Common APIs for this data platform: +- `pubsub.googleapis.com` — required for gcp-pubsub-topic +- `bigquery.googleapis.com` — required for gcp-bigquery-dataset +- `run.googleapis.com` — required for gcp-cloud-run-service +- `storage.googleapis.com` — required for gcp-storage-bucket +- `billingbudgets.googleapis.com` — required when budget is enabled + +To check which APIs are currently enabled: +```bash +gcloud services list --enabled --project={{artifacts.landing_zone.project_id}} +``` + +**Budget not enabled because billing API is missing.** +```bash +gcloud services list --enabled --project={{artifacts.landing_zone.project_id}} | grep billingbudgets +``` +If nothing returns, add `billingbudgets.googleapis.com` to `enabled_apis` and redeploy before enabling the budget. + +**Org policy apply fails with "403 PERMISSION_DENIED".** +The deploy credential needs `orgpolicy.policy.set` at the project level: +```bash +gcloud projects add-iam-policy-binding {{artifacts.landing_zone.project_id}} \ + --member="serviceAccount:" \ + --role="roles/orgpolicy.policyAdmin" +``` + +**An IAM binding appears in GCP but is not in params.** +If the binding was added outside Terraform, it will not be touched by Massdriver. To remove it, use `gcloud` or the Console. + +## Day-2 operations + +**Adding a human operator binding:** Add `{role, member}` to `iam_bindings` and redeploy. Additive — no existing bindings are touched. + +**Removing a human operator binding:** Remove the entry from `iam_bindings` and redeploy. Only that specific binding resource is destroyed. + +**Adding an org policy constraint:** Add `{constraint, enforced}` to `org_policies` and redeploy. + +**Removing an org policy constraint:** Remove the entry from `org_policies` and redeploy. The org's inherited policy (if any) applies after removal. + +**Adding APIs after initial deploy:** Update `enabled_apis` and redeploy. Existing APIs are not touched. + +**Disabling an API:** Remove it from `enabled_apis` and redeploy. Terraform drops the state entry but does NOT call the GCP disable API. Manually disable via `gcloud services disable` if needed. + +**Changing budget amount or alert thresholds:** Update params and redeploy. The `google_billing_budget` resource updates in-place. + +**Disabling the budget after it was enabled:** Set `budget.enabled = false` and redeploy. The budget and notification channel are destroyed. + +**Rotating the deploy credential:** Update the GCP credential in the Massdriver UI under environment credential settings, then redeploy. + +## Useful commands + +```bash +# List enabled APIs in the project +gcloud services list --enabled --project={{artifacts.landing_zone.project_id}} + +# Check all project-level IAM bindings +gcloud projects get-iam-policy {{artifacts.landing_zone.project_id}} \ + --format="table(bindings.role,bindings.members)" + +# List active org policy constraints on the project +gcloud resource-manager org-policies list \ + --project={{artifacts.landing_zone.project_id}} + +# Describe a specific org policy constraint +gcloud resource-manager org-policies describe constraints/iam.disableServiceAccountKeyCreation \ + --project={{artifacts.landing_zone.project_id}} + +# List all service accounts in the project (workload SAs are owned by consumer bundles) +gcloud iam service-accounts list --project={{artifacts.landing_zone.project_id}} + +# Manually disable an API (only needed if you removed it from enabled_apis and want it actually off) +gcloud services disable .googleapis.com \ + --project={{artifacts.landing_zone.project_id}} +``` diff --git a/bundles/gcp-landing-zone/src/.checkov.yml b/bundles/gcp-landing-zone/src/.checkov.yml new file mode 100644 index 0000000..c43c062 --- /dev/null +++ b/bundles/gcp-landing-zone/src/.checkov.yml @@ -0,0 +1,3 @@ +skip-check: + # CKV_GCP_118: google_project_service — no IAM policy needed on API enablement resources + - CKV_GCP_118 diff --git a/bundles/gcp-landing-zone/src/artifacts.tf b/bundles/gcp-landing-zone/src/artifacts.tf new file mode 100644 index 0000000..195dceb --- /dev/null +++ b/bundles/gcp-landing-zone/src/artifacts.tf @@ -0,0 +1,45 @@ +# Single landing-zone artifact — combines network, enabled APIs, budget reference, +# and an informational summary of the IAM bindings applied at project level. +# Downstream bundles connect to this one artifact to get project_id, network, and +# the list of enabled APIs. Each consumer bundle creates its own workload SA. + +resource "massdriver_artifact" "landing_zone" { + field = "landing_zone" + name = "GCP Landing Zone ${var.md_metadata.name_prefix}" + artifact = jsonencode({ + project_id = local.project_id + + network = { + network_name = var.network.network_name + network_self_link = var.network.network_self_link + region = var.network.region + primary_subnet = var.network.primary_subnet + } + + enabled_apis = var.enabled_apis + + # iam_bindings carries an informational summary of what project-level IAM was applied. + # Downstream bundles do not consume this — it is an audit trail for operators. + iam_bindings = [ + for binding in var.iam_bindings : { + role = binding.role + member = binding.member + } + ] + + # budget is always present in the artifact for schema conformance. + # When disabled, fields carry null/empty sentinel values so downstream + # bundles can safely check landing_zone.budget.enabled before using them. + budget = var.budget.enabled ? { + enabled = true + budget_name = google_billing_budget.environment[0].display_name + billing_account_id = var.budget.billing_account_id + amount_usd = var.budget.amount + } : { + enabled = false + budget_name = null + billing_account_id = null + amount_usd = null + } + }) +} diff --git a/bundles/gcp-landing-zone/src/main.tf b/bundles/gcp-landing-zone/src/main.tf new file mode 100644 index 0000000..b644149 --- /dev/null +++ b/bundles/gcp-landing-zone/src/main.tf @@ -0,0 +1,155 @@ +terraform { + required_version = ">= 1.0" + required_providers { + google = { + source = "hashicorp/google" + version = "~> 6.0" + } + google-beta = { + source = "hashicorp/google-beta" + version = "~> 6.0" + } + massdriver = { + source = "massdriver-cloud/massdriver" + version = "~> 1.3" + } + } +} + +provider "google" { + project = var.gcp_authentication.project_id + credentials = jsonencode(var.gcp_authentication) +} + +provider "google-beta" { + project = var.gcp_authentication.project_id + credentials = jsonencode(var.gcp_authentication) +} + +locals { + project_id = var.gcp_authentication.project_id + name_prefix = var.md_metadata.name_prefix +} + +# ─── Service APIs ───────────────────────────────────────────────────────────── + +resource "google_project_service" "apis" { + for_each = toset(var.enabled_apis) + + project = local.project_id + service = each.value + + # Do not disable the API on destroy — other resources in the project may depend on it + disable_on_destroy = false +} + +# ─── Project IAM Bindings (human operators / groups) ───────────────────────── +# Non-authoritative (google_project_iam_member) — one resource per binding. +# This will NOT remove any bindings set outside of Terraform. +# Intended for humans and groups who need project-level access (e.g., viewers, +# billing admins). Workload service accounts are NOT managed here — each consumer +# bundle creates its own runtime SA. + +resource "google_project_iam_member" "operators" { + for_each = { + for binding in var.iam_bindings : + "${binding.role}/${binding.member}" => binding + } + + project = local.project_id + role = each.value.role + member = each.value.member + + depends_on = [google_project_service.apis] +} + +# ─── Org Policy Guardrails (project-scoped) ─────────────────────────────────── +# Applied at the project level — does not affect other projects in the org. +# Boolean constraints: enforce = true/false as configured. +# List constraints (e.g. vmExternalIpAccess): enforced=true → deny_all policy. +# +# Common useful constraints: +# constraints/iam.disableServiceAccountKeyCreation — prevents user-managed SA keys +# constraints/storage.publicAccessPrevention — blocks public GCS bucket access +# constraints/compute.requireOsLogin — enforces OS Login on all VMs +# constraints/compute.vmExternalIpAccess — deny all external IPs on VMs + +resource "google_project_organization_policy" "guardrails" { + for_each = { + for policy in var.org_policies : + policy.constraint => policy + } + + project = local.project_id + constraint = each.value.constraint + + boolean_policy { + enforced = each.value.enforced + } + + depends_on = [google_project_service.apis] +} + +# ─── Billing Budget ─────────────────────────────────────────────────────────── +# Requires billingbudgets.googleapis.com enabled and billing.budgets.create IAM. +# Only created when var.budget.enabled == true. The billingbudgets.googleapis.com +# API should be included in enabled_apis when budget is enabled. + +data "google_project" "current" { + project_id = local.project_id + + depends_on = [google_project_service.apis] +} + +resource "google_billing_budget" "environment" { + count = var.budget.enabled ? 1 : 0 + + billing_account = var.budget.billing_account_id + display_name = "Budget — ${local.name_prefix}" + + budget_filter { + projects = ["projects/${data.google_project.current.number}"] + } + + amount { + specified_amount { + currency_code = "USD" + units = tostring(floor(var.budget.amount)) + } + } + + dynamic "threshold_rules" { + for_each = var.budget.threshold_percentages + content { + # threshold_percentages are stored as whole numbers (50, 90, 100) in params + # and converted to fractions (0.5, 0.9, 1.0) for the GCP API + threshold_percent = threshold_rules.value / 100 + spend_basis = "CURRENT_SPEND" + } + } + + all_updates_rule { + monitoring_notification_channels = length(google_monitoring_notification_channel.budget_email) > 0 ? [google_monitoring_notification_channel.budget_email[0].id] : [] + disable_default_iam_recipients = false + } + + depends_on = [google_project_service.apis] +} + +# ─── Budget Email Alert via Monitoring Notification Channel ────────────────── +# Only provisioned when budget is enabled AND notification_emails is non-empty. +# Emails are optional — GCP will still send to billing admins via disable_default_iam_recipients=false. + +resource "google_monitoring_notification_channel" "budget_email" { + count = var.budget.enabled && length(var.budget.notification_emails) > 0 ? 1 : 0 + + project = local.project_id + display_name = "Budget Alert — ${local.name_prefix}" + type = "email" + + labels = { + email_address = var.budget.notification_emails[0] + } + + depends_on = [google_project_service.apis] +} diff --git a/bundles/gcp-landing-zone/src/variables.tf b/bundles/gcp-landing-zone/src/variables.tf new file mode 100644 index 0000000..8d0580c --- /dev/null +++ b/bundles/gcp-landing-zone/src/variables.tf @@ -0,0 +1,68 @@ +variable "md_metadata" { + type = object({ + name_prefix = string + default_tags = optional(map(string), {}) + }) +} + +variable "gcp_authentication" { + type = object({ + type = string + project_id = string + private_key_id = string + private_key = string + client_email = string + client_id = string + auth_uri = string + token_uri = string + auth_provider_x509_cert_url = string + client_x509_cert_url = string + }) + sensitive = true +} + +variable "network" { + type = object({ + project_id = string + network_name = string + network_self_link = string + region = string + primary_subnet = object({ + name = string + cidr = string + self_link = string + }) + }) +} + +variable "enabled_apis" { + type = list(string) +} + +variable "iam_bindings" { + description = "Project-level IAM bindings for human operators/groups. Non-authoritative (additive) — will not remove bindings set outside Terraform." + type = list(object({ + role = string + member = string + })) + default = [] +} + +variable "org_policies" { + description = "Project-scoped org-policy constraints. Boolean constraints are set to enforce=true/false. List constraints with enforced=true deny all values." + type = list(object({ + constraint = string + enforced = bool + })) + default = [] +} + +variable "budget" { + type = object({ + enabled = bool + billing_account_id = optional(string) + amount = optional(number) + threshold_percentages = optional(list(number)) + notification_emails = optional(list(string), []) + }) +} diff --git a/bundles/gcp-log-sink/README.md b/bundles/gcp-log-sink/README.md new file mode 100644 index 0000000..95445cb --- /dev/null +++ b/bundles/gcp-log-sink/README.md @@ -0,0 +1,53 @@ +# gcp-log-sink + +Routes Cloud Logging entries from a GCP project to either a BigQuery dataset or a GCS bucket. Exactly one destination must be wired — the bundle enforces this with a Terraform precondition. The Google-managed sink writer service account is automatically granted the minimum required IAM role on the chosen destination. + +## Use Cases + +- Persistent audit log storage: pipe `cloudaudit.googleapis.com/activity` to GCS for long-term retention at low cost. +- Log-based analytics: route application or infrastructure logs to BigQuery for SQL queries and dashboards. +- Error alerting pipeline: filter `severity >= ERROR` to BigQuery, then query from Vertex Workbench or a BI tool. + +## Resources Created + +| Resource | Description | +|---|---| +| `google_logging_project_sink.main` | Project-scoped Cloud Logging sink with unique writer identity | +| `google_bigquery_dataset_iam_member.sink_writer` | (BigQuery only) Grants sink SA `roles/bigquery.dataEditor` on the dataset | +| `google_storage_bucket_iam_member.sink_writer` | (GCS only) Grants sink SA `roles/storage.objectCreator` on the bucket | + +## Connections + +### Required + +- **GCP Credentials** (`gcp-service-account`) — service account used by Terraform to create and manage the sink. +- **GCP Landing Zone** (`catalog-demo/gcp-landing-zone`) — provides the project ID where the sink is created. + +### Optional Destinations (exactly one must be wired) + +- **BigQuery Dataset** (`catalog-demo/gcp-bigquery-dataset`) — route logs to this dataset. Logs land in tables named after the log type; date-partitioned when `use_partitioned_tables` is enabled. +- **GCS Bucket** (`catalog-demo/gcp-storage-bucket`) — route logs to this bucket. Cloud Logging batches entries hourly into JSON files organized by date and hour. + +If neither or both destinations are wired, `tofu plan` will fail with a clear error message. + +## Artifact Produced + +`catalog-demo/gcp-log-sink` — carries `project_id`, `sink_name`, `destination`, `writer_identity`, and `destination_type`. Downstream bundles rarely need to consume this artifact directly; it is published for observability and chaining. + +## Compliance + +Log sinks are low-risk infrastructure. No Checkov skips are expected. `halt_on_failure` is set to block deployments to `prod`, `prd`, and `production` environments on any compliance failure. + +## Presets + +| Preset | Filter | Partitioned Tables | Notes | +|---|---|---|---| +| Error Logs to BigQuery | `severity >= ERROR` | Yes | Recommended starting point for BigQuery destinations | +| Audit Logs to GCS | `logName = "projects/PROJECT/logs/cloudaudit.googleapis.com%2Factivity"` | No | Update PROJECT to your GCP project ID before deploying | +| All Logs (no filter) | (empty) | Yes | Routes every log entry — can generate significant storage costs | + +## Assumptions + +- This bundle creates a **project-level** sink. It does NOT capture logs from child projects, folders, or the organization. Folder or org sinks are out of scope. +- `unique_writer_identity = true` is non-negotiable. Sharing the project-level logging SA across sinks would mean IAM grants on one sink's destination affect all other sinks. +- Filter changes take effect immediately but do NOT backfill historical logs. Logs written before the filter change are not re-routed. diff --git a/bundles/gcp-log-sink/massdriver.yaml b/bundles/gcp-log-sink/massdriver.yaml new file mode 100644 index 0000000..63cff99 --- /dev/null +++ b/bundles/gcp-log-sink/massdriver.yaml @@ -0,0 +1,145 @@ +name: gcp-log-sink +description: Google Cloud Logging project-level sink with configurable filter and + destination. Routes log entries matching the filter to a BigQuery dataset or GCS + bucket. Automatically grants the Google-managed sink writer identity the minimum + required IAM role on the chosen destination. Enforces exactly-one destination via + a Terraform precondition. Emits a gcp-log-sink artifact with sink metadata for + downstream reference. +source_url: https://github.com/massdriver-cloud/massdriver-catalog/tree/main/bundles/gcp-log-sink +version: 0.1.1 + +params: + examples: + - __name: Error Logs to BigQuery + filter: "severity >= ERROR" + use_partitioned_tables: true + exclusions: [] + - __name: Audit Logs to GCS + filter: 'logName = "projects/PROJECT/logs/cloudaudit.googleapis.com%2Factivity"' + use_partitioned_tables: false + exclusions: [] + - __name: All Logs (no filter) + filter: "" + use_partitioned_tables: true + exclusions: [] + + properties: + filter: + title: Log Filter + description: >- + Cloud Logging filter query that determines which log entries are routed to + the destination. Leave empty to include all logs (can be expensive). Common + examples: severity >= ERROR / resource.type = "cloud_run_revision" / + logName = "projects/PROJECT/logs/cloudaudit.googleapis.com%2Factivity". + Filter syntax: https://cloud.google.com/logging/docs/view/logging-query-language + type: string + default: "" + + use_partitioned_tables: + title: Use Partitioned Tables (BigQuery only) + description: >- + When the destination is BigQuery, write log entries into date-partitioned + tables instead of a single monolithic table. Partitioning reduces query cost + and improves performance for time-bounded queries. Has no effect when the + destination is GCS — the setting is stored but ignored. + type: boolean + default: true + + exclusions: + title: Exclusions + description: >- + Optional list of log exclusions. Each exclusion drops log entries that match + its filter before they reach the destination. Useful for suppressing high-volume + low-value logs (e.g., health-check requests) from storage costs. Exclusions + are evaluated AFTER the sink filter, so they can only drop entries that would + otherwise be included. + type: array + default: [] + items: + type: object + required: + - name + - filter + properties: + name: + title: Exclusion Name + description: >- + Short identifier for this exclusion rule. Must be unique within the sink. + Used in Logging metrics and audit logs to identify which exclusion applied. + type: string + filter: + title: Exclusion Filter + description: >- + Cloud Logging query language filter. Log entries matching this filter + are dropped from the sink. Use the same syntax as the top-level sink + filter field. + type: string + description: + title: Description + description: Optional human-readable explanation of why these logs are excluded. + type: string + disabled: + title: Disabled + description: >- + When true, the exclusion is defined but not active — matching logs are + still routed to the destination. Useful for temporarily suspending an + exclusion without deleting it. + type: boolean + default: false + +connections: + required: + - gcp_authentication + - landing_zone + properties: + gcp_authentication: + $ref: gcp-service-account + title: GCP Credentials + + landing_zone: + $ref: catalog-demo/gcp-landing-zone + title: GCP Landing Zone + + bigquery_dataset: + $ref: catalog-demo/gcp-bigquery-dataset + title: BigQuery Dataset (optional destination) + + storage_bucket: + $ref: catalog-demo/gcp-storage-bucket + title: GCS Bucket (optional destination) + +artifacts: + required: + - log_sink + properties: + log_sink: + $ref: catalog-demo/gcp-log-sink + title: GCP Log Sink + +steps: + - path: src + provisioner: opentofu:1.10 + config: + checkov: + halt_on_failure: '.params.md_metadata.default_tags["md-target"] | test("^(prod|prd|production)$")' + +ui: + ui:order: + - filter + - use_partitioned_tables + - exclusions + - "*" + properties: + use_partitioned_tables: + ui:widget: checkbox + exclusions: + items: + ui:order: + - name + - filter + - description + - disabled + - "*" + properties: + disabled: + ui:widget: checkbox diff --git a/bundles/gcp-log-sink/operator.md b/bundles/gcp-log-sink/operator.md new file mode 100644 index 0000000..82af3e2 --- /dev/null +++ b/bundles/gcp-log-sink/operator.md @@ -0,0 +1,65 @@ +# gcp-log-sink — Operator Runbook + +## Non-obvious constraints + +**Project scope only.** This sink captures logs from the project specified in the landing zone connection. Logs from other projects, child folders, or the organization are not captured. Folder-level and org-level sinks require a different Terraform resource (`google_logging_folder_sink` / `google_logging_organization_sink`) and are out of scope for this bundle. + +**`unique_writer_identity` is locked to `true`.** The Google-managed writer SA is unique per sink. If set to `false`, Cloud Logging would use the shared `cloud-logs@system.gserviceaccount.com` SA, which cannot be individually scoped to a single dataset or bucket. Changing this after the sink is created requires destroy and recreate — the writer identity changes. + +**Writer identity is generated at sink creation.** The `writer_identity` SA email is not known before `tofu apply`. It is provisioned by Cloud Logging when the sink resource is created. If the sink is destroyed and recreated (not updated in place), a NEW writer identity SA is generated and all prior IAM bindings on the destination become stale. This bundle re-creates the IAM binding from the new identity, but any manually added bindings on the destination will not. + +**Filter changes are non-backfilling.** Updating the `filter` takes effect immediately for new log entries. Historical logs already written to the destination are not touched. Entries that were routed before the filter change remain in BigQuery or GCS permanently. + +**BigQuery schema is auto-created and can drift.** Cloud Logging infers table schema from log entry structure. If Google changes the structure of a system log (e.g., adds or renames a field), existing tables are not migrated. Queries relying on specific field paths may break. Use `SELECT *` with caution in production pipelines. + +**GCS batching latency.** Cloud Logging batches log entries hourly before writing to GCS. The sink is not suitable for near-real-time querying or alerting. Use BigQuery with `use_partitioned_tables = true` for latency-sensitive use cases. + +**Exactly-one destination is a hard constraint.** The Terraform precondition blocks plan if both or neither optional connections are wired. This check fires before any API calls — you will see the error in the deployment log from the `tofu plan` step. + +## Troubleshooting + +**"precondition failed: Connect either a BigQuery dataset or a Storage bucket"** — Exactly one of the two optional connections (`bigquery_dataset`, `storage_bucket`) must be wired on the canvas. Check the canvas wiring and re-deploy. + +**Sink exists but no logs appear in destination** — Verify the filter is correct by testing it in the Logs Explorer (`console.cloud.google.com/logs/query`) against live traffic before applying it to the sink. An overly restrictive filter results in a valid sink that routes nothing. + +**IAM error: "The caller does not have permission on the resource"** — The sink writer identity SA needs time to propagate after creation. If IAM bindings were applied but the sink was just created, wait 60-90 seconds and check again. If the sink was destroyed and recreated, the writer identity changed — check the artifact `writer_identity` field and verify the IAM binding reflects the new SA. + +**BigQuery tables not appearing after deploy** — Cloud Logging creates tables lazily: the first matching log entry triggers table creation. If no logs match the filter, no tables appear. Confirm by checking Logs Explorer for matching entries, then wait up to 5 minutes. + +**GCS files not appearing** — Cloud Logging writes hourly. Wait at least 90 minutes after deploy before concluding there is a problem. Check the Logs Explorer for entries matching the filter first. + +**"ALREADY_EXISTS" error on sink creation** — A sink with the same name (derived from `md_metadata.name_prefix`) already exists in the project. This happens if a previous deployment left a sink that Terraform state does not track. Import the existing sink: `tofu import google_logging_project_sink.main projects/PROJECT/sinks/SINK_NAME`. + +## Day-2 operations + +**Updating the filter** — Change the `filter` param in the package config and deploy. The sink is updated in place. Filter changes are immediate for new log entries. No restart or recreate needed. + +**Adding an exclusion** — Add an entry to the `exclusions` array and deploy. Exclusions are applied after the sink filter. Use the Logs Explorer to validate the exclusion filter matches what you intend before deploying to production. + +**Switching destinations** — Changing from BigQuery to GCS (or vice versa) requires the opposite connection to be wired AND the currently wired connection to be unwired simultaneously. The precondition blocks any state where both or neither are active. Execute the connection change and re-deploy in a single operation. The old IAM binding is removed and a new one is created. The sink name and writer identity do not change. + +**Decommissioning** — Destroying the bundle removes the sink and the IAM binding. Log entries already in the destination (BigQuery tables or GCS objects) are NOT deleted — they remain in the destination resource and accrue storage cost until manually removed. + +## Useful Commands + +```bash +# List sinks in the project +gcloud logging sinks list --project=PROJECT_ID + +# Describe a specific sink +gcloud logging sinks describe SINK_NAME --project=PROJECT_ID + +# Check sink writer identity (useful for manual IAM debugging) +gcloud logging sinks describe SINK_NAME --project=PROJECT_ID --format="value(writerIdentity)" + +# Test a log filter in Logs Explorer (output to stdout for quick count check) +gcloud logging read 'severity >= ERROR' --project=PROJECT_ID --limit=10 + +# Verify BigQuery IAM on the dataset +gcloud projects get-iam-policy PROJECT_ID --flatten="bindings[].members" \ + --format="table(bindings.role,bindings.members)" \ + --filter="bindings.members:gcp-sa-logging" + +# Import an orphaned sink into Terraform state +tofu import google_logging_project_sink.main projects/PROJECT_ID/sinks/SINK_NAME +``` diff --git a/bundles/gcp-log-sink/src/artifacts.tf b/bundles/gcp-log-sink/src/artifacts.tf new file mode 100644 index 0000000..17f3534 --- /dev/null +++ b/bundles/gcp-log-sink/src/artifacts.tf @@ -0,0 +1,13 @@ +# Log sink artifact — matches catalog-demo/gcp-log-sink schema. + +resource "massdriver_artifact" "log_sink" { + field = "log_sink" + name = "GCP Log Sink ${var.md_metadata.name_prefix}" + artifact = jsonencode({ + project_id = local.project_id + sink_name = google_logging_project_sink.main.name + destination = google_logging_project_sink.main.destination + writer_identity = google_logging_project_sink.main.writer_identity + destination_type = local.destination_type + }) +} diff --git a/bundles/gcp-log-sink/src/main.tf b/bundles/gcp-log-sink/src/main.tf new file mode 100644 index 0000000..a790513 --- /dev/null +++ b/bundles/gcp-log-sink/src/main.tf @@ -0,0 +1,97 @@ +terraform { + required_version = ">= 1.0" + required_providers { + google = { + source = "hashicorp/google" + version = "~> 6.0" + } + massdriver = { + source = "massdriver-cloud/massdriver" + version = "~> 1.3" + } + } +} + +provider "google" { + project = var.gcp_authentication.project_id + credentials = jsonencode(var.gcp_authentication) +} + +locals { + project_id = var.landing_zone.project_id + name_prefix = var.md_metadata.name_prefix + + # Resolve which destination connection is wired. Exactly one must be non-null. + # The precondition on google_logging_project_sink.main enforces this at plan time. + has_bigquery = var.bigquery_dataset != null + has_gcs = var.storage_bucket != null + + destination = local.has_bigquery ? ( + "bigquery.googleapis.com/projects/${var.bigquery_dataset.project_id}/datasets/${var.bigquery_dataset.dataset_id}" + ) : local.has_gcs ? ( + "storage.googleapis.com/${var.storage_bucket.bucket_name}" + ) : "" + + destination_type = local.has_bigquery ? "bigquery" : "gcs" +} + +# ─── Cloud Logging Project Sink ─────────────────────────────────────────────── + +resource "google_logging_project_sink" "main" { + project = local.project_id + name = local.name_prefix + destination = local.destination + filter = var.filter != "" ? var.filter : null + + # unique_writer_identity = true ensures the sink gets its own Google-managed SA + # rather than sharing the project-level logging SA. Required when granting the + # sink's writer access on a specific dataset or bucket (otherwise IAM bindings + # would affect ALL sinks in the project). This is non-negotiable. + unique_writer_identity = true + + dynamic "bigquery_options" { + for_each = local.has_bigquery ? [1] : [] + content { + use_partitioned_tables = var.use_partitioned_tables + } + } + + dynamic "exclusions" { + for_each = var.exclusions + content { + name = exclusions.value.name + filter = exclusions.value.filter + description = try(exclusions.value.description, null) + disabled = try(exclusions.value.disabled, false) + } + } + + lifecycle { + precondition { + condition = (var.bigquery_dataset != null) != (var.storage_bucket != null) + error_message = "Connect either a BigQuery dataset or a Storage bucket as the sink destination, not both and not neither." + } + } +} + +# ─── Sink Writer IAM Binding ────────────────────────────────────────────────── +# Grant the Google-managed sink writer SA the minimum role on the destination. +# writer_identity is not known until the sink is created — Terraform handles the +# dependency automatically via the reference below. + +resource "google_bigquery_dataset_iam_member" "sink_writer" { + count = local.has_bigquery ? 1 : 0 + + project = var.bigquery_dataset.project_id + dataset_id = var.bigquery_dataset.dataset_id + role = "roles/bigquery.dataEditor" + member = google_logging_project_sink.main.writer_identity +} + +resource "google_storage_bucket_iam_member" "sink_writer" { + count = local.has_gcs ? 1 : 0 + + bucket = var.storage_bucket.bucket_name + role = "roles/storage.objectCreator" + member = google_logging_project_sink.main.writer_identity +} diff --git a/bundles/gcp-log-sink/src/variables.tf b/bundles/gcp-log-sink/src/variables.tf new file mode 100644 index 0000000..0e0766b --- /dev/null +++ b/bundles/gcp-log-sink/src/variables.tf @@ -0,0 +1,93 @@ +variable "md_metadata" { + type = object({ + name_prefix = string + default_tags = optional(map(string), {}) + }) +} + +variable "gcp_authentication" { + type = object({ + type = string + project_id = string + private_key_id = string + private_key = string + client_email = string + client_id = string + auth_uri = string + token_uri = string + auth_provider_x509_cert_url = string + client_x509_cert_url = string + }) + sensitive = true +} + +variable "landing_zone" { + type = object({ + project_id = string + network = object({ + network_name = string + network_self_link = string + region = string + primary_subnet = object({ + name = string + cidr = string + self_link = string + }) + }) + enabled_apis = list(string) + budget = object({ + enabled = bool + budget_name = optional(string) + billing_account_id = optional(string) + amount_usd = optional(number) + }) + }) +} + +variable "bigquery_dataset" { + description = "Optional BigQuery dataset destination. Must be wired when the sink routes to BigQuery." + type = object({ + project_id = string + dataset_id = string + dataset_full_name = string + location = string + friendly_name = optional(string) + }) + default = null +} + +variable "storage_bucket" { + description = "Optional GCS bucket destination. Must be wired when the sink routes to GCS." + type = object({ + project_id = string + bucket_name = string + bucket_url = string + bucket_self_link = string + location = string + storage_class = string + }) + default = null +} + +variable "filter" { + description = "Cloud Logging query filter. Empty string means include all logs." + type = string + default = "" +} + +variable "use_partitioned_tables" { + description = "Write to date-partitioned BigQuery tables. Ignored for GCS destinations." + type = bool + default = true +} + +variable "exclusions" { + description = "Log exclusion rules applied after the sink filter." + type = list(object({ + name = string + filter = string + description = optional(string) + disabled = optional(bool, false) + })) + default = [] +} diff --git a/bundles/gcp-network/README.md b/bundles/gcp-network/README.md new file mode 100644 index 0000000..a7ae332 --- /dev/null +++ b/bundles/gcp-network/README.md @@ -0,0 +1,65 @@ +# gcp-network + +Minimal GCP VPC network with a single regional subnet. Deploy this before `gcp-landing-zone` — the landing zone consumes the `gcp-network` artifact and passes it downstream so other bundles only need one connection. + +## Use Cases + +- Foundational networking for a GCP data platform stack +- Single regional subnet with Private Google Access so VMs reach GCP APIs without a NAT gateway +- Baseline deny-all ingress policy; workload bundles layer their own allow rules on top + +## Resources Created + +| Resource | Type | Notes | +|---|---|---| +| `google_compute_network.vpc` | VPC network | Custom subnet mode; GCP does not auto-create subnets in other regions | +| `google_compute_subnetwork.primary` | Regional subnet | Flow logging on (0.5 sampling), Private Google Access on | +| `google_compute_firewall.deny_all_ingress` | Firewall rule | Deny all ingress at priority 65534 | + +## Connections + +| Connection | Artifact Type | How It Is Used | +|---|---|---| +| `gcp_authentication` | `gcp-service-account` | Deploy credential — project ID and service account key for the Google provider | + +## Artifact Produced + +**Artifact type:** `gcp-network` + +| Field | Description | +|---|---| +| `project_id` | GCP project the VPC belongs to | +| `network_name` | Name of the VPC network resource | +| `network_self_link` | Full self-link URI for the VPC | +| `region` | Region of the primary subnet | +| `primary_subnet.name` | Subnet resource name | +| `primary_subnet.cidr` | Primary IP range of the subnet | +| `primary_subnet.self_link` | Full self-link URI for the subnet | + +This artifact is consumed by `gcp-landing-zone`, which passes it through into its own artifact so downstream bundles (Cloud Run, Vertex Workbench) only need to connect to the landing zone. + +## Compliance + +### Hardcoded security controls + +| Control | Mechanism | Reason | +|---|---|---| +| Deny-all ingress | `google_compute_firewall.deny_all_ingress` at priority 65534 | Enforces explicit allowlisting per workload (Checkov CKV2_GCP_18) | +| Custom subnet mode | `auto_create_subnetworks = false` | Prevents GCP from auto-creating subnets in every region | +| Private Google Access | `private_ip_google_access = true` | VMs reach GCP APIs over internal IPs without egress or NAT | +| Flow logging | `log_config` block, 0.5 sampling | Network audit trail for traffic troubleshooting | + +No Checkov skips — all findings are satisfied by the hardcoded controls above or blocked in production via `halt_on_failure`. + +The `halt_on_failure` expression in `massdriver.yaml` blocks deployments with remaining high-severity findings when the environment target matches `prod`, `prd`, or `production`. + +## Assumptions + +- The GCP project already exists — this bundle does not create projects. +- The `gcp_authentication` credential has `compute.admin` or equivalent IAM to create VPC resources and firewall rules. + +## Presets + +| Preset | Region | Network Name | Subnet CIDR | +|---|---|---|---| +| Standard | `us-central1` | `data-platform-vpc` | `10.0.0.0/20` | diff --git a/bundles/gcp-network/massdriver.yaml b/bundles/gcp-network/massdriver.yaml new file mode 100644 index 0000000..77579e1 --- /dev/null +++ b/bundles/gcp-network/massdriver.yaml @@ -0,0 +1,90 @@ +name: gcp-network +description: Minimal GCP VPC network with a single regional subnet. Produces a + gcp-network artifact consumed by landing-zone, Cloud Run, Vertex Workbench, and + other data-platform bundles. +source_url: https://github.com/massdriver-cloud/massdriver-catalog/tree/main/bundles/gcp-network +version: 0.1.1 + +params: + required: + - region + - network_name + - subnet_cidr + examples: + - __name: Standard + region: us-central1 + network_name: data-platform-vpc + subnet_cidr: "10.0.0.0/20" + properties: + region: + title: Region + description: GCP region to deploy the subnet into (the VPC is global). Cannot + be changed after the subnet is created — changing the region requires destroying + and recreating all resources in this bundle. + type: string + $md.immutable: true + default: us-central1 + enum: + - us-central1 + - us-east1 + - us-east4 + - us-west1 + - us-west2 + - us-west3 + - us-west4 + - europe-west1 + - europe-west2 + - europe-west4 + - asia-east1 + - asia-northeast1 + - asia-southeast1 + + network_name: + title: Network Name + description: Name for the VPC network resource. Cannot be changed after creation — + renaming the network requires destroying and recreating all dependent resources + (subnets, firewall rules). + type: string + $md.immutable: true + default: data-platform-vpc + pattern: ^[a-z][a-z0-9-]{0,61}[a-z0-9]$ + + subnet_cidr: + title: Subnet CIDR + description: Primary IP range for the regional subnet. Cannot be changed after + creation — expanding or changing the range requires subnet recreation. + type: string + $md.immutable: true + default: "10.0.0.0/20" + pattern: >- + ^(?:[0-9]|[0-9]{2}|1[0-9][0-9]|2[0-4][0-9]|25[0-5])(?:\.(?:[0-9]|[0-9]{2}|1[0-9][0-9]|2[0-4][0-9]|25[0-5])){3}(?:/(?:[0-9]|1[0-9]|2[0-9]|3[0-2]))$ + +connections: + required: + - gcp_authentication + properties: + gcp_authentication: + $ref: gcp-service-account + title: GCP Credentials + +artifacts: + required: + - network + properties: + network: + $ref: gcp-network + title: GCP Network + +steps: + - path: src + provisioner: opentofu:1.10 + config: + checkov: + halt_on_failure: '.params.md_metadata.default_tags["md-target"] | test("^(prod|prd|production)$")' + +ui: + ui:order: + - region + - network_name + - subnet_cidr + - "*" diff --git a/bundles/gcp-network/operator.md b/bundles/gcp-network/operator.md new file mode 100644 index 0000000..e5cf85b --- /dev/null +++ b/bundles/gcp-network/operator.md @@ -0,0 +1,84 @@ +--- +templating: mustache +--- + +# GCP Network — Operator Runbook + +## Non-obvious constraints + +**Network name is immutable.** Changing it forces replacement of the entire VPC and all dependent resources (subnets, firewall rules, peerings). Treat it as permanent after first deploy. + +**Subnet CIDR is immutable.** GCP does not support in-place CIDR changes. To change it: decommission all resources in the subnet, destroy this package, then reprovision with the new range. + +**Subnet region is immutable.** The subnet's region cannot be changed after creation. A region change requires destroy and recreate. + +**Deny-all firewall is hardcoded at priority 65534.** This bundle creates a single baseline deny-all ingress rule. No ingress traffic is allowed by default. Workload bundles (Cloud Run, Vertex, etc.) layer their own allow rules at lower priority numbers. + +**VPC is global; the subnet is regional.** Cross-region resources can share the VPC but need their own regional subnets — extend the Terraform source if additional subnets are needed. + +**Deleting the network fails if anything is still attached.** Terraform will error if VMs, Cloud Run VPC connectors, GKE nodes, or other resources are still using the network. Decommission all dependent packages first. + +## Troubleshooting + +**Subnet resources fail to delete ("resourceInUseByAnotherResource").** +Something is still attached. Find it: +```bash +gcloud compute networks list-associated-resources {{artifacts.network.network_name}} \ + --project={{artifacts.network.project_id}} +``` +Decommission those packages first, then retry destroy. + +**Firewall rules not taking effect.** +Rules are evaluated by priority (lowest number wins). Check the full rule list for conflicts: +```bash +gcloud compute firewall-rules list \ + --filter="network:{{artifacts.network.network_name}}" \ + --format="table(name,direction,priority,disabled,sourceRanges,allowed[].map().firewall_rule().list():label=ALLOW,denied[].map().firewall_rule().list():label=DENY)" \ + --sort-by=priority +``` + +**API quota or "permission denied" on VPC creation.** +Ensure `compute.googleapis.com` is enabled in the landing zone's `enabled_apis`. + +## Day-2 operations + +**Expanding or changing CIDR:** Not supported in-place. Must destroy and recreate. Decommission all resources in the subnet first. + +**Adding subnets:** This bundle provisions one regional subnet. For additional subnets, extend the Terraform source directly. + +**VPC peering:** Add a `google_compute_network_peering` resource to the bundle source. Ensure CIDR ranges don't overlap between peered VPCs. + +**Querying VPC flow logs:** +Flow logs are stored in Cloud Logging under resource type `gce_subnetwork`. Sampling is 50% at 5-second aggregation intervals. +```bash +gcloud logging read \ + 'resource.type="gce_subnetwork" AND resource.labels.subnetwork_name="{{artifacts.network.primary_subnet.name}}"' \ + --project={{artifacts.network.project_id}} \ + --limit=50 \ + --format=json +``` + +## Useful commands + +```bash +# List all firewall rules on this network +gcloud compute firewall-rules list \ + --filter="network:{{artifacts.network.network_name}}" \ + --format="table(name,direction,priority,disabled,sourceRanges,allowed[].map().firewall_rule().list():label=ALLOW,denied[].map().firewall_rule().list():label=DENY)" + +# Describe the primary subnet +gcloud compute networks subnets describe {{artifacts.network.primary_subnet.name}} \ + --region={{artifacts.network.region}} \ + --project={{artifacts.network.project_id}} + +# Describe the VPC +gcloud compute networks describe {{artifacts.network.network_name}} \ + --project={{artifacts.network.project_id}} + +# Tail recent VPC flow logs for this subnet +gcloud logging read \ + 'resource.type="gce_subnetwork" AND resource.labels.subnetwork_name="{{artifacts.network.primary_subnet.name}}"' \ + --project={{artifacts.network.project_id}} \ + --limit=20 \ + --format=json +``` diff --git a/bundles/gcp-network/src/_massdriver_variables.tf b/bundles/gcp-network/src/_massdriver_variables.tf new file mode 100644 index 0000000..f420e06 --- /dev/null +++ b/bundles/gcp-network/src/_massdriver_variables.tf @@ -0,0 +1,47 @@ +// This file is auto-generated by massdriver from your massdriver.yaml file. +// Any changes made directly to this file will be overwritten on the next build. +// To opt a variable out of regeneration, move it to another file (e.g. variables.tf). +variable "gcp_authentication" { + type = object({ + auth_provider_x509_cert_url = string + auth_uri = string + client_email = string + client_id = string + client_x509_cert_url = string + private_key = string + private_key_id = string + project_id = string + token_uri = string + type = string + }) +} +variable "md_metadata" { + type = object({ + default_tags = map(string) + deployment = object({ + id = string + }) + name_prefix = string + observability = object({ + alarm_webhook_url = string + }) + package = object({ + created_at = string + deployment_enqueued_at = string + previous_status = string + updated_at = string + }) + target = object({ + contact_email = string + }) + }) +} +variable "network_name" { + type = string +} +variable "region" { + type = string +} +variable "subnet_cidr" { + type = string +} diff --git a/bundles/gcp-network/src/artifacts.tf b/bundles/gcp-network/src/artifacts.tf new file mode 100644 index 0000000..e09d7a6 --- /dev/null +++ b/bundles/gcp-network/src/artifacts.tf @@ -0,0 +1,15 @@ +resource "massdriver_artifact" "network" { + field = "network" + name = "GCP Network ${var.md_metadata.name_prefix}" + artifact = jsonencode({ + project_id = var.gcp_authentication.project_id + network_name = google_compute_network.vpc.name + network_self_link = google_compute_network.vpc.self_link + region = var.region + primary_subnet = { + name = google_compute_subnetwork.primary.name + cidr = google_compute_subnetwork.primary.ip_cidr_range + self_link = google_compute_subnetwork.primary.self_link + } + }) +} diff --git a/bundles/gcp-network/src/main.tf b/bundles/gcp-network/src/main.tf new file mode 100644 index 0000000..d24aa64 --- /dev/null +++ b/bundles/gcp-network/src/main.tf @@ -0,0 +1,60 @@ +terraform { + required_version = ">= 1.0" + required_providers { + google = { + source = "hashicorp/google" + version = "~> 6.0" + } + massdriver = { + source = "massdriver-cloud/massdriver" + version = "~> 1.3" + } + } +} + +provider "google" { + project = var.gcp_authentication.project_id + credentials = jsonencode(var.gcp_authentication) + region = var.region +} + +locals { + subnet_name = "${var.network_name}-${var.region}" +} + +resource "google_compute_network" "vpc" { + name = var.network_name + auto_create_subnetworks = false + description = "Data platform VPC managed by Massdriver — ${var.md_metadata.name_prefix}" +} + +resource "google_compute_subnetwork" "primary" { + name = local.subnet_name + ip_cidr_range = var.subnet_cidr + region = var.region + network = google_compute_network.vpc.id + private_ip_google_access = true + + log_config { + aggregation_interval = "INTERVAL_5_SEC" + flow_sampling = 0.5 + metadata = "INCLUDE_ALL_METADATA" + } +} + +# Baseline deny-all ingress firewall. Workload bundles add targeted allow rules +# (e.g. allow 443 from load balancer IP ranges). This satisfies CKV2_GCP_18 and +# enforces explicit allowlisting instead of relying on GCP's permissive defaults. +resource "google_compute_firewall" "deny_all_ingress" { + name = "${var.network_name}-deny-all-ingress" + network = google_compute_network.vpc.id + description = "Baseline deny-all ingress. Workload bundles add targeted allow rules." + direction = "INGRESS" + priority = 65534 + + deny { + protocol = "all" + } + + source_ranges = ["0.0.0.0/0"] +} diff --git a/bundles/gcp-pubsub-topic/README.md b/bundles/gcp-pubsub-topic/README.md new file mode 100644 index 0000000..6e6195c --- /dev/null +++ b/bundles/gcp-pubsub-topic/README.md @@ -0,0 +1,75 @@ +# gcp-pubsub-topic + +Google Cloud Pub/Sub topic with optional dead-letter queue (DLQ). Use this bundle to provision a managed message topic for event-driven workloads — Cloud Run services, Dataflow pipelines, BigQuery subscriptions, and similar. + +## Use Cases + +- Decoupling producers from consumers in event-driven architectures +- Buffering messages for downstream workers that process at their own pace +- Capturing undeliverable messages in a DLQ for retry or inspection + +## Resources Created + +| Resource | Type | Notes | +|---|---|---| +| `google_pubsub_topic.main` | Main Pub/Sub topic | Retention and ordering label set at provision time | +| `google_pubsub_topic.dlq` | Dead-letter topic | Created only when `dlq.enabled = true` | + +This bundle does NOT create any IAM bindings. Consumer bundles (e.g., `gcp-cloud-run-service`) create their own service accounts and bind the appropriate roles on this topic when connected on the canvas. + +## Connections + +| Connection | Artifact Type | How It Is Used | +|---|---|---| +| `gcp_authentication` | `gcp-service-account` | Deploy credential — project ID and service account key for the Google provider | +| `landing_zone` | `catalog-demo/gcp-landing-zone` | Provides `project_id` for resource placement | + +## Artifact Produced + +**Artifact type:** `catalog-demo/gcp-pubsub-topic` + +| Field | Present | Description | +|---|---|---| +| `project_id` | Always | GCP project ID | +| `topic_name` | Always | Main topic resource name | +| `topic_id` | Always | Full topic resource ID | +| `dlq_topic_name` | Only when `dlq.enabled = true` | DLQ topic resource name | +| `dlq_topic_id` | Only when `dlq.enabled = true` | Full DLQ topic resource ID | + +Consumer bundles that need to publish or subscribe bind IAM roles using `topic_name` and `project_id` from this artifact. Example pattern in a consumer bundle: + +```hcl +resource "google_pubsub_topic_iam_member" "publisher" { + project = var.pubsub_topic.project_id + topic = var.pubsub_topic.topic_name + role = "roles/pubsub.publisher" + member = "serviceAccount:${google_service_account.runtime.email}" +} +``` + +## Message Ordering + +Message ordering is enforced at the publisher SDK level, not at the topic resource level. The `message_ordering_enabled` parameter writes a label (`message-ordering: enabled|disabled`) on the topic to record operator intent. Publishers must set `enable_message_ordering = true` and use ordering keys in their SDK client. Enabling ordering reduces maximum throughput. + +## Compliance + +### Checkov skips + +| Check | Reason | +|---|---| +| `CKV_GCP_83` | CSEK (Customer-Supplied Encryption Keys) skipped across all environments. CSEK requires callers to manage raw AES-256 keys on every API call. Google-managed encryption satisfies encryption-at-rest requirements for the workloads this bundle targets. If CMEK via Cloud KMS is required, use a separate bundle with a KMS connection. | + +The `halt_on_failure` expression in `massdriver.yaml` blocks deployments with unresolved high-severity Checkov findings when the environment target matches `prod`, `prd`, or `production`. + +## Assumptions + +- `pubsub.googleapis.com` must be enabled in the landing zone before deploying. Add it to `enabled_apis` in the `gcp-landing-zone` package. +- The `gcp_authentication` credential has `pubsub.admin` or equivalent IAM on the project. + +## Presets + +| Preset | Retention | DLQ | Max Delivery Attempts | Use Case | +|---|---|---|---|---| +| Low-volume | 7 days | Off | — | Dev or low-traffic topics where DLQ overhead is unnecessary | +| Standard | 7 days | On | 5 | Most production topics; catches undeliverable messages | +| High-throughput | 1 day | On | 10 | High-volume pipelines where shorter retention reduces storage cost | diff --git a/bundles/gcp-pubsub-topic/massdriver.yaml b/bundles/gcp-pubsub-topic/massdriver.yaml new file mode 100644 index 0000000..bc28cd6 --- /dev/null +++ b/bundles/gcp-pubsub-topic/massdriver.yaml @@ -0,0 +1,139 @@ +name: gcp-pubsub-topic +description: Google Cloud Pub/Sub topic with optional dead-letter queue. Provisions + the main topic, an optional DLQ topic, and grants the landing zone's workload + service account publisher access. Emits a gcp-pubsub-topic artifact for downstream + Cloud Run, Dataflow, and BigQuery bundles to consume. +source_url: https://github.com/massdriver-cloud/massdriver-catalog/tree/main/bundles/gcp-pubsub-topic +version: 0.1.1 + +params: + required: + - message_retention_duration + - dlq + - message_ordering_enabled + examples: + - __name: Low-volume + message_retention_duration: 604800 + message_ordering_enabled: false + dlq: + enabled: false + - __name: Standard + message_retention_duration: 604800 + message_ordering_enabled: false + dlq: + enabled: true + max_delivery_attempts: 5 + dlq_retention_duration: 604800 + - __name: High-throughput + message_retention_duration: 86400 + message_ordering_enabled: false + dlq: + enabled: true + max_delivery_attempts: 10 + dlq_retention_duration: 86400 + properties: + message_retention_duration: + title: Message Retention (seconds) + description: How long unacknowledged messages are retained on the topic, in + seconds. Minimum 600 (10 min), maximum 604800 (7 days). + type: integer + minimum: 600 + maximum: 604800 + default: 604800 + + message_ordering_enabled: + title: Enable Message Ordering + description: When enabled, messages with the same ordering key are delivered + to subscribers in the order they were published. Disabling improves throughput + at the cost of ordering guarantees. + type: boolean + default: false + + dlq: + title: Dead-Letter Queue + description: Configure a dead-letter queue to capture messages that cannot be + delivered after the maximum number of delivery attempts. + type: object + required: + - enabled + properties: + enabled: + title: Enable DLQ + description: Provision a separate dead-letter topic and configure max delivery + attempts on the main topic's default subscription. + type: boolean + default: true + dependencies: + enabled: + oneOf: + - properties: + enabled: + const: true + max_delivery_attempts: + title: Max Delivery Attempts + description: Number of delivery attempts before a message is forwarded + to the dead-letter topic. Must be between 5 and 100. + type: integer + minimum: 5 + maximum: 100 + default: 5 + dlq_retention_duration: + title: DLQ Retention (seconds) + description: How long messages are retained on the dead-letter topic, + in seconds. Minimum 600, maximum 604800 (7 days). + type: integer + minimum: 600 + maximum: 604800 + default: 604800 + required: + - max_delivery_attempts + - properties: + enabled: + const: false + +connections: + required: + - gcp_authentication + - landing_zone + properties: + gcp_authentication: + $ref: gcp-service-account + title: GCP Credentials + + landing_zone: + $ref: catalog-demo/gcp-landing-zone + title: GCP Landing Zone + +artifacts: + required: + - pubsub_topic + properties: + pubsub_topic: + $ref: catalog-demo/gcp-pubsub-topic + title: GCP Pub/Sub Topic + +steps: + - path: src + provisioner: opentofu:1.10 + config: + checkov: + halt_on_failure: '.params.md_metadata.default_tags["md-target"] | test("^(prod|prd|production)$")' + +ui: + ui:order: + - message_retention_duration + - message_ordering_enabled + - dlq + - "*" + properties: + message_ordering_enabled: + ui:widget: checkbox + dlq: + ui:order: + - enabled + - max_delivery_attempts + - dlq_retention_duration + - "*" + properties: + enabled: + ui:widget: checkbox diff --git a/bundles/gcp-pubsub-topic/operator.md b/bundles/gcp-pubsub-topic/operator.md new file mode 100644 index 0000000..e81951c --- /dev/null +++ b/bundles/gcp-pubsub-topic/operator.md @@ -0,0 +1,93 @@ +--- +templating: mustache +--- + +# GCP Pub/Sub Topic — Operator Runbook + +## Non-obvious constraints + +**Topic name is immutable.** Renaming requires decommissioning this package, recreating with the new name, and updating all consumer subscriptions. Plan a maintenance window. + +**Message retention changes are safe in-place.** Updating `message_retention_duration` applies without disruption. In-flight messages are not affected. + +**Enabling DLQ after-the-fact does not update existing subscriptions.** When you enable the DLQ, Terraform creates the DLQ topic — but existing consumer subscriptions do not automatically gain a dead-letter policy. Consumer bundles must be updated separately to reference the new DLQ topic. + +**Disabling DLQ destroys the DLQ topic.** Consumer subscriptions that have a dead-letter policy pointing to the old DLQ topic will fail to deliver dead letters after the destroy. Remove dead-letter policies from consumer subscriptions before disabling the DLQ here. + +**Message ordering on the topic is a label, not enforcement.** Publishers must also set `enable_message_ordering = true` in their SDK client and pass an ordering key on every publish call. Without ordering keys from the publisher, messages are not ordered regardless of the topic label. + +**`max_delivery_attempts` is enforced at the subscription, not the topic.** This bundle provisions the DLQ topic. The delivery attempt limit lives on the consumer's subscription (managed by the consumer bundle). If messages aren't reaching the DLQ, check the consumer subscription's dead-letter policy first. + +**This bundle creates no IAM bindings.** Consumer bundles bind their own service accounts to this topic. If a service can't publish, the IAM binding is missing from the consumer bundle — not from here. + +## Troubleshooting + +**Messages not flowing to DLQ.** +Check that the consumer subscription has a dead-letter policy referencing `{{artifacts.pubsub_topic.dlq_topic_name}}`: +```bash +gcloud pubsub subscriptions describe \ + --project={{artifacts.pubsub_topic.project_id}} \ + --format="yaml(deadLetterPolicy)" +``` +If the field is absent, the consumer bundle is not configured to use the DLQ. + +**Deploy fails with "pubsub.googleapis.com has not been used in project."** +Add `pubsub.googleapis.com` to `enabled_apis` in the `gcp-landing-zone` package, redeploy the landing zone, wait ~60 seconds, then retry. + +**Publisher permission denied.** +Check the topic IAM policy — the publisher's SA must have `roles/pubsub.publisher`: +```bash +gcloud pubsub topics get-iam-policy {{artifacts.pubsub_topic.topic_name}} \ + --project={{artifacts.pubsub_topic.project_id}} +``` +If missing, the consumer bundle needs to be redeployed with the topic wired on the canvas. + +## Day-2 operations + +**Changing retention duration:** Update param and redeploy. In-place, no disruption. + +**Enabling DLQ on an existing topic:** Set `dlq.enabled = true`, configure `max_delivery_attempts`, redeploy. Then update consumer bundles to add dead-letter policies pointing to `{{artifacts.pubsub_topic.dlq_topic_name}}`. + +**Disabling DLQ:** Remove dead-letter policies from all consumer subscriptions first. Then set `dlq.enabled = false` and redeploy. The DLQ topic is destroyed. + +**Renaming the topic:** Destroy this package, recreate with the new name, update all consumers. No in-place rename is possible. + +**Pulling messages from the DLQ to inspect failures.** +A subscription on the DLQ topic is required (managed by a consumer bundle). If one exists: +```bash +gcloud pubsub subscriptions pull \ + --project={{artifacts.pubsub_topic.project_id}} \ + --limit=10 \ + --auto-ack +``` + +## Useful commands + +```bash +# Describe the main topic +gcloud pubsub topics describe {{artifacts.pubsub_topic.topic_name}} \ + --project={{artifacts.pubsub_topic.project_id}} + +# List subscriptions on the main topic +gcloud pubsub topics list-subscriptions {{artifacts.pubsub_topic.topic_name}} \ + --project={{artifacts.pubsub_topic.project_id}} + +{{#artifacts.pubsub_topic.dlq_topic_name}} +# Describe the DLQ topic +gcloud pubsub topics describe {{artifacts.pubsub_topic.dlq_topic_name}} \ + --project={{artifacts.pubsub_topic.project_id}} + +# List subscriptions on the DLQ topic +gcloud pubsub topics list-subscriptions {{artifacts.pubsub_topic.dlq_topic_name}} \ + --project={{artifacts.pubsub_topic.project_id}} +{{/artifacts.pubsub_topic.dlq_topic_name}} + +# Check IAM on the main topic +gcloud pubsub topics get-iam-policy {{artifacts.pubsub_topic.topic_name}} \ + --project={{artifacts.pubsub_topic.project_id}} + +# Publish a test message +gcloud pubsub topics publish {{artifacts.pubsub_topic.topic_name}} \ + --project={{artifacts.pubsub_topic.project_id}} \ + --message="test" +``` diff --git a/bundles/gcp-pubsub-topic/src/.checkov.yml b/bundles/gcp-pubsub-topic/src/.checkov.yml new file mode 100644 index 0000000..976ffde --- /dev/null +++ b/bundles/gcp-pubsub-topic/src/.checkov.yml @@ -0,0 +1,9 @@ +skip-check: + # CKV_GCP_83: PubSub Topics encrypted with Customer Supplied Encryption Keys (CSEK) + # CSEK requires the caller to manage raw AES-256 keys and pass them on every API + # call — an operational burden that GCP itself recommends against for most workloads. + # Google-managed encryption (default) and CMEK via Cloud KMS are both acceptable + # alternatives that satisfy encryption-at-rest requirements without CSEK complexity. + # This skip applies to all environments; if CMEK is required, add a kms_key_name + # param and wire it to the google_pubsub_topic resources, then remove this skip. + - CKV_GCP_83 diff --git a/bundles/gcp-pubsub-topic/src/artifacts.tf b/bundles/gcp-pubsub-topic/src/artifacts.tf new file mode 100644 index 0000000..db49d55 --- /dev/null +++ b/bundles/gcp-pubsub-topic/src/artifacts.tf @@ -0,0 +1,18 @@ +# Pub/Sub topic artifact — flat schema matching catalog-demo/gcp-pubsub-topic. +# Includes DLQ fields only when the DLQ is enabled (conditional merge). + +resource "massdriver_artifact" "pubsub_topic" { + field = "pubsub_topic" + name = "GCP Pub/Sub Topic ${var.md_metadata.name_prefix}" + artifact = jsonencode(merge( + { + project_id = local.project_id + topic_name = google_pubsub_topic.main.name + topic_id = google_pubsub_topic.main.id + }, + var.dlq.enabled ? { + dlq_topic_name = google_pubsub_topic.dlq[0].name + dlq_topic_id = google_pubsub_topic.dlq[0].id + } : {} + )) +} diff --git a/bundles/gcp-pubsub-topic/src/main.tf b/bundles/gcp-pubsub-topic/src/main.tf new file mode 100644 index 0000000..b5007dd --- /dev/null +++ b/bundles/gcp-pubsub-topic/src/main.tf @@ -0,0 +1,76 @@ +terraform { + required_version = ">= 1.0" + required_providers { + google = { + source = "hashicorp/google" + version = "~> 6.0" + } + massdriver = { + source = "massdriver-cloud/massdriver" + version = "~> 1.3" + } + } +} + +provider "google" { + project = var.gcp_authentication.project_id + credentials = jsonencode(var.gcp_authentication) +} + +locals { + project_id = var.landing_zone.project_id + name_prefix = var.md_metadata.name_prefix + + # GCP message_retention_duration requires the "s" suffix (e.g. "604800s") + retention_duration = "${var.message_retention_duration}s" + dlq_retention_duration = var.dlq.enabled ? "${var.dlq.dlq_retention_duration}s" : null + + topic_name = local.name_prefix + dlq_name = "${local.name_prefix}-dlq" +} + +# ─── Main Topic ─────────────────────────────────────────────────────────────── + +resource "google_pubsub_topic" "main" { + project = local.project_id + name = local.topic_name + + message_retention_duration = local.retention_duration + + # Message ordering is set at the publisher client level; the schema_settings + # field is not required. Ordering is enforced per-publisher, not at topic level. + # This label records the operator intent so Cloud Run and other publishers know + # whether to enable ordering keys when publishing. + labels = merge(var.md_metadata.default_tags, { + message-ordering = var.message_ordering_enabled ? "enabled" : "disabled" + }) +} + +# ─── Dead-Letter Queue Topic ────────────────────────────────────────────────── +# Only created when dlq.enabled == true. Pub/Sub requires the DLQ topic to exist +# before the subscription referencing it can be created by consumers. + +resource "google_pubsub_topic" "dlq" { + count = var.dlq.enabled ? 1 : 0 + + project = local.project_id + name = local.dlq_name + + message_retention_duration = local.dlq_retention_duration + + labels = var.md_metadata.default_tags +} + +# ─── No workload IAM binding here ──────────────────────────────────────────── +# Pub/Sub topics do not own a runtime identity. The landing zone no longer +# provides a shared workload SA. Consumer bundles (e.g. gcp-cloud-run-service) +# create their OWN service account and the Cloud Run bundle grants publisher/ +# subscriber access on this topic's artifact fields when connected on the canvas. +# +# Artifact policy pattern — grant a consumer's SA publisher access: +# resource "google_pubsub_topic_iam_member" "publisher" { +# project = var.pubsub_topic.project_id +# topic = var.pubsub_topic.topic_name +# role = "roles/pubsub.publisher" +# member = "serviceAccount:" +# } diff --git a/bundles/gcp-pubsub-topic/src/variables.tf b/bundles/gcp-pubsub-topic/src/variables.tf new file mode 100644 index 0000000..2429da1 --- /dev/null +++ b/bundles/gcp-pubsub-topic/src/variables.tf @@ -0,0 +1,63 @@ +variable "md_metadata" { + type = object({ + name_prefix = string + default_tags = optional(map(string), {}) + }) +} + +variable "gcp_authentication" { + type = object({ + type = string + project_id = string + private_key_id = string + private_key = string + client_email = string + client_id = string + auth_uri = string + token_uri = string + auth_provider_x509_cert_url = string + client_x509_cert_url = string + }) + sensitive = true +} + +variable "landing_zone" { + type = object({ + project_id = string + network = object({ + network_name = string + network_self_link = string + region = string + primary_subnet = object({ + name = string + cidr = string + self_link = string + }) + }) + enabled_apis = list(string) + budget = object({ + enabled = bool + budget_name = optional(string) + billing_account_id = optional(string) + amount_usd = optional(number) + }) + }) +} + +variable "message_retention_duration" { + type = number + default = 604800 +} + +variable "message_ordering_enabled" { + type = bool + default = false +} + +variable "dlq" { + type = object({ + enabled = bool + max_delivery_attempts = optional(number) + dlq_retention_duration = optional(number) + }) +} diff --git a/bundles/gcp-storage-bucket/README.md b/bundles/gcp-storage-bucket/README.md new file mode 100644 index 0000000..cc681a7 --- /dev/null +++ b/bundles/gcp-storage-bucket/README.md @@ -0,0 +1,82 @@ +# gcp-storage-bucket + +Google Cloud Storage bucket with configurable storage class, optional versioning, and lifecycle rules. Use this bundle to provision a managed object store for data platform workloads — Cloud Run pipelines, BigQuery exports, Vertex Workbench datasets, and similar. + +## Use Cases + +- Staging area for data ingestion before loading into BigQuery +- Durable dataset storage with versioning and access via scoped service accounts +- Archive tier for cost-optimized long-term retention +- Intermediate storage between pipeline stages + +## Resources Created + +| Resource | Type | Notes | +|---|---|---| +| `google_storage_bucket.main` | GCS bucket | Storage class, location, versioning, and lifecycle rules set at provision time | + +This bundle does NOT create any IAM bindings. Consumer bundles (e.g., `gcp-cloud-run-service`) create their own service accounts and bind the appropriate roles on this bucket when connected on the canvas. + +## Connections + +| Connection | Artifact Type | How It Is Used | +|---|---|---| +| `gcp_authentication` | `gcp-service-account` | Deploy credential — project ID and service account key for the Google provider | +| `landing_zone` | `catalog-demo/gcp-landing-zone` | Provides `project_id` for resource placement | + +## Artifact Produced + +**Artifact type:** `catalog-demo/gcp-storage-bucket` + +| Field | Description | +|---|---| +| `project_id` | GCP project ID that owns the bucket | +| `bucket_name` | Globally-unique GCS bucket name (derived from the Massdriver name prefix) | +| `bucket_url` | Canonical `gs://` URL for use with gsutil and client libraries | +| `bucket_self_link` | GCS REST API resource URL (`https://www.googleapis.com/storage/v1/b/`) | +| `location` | GCS location where the bucket is deployed | +| `storage_class` | Active storage class of the bucket | + +Consumer bundles bind IAM roles on the bucket using `bucket_name` and `project_id` from this artifact. Example pattern: + +```hcl +resource "google_storage_bucket_iam_member" "runtime_object_user" { + bucket = var.storage_bucket.bucket_name + role = "roles/storage.objectUser" + member = "serviceAccount:${google_service_account.runtime.email}" +} +``` + +## Compliance + +### Hardcoded security controls + +| Setting | Value | Reason | +|---|---|---| +| `uniform_bucket_level_access` | `true` | Disables legacy object-level ACLs. All access is IAM-only, which prevents split access-control models that are difficult to audit (Checkov CKV_GCP_29). | +| `public_access_prevention` | `"enforced"` | Blocks all public object access regardless of IAM or ACLs. Prevents accidental data exposure via `allUsers` or `allAuthenticatedUsers` (Checkov CKV_GCP_114). | + +### Checkov skips + +| Check | Reason | +|---|---| +| `CKV_GCP_62` | Bucket access logging requires a separate log-sink bucket not in scope here. Enabling logging without a target bucket causes a plan-time error. Operators who need access logs should provision a dedicated log bucket and wire `logging.log_bucket` manually. | +| `CKV_GCP_63` | Checks that a bucket does not log to itself. Because no `logging` block is configured, this check fires as a false positive. | +| `CKV_GCP_78` | Retention lock (WORM) makes objects immutable for a fixed duration and cannot be removed once set. It is not appropriate for all workloads. Add a `retention_policy` param if your workload requires WORM. | + +The `halt_on_failure` expression in `massdriver.yaml` blocks deployments with unresolved high-severity Checkov findings when the environment target matches `prod`, `prd`, or `production`. + +## Assumptions + +- `storage.googleapis.com` must be enabled in the landing zone before deploying. Add it to `enabled_apis` in the `gcp-landing-zone` package. +- The `gcp_authentication` credential has `storage.admin` or equivalent IAM on the project. +- Bucket names are derived from the Massdriver `name_prefix` and are globally unique — operators do not choose the raw bucket name. +- Bucket location cannot be changed after creation. Choosing the wrong location requires decommissioning and reprovisioning with data migration. + +## Presets + +| Preset | Storage Class | Location | Versioning | Lifecycle | +|---|---|---|---|---| +| Staging | STANDARD | US | Off | Delete objects after 30 days | +| Durable | STANDARD | US | On | None — retain all versions | +| Archive | COLDLINE | US | On | Transition to ARCHIVE after 365 days | diff --git a/bundles/gcp-storage-bucket/massdriver.yaml b/bundles/gcp-storage-bucket/massdriver.yaml new file mode 100644 index 0000000..7b5d29b --- /dev/null +++ b/bundles/gcp-storage-bucket/massdriver.yaml @@ -0,0 +1,221 @@ +name: gcp-storage-bucket +description: Google Cloud Storage bucket with configurable storage class, optional + versioning, and lifecycle rules. Enforces uniform bucket-level access and public + access prevention as non-negotiable security baselines. Grants the landing zone's + workload service account objectAdmin access. Emits a gcp-storage-bucket artifact + for downstream Cloud Run, BigQuery, and Vertex Workbench bundles. +source_url: https://github.com/massdriver-cloud/massdriver-catalog/tree/main/bundles/gcp-storage-bucket +version: 0.1.1 + +params: + required: + - storage_class + - location + - versioning_enabled + - lifecycle_rules + examples: + - __name: Staging + storage_class: STANDARD + location: US + versioning_enabled: false + lifecycle_rules: + - action: + type: Delete + condition: + age_days: 30 + - __name: Durable + storage_class: STANDARD + location: US + versioning_enabled: true + lifecycle_rules: [] + - __name: Archive + storage_class: COLDLINE + location: US + versioning_enabled: true + lifecycle_rules: + - action: + type: SetStorageClass + storage_class: ARCHIVE + condition: + age_days: 365 + properties: + storage_class: + title: Storage Class + description: Storage class controls cost vs. access latency trade-offs. STANDARD + for frequently accessed data. NEARLINE for monthly access. COLDLINE for + quarterly access. ARCHIVE for annual or less frequent access. + type: string + enum: + - STANDARD + - NEARLINE + - COLDLINE + - ARCHIVE + default: STANDARD + + location: + title: Location + description: GCS location for the bucket. Multi-regions (US, EU, ASIA) give + highest availability. Dual-regions (NAM4, EUR4, ASIA1) offer low-latency + redundancy. Single regions (e.g. us-central1) co-locate storage with compute. + Location cannot be changed after bucket creation. + type: string + $md.immutable: true + default: US + enum: + - US + - EU + - ASIA + - NAM4 + - EUR4 + - ASIA1 + - us-central1 + - us-east1 + - us-east4 + - us-west1 + - us-west2 + - europe-west1 + - europe-west2 + - europe-west3 + - europe-west4 + - asia-east1 + - asia-northeast1 + - asia-south1 + - asia-southeast1 + - australia-southeast1 + - southamerica-east1 + + versioning_enabled: + title: Enable Versioning + description: When enabled, GCS retains previous versions of objects when they + are overwritten or deleted. Required for Archive lifecycle transitions and + recommended for durable datasets. + type: boolean + default: false + + lifecycle_rules: + title: Lifecycle Rules + description: Ordered list of lifecycle rules applied to objects in this bucket. + Rules are evaluated in order; the first matching rule wins. Leave empty + for no automated lifecycle management. + type: array + default: [] + items: + type: object + required: + - action + - condition + properties: + action: + title: Action + description: What to do when the condition is met. + type: object + required: + - type + properties: + type: + title: Action Type + description: "Delete: permanently removes matched objects. SetStorageClass: + transitions matched objects to the target storage class." + type: string + enum: + - Delete + - SetStorageClass + dependencies: + type: + oneOf: + - properties: + type: + const: SetStorageClass + storage_class: + title: Target Storage Class + description: Storage class to transition matched objects into. + Only valid when action type is SetStorageClass. + type: string + enum: + - NEARLINE + - COLDLINE + - ARCHIVE + required: + - storage_class + - properties: + type: + const: Delete + condition: + title: Condition + description: When to apply the action. + type: object + required: + - age_days + properties: + age_days: + title: Age (days) + description: Apply the action to objects that are at least this many + days old. + type: integer + minimum: 1 + with_state: + title: Object State Filter + description: "Restrict the rule to objects in a particular versioning + state. LIVE: current versions only. ARCHIVED: non-current versions + only. ANY: all versions. Omit to match all states." + type: string + enum: + - LIVE + - ARCHIVED + - ANY + +connections: + required: + - gcp_authentication + - landing_zone + properties: + gcp_authentication: + $ref: gcp-service-account + title: GCP Credentials + + landing_zone: + $ref: catalog-demo/gcp-landing-zone + title: GCP Landing Zone + +artifacts: + required: + - storage_bucket + properties: + storage_bucket: + $ref: catalog-demo/gcp-storage-bucket + title: GCP Storage Bucket + +steps: + - path: src + provisioner: opentofu:1.10 + config: + checkov: + halt_on_failure: '.params.md_metadata.default_tags["md-target"] | test("^(prod|prd|production)$")' + +ui: + ui:order: + - storage_class + - location + - versioning_enabled + - lifecycle_rules + - "*" + properties: + versioning_enabled: + ui:widget: checkbox + lifecycle_rules: + items: + ui:order: + - action + - condition + - "*" + properties: + action: + ui:order: + - type + - storage_class + - "*" + condition: + ui:order: + - age_days + - with_state + - "*" diff --git a/bundles/gcp-storage-bucket/operator.md b/bundles/gcp-storage-bucket/operator.md new file mode 100644 index 0000000..e9fe990 --- /dev/null +++ b/bundles/gcp-storage-bucket/operator.md @@ -0,0 +1,93 @@ +--- +templating: mustache +--- + +# GCP Storage Bucket — Operator Runbook + +## Non-obvious constraints + +**Bucket name is globally unique and immutable.** The name is derived from the Massdriver name prefix. Renaming requires decommissioning and recreating the package, then migrating all objects. + +**Location is immutable.** Bucket location cannot be changed after creation. To move: copy all objects to a new bucket in the target location, update consumers, then decommission this package. Use `gcloud storage cp -r` or a Dataflow job for large datasets. + +**Public access prevention is hardcoded.** `public_access_prevention = "enforced"` cannot be overridden through bundle configuration. Any attempt to grant `allUsers` or `allAuthenticatedUsers` via IAM is rejected by GCP even if the IAM call appears to succeed. + +**Uniform bucket-level access is enabled.** Object-level ACLs are disabled. All access is IAM-only. Granting access to specific objects via ACLs is not possible. + +**This bundle creates no IAM bindings.** Consumer bundles bind their own service accounts to this bucket. If a service can't read or write objects, the IAM binding is missing from the consumer bundle — not from here. + +**Turning versioning off does not delete existing non-current versions.** GCS stops creating new versions, but existing non-current versions are retained and continue to incur storage charges. Add a lifecycle rule targeting `with_state: ARCHIVED` to clean them up. + +**Lifecycle rules evaluate once daily, not in real time.** A rule set to delete objects after 30 days may not take effect until the next evaluation window. + +**`Delete` action on a versioned bucket sets a delete marker — it does not immediately remove storage.** Add a second lifecycle rule targeting `with_state: ARCHIVED` with a shorter `age_days` to purge non-current versions and reclaim storage. + +## Troubleshooting + +**Permission denied on object read/write.** +Uniform bucket-level access is on — check bucket IAM, not object ACLs: +```bash +gcloud storage buckets get-iam-policy {{artifacts.storage_bucket.bucket_url}} +``` +The workload SA needs `roles/storage.objectUser` to read and write, or `roles/storage.objectViewer` for read-only. If the binding is absent, redeploy the consumer bundle with the bucket wired on the canvas. + +**Objects not being deleted by lifecycle rules.** +Lifecycle rules evaluate once daily. Wait up to 24 hours after a rule change. Inspect current lifecycle config: +```bash +gcloud storage buckets describe {{artifacts.storage_bucket.bucket_url}} \ + --format="yaml(lifecycle)" +``` + +**Storage costs unexpectedly high after disabling versioning.** +Old non-current versions are still present. List them: +```bash +gcloud storage ls -a {{artifacts.storage_bucket.bucket_url}} +``` +Add a lifecycle rule with `with_state: ARCHIVED` to purge them. + +**Deploy fails with "storage.googleapis.com has not been used in project."** +Add `storage.googleapis.com` to `enabled_apis` in the `gcp-landing-zone` package, redeploy, wait ~60 seconds, then retry. + +## Day-2 operations + +**Changing storage class:** Update `storage_class` param and redeploy. The bucket updates in-place. Existing objects retain their current storage class — only new writes use the new class. Use a lifecycle `SetStorageClass` rule to migrate existing objects. + +**Enabling versioning:** Safe in-place change. Objects written before versioning was enabled have a single version. Objects overwritten or deleted afterward accumulate versions. + +**Disabling versioning:** In-place change, but existing non-current versions are retained. Add a lifecycle rule targeting `with_state: ARCHIVED` to clean up. + +**Granting access to another service account** (outside Terraform — will be overwritten on next apply): +```bash +gcloud storage buckets add-iam-policy-binding {{artifacts.storage_bucket.bucket_url}} \ + --member="serviceAccount:" \ + --role="roles/storage.objectViewer" +``` +For permanent bindings, add a `google_storage_bucket_iam_member` resource to the consumer bundle source. + +**Migrating objects to a new bucket:** +```bash +gcloud storage cp -r {{artifacts.storage_bucket.bucket_url}}/* gs:/// +``` + +## Useful commands + +```bash +# List objects in the bucket +gcloud storage ls {{artifacts.storage_bucket.bucket_url}} + +# List all objects including non-current versions +gcloud storage ls -a {{artifacts.storage_bucket.bucket_url}} + +# Check bucket IAM policy +gcloud storage buckets get-iam-policy {{artifacts.storage_bucket.bucket_url}} + +# Inspect lifecycle rules +gcloud storage buckets describe {{artifacts.storage_bucket.bucket_url}} \ + --format="yaml(lifecycle)" + +# Copy a local file into the bucket +gcloud storage cp ./myfile.txt {{artifacts.storage_bucket.bucket_url}}/myfile.txt + +# Sync a local directory to the bucket +gcloud storage rsync ./local-dir {{artifacts.storage_bucket.bucket_url}}/remote-dir --recursive +``` diff --git a/bundles/gcp-storage-bucket/src/.checkov.yml b/bundles/gcp-storage-bucket/src/.checkov.yml new file mode 100644 index 0000000..731dcc2 --- /dev/null +++ b/bundles/gcp-storage-bucket/src/.checkov.yml @@ -0,0 +1,27 @@ +skip-check: + # CKV_GCP_62: Bucket should log access requests + # Access logging writes log objects to a separate GCS bucket. For this demo data + # platform series the log sink bucket is not provisioned, so enabling logging here + # would cause a plan-time error (the target bucket does not exist). Operators who + # need access logs should provision a dedicated log bucket and set the + # logging.log_bucket field on this resource. This skip is appropriate because the + # check requires an out-of-band dependency that is not part of this bundle's scope. + - CKV_GCP_62 + + # CKV_GCP_63: Bucket should not log to itself + # This check verifies a bucket is not configured to log access requests to itself + # (which would cause infinite log growth). Because we have no logging block at all + # (access logging is skipped per CKV_GCP_62 — no log-sink bucket is in scope), + # Checkov incorrectly fails this check. The bucket cannot log to itself if logging + # is not configured. Both CKV_GCP_62 and CKV_GCP_63 require a log-sink bucket as + # an out-of-band dependency not provided by this bundle. + - CKV_GCP_63 + + # CKV_GCP_78: Ensure Cloud storage has lock retention policy enabled + # Retention lock makes a bucket's objects immutable for a specified duration. + # While valuable for compliance/WORM use cases, it is not universally appropriate: + # it prevents deletion of any object (including accidental uploads) and cannot be + # shortened or removed once set. Exposing it as a param is the right approach for + # workloads that need it. This skip applies across all environments in this series; + # add a retention_policy param if your workload requires WORM guarantees. + - CKV_GCP_78 diff --git a/bundles/gcp-storage-bucket/src/artifacts.tf b/bundles/gcp-storage-bucket/src/artifacts.tf new file mode 100644 index 0000000..8eb550c --- /dev/null +++ b/bundles/gcp-storage-bucket/src/artifacts.tf @@ -0,0 +1,14 @@ +# Storage bucket artifact — flat schema matching catalog-demo/gcp-storage-bucket. + +resource "massdriver_artifact" "storage_bucket" { + field = "storage_bucket" + name = "GCP Storage Bucket ${var.md_metadata.name_prefix}" + artifact = jsonencode({ + project_id = local.project_id + bucket_name = google_storage_bucket.main.name + bucket_url = "gs://${google_storage_bucket.main.name}" + bucket_self_link = google_storage_bucket.main.self_link + location = google_storage_bucket.main.location + storage_class = google_storage_bucket.main.storage_class + }) +} diff --git a/bundles/gcp-storage-bucket/src/main.tf b/bundles/gcp-storage-bucket/src/main.tf new file mode 100644 index 0000000..f62fa05 --- /dev/null +++ b/bundles/gcp-storage-bucket/src/main.tf @@ -0,0 +1,84 @@ +terraform { + required_version = ">= 1.0" + required_providers { + google = { + source = "hashicorp/google" + version = "~> 6.0" + } + massdriver = { + source = "massdriver-cloud/massdriver" + version = "~> 1.3" + } + } +} + +provider "google" { + project = var.gcp_authentication.project_id + credentials = jsonencode(var.gcp_authentication) +} + +locals { + project_id = var.landing_zone.project_id + name_prefix = var.md_metadata.name_prefix + + # GCS bucket names must be globally unique. The name_prefix already incorporates + # the Massdriver environment slug, so we use it directly as the bucket name. + bucket_name = local.name_prefix +} + +# ─── GCS Bucket ─────────────────────────────────────────────────────────────── + +resource "google_storage_bucket" "main" { + project = local.project_id + name = local.bucket_name + location = var.location + + storage_class = var.storage_class + + # ── Security baselines — NOT configurable ──────────────────────────────────── + # uniform_bucket_level_access: Disables legacy object-level ACLs and enforces + # IAM-only access control. This is a GCP best practice and a Checkov requirement + # (CKV_GCP_29). Allowing ACLs alongside IAM creates split access control models + # that are difficult to audit and easy to misconfigure. + uniform_bucket_level_access = true + + # public_access_prevention: Set to "enforced" to block all public object access + # regardless of IAM policies or ACLs. This prevents accidental data exposure via + # allUsers/allAuthenticatedUsers grants (CKV_GCP_114). This is a non-negotiable + # baseline for all environments in this data platform series. + public_access_prevention = "enforced" + # ───────────────────────────────────────────────────────────────────────────── + + versioning { + enabled = var.versioning_enabled + } + + dynamic "lifecycle_rule" { + for_each = var.lifecycle_rules + content { + action { + type = lifecycle_rule.value.action.type + storage_class = try(lifecycle_rule.value.action.storage_class, null) + } + condition { + age = lifecycle_rule.value.condition.age_days + with_state = try(lifecycle_rule.value.condition.with_state, null) + } + } + } + + labels = var.md_metadata.default_tags +} + +# ─── No workload IAM binding here ──────────────────────────────────────────── +# GCS buckets do not own a runtime identity. The landing zone no longer provides +# a shared workload SA. Consumer bundles (e.g. gcp-cloud-run-service) create their +# OWN service account and the Cloud Run bundle grants objectUser access on this +# bucket when connected on the canvas. +# +# Artifact policy pattern — grant a consumer's SA object access: +# resource "google_storage_bucket_iam_member" "runtime_object_user" { +# bucket = var.storage_bucket.bucket_name +# role = "roles/storage.objectUser" +# member = "serviceAccount:" +# } diff --git a/bundles/gcp-storage-bucket/src/variables.tf b/bundles/gcp-storage-bucket/src/variables.tf new file mode 100644 index 0000000..ab231c4 --- /dev/null +++ b/bundles/gcp-storage-bucket/src/variables.tf @@ -0,0 +1,74 @@ +variable "md_metadata" { + type = object({ + name_prefix = string + default_tags = optional(map(string), {}) + }) +} + +variable "gcp_authentication" { + type = object({ + type = string + project_id = string + private_key_id = string + private_key = string + client_email = string + client_id = string + auth_uri = string + token_uri = string + auth_provider_x509_cert_url = string + client_x509_cert_url = string + }) + sensitive = true +} + +variable "landing_zone" { + type = object({ + project_id = string + network = object({ + network_name = string + network_self_link = string + region = string + primary_subnet = object({ + name = string + cidr = string + self_link = string + }) + }) + enabled_apis = list(string) + budget = object({ + enabled = bool + budget_name = optional(string) + billing_account_id = optional(string) + amount_usd = optional(number) + }) + }) +} + +variable "storage_class" { + type = string + default = "STANDARD" +} + +variable "location" { + type = string + default = "US" +} + +variable "versioning_enabled" { + type = bool + default = false +} + +variable "lifecycle_rules" { + type = list(object({ + action = object({ + type = string + storage_class = optional(string) + }) + condition = object({ + age_days = number + with_state = optional(string) + }) + })) + default = [] +} diff --git a/bundles/gcp-vertex-workbench/README.md b/bundles/gcp-vertex-workbench/README.md new file mode 100644 index 0000000..b6bb4b0 --- /dev/null +++ b/bundles/gcp-vertex-workbench/README.md @@ -0,0 +1,82 @@ +# gcp-vertex-workbench + +Vertex AI Workbench instance for interactive data science. Each bundle instance provisions a dedicated per-instance service account and a managed JupyterLab environment running on GCE. When a BigQuery dataset is connected, the instance SA is automatically granted read-only access — no manual IAM wiring required. + +## Use Cases + +- Exploratory data analysis with scoped, auditable IAM access to BigQuery datasets +- ML model development in GPU-accelerated notebook environments +- Platform-managed data science environments enforcing Shielded VM, no public IP, and per-instance identity + +## Resources Created + +| Resource | Type | Notes | +|---|---|---| +| `google_service_account.instance` | Per-instance SA | This instance's workload identity — one per bundle instance | +| `google_workbench_instance.main` | Vertex AI Workbench instance | Workbench Instances API v2 (`google_workbench_instance`) | +| `google_bigquery_dataset_iam_member.dataset_viewer` | BigQuery read-only IAM | Created only when BigQuery dataset is connected — grants `roles/bigquery.dataViewer` to instance SA | + +## Connections + +### Required + +| Connection | Artifact Type | How It Is Used | +|---|---|---| +| `gcp_authentication` | `gcp-service-account` | GCP credentials used by Terraform to provision resources | +| `landing_zone` | `catalog-demo/gcp-landing-zone` | Provides `project_id`, `network.region`, and subnet self-link for instance placement | + +### Optional + +| Connection | Artifact Type | IAM Role Granted | +|---|---|---| +| `bigquery_dataset` | `catalog-demo/gcp-bigquery-dataset` | `roles/bigquery.dataViewer` (read-only) on the dataset | + +Connecting or disconnecting the BigQuery dataset on the canvas does not take effect until a Terraform apply runs. + +## Artifact Produced + +**Artifact type:** `catalog-demo/gcp-vertex-workbench` + +| Field | Type | Description | +|---|---|---| +| `project_id` | string | GCP project that owns the instance | +| `instance_name` | string | Short instance name (used in gcloud commands) | +| `location` | string | GCP zone where the instance is deployed (e.g., `us-central1-a`) | +| `proxy_url` | string | JupyterLab HTTPS proxy URL — open in a browser to access the notebook. Empty while the instance is starting. | +| `instance_service_account_email` | string | Email of this instance's own SA | +| `instance_service_account_member` | string | IAM principal string (`serviceAccount:`) for downstream bindings | + +## Compliance + +### Hardcoded controls + +| Control | Value | Reason | +|---|---|---| +| Shielded VM — Secure Boot | `enable_secure_boot = true` | Prevents unsigned kernel modules and boot-time malware from loading | +| Shielded VM — vTPM | `enable_vtpm = true` | Enables measured boot and key attestation | +| Shielded VM — Integrity Monitoring | `enable_integrity_monitoring = true` | Detects tampering with the boot sequence | +| No public IP | `disable_public_ip = true` | The Workbench proxy handles browser access; no external IP is exposed | +| Per-instance service account | `google_service_account.instance` (one per bundle instance) | Each instance gets its own SA — no shared SA across Workbench notebooks | +| Read-only BigQuery access | `roles/bigquery.dataViewer` (not dataEditor) | Workbench is an exploration environment. Write access would allow ad-hoc schema mutations from notebook cells. Users who need to write back should use their personal GCP identity or a separate pipeline bundle. | +| Resource labels | Massdriver default tags | Enforces cost attribution and environment tagging | + +### Checkov skips + +None. Existing Vertex AI Workbench Checkov checks (CKV_GCP_89, CKV_GCP_126, CKV_GCP_127) target the deprecated `google_notebooks_instance` resource and do not fire against `google_workbench_instance`. CMEK for disk encryption is intentionally out of scope — Google-managed encryption is used. + +The `halt_on_failure` expression in `massdriver.yaml` blocks deployments with unresolved high-severity Checkov findings when the environment target matches `prod`, `prd`, or `production`. + +## Assumptions + +- The landing zone provides `project_id`, `network.region`, and `primary_subnet.self_link`. The instance is placed in the landing zone's subnet, zone `-a`. +- The subnet must have Private Google Access enabled for the instance to reach GCP APIs (BigQuery, GCS) without a public IP. The `gcp-network` bundle enables this by default. +- GPU availability is zone-dependent. If a GPU type is not available in `-a`, change `local.zone` in `src/main.tf` to a zone with quota. +- The `proxy_url` artifact field may be empty immediately after deploy. It populates within 2–5 minutes after the instance reaches ACTIVE state. + +## Presets + +| Preset | Machine Type | Disk | GPU | Idle Timeout | +|---|---|---|---|---| +| Small | `e2-standard-4` | 150 GB | none | 3 hours | +| Medium | `n1-standard-8` | 200 GB | none | 3 hours | +| GPU | `n1-standard-8` | 200 GB | NVIDIA_TESLA_T4 x 1 | 3 hours | diff --git a/bundles/gcp-vertex-workbench/massdriver.yaml b/bundles/gcp-vertex-workbench/massdriver.yaml new file mode 100644 index 0000000..ff05896 --- /dev/null +++ b/bundles/gcp-vertex-workbench/massdriver.yaml @@ -0,0 +1,139 @@ +name: gcp-vertex-workbench +description: Vertex AI Workbench instance for interactive data science. Provisions + a dedicated per-instance service account with scoped IAM access to any connected + data resources. Automatically grants the instance SA read-only access to a connected + BigQuery dataset when wired. Emits a gcp-vertex-workbench artifact carrying the + instance name, zone, JupyterLab proxy URL, and SA identity. +source_url: https://github.com/massdriver-cloud/massdriver-catalog/tree/main/bundles/gcp-vertex-workbench +version: 0.1.1 + +params: + required: + - machine_type + - boot_disk_size_gb + - idle_shutdown_timeout_minutes + examples: + - __name: Small + machine_type: e2-standard-4 + boot_disk_size_gb: 150 + idle_shutdown_timeout_minutes: 180 + - __name: Medium + machine_type: n1-standard-8 + boot_disk_size_gb: 200 + idle_shutdown_timeout_minutes: 180 + - __name: GPU + machine_type: n1-standard-8 + boot_disk_size_gb: 200 + idle_shutdown_timeout_minutes: 180 + accelerator_type: NVIDIA_TESLA_T4 + accelerator_count: 1 + + properties: + machine_type: + title: Machine Type + description: GCP machine type for the Workbench instance. E2 types are cost-efficient + general-purpose machines. N1 types are required when attaching GPUs. N2 types + offer higher per-core performance for CPU-intensive workloads. Machine type + can be changed in-place by redeploying — the instance is stopped and restarted. + type: string + default: e2-standard-4 + enum: + - e2-standard-2 + - e2-standard-4 + - e2-standard-8 + - e2-standard-16 + - n1-standard-4 + - n1-standard-8 + - n1-standard-16 + - n1-standard-32 + - n2-standard-4 + - n2-standard-8 + - n2-standard-16 + + boot_disk_size_gb: + title: Boot Disk Size (GB) + description: Size of the boot disk in gigabytes. The boot disk holds the OS, + JupyterLab environment, conda/pip packages, and local notebook files. Increase + if you expect to install large libraries (e.g., TensorFlow with CUDA) or store + interim data locally. Boot disk size can only be increased in-place, not decreased. + Minimum 150 GB (enforced by the Workbench base image), maximum 4000 GB. + type: integer + minimum: 150 + maximum: 4000 + default: 150 + + idle_shutdown_timeout_minutes: + title: Idle Shutdown Timeout (minutes) + description: Number of minutes of kernel inactivity before the instance automatically + shuts down. Set to 0 to disable idle shutdown entirely (not recommended in + shared projects — you will be billed continuously). Default 180 minutes (3 hours) + is a good balance for interactive data science sessions. The instance can be + manually restarted from the Massdriver canvas or via gcloud after an idle shutdown. + type: integer + minimum: 0 + default: 180 + + accelerator_type: + title: GPU Accelerator Type + description: GPU accelerator to attach to the instance. Leave empty for CPU-only + workloads. GPUs require an N1 machine type — do not combine with E2 or N2 types. + Attaching a GPU changes the underlying VM and will cause a brief interruption + if changed in-place. + type: string + enum: + - NVIDIA_TESLA_T4 + - NVIDIA_TESLA_V100 + - NVIDIA_TESLA_A100 + - NVIDIA_L4 + + accelerator_count: + title: GPU Count + description: Number of GPU accelerators to attach. Must be set alongside accelerator_type. + Typical values are 1, 2, or 4 depending on the GPU type and quota. + type: integer + minimum: 1 + maximum: 8 + default: 1 + +connections: + required: + - gcp_authentication + - landing_zone + properties: + gcp_authentication: + $ref: gcp-service-account + title: GCP Credentials + + landing_zone: + $ref: catalog-demo/gcp-landing-zone + title: GCP Landing Zone + + # Optional — when wired, the bundle automatically grants the instance SA + # roles/bigquery.dataViewer on this dataset (read-only). Disconnect to remove. + bigquery_dataset: + $ref: catalog-demo/gcp-bigquery-dataset + title: BigQuery Dataset (optional — grants read-only access to instance SA) + +artifacts: + required: + - vertex_workbench + properties: + vertex_workbench: + $ref: catalog-demo/gcp-vertex-workbench + title: GCP Vertex AI Workbench Instance + +steps: + - path: src + provisioner: opentofu:1.10 + config: + checkov: + halt_on_failure: '.params.md_metadata.default_tags["md-target"] | test("^(prod|prd|production)$")' + +ui: + ui:order: + - machine_type + - boot_disk_size_gb + - idle_shutdown_timeout_minutes + - accelerator_type + - accelerator_count + - "*" diff --git a/bundles/gcp-vertex-workbench/operator.md b/bundles/gcp-vertex-workbench/operator.md new file mode 100644 index 0000000..2586591 --- /dev/null +++ b/bundles/gcp-vertex-workbench/operator.md @@ -0,0 +1,147 @@ +--- +templating: mustache +--- + +# GCP Vertex AI Workbench — Operator Runbook + +## Non-obvious constraints + +**Location is a zone, not a region.** This bundle appends `-a` to the landing zone region (e.g., `us-central1` → `us-central1-a`). GPU quota is zone-specific — if you get a quota error, check availability in the zone and request quota or change `local.zone` in `src/main.tf`. + +**E2 machine types do not support GPUs.** If `accelerator_type` is set, the machine type must be N1 (`n1-standard-*`). Attempting to attach a GPU to an E2 machine fails at apply time. + +**Machine type changes stop and restart the instance.** The instance shuts down, resizes, and restarts. Expect 5–10 minutes of downtime. Open notebooks are saved to disk and are available after restart. + +**Shielded VM settings are hardcoded and not changeable in-place.** Changing `enable_secure_boot`, `enable_vtpm`, or `enable_integrity_monitoring` would require destroying and recreating the instance. These are always `true` and are not exposed as params. + +**Idle shutdown requires the Workbench agent running.** The `idle-timeout-seconds` metadata key is only honoured if the Workbench agent is active. If the agent crashes or the instance was reimaged externally, idle shutdown will not fire. + +**Per-instance SA recreates if the package is renamed.** The SA `account_id` is derived from `name_prefix`. Renaming destroys the old SA and creates a new one. Canvas-wired IAM bindings are recreated automatically on the next deploy. Out-of-band bindings must be reapplied manually. + +**Canvas wires require a deploy to take effect.** Connecting or disconnecting the BigQuery dataset on the canvas does NOT grant or revoke IAM access immediately — a Massdriver deploy must run. + +**`proxy_url` is empty until the instance is ACTIVE.** `{{artifacts.vertex_workbench.proxy_url}}` is only populated after the instance boots and the proxy registers. This takes 2–5 minutes after the Terraform apply completes. + +## Troubleshooting + +**Instance stuck in PROVISIONING or STARTING.** +Check the GCE instance serial console for boot errors: +```bash +gcloud compute instances get-serial-port-output {{artifacts.vertex_workbench.instance_name}} \ + --zone={{artifacts.vertex_workbench.location}} \ + --project={{artifacts.vertex_workbench.project_id}} +``` +Common causes: GPU quota exceeded, subnet CIDR exhausted, missing API enablement (`notebooks.googleapis.com`). + +**`proxy_url` is empty after 10 minutes.** +```bash +gcloud workbench instances describe {{artifacts.vertex_workbench.instance_name}} \ + --location={{artifacts.vertex_workbench.location}} \ + --project={{artifacts.vertex_workbench.project_id}} \ + --format="yaml(state,proxyUri,healthInfo)" +``` +If `state` is ACTIVE but `proxyUri` is empty, the Workbench proxy failed to register. Stop and start the instance: +```bash +gcloud workbench instances stop {{artifacts.vertex_workbench.instance_name}} \ + --location={{artifacts.vertex_workbench.location}} \ + --project={{artifacts.vertex_workbench.project_id}} + +gcloud workbench instances start {{artifacts.vertex_workbench.instance_name}} \ + --location={{artifacts.vertex_workbench.location}} \ + --project={{artifacts.vertex_workbench.project_id}} +``` + +**Notebook can't query BigQuery — Access Denied.** +Confirm the canvas wire is connected AND the package has been redeployed since the wire was added. Verify the IAM binding exists: +```bash +bq get-iam-policy {{artifacts.vertex_workbench.project_id}}: \ + --format=prettyjson | grep -A3 "dataViewer" +``` +The member should be `{{artifacts.vertex_workbench.instance_service_account_member}}`. + +**GPU not available in zone.** +```bash +gcloud compute accelerator-types list \ + --filter="zone:{{artifacts.vertex_workbench.location}}" \ + --project={{artifacts.vertex_workbench.project_id}} +``` +If the GPU type is absent, request quota for a different zone, then update `local.zone` in `src/main.tf` and redeploy. + +**Instance not shutting down after idle timeout.** +Confirm the `idle-timeout-seconds` metadata key was set: +```bash +gcloud compute instances describe {{artifacts.vertex_workbench.instance_name}} \ + --zone={{artifacts.vertex_workbench.location}} \ + --project={{artifacts.vertex_workbench.project_id}} \ + --format="yaml(metadata.items)" +``` +If missing, the `idle_shutdown_timeout_minutes` param was 0 (disabled). The metadata key is only written when the value is > 0. + +## Day-2 operations + +**Stopping and starting the instance (e.g., to save costs overnight):** +```bash +# Stop +gcloud workbench instances stop {{artifacts.vertex_workbench.instance_name}} \ + --location={{artifacts.vertex_workbench.location}} \ + --project={{artifacts.vertex_workbench.project_id}} + +# Start +gcloud workbench instances start {{artifacts.vertex_workbench.instance_name}} \ + --location={{artifacts.vertex_workbench.location}} \ + --project={{artifacts.vertex_workbench.project_id}} +``` +Starting after an idle shutdown or manual stop takes 2–5 minutes. The proxy URL remains the same. + +**Resizing the instance:** Update `machine_type` or `boot_disk_size_gb` params and redeploy. The instance stops, resizes, and restarts. Disk size can only be increased, not decreased. + +**Adding a GPU after initial deploy:** Change `machine_type` to an N1 type, set `accelerator_type` and `accelerator_count`, and redeploy. This recreates the underlying GCE VM. + +**Granting a user access to the JupyterLab UI:** +```bash +gcloud workbench instances add-iam-policy-binding {{artifacts.vertex_workbench.instance_name}} \ + --location={{artifacts.vertex_workbench.location}} \ + --project={{artifacts.vertex_workbench.project_id}} \ + --role="roles/notebooks.viewer" \ + --member="user:alice@example.com" +``` + +## Useful commands + +```bash +# Describe instance state and proxy URL +gcloud workbench instances describe {{artifacts.vertex_workbench.instance_name}} \ + --location={{artifacts.vertex_workbench.location}} \ + --project={{artifacts.vertex_workbench.project_id}} \ + --format="yaml(state,proxyUri,healthInfo,gceSetup.machineType,gceSetup.serviceAccounts)" + +# List all Workbench instances in the project +gcloud workbench instances list \ + --location=- \ + --project={{artifacts.vertex_workbench.project_id}} \ + --format="table(name,location,state,proxyUri)" + +# Describe the instance service account +gcloud iam service-accounts describe {{artifacts.vertex_workbench.instance_service_account_email}} \ + --project={{artifacts.vertex_workbench.project_id}} + +# Check IAM bindings granted to the instance SA +gcloud projects get-iam-policy {{artifacts.vertex_workbench.project_id}} \ + --flatten="bindings[].members" \ + --filter="bindings.members:{{artifacts.vertex_workbench.instance_service_account_member}}" \ + --format="table(bindings.role)" + +# Check runtime logs from the Workbench agent +gcloud logging read \ + 'resource.type="gce_instance" AND labels."compute.googleapis.com/resource_name"="{{artifacts.vertex_workbench.instance_name}}"' \ + --project={{artifacts.vertex_workbench.project_id}} \ + --limit=50 \ + --format="table(timestamp,textPayload)" + +# Check GCP Audit Logs for BigQuery access by the instance SA +gcloud logging read \ + 'protoPayload.authenticationInfo.principalEmail="{{artifacts.vertex_workbench.instance_service_account_email}}" AND protoPayload.serviceName="bigquery.googleapis.com"' \ + --project={{artifacts.vertex_workbench.project_id}} \ + --limit=20 \ + --format="table(timestamp,protoPayload.methodName,protoPayload.resourceName)" +``` diff --git a/bundles/gcp-vertex-workbench/src/.checkov.yml b/bundles/gcp-vertex-workbench/src/.checkov.yml new file mode 100644 index 0000000..2f59817 --- /dev/null +++ b/bundles/gcp-vertex-workbench/src/.checkov.yml @@ -0,0 +1,14 @@ +skip-check: + # CKV2_GCP_27: Ensure Vertex AI workbench instance disks are encrypted with a Customer Managed Key (CMK) + # CMEK is intentionally out of scope for this bundle. Boot and data disks use Google-managed + # encryption (GOOGLE_MANAGED_ENCRYPTION), which is appropriate for the interactive data-science + # workloads this bundle targets. Checkov fires this check whenever a kms_key is absent from the + # boot_disk and data_disks blocks, making it a false positive here. If CMEK is required for a + # specific regulatory workload, a separate bundle with a Cloud KMS key connection should be used — + # it requires provisioning a KMS key, key ring, and granting the Compute Engine SA CryptoKey + # Encrypter/Decrypter access, which is out of scope for this general-purpose bundle. + - CKV2_GCP_27 + + # Note: As of checkov 3.2.x, legacy Vertex AI Notebook checks (CKV_GCP_89, CKV_GCP_126, + # CKV_GCP_127) target google_notebooks_instance (deprecated API v1) and do not fire against + # google_workbench_instance. No skip needed for those here. diff --git a/bundles/gcp-vertex-workbench/src/artifacts.tf b/bundles/gcp-vertex-workbench/src/artifacts.tf new file mode 100644 index 0000000..b91902c --- /dev/null +++ b/bundles/gcp-vertex-workbench/src/artifacts.tf @@ -0,0 +1,18 @@ +# Workbench instance artifact — matches catalog-demo/gcp-vertex-workbench schema. +# Emitted after the instance is provisioned and the proxy_uri is known. +# The proxy_url may be empty on first deploy if the instance is still starting. +# Downstream connections can use instance_service_account_member to grant the +# Workbench additional IAM roles on resources outside this bundle. + +resource "massdriver_artifact" "vertex_workbench" { + field = "vertex_workbench" + name = "GCP Vertex Workbench ${var.md_metadata.name_prefix}" + artifact = jsonencode({ + project_id = local.project_id + instance_name = google_workbench_instance.main.name + location = google_workbench_instance.main.location + proxy_url = google_workbench_instance.main.proxy_uri + instance_service_account_email = local.instance_sa_email + instance_service_account_member = local.instance_sa_member + }) +} diff --git a/bundles/gcp-vertex-workbench/src/iam.tf b/bundles/gcp-vertex-workbench/src/iam.tf new file mode 100644 index 0000000..0e97f25 --- /dev/null +++ b/bundles/gcp-vertex-workbench/src/iam.tf @@ -0,0 +1,50 @@ +# ─── Upstream Artifact IAM Auto-Binding ─────────────────────────────────────── +# +# This file implements the "auto-binding" pattern for Workbench instances that +# consume upstream data artifacts. For each optional connection that IS wired on +# the canvas, Terraform grants THIS bundle's instance service account the +# minimum-privilege read-only role required to use that resource. +# +# The instance SA (google_service_account.instance in main.tf) is created by +# this bundle — not inherited from the landing zone. Each Workbench instance gets +# its own identity with bindings only to the resources it actually connects to. +# +# HOW IT WORKS +# ──────────── +# Massdriver passes optional connections as null when not wired on the canvas, +# or as a plain object when wired. We detect presence with: var. != null +# Then use `count = var. != null ? 1 : 0` to conditionally create +# the binding. No connection → no IAM change. Add connection → binding appears +# on next deploy. Remove connection → binding is destroyed on next deploy. +# +# ROLES GRANTED +# ───────────── +# BigQuery dataset → roles/bigquery.dataViewer (read-only) +# Allows the Workbench instance to SELECT from tables and list tables within +# the dataset. Does NOT allow writing, updating, deleting rows, or creating +# tables. This is intentionally restrictive — Workbench is a read-and-explore +# environment, not a write path. If a notebook needs to write results back, a +# separate BigQuery writer service (Cloud Run, Dataflow) should own that role. +# +# HARDCODED POLICY: read-only access for BigQuery dataset connections +# The decision to grant only roles/bigquery.dataViewer (not dataEditor) is +# deliberate and non-configurable. Workbench instances are interactive exploration +# tools — granting write access would allow ad-hoc schema mutations and data +# deletion from notebook cells, bypassing any pipeline governance. If a user needs +# write access to BigQuery from Workbench, they should authenticate with their +# personal GCP identity (via Application Default Credentials), which is subject +# to IAM policy for their user account and provides a full audit trail. + +# ── BigQuery Dataset ─────────────────────────────────────────────────────────── +# Grant the instance SA read-only access to the connected BigQuery dataset. +# Binding is dataset-scoped — propagates to all current and future tables. +# For table-level isolation, use google_bigquery_table_iam_member. + +resource "google_bigquery_dataset_iam_member" "dataset_viewer" { + count = var.bigquery_dataset != null ? 1 : 0 + + project = var.bigquery_dataset.project_id + dataset_id = var.bigquery_dataset.dataset_id + role = "roles/bigquery.dataViewer" + member = local.instance_sa_member +} diff --git a/bundles/gcp-vertex-workbench/src/main.tf b/bundles/gcp-vertex-workbench/src/main.tf new file mode 100644 index 0000000..552691e --- /dev/null +++ b/bundles/gcp-vertex-workbench/src/main.tf @@ -0,0 +1,168 @@ +terraform { + required_version = ">= 1.0" + required_providers { + google = { + source = "hashicorp/google" + version = "~> 6.0" + } + massdriver = { + source = "massdriver-cloud/massdriver" + version = "~> 1.3" + } + } +} + +provider "google" { + project = var.gcp_authentication.project_id + credentials = jsonencode(var.gcp_authentication) +} + +locals { + project_id = var.landing_zone.project_id + name_prefix = var.md_metadata.name_prefix + region = var.landing_zone.network.region + # Workbench instances are zonal resources. Default to the first zone of the region. + zone = "${local.region}-a" + + # Instance SA is created by THIS bundle — scoped to this specific Workbench instance. + # See google_service_account.instance below for the design rationale. + instance_sa_email = google_service_account.instance.email + instance_sa_member = "serviceAccount:${google_service_account.instance.email}" + + # Idle shutdown is configured via GCE metadata. The Workbench agent reads + # "idle-timeout-seconds" and shuts down the instance after the specified + # number of seconds of kernel inactivity. 0 = never shut down. + idle_shutdown_seconds = var.idle_shutdown_timeout_minutes * 60 + + # Detect whether a GPU is requested. + has_gpu = var.accelerator_type != null && var.accelerator_type != "" +} + +# ─── Instance Service Account ────────────────────────────────────────────────── +# DESIGN DECISION: This bundle always creates a dedicated per-instance service +# account. Workbench instances are intended for data-science exploration with +# scoped, auditable access. Sharing a single SA across multiple Workbench +# instances makes post-hoc access auditing impossible — you can't tell which +# instance accessed a resource. By issuing one SA per instance, every IAM action +# in Cloud Audit Logs is traceable to a specific instance and its owner. +# +# The SA is granted ONLY the roles it needs for resources explicitly connected +# on the Massdriver canvas — no standing access to datasets or buckets it does +# not use. Roles are bound and unbound automatically as connections are added +# or removed. +# +# account_id is derived from name_prefix and capped at 28 chars (GCP limit is 30; +# we reserve 2 chars for future suffix use). The SA lives in the landing zone project. +# +# WARNING: Changing the package name_prefix recreates the SA with a new email. +# Any out-of-band IAM bindings referencing the old SA email are invalidated. Canvas- +# wired bindings are recreated automatically on the next deploy. + +resource "google_service_account" "instance" { + project = local.project_id + account_id = substr(local.name_prefix, 0, 28) + display_name = "Workbench Instance — ${local.name_prefix}" + description = "Runtime identity for Workbench instance ${local.name_prefix}. Managed by Massdriver." +} + +# ─── Vertex AI Workbench Instance ───────────────────────────────────────────── +# Uses google_workbench_instance (current Vertex AI Workbench Instances API v2). +# Do NOT use google_notebooks_instance — that resource targets the deprecated +# Notebooks API v1 and is scheduled for removal. +# +# Location is a ZONE, not a region. We derive it from the landing zone region +# by appending "-a" (the first zone in every GCP region). If you need a different +# zone, adjust local.zone above. +# +# Shielded VM (secure boot, vTPM, integrity monitoring) is enabled by default +# as a hardcoded security baseline. Disabling these requires explicit override +# and is not exposed as a param — see compliance notes in README.md. +# +# Public IP is disabled (disable_public_ip = true). Workbench instances reach +# GCP APIs via Private Google Access on the landing zone subnet. No external +# IP is required for normal JupyterLab use — the proxy URL handles browser access. + +resource "google_workbench_instance" "main" { + project = local.project_id + name = local.name_prefix + location = local.zone + + gce_setup { + machine_type = var.machine_type + + # ── Shielded VM ────────────────────────────────────────────────────────── + # Hardcoded security baseline — not configurable. Secure Boot prevents + # unsigned code from running during startup. vTPM enables measured boot and + # key attestation. Integrity Monitoring detects tampering of the boot sequence. + # All three are standard security hygiene for data science VMs. + shielded_instance_config { + enable_secure_boot = true + enable_vtpm = true + enable_integrity_monitoring = true + } + + # ── Network ────────────────────────────────────────────────────────────── + # Place the instance on the landing zone's primary subnet. + # disable_public_ip prevents an external IP from being assigned — the + # JupyterLab proxy handles browser access without a public IP. + disable_public_ip = true + + network_interfaces { + network = var.landing_zone.network.network_self_link + subnet = var.landing_zone.network.primary_subnet.self_link + nic_type = "GVNIC" + } + + # ── Service Account ─────────────────────────────────────────────────────── + # Run as the per-instance SA created above. IAM bindings in iam.tf grant + # this SA the minimum required roles on any connected upstream data artifact. + service_accounts { + email = local.instance_sa_email + } + + # ── GPU Accelerator ─────────────────────────────────────────────────────── + # Only created when accelerator_type is set. GPUs require N1 machine types. + # E2 and N2 machine types do not support GPU attachment. + dynamic "accelerator_configs" { + for_each = local.has_gpu ? [1] : [] + content { + type = var.accelerator_type + core_count = var.accelerator_count + } + } + + # ── Boot Disk ───────────────────────────────────────────────────────────── + boot_disk { + disk_size_gb = var.boot_disk_size_gb + disk_type = "PD_SSD" + } + + # ── Metadata ────────────────────────────────────────────────────────────── + # idle-timeout-seconds: Workbench agent shuts down the instance after this + # many seconds of kernel inactivity. 0 = never (not recommended — continuous billing). + # serial-port-logging-enable: disabled by default; enable only for deep debugging. + metadata = merge( + { + "serial-port-logging-enable" = "false" + }, + local.idle_shutdown_seconds > 0 ? { + "idle-timeout-seconds" = tostring(local.idle_shutdown_seconds) + } : {} + ) + } + + labels = var.md_metadata.default_tags + + # Google adds auto-managed keys to metadata post-creation (for example + # enable-jupyterlab4, proxy-mode). Terraform sees those as drift and wants to + # prune them, which triggers a gce_setup update. The Workbench API then + # rejects the apply because updates to gce_setup require the instance to be + # stopped first — even when no restricted field (machine_type, shielded + # config, etc.) is actually changing. Ignoring metadata drift keeps + # redeploys no-op when only the user-managed keys are stable. + lifecycle { + ignore_changes = [ + gce_setup[0].metadata, + ] + } +} diff --git a/bundles/gcp-vertex-workbench/src/variables.tf b/bundles/gcp-vertex-workbench/src/variables.tf new file mode 100644 index 0000000..9d3baf6 --- /dev/null +++ b/bundles/gcp-vertex-workbench/src/variables.tf @@ -0,0 +1,90 @@ +variable "md_metadata" { + type = object({ + name_prefix = string + default_tags = optional(map(string), {}) + }) +} + +variable "gcp_authentication" { + type = object({ + type = string + project_id = string + private_key_id = string + private_key = string + client_email = string + client_id = string + auth_uri = string + token_uri = string + auth_provider_x509_cert_url = string + client_x509_cert_url = string + }) + sensitive = true +} + +variable "landing_zone" { + type = object({ + project_id = string + network = object({ + network_name = string + network_self_link = string + region = string + primary_subnet = object({ + name = string + cidr = string + self_link = string + }) + }) + enabled_apis = list(string) + budget = object({ + enabled = bool + budget_name = optional(string) + billing_account_id = optional(string) + amount_usd = optional(number) + }) + }) +} + +# ─── Optional upstream artifact connections ──────────────────────────────────── +# These variables are null when the connection is not wired on the canvas. +# Massdriver passes optional connections as a plain object or null — NOT a list. +# iam.tf uses count = var. != null ? 1 : 0 to conditionally create IAM +# bindings, and references fields directly (e.g., var.bigquery_dataset.dataset_id). + +variable "bigquery_dataset" { + description = "Optional BigQuery dataset connection. When provided, the instance SA is granted roles/bigquery.dataViewer (read-only) on the dataset." + type = object({ + project_id = string + dataset_id = string + dataset_full_name = string + location = string + friendly_name = optional(string) + }) + default = null +} + +# ─── Instance params ─────────────────────────────────────────────────────────── + +variable "machine_type" { + type = string + default = "e2-standard-4" +} + +variable "boot_disk_size_gb" { + type = number + default = 150 +} + +variable "idle_shutdown_timeout_minutes" { + type = number + default = 180 +} + +variable "accelerator_type" { + type = string + default = null +} + +variable "accelerator_count" { + type = number + default = 1 +} diff --git a/templates/gcp-cloud-run-service/README.md b/templates/gcp-cloud-run-service/README.md new file mode 100644 index 0000000..c08ff69 --- /dev/null +++ b/templates/gcp-cloud-run-service/README.md @@ -0,0 +1,26 @@ +# GCP Cloud Run Service — Application Template + +Scaffold a new application bundle for a Cloud Run service with a per-service runtime identity and pick any upstream data artifacts you want this service to consume. + +## Use with `mass bundle new` + +``` +mass bundle new --template gcp-cloud-run-service +``` + +The CLI will prompt for: +- The bundle's `name` and `description` +- Any connections to add (you'll see a list of artifact definitions published in your Massdriver org — pick the upstream resources this service needs, e.g. a `gcp-pubsub-topic`, `gcp-bigquery-dataset`, or `gcp-storage-bucket`) + +## What you get + +- Cloud Run v2 service running as its own per-service service account +- Sensible defaults baked in: 1 vCPU, 512Mi memory, internal ingress, port 8080 +- Artifact output so downstream bundles can discover the service URL and runtime SA +- Example IAM bindings in `src/iam.tf` for common upstream data resources (Pub/Sub publisher, BigQuery writer, GCS object user) — commented out, ready to uncomment based on which connections you picked +- Example push subscription in `src/push_subscription.tf` — uncomment if you want this service to receive messages from a Pub/Sub topic. Uses a dedicated push-invoker SA and OIDC for authenticated delivery. +- Example VPC connector wiring in `src/main.tf` — uncomment if you want egress to flow through a Serverless VPC Access connector (required for reaching private endpoints like on-prem Kafka via peered networks). + +## What to customize + +The template is intentionally lean. Only `image` is exposed as a param. Add more params to `massdriver.yaml` as your application needs them (port, memory, environment variables, min/max instances, ingress, etc.). Everything in `src/main.tf` is hardcoded defaults — move anything you want operators to tune into `var.*` and add it to the params block. diff --git a/templates/gcp-cloud-run-service/icon.png b/templates/gcp-cloud-run-service/icon.png new file mode 100644 index 0000000..29aefb8 Binary files /dev/null and b/templates/gcp-cloud-run-service/icon.png differ diff --git a/templates/gcp-cloud-run-service/massdriver.yaml b/templates/gcp-cloud-run-service/massdriver.yaml new file mode 100644 index 0000000..ec79b21 --- /dev/null +++ b/templates/gcp-cloud-run-service/massdriver.yaml @@ -0,0 +1,57 @@ +# Bundle YAML Spec: https://docs.massdriver.cloud/guides/bundle-yaml-spec +# Module Patterns: https://docs.massdriver.cloud/guides/module-patterns +# Bundle Templates: https://docs.massdriver.cloud/guides/bundle-templates + +name: "{{ name }}" +description: "{{ description }}" +source_url: https://github.com/YOUR_ORG/YOUR_REPO/tree/main/bundles/{{ name }} +version: 0.0.0 + +steps: + - path: src + provisioner: opentofu:1.10 + config: + checkov: + halt_on_failure: '.params.md_metadata.default_tags["md-target"] | test("^(prod|prd|production)$")' + +params: + required: + - image + properties: + image: + type: string + title: Container Image + description: Fully-qualified container image reference. Pin to a digest + (image@sha256:...) for production. + +connections: + required: + - gcp_authentication + - landing_zone + {%- for conn in connections %} + - {{ conn.name }} + {%- endfor %} + properties: + gcp_authentication: + $ref: gcp-service-account + title: GCP Credentials + landing_zone: + $ref: catalog-demo/gcp-landing-zone + title: GCP Landing Zone + {%- for conn in connections %} + {{ conn.name }}: + $ref: {{ conn.artifact_definition }} + {%- endfor %} + +artifacts: + required: + - cloud_run_service + properties: + cloud_run_service: + $ref: catalog-demo/gcp-cloud-run-service + title: GCP Cloud Run Service + +ui: + ui:order: + - image + - "*" diff --git a/templates/gcp-cloud-run-service/operator.md b/templates/gcp-cloud-run-service/operator.md new file mode 100644 index 0000000..88b892a --- /dev/null +++ b/templates/gcp-cloud-run-service/operator.md @@ -0,0 +1,22 @@ +# {{ name }} + +{{ description }} + +## Non-obvious constraints + +- Cloud Run revisions are immutable. New config triggers a new revision; traffic defaults to 100% on latest. +- Service account name derives from the bundle's name prefix, capped at 30 characters. Renaming the package destroys and recreates the SA. +- Ingress changes trigger a full revision replacement (cold start on next request). + +## Troubleshooting + +- Revision fails readiness check: the container port in `src/main.tf` must match what the running process binds to. +- Image pull errors: the runtime service account needs `roles/artifactregistry.reader` on the image repo. + +## Useful commands + +``` +gcloud run services describe $SERVICE --region $REGION +gcloud run services logs read $SERVICE --region $REGION --limit 100 +gcloud run services update-traffic $SERVICE --to-revisions $REVISION=100 --region $REGION +``` diff --git a/templates/gcp-cloud-run-service/src/.checkov.yml b/templates/gcp-cloud-run-service/src/.checkov.yml new file mode 100644 index 0000000..33d6926 --- /dev/null +++ b/templates/gcp-cloud-run-service/src/.checkov.yml @@ -0,0 +1,12 @@ +skip-check: + # CKV_GCP_102: Cloud Run services should use private endpoint or VPC connector. + # Ingress is configurable; the default is internal-only. If your service needs + # a VPC connector, add the relevant google_vpc_access_connector resource and + # update the service's vpc_access block. This skip acknowledges the check is + # not applicable to services that use ingress restrictions instead. + - CKV_GCP_102 + # CKV_GCP_103: Cloud Run services should use Binary Authorization. Binary + # Authorization requires separate attestor infrastructure and image signing + # pipelines that are out of scope for a per-service bundle. Enforce at the + # organization level via org policy if needed. + - CKV_GCP_103 diff --git a/templates/gcp-cloud-run-service/src/_providers.tf b/templates/gcp-cloud-run-service/src/_providers.tf new file mode 100644 index 0000000..cad113c --- /dev/null +++ b/templates/gcp-cloud-run-service/src/_providers.tf @@ -0,0 +1,18 @@ +terraform { + required_version = ">= 1.0" + required_providers { + google = { + source = "hashicorp/google" + version = "~> 6.0" + } + massdriver = { + source = "massdriver-cloud/massdriver" + version = "~> 1.3" + } + } +} + +provider "google" { + project = var.gcp_authentication.project_id + credentials = jsonencode(var.gcp_authentication) +} diff --git a/templates/gcp-cloud-run-service/src/artifacts.tf b/templates/gcp-cloud-run-service/src/artifacts.tf new file mode 100644 index 0000000..d8cc131 --- /dev/null +++ b/templates/gcp-cloud-run-service/src/artifacts.tf @@ -0,0 +1,13 @@ +resource "massdriver_artifact" "cloud_run_service" { + field = "cloud_run_service" + name = "GCP Cloud Run Service ${var.md_metadata.name_prefix}" + artifact = jsonencode({ + project_id = local.project_id + service_name = google_cloud_run_v2_service.main.name + service_url = google_cloud_run_v2_service.main.uri + location = google_cloud_run_v2_service.main.location + latest_ready_revision = google_cloud_run_v2_service.main.latest_ready_revision + runtime_service_account_email = local.runtime_sa_email + runtime_service_account_member = local.runtime_sa_member + }) +} diff --git a/templates/gcp-cloud-run-service/src/iam.tf b/templates/gcp-cloud-run-service/src/iam.tf new file mode 100644 index 0000000..2ca37b2 --- /dev/null +++ b/templates/gcp-cloud-run-service/src/iam.tf @@ -0,0 +1,28 @@ +# Grant this service's runtime SA the minimum role it needs on each upstream +# resource it consumes. Each connection you picked at scaffold time is available +# as var.. +# +# Examples — uncomment and adapt based on which connections you selected: +# +# --- Outgoing: publish to a Pub/Sub topic --- +# resource "google_pubsub_topic_iam_member" "publisher" { +# project = var.pubsub_topic.project_id +# topic = var.pubsub_topic.topic_name +# role = "roles/pubsub.publisher" +# member = local.runtime_sa_member +# } +# +# --- Outgoing: write to BigQuery --- +# resource "google_bigquery_dataset_iam_member" "data_editor" { +# project = var.bigquery_dataset.project_id +# dataset_id = var.bigquery_dataset.dataset_id +# role = "roles/bigquery.dataEditor" +# member = local.runtime_sa_member +# } +# +# --- Outgoing: read/write GCS objects --- +# resource "google_storage_bucket_iam_member" "object_user" { +# bucket = var.storage_bucket.bucket_name +# role = "roles/storage.objectUser" +# member = local.runtime_sa_member +# } diff --git a/templates/gcp-cloud-run-service/src/main.tf b/templates/gcp-cloud-run-service/src/main.tf new file mode 100644 index 0000000..4a79fd3 --- /dev/null +++ b/templates/gcp-cloud-run-service/src/main.tf @@ -0,0 +1,53 @@ +locals { + project_id = var.landing_zone.project_id + name_prefix = var.md_metadata.name_prefix + region = var.landing_zone.network.region + + runtime_sa_email = google_service_account.runtime.email + runtime_sa_member = "serviceAccount:${google_service_account.runtime.email}" +} + +resource "google_service_account" "runtime" { + project = local.project_id + account_id = substr(local.name_prefix, 0, 30) + display_name = "Cloud Run Runtime — ${local.name_prefix}" + description = "Runtime identity for ${local.name_prefix}. Managed by Massdriver." +} + +resource "google_cloud_run_v2_service" "main" { + project = local.project_id + name = local.name_prefix + location = local.region + + ingress = "INGRESS_TRAFFIC_INTERNAL_ONLY" + + template { + service_account = local.runtime_sa_email + + # Optional: route egress through a Serverless VPC Access connector. + # Uncomment if you picked a gcp-vpc-connector connection named `vpc_connector` + # at scaffold time. + # + # vpc_access { + # connector = var.vpc_connector.connector_id + # egress = "PRIVATE_RANGES_ONLY" # or ALL_TRAFFIC to force all egress through VPC + # } + + containers { + image = var.image + + ports { + container_port = 8080 + } + + resources { + limits = { + cpu = "1" + memory = "512Mi" + } + } + } + } + + labels = var.md_metadata.default_tags +} diff --git a/templates/gcp-cloud-run-service/src/push_subscription.tf b/templates/gcp-cloud-run-service/src/push_subscription.tf new file mode 100644 index 0000000..6b65d4a --- /dev/null +++ b/templates/gcp-cloud-run-service/src/push_subscription.tf @@ -0,0 +1,43 @@ +# Optional: receive messages from a Pub/Sub topic via push subscription. +# +# If you picked a gcp-pubsub-topic connection named `incoming_topic` at scaffold +# time, uncomment the resources below. This creates a push subscription that +# invokes this service's URL using OIDC with a dedicated invoker SA. +# +# The push invoker SA is separate from this service's runtime SA. Pub/Sub uses +# the invoker SA to call the service; the runtime SA is the identity the +# container runs as once the request lands. +# +# resource "google_service_account" "push_invoker" { +# project = local.project_id +# account_id = "${substr(local.name_prefix, 0, 28)}-p" +# display_name = "Push Invoker — ${local.name_prefix}" +# } +# +# resource "google_cloud_run_v2_service_iam_member" "push_invoker" { +# project = local.project_id +# location = google_cloud_run_v2_service.main.location +# name = google_cloud_run_v2_service.main.name +# role = "roles/run.invoker" +# member = "serviceAccount:${google_service_account.push_invoker.email}" +# } +# +# resource "google_pubsub_subscription" "push" { +# project = var.incoming_topic.project_id +# name = "${local.name_prefix}-push" +# topic = var.incoming_topic.topic_id +# +# ack_deadline_seconds = 60 +# +# push_config { +# push_endpoint = google_cloud_run_v2_service.main.uri +# oidc_token { +# service_account_email = google_service_account.push_invoker.email +# } +# } +# +# retry_policy { +# minimum_backoff = "10s" +# maximum_backoff = "600s" +# } +# }