From 9d3af406f0790dd0d55c242c7e7a8e13910b33c5 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 13 Mar 2026 02:51:26 +0000 Subject: [PATCH 01/25] refactor: Unify bin scripts and refactor core libs - Replaced separate create/destroy/audit scripts for standard and private clusters with unified scripts in bin/ (e.g., create-dpgce, destroy-dpgce, audit-dpgce). - Moved common functions to lib/script-utils.sh. - Updated lib/env.sh with new variables and logic. - Added new utility scripts like connectivity-test and debug-init-action. --- gcloud/bin/audit-dpgce | 51 ++++++++++ gcloud/bin/audit-dpgke | 123 +++++++++++++++++++++++++ gcloud/bin/connectivity-test | 1 + gcloud/bin/create-dpgce | 164 ++++++++++++++++----------------- gcloud/bin/debug-init-action | 21 +++++ gcloud/bin/destroy-dpgce | 143 ++++++++++++++++------------ gcloud/bin/recreate-cluster.sh | 65 +++++++++++++ gcloud/bin/scp-m | 16 ++-- gcloud/bin/ssh-m | 16 ++-- gcloud/lib/env.sh | 79 ++++++++++++---- gcloud/lib/script-utils.sh | 112 ++++++---------------- 11 files changed, 535 insertions(+), 256 deletions(-) create mode 100755 gcloud/bin/audit-dpgce create mode 100755 gcloud/bin/audit-dpgke create mode 100644 gcloud/bin/connectivity-test create mode 100755 gcloud/bin/debug-init-action create mode 100755 gcloud/bin/recreate-cluster.sh diff --git a/gcloud/bin/audit-dpgce b/gcloud/bin/audit-dpgce new file mode 100755 index 00000000..cc725f7d --- /dev/null +++ b/gcloud/bin/audit-dpgce @@ -0,0 +1,51 @@ +#!/bin/bash +# +# Universal audit script for all Dataproc on GCE environment variations. +# +# This script generates a state.json file that is the canonical source of truth +# for the environment's state. + +# Exit on failure +set -e + +# --- Get script's real directory --- +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +GCLOUD_DIR="$(realpath "${SCRIPT_DIR}/..")" +source "${GCLOUD_DIR}/lib/env.sh" +source "${GCLOUD_DIR}/lib/script-utils.sh" +source "${GCLOUD_DIR}/lib/network/network.sh" +source "${GCLOUD_DIR}/lib/network/subnet.sh" +source "${GCLOUD_DIR}/lib/network/router.sh" +source "${GCLOUD_DIR}/lib/network/firewall.sh" +source "${GCLOUD_DIR}/lib/gcp/iam.sh" +source "${GCLOUD_DIR}/lib/dataproc/cluster.sh" +source "${GCLOUD_DIR}/lib/dataproc/private-cluster.sh" +source "${GCLOUD_DIR}/lib/dataproc/autoscaling.sh" +source "${GCLOUD_DIR}/lib/gcp/misc.sh" + +# --- Main Audit Logic --- +# This script will now generate a state.json file. +# It does not need to print a human-readable report, as that can be done by +# another script that reads the state file. + +# Initialize an empty JSON object +echo "{}" > "${STATE_FILE}" + +# --- Infrastructure State Checks --- +update_state "project" "$(_check_exists "gcloud projects describe '${PROJECT_ID}' --format='json(lifecycleState,projectId)'")" +update_state "billing" "$(_check_exists "gcloud beta billing projects describe '${PROJECT_ID}' --format='json(billingEnabled)'")" + +# --- Resource Existence Checks --- +update_state "vpcNetwork" "$(exists_network)" +update_state "standardSubnet" "$(exists_subnet "${SUBNET}")" +update_state "privateSubnet" "$(exists_subnet "${PRIVATE_SUBNET}")" +update_state "cloudRouter" "$(exists_router)" +update_state "firewallRule" "$(exists_firewall)" +update_state "routes" "$(_check_exists "gcloud compute routes list --project='${PROJECT_ID}' --filter='network~\"/${NETWORK}$\"' --format='json(name,selfLink)'" | jq 'if . == [] then null else . end')" +update_state "serviceAccount" "$(exists_service_account)" +update_state "autoscalingPolicy" "$(exists_autoscaling_policy)" +update_state "dataprocCluster" "$(exists_dpgce_cluster)" + +# --- Final Output --- +# For convenience, pretty-print the state file to stdout +jq . < "${STATE_FILE}" diff --git a/gcloud/bin/audit-dpgke b/gcloud/bin/audit-dpgke new file mode 100755 index 00000000..11bd8c7b --- /dev/null +++ b/gcloud/bin/audit-dpgke @@ -0,0 +1,123 @@ +#!/bin/bash +# +# Universal audit script for Dataproc on GKE (DPGKE) environments. +# +# This script inspects the live state of GCP resources to generate a report, +# ignoring potentially stale local sentinel files. + +# Exit on failure +set -e + +# --- Get script's real directory --- +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +GCLOUD_DIR="$(realpath "${SCRIPT_DIR}/..")" +source "${GCLOUD_DIR}/lib/env.sh" +source "${GCLOUD_DIR}/lib/script-utils.sh" +source "${GCLOUD_DIR}/lib/gke.sh" +source "${GCLOUD_DIR}/lib/dpgke.sh" +source "${GCLOUD_DIR}/lib/gcp/misc.sh" + +# --- Argument Parsing --- +AUDIT_DESTROY=false +OUTPUT_FORMAT="text" + +while [[ "$#" -gt 0 ]]; do + case $1 in + --destroy) AUDIT_DESTROY=true ;; + --format=json) OUTPUT_FORMAT="json" ;; + *) echo "Unknown parameter passed: $1"; exit 1 ;; + esac + shift +done + +# --- Main Audit Logic --- +if [[ "${OUTPUT_FORMAT}" != "json" ]]; then + configure_gcloud +fi + +declare -A AUDIT_RESULTS + +if [[ "${OUTPUT_FORMAT}" != "json" ]]; then + echo "=================================================" + echo "Starting Comprehensive DPGKE Environment Audit" + echo "=================================================" +fi + +if [[ "$AUDIT_DESTROY" == "true" ]]; then + # --- Verify Destruction --- + if [[ "${OUTPUT_FORMAT}" != "json" ]]; then + echo + echo "Verifying Complete Resource Destruction..." + fi + check_not_exists "Dataproc on GKE Cluster '${DPGKE_CLUSTER_NAME}'" "exists_dpgke_cluster" + check_not_exists "GKE Cluster '${GKE_CLUSTER_NAME}'" "exists_gke_cluster" + check_not_exists "Default NodePool" "exists_gke_nodepool '${DP_POOLNAME_DEFAULT}'" + check_not_exists "Spark Driver NodePool" "exists_gke_nodepool '${DP_DRIVER_POOLNAME}'" + check_not_exists "Spark Executor NodePool" "exists_gke_nodepool '${DP_EXEC_POOLNAME}'" + + # Correct sentinels + if [[ "${OUTPUT_FORMAT}" != "json" ]]; then + print_status "Updating sentinel files based on audit findings..." + report_result "Done" + fi +else + # --- Verify Creation --- + if [[ "${OUTPUT_FORMAT}" != "json" ]]; then + print_status "Auditing environment state..." + fi + AUDIT_RESULTS["GKE Cluster"]=$(_check_exists "GKE Cluster '${GKE_CLUSTER_NAME}'" "exists_gke_cluster") + AUDIT_RESULTS["Dataproc on GKE Cluster"]=$(_check_exists "Dataproc on GKE Cluster '${DPGKE_CLUSTER_NAME}'" "exists_dpgke_cluster") + if [[ ${AUDIT_RESULTS["GKE Cluster"]} == "Exists" ]]; then + AUDIT_RESULTS["Default NodePool"]=$(_check_exists "Default NodePool" "exists_gke_nodepool '${DP_POOLNAME_DEFAULT}'") + AUDIT_RESULTS["Spark Driver NodePool"]=$(_check_exists "Spark Driver NodePool" "exists_gke_nodepool '${DP_DRIVER_POOLNAME}'") + AUDIT_RESULTS["Spark Executor NodePool"]=$(_check_exists "Spark Executor NodePool" "exists_gke_nodepool '${DP_EXEC_POOLNAME}'") + fi + if [[ "${OUTPUT_FORMAT}" != "json" ]]; then + report_result "Done" + fi + + # Correct sentinels + if [[ "${OUTPUT_FORMAT}" != "json" ]]; then + print_status "Updating sentinel files based on audit findings..." + fi + if [[ ${AUDIT_RESULTS["Dataproc on GKE Cluster"]} == "Exists" ]]; then + else + fi + if [[ "${OUTPUT_FORMAT}" != "json" ]]; then + report_result "Done" + fi +fi + +# --- Output Results --- +if [[ "${OUTPUT_FORMAT}" == "json" ]]; then + # Build a JSON object with jq + json_output=$(jq -n \ + --arg gke_cluster "${AUDIT_RESULTS["GKE Cluster"]:-"Not Checked"}" \ + --arg dpgke_cluster "${AUDIT_RESULTS["Dataproc on GKE Cluster"]:-"Not Checked"}" \ + --arg default_nodepool "${AUDIT_RESULTS["Default NodePool"]:-"Not Checked"}" \ + --arg driver_nodepool "${AUDIT_RESULTS["Spark Driver NodePool"]:-"Not Checked"}" \ + --arg exec_nodepool "${AUDIT_RESULTS["Spark Executor NodePool"]:-"Not Checked"}" \ + '{ + "resources": { + "gkeCluster": $gke_cluster, + "dataprocOnGkeCluster": $dpgke_cluster, + "defaultNodePool": $default_nodepool, + "sparkDriverNodePool": $driver_nodepool, + "sparkExecutorNodePool": $exec_nodepool + } + }') + echo "${json_output}" +else + # Print the human-readable summary + echo + echo "=================================================" + echo " Audit Summary" + echo "=================================================" + for resource in "GKE Cluster" "Dataproc on GKE Cluster" "Default NodePool" "Spark Driver NodePool" "Spark Executor NodePool"; do + if [[ -v AUDIT_RESULTS[${resource}] ]]; then + printf "%-25s: %s +" "${resource}" "${AUDIT_RESULTS[${resource}]}" + fi + done + echo "=================================================" +fi diff --git a/gcloud/bin/connectivity-test b/gcloud/bin/connectivity-test new file mode 100644 index 00000000..c3f22da8 --- /dev/null +++ b/gcloud/bin/connectivity-test @@ -0,0 +1 @@ +#!/bin/bashsource lib/env.shgcloud network-services gateways describe ${SWP_INSTANCE_NAME} --location ${REGION}gcloud network-security gateway-security-policies rules export allow-all-rule \ --gateway-security-policy=${SWP_POLICY_NAME} \ --location ${REGION}#gcloud logging read \# "resource.type=\"networkservices.googleapis.com/Gateway\" \# resource.labels.location=\"${REGION}\" \# resource.labels.gateway_id=\"${SWP_INSTANCE_NAME}\"" \# --project="${PROJECT_ID}" \# --limit=100 \# --format=json # Or text, defaultgcloud logging read \ "resource.type=\"networkservices.googleapis.com/Gateway\" \ resource.labels.location=\"${REGION}\" \ resource.labels.gateway_id=\"${SWP_INSTANCE_NAME}\"" \ --project="${PROJECT_ID}" \ --limit=500 \ --freshness=1d # Look back over the last day# Delete the old test if it existsgcloud network-management connectivity-tests delete swp-test --quiet# Create a new onegcloud network-management connectivity-tests create swp-test \ --source-instance=projects/${PROJECT_ID}/zones/${ZONE}/instances/${CLUSTER_NAME}-m \ --destination-ip-address=10.43.79.245 \ --destination-port=3128 \ --protocol=TCP \ --source-network=projects/${PROJECT_ID}/global/networks/${NETWORK}# Wait and describesleep 5# After a moment, get the results:gcloud network-management connectivity-tests describe swp-test \ No newline at end of file diff --git a/gcloud/bin/create-dpgce b/gcloud/bin/create-dpgce index e91ad095..0942c6ac 100755 --- a/gcloud/bin/create-dpgce +++ b/gcloud/bin/create-dpgce @@ -1,112 +1,110 @@ #!/bin/bash -# -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS-IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# # Exit on failure set -e -source lib/env.sh +# --- Get script's real directory --- +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +GCLOUD_DIR="$(realpath "${SCRIPT_DIR}/..")" -parse_args "$@" +# --- Source environment variables and utility functions --- +source "${GCLOUD_DIR}/lib/env.sh" + +# --- Argument Parsing --- +IS_CUSTOM=false +IS_PRIVATE=false +CREATE_CLUSTER=true + +while [[ "$#" -gt 0 ]]; do + case $1 in + --custom) IS_CUSTOM=true ;; + --private) IS_PRIVATE=true ;; + --no-create-cluster) CREATE_CLUSTER=false ;; + *) echo "Unknown parameter passed: $1"; exit 1 ;; + esac + shift +done if (( DEBUG != 0 )); then set -x fi -# Source function files -source lib/gcp/project.sh -source lib/gcp/misc.sh -source lib/misc.sh -source lib/gcp/iam.sh -source lib/gcp/gcs.sh -source lib/network/network.sh -source lib/network/subnet.sh -source lib/network/router.sh -source lib/network/routes.sh -source lib/network/firewall.sh -source lib/dataproc/cluster.sh -source lib/dataproc/autoscaling.sh -# Add others as needed, e.g.: -# source lib/network/peering.sh -# source lib/bigtable.sh -# source lib/database/mysql.sh -# source lib/database/mssql.sh - -create_project +# Source all function files needed for checks and creation +source "${GCLOUD_DIR}/lib/script-utils.sh" +source "${GCLOUD_DIR}/lib/gcp/project.sh" +source "${GCLOUD_DIR}/lib/gcp/gcs.sh" +source "${GCLOUD_DIR}/lib/network/network.sh" +source "${GCLOUD_DIR}/lib/network/subnet.sh" +source "${GCLOUD_DIR}/lib/network/router.sh" +source "${GCLOUD_DIR}/lib/network/firewall.sh" +source "${GCLOUD_DIR}/lib/gcp/iam.sh" +source "${GCLOUD_DIR}/lib/gcp/misc.sh" +source "${GCLOUD_DIR}/lib/misc.sh" +source "${GCLOUD_DIR}/lib/dataproc/autoscaling.sh" +source "${GCLOUD_DIR}/lib/dataproc/cluster.sh" +source "${GCLOUD_DIR}/lib/dataproc/private-cluster.sh" + + +# --- Main Logic --- +print_status "Auditing environment to determine current state..." +"${GCLOUD_DIR}/bin/audit-dpgce" > /dev/null +report_result "Done" configure_gcloud - +check_project +check_billing set_cluster_name - enable_services -create_service_account - -create_bucket - -create_vpc_network - -#create_ip_allocation - -#create_vpc_peering - -# Create a cloud router - -create_vpc_network - -create_subnet - -create_router - -create_default_route - -add_nat_policy - -create_firewall_rules - -# Create logging firewall rules - -#create_logging_firewall_rules +if [[ $(jq -r '.serviceAccount == null' "${STATE_FILE}") == "true" ]]; then + create_service_account +else + print_status "Skipping Service Account creation (already exists)..." + report_result "Exists" +fi -#create_bigtable_instance +create_gcs_bucket "${BUCKET}" "Standard" +grant_gcs_bucket_perms "${BUCKET}" +create_gcs_bucket "${TEMP_BUCKET}" "Standard" +grant_gcs_bucket_perms "${TEMP_BUCKET}" +upload_init_actions -#create_mysql_instance -#create_legacy_mssql_instance +if [[ $(jq -r '.vpcNetwork == null' "${STATE_FILE}") == "true" ]]; then + create_vpc_network +fi -# Create PHS dataproc cluster +if [[ $(jq -r '.standardSubnet == null' "${STATE_FILE}") == "true" ]]; then + create_subnet +fi -#create_phs_cluster +if [[ $(jq -r '.cloudRouter == null' "${STATE_FILE}") == "true" ]]; then + create_router + add_nat_to_router +fi -# Create normal dataproc cluster +if [[ $(jq -r '.firewallRule == null' "${STATE_FILE}") == "true" ]]; then + create_firewall_rules +fi -create_autoscaling_policy +if [[ $(jq -r '.autoscalingPolicy == null' "${STATE_FILE}") == "true" ]]; then + create_autoscaling_policy +fi +# --- Conditional Cluster Creation --- if [[ "${CREATE_CLUSTER}" = true ]]; then - print_status "Creating Dataproc Cluster ${CLUSTER_NAME}..." - if create_dpgce_cluster; then - print_result "Pass" + if [[ "$IS_PRIVATE" == "true" ]]; then + source "${GCLOUD_DIR}/lib/gcp/private-network.sh" + create_private_subnet + create_dpgce_private_cluster else - print_result "Fail" - exit 1 + if [[ "$IS_CUSTOM" == "true" ]]; then + source "${GCLOUD_DIR}/lib/dataproc/cluster-custom.sh" + fi + create_dpgce_cluster fi + + # After creation, run audit again to update state file with new resource details + "${GCLOUD_DIR}/bin/audit-dpgce" > /dev/null else echo -e "${YELLOW}Skipping Cluster Creation due to --no-create-cluster flag.${NC}" fi - -# Perform some connectivity tests - -#perform_connectivity_tests - diff --git a/gcloud/bin/debug-init-action b/gcloud/bin/debug-init-action new file mode 100755 index 00000000..5c060f59 --- /dev/null +++ b/gcloud/bin/debug-init-action @@ -0,0 +1,21 @@ +#!/bin/bash +source lib/env.sh + +echo "Uploading latest init scripts to gs://${BUCKET}/dataproc-initialization-actions..." +gsutil -m cp -r init/* gs://${BUCKET}/dataproc-initialization-actions + +echo " " +echo "SSH to the -m node:" +echo "gcloud compute ssh --zone ${ZONE} ${CLUSTER_NAME}-m --project ${PROJECT_ID}" +echo " " +echo "Once SSHed, run these commands:" +echo "----" +echo "sudo -i" +echo "rm -f /tmp/install_gpu_driver.sh /tmp/install.log" +echo "gsutil cp gs://${BUCKET}/dataproc-initialization-actions/gpu/install_gpu_driver.sh /tmp/" +echo "chmod +x /tmp/install_gpu_driver.sh" +echo "time bash -x /tmp/install_gpu_driver.sh 2>&1 | tee /tmp/install.log" +echo "----" +echo " " +echo "After the script finishes on the VM, download the log on your local machine:" +echo "gcloud compute scp --zone ${ZONE} ${CLUSTER_NAME}-m:/tmp/install.log ./tmp --tunnel-through-iap --project ${PROJECT_ID}" diff --git a/gcloud/bin/destroy-dpgce b/gcloud/bin/destroy-dpgce index 858248c7..6b7eedba 100755 --- a/gcloud/bin/destroy-dpgce +++ b/gcloud/bin/destroy-dpgce @@ -1,77 +1,106 @@ #!/bin/bash -# -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS-IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# +# Exit on failure set -e -source lib/env.sh - -# Source function files -source lib/dataproc/cluster.sh -source lib/gcp/iam.sh -source lib/dataproc/autoscaling.sh -source lib/network/routes.sh -source lib/network/router.sh -source lib/network/firewall.sh -source lib/network/subnet.sh -source lib/network/network.sh -source lib/gcp/gcs.sh -# source lib/database/mysql.sh -# source lib/database/mssql.sh - -parse_args "$@" - -if (( DEBUG != 0 )); then - set -x +# --- Get script's real directory --- +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +GCLOUD_DIR="$(realpath "${SCRIPT_DIR}/..")" + +# --- Source environment variables and utility functions --- +source "${GCLOUD_DIR}/lib/env.sh" +source "${GCLOUD_DIR}/lib/script-utils.sh" +source "${GCLOUD_DIR}/lib/gcp/misc.sh" +source "${GCLOUD_DIR}/lib/network/network.sh" +source "${GCLOUD_DIR}/lib/network/subnet.sh" +source "${GCLOUD_DIR}/lib/network/router.sh" +source "${GCLOUD_DIR}/lib/network/routes.sh" +source "${GCLOUD_DIR}/lib/network/firewall.sh" +source "${GCLOUD_DIR}/lib/gcp/iam.sh" +source "${GCLOUD_DIR}/lib/gcp/gcs.sh" +source "${GCLOUD_DIR}/lib/dataproc/cluster.sh" +source "${GCLOUD_DIR}/lib/dataproc/private-cluster.sh" +source "${GCLOUD_DIR}/lib/dataproc/autoscaling.sh" + +# --- Argument Parsing --- +FORCE_DELETE=false +while [[ "$#" -gt 0 ]]; do + case $1 in + --force) FORCE_DELETE=true ;; + *) echo "Unknown parameter passed: $1"; exit 1 ;; + esac + shift +done + +# --- Main Logic --- +configure_gcloud + +echo "========================================" +echo "Starting DPGCE Environment Teardown" +echo "========================================" + +# Run audit to get the current state +"${GCLOUD_DIR}/bin/audit-dpgce" > /dev/null + +# --- Teardown Logic --- +# Read the state file and delete resources that are not null. +# The order is important to handle dependencies. + +if [[ $(jq -r '.dataprocCluster != null' "${STATE_FILE}") == "true" ]]; then + delete_dpgce_cluster fi -delete_dpgce_cluster - -delete_service_account - -delete_autoscaling_policy - -#delete_phs_cluster() - -#delete_mysql_instance -#delete_legacy_mssql_instance - -delete_default_route - -delete_nat_configs # Should be called before delete_router +if [[ $(jq -r '.autoscalingPolicy != null' "${STATE_FILE}") == "true" ]]; then + delete_autoscaling_policy +fi -delete_router +if [[ $(jq -r '.cloudRouter != null' "${STATE_FILE}") == "true" ]]; then + delete_router +fi -delete_firewall_rules +if [[ $(jq -r '.firewallRule != null' "${STATE_FILE}") == "true" ]]; then + delete_firewall_rules +fi -#delete_logging_firewall_rules +# Delete routes after firewall rules +if [[ $(jq -r '.routes | length > 0' "${STATE_FILE}") == "true" ]]; then + mapfile -t route_names < <(jq -r '.routes[].name' "${STATE_FILE}") + for route_name in "${route_names[@]}"; do + delete_route "${route_name}" + done +fi -#delete_ip_allocation +if [[ $(jq -r '.privateSubnet != null' "${STATE_FILE}") == "true" ]]; then + delete_subnet "${PRIVATE_SUBNET}" +fi -delete_subnet +if [[ $(jq -r '.standardSubnet != null' "${STATE_FILE}") == "true" ]]; then + delete_subnet "${SUBNET}" +fi -delete_vpc_network +if [[ $(jq -r '.serviceAccount != null' "${STATE_FILE}") == "true" ]]; then + delete_service_account +fi -#delete_vpc_peering +# Finally, attempt to delete the network. It should be empty now. +if [[ $(jq -r '.vpcNetwork != null' "${STATE_FILE}") == "true" ]]; then + delete_vpc_network +fi +# Conditionally delete buckets if --force is specified if [[ "${FORCE_DELETE}" = true ]]; then - delete_bucket + delete_gcs_bucket "${BUCKET}" + delete_gcs_bucket "${TEMP_BUCKET}" else print_status "Skipping Bucket Deletion. Use --force to delete buckets." report_result "Skipped" fi -set +x \ No newline at end of file +# After attempting deletion, run audit one last time to generate the final, clean state file. +"${GCLOUD_DIR}/bin/audit-dpgce" > /dev/null + +echo "========================================" +echo "DPGCE Environment teardown complete" +echo "Final state written to ${STATE_FILE}" +echo "========================================" +jq . < "${STATE_FILE}" diff --git a/gcloud/bin/recreate-cluster.sh b/gcloud/bin/recreate-cluster.sh new file mode 100755 index 00000000..ea47bc82 --- /dev/null +++ b/gcloud/bin/recreate-cluster.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +# Exit on failure +set -e + +# --- Get script's real directory --- +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +GCLOUD_DIR="$(realpath "${SCRIPT_DIR}/..")" + +# --- Source environment variables and utility functions --- +source "${GCLOUD_DIR}/lib/env.sh" +source "${GCLOUD_DIR}/lib/script-utils.sh" +source "${GCLOUD_DIR}/lib/dataproc/cluster.sh" +source "${GCLOUD_DIR}/lib/dataproc/cluster-custom.sh" +source "${GCLOUD_DIR}/lib/dataproc/private-cluster.sh" +source "${GCLOUD_DIR}/lib/gcp/misc.sh" + +# --- Argument Parsing --- +IS_CUSTOM=false +IS_PRIVATE=false + +while [[ "$#" -gt 0 ]]; do + case $1 in + --custom) IS_CUSTOM=true ;; + --private) IS_PRIVATE=true ;; + *) echo "Unknown parameter passed: $1"; exit 1 ;; + esac + shift +done + +if (( DEBUG != 0 )); then + set -x +fi + +# --- Main Logic --- +configure_gcloud + +echo "========================================" +echo "Starting DPGCE Cluster Recreation" +echo "========================================" + +# Run audit to get the current state of the cluster +print_status "Auditing environment to determine current state..." +"${GCLOUD_DIR}/bin/audit-dpgce" > /dev/null +report_result "Done" + +# Check if a cluster exists and delete it +if [[ $(jq -r '.dataprocCluster != null' "${STATE_FILE}") == "true" ]]; then + delete_dpgce_cluster +fi + +# Re-create the cluster based on the flags provided +if [[ "$IS_PRIVATE" == "true" ]]; then + create_dpgce_private_cluster "$@" +else + create_dpgce_cluster "$@" +fi + +# After creation, run audit again to update state file with new resource details +"${GCLOUD_DIR}/bin/audit-dpgce" > /dev/null + +echo "========================================" +echo "DPGCE Cluster re-created" +echo "========================================" +print_cluster_details diff --git a/gcloud/bin/scp-m b/gcloud/bin/scp-m index 09c78937..d228985c 100755 --- a/gcloud/bin/scp-m +++ b/gcloud/bin/scp-m @@ -17,25 +17,25 @@ source lib/env.sh if [[ -z "$1" ]]; then - echo "$0 [master number] " + echo "$0 [-m node index] " exit 1 fi -# If the first argument is a number, assume it indicates which master to send file to -MASTER_HOSTNAME="${CLUSTER_NAME}-m" +# If the first argument is a number, assume it indicates which m node to send file to +M_HOSTNAME="${CLUSTER_NAME}-m" if [[ $(perl -e "print 1 if q{$1} =~ /^\d+$/") == "1" ]]; then - master_num="$1" - echo "master_num: $master_num" - MASTER_HOSTNAME="${MASTER_HOSTNAME}-${master_num}" + m_num="$1" + echo "m_num: $m_num" + M_HOSTNAME="${M_HOSTNAME}-${m_num}" shift 1 else - MASTER_HOSTNAME="${MASTER_HOSTNAME}" + M_HOSTNAME="${M_HOSTNAME}" fi date gcloud compute scp --recurse "$*" \ --zone ${ZONE} \ - ${MASTER_HOSTNAME}:/tmp \ + ${M_HOSTNAME}:/tmp \ --tunnel-through-iap \ --project ${PROJECT_ID} diff --git a/gcloud/bin/ssh-m b/gcloud/bin/ssh-m index 4bfbfbb9..0b847cd5 100755 --- a/gcloud/bin/ssh-m +++ b/gcloud/bin/ssh-m @@ -16,12 +16,14 @@ # source lib/env.sh -master_num="$1" - -MASTER_HOSTNAME="${CLUSTER_NAME}-m" -if [[ -n "$master_num" ]]; then - echo "master_num: $master_num" - MASTER_HOSTNAME="${MASTER_HOSTNAME}-${master_num}" +M_HOSTNAME="${CLUSTER_NAME}-m" +# If the first argument is a number, treat it as the master index for HA +if [[ "$1" =~ ^[0-9]+$ ]]; then + M_HOSTNAME="${M_HOSTNAME}-${1}" + shift # Remove the index from the arguments fi -gcloud compute ssh --zone ${ZONE} ${MASTER_HOSTNAME} --tunnel-through-iap --project ${PROJECT_ID} -- -o ConnectTimeout=360 +# The rest of the arguments are the command to be executed +COMMAND_TO_RUN="$@" + +gcloud compute ssh --zone "${ZONE}" "${M_HOSTNAME}" --tunnel-through-iap --project "${PROJECT_ID}" -- -t -o ConnectTimeout=360 -AY ${COMMAND_TO_RUN} diff --git a/gcloud/lib/env.sh b/gcloud/lib/env.sh index a4fe4d5d..88973797 100644 --- a/gcloud/lib/env.sh +++ b/gcloud/lib/env.sh @@ -17,15 +17,17 @@ # Set RESOURCE_SUFFIX based on TIMESTAMP env var or generate new if [[ -n "${TIMESTAMP}" ]]; then export RESOURCE_SUFFIX="${TIMESTAMP}" - echo "Using provided TIMESTAMP for resources: ${RESOURCE_SUFFIX}" + echo "Using provided TIMESTAMP for resources: ${RESOURCE_SUFFIX}" >&2 else export RESOURCE_SUFFIX="$(date +%s)" - echo "Generated new TIMESTAMP for resources: ${RESOURCE_SUFFIX}" + echo "Generated new TIMESTAMP for resources: ${RESOURCE_SUFFIX}" >&2 fi export REPRO_TMPDIR="${REPRO_TMPDIR:-/tmp/dataproc-repro/${RESOURCE_SUFFIX}}" mkdir -p "${REPRO_TMPDIR}" -export SENTINEL_DIR="${SENTINEL_DIR:-${REPRO_TMPDIR}/sentinels}" -mkdir -p "${SENTINEL_DIR}" +export LOG_DIR="${LOG_DIR:-${REPRO_TMPDIR}/logs}" +mkdir -p "${LOG_DIR}" +export STATE_FILE="${REPRO_TMPDIR}/state.json" + source lib/script-utils.sh @@ -44,13 +46,23 @@ export CLUSTER_NAME="$(jq -r .CLUSTER_NAME env.json)" export BUCKET="$(jq -r .BUCKET env.json)" export TEMP_BUCKET="$(jq -r .TEMP_BUCKET env.json)" export RANGE="$(jq -r .RANGE env.json)" +export PRIVATE_RANGE="$(jq -r .PRIVATE_RANGE env.json)" +export PRIVATE_SUBNET="private-subnet-${CLUSTER_NAME}" +export SWP_RANGE="$(jq -r .SWP_RANGE env.json)" +export SWP_SUBNET="swp-subnet-${CLUSTER_NAME}" export IDLE_TIMEOUT="$(jq -r .IDLE_TIMEOUT env.json)" export ASN_NUMBER="$(jq -r .ASN_NUMBER env.json)" export IMAGE_VERSION="$(jq -r .IMAGE_VERSION env.json)" export REGION="$(jq -r .REGION env.json)" +export SWP_IP="$(jq -r .SWP_IP env.json)" +export SWP_PORT="$(jq -r .SWP_PORT env.json)" +export SWP_HOSTNAME="$(jq -r .SWP_HOSTNAME env.json)" +export SWP_POLICY_NAME="swp-policy-${CLUSTER_NAME}" +export SWP_INSTANCE_NAME="swp-gateway-${CLUSTER_NAME}" +export SWP_CERT_NAME="swp-cert-${CLUSTER_NAME}-${RESOURCE_SUFFIX}" export DEBUG="${DEBUG:-0}" -export ZONE="${REGION}-b" +export ZONE="${REGION}-a" #export ZONE="${REGION}-b" #export IMAGE_VERSION="2.0" #export IMAGE_VERSION="2.0.67-debian10" # final proprietary gpu support - April 26, 2024 - 5.10.0-0.deb10.16-amd64 @@ -151,7 +163,12 @@ export MR_HISTORY_BUCKET="${BUCKET}/*/mapreduce-job-history/done" #export MACHINE_TYPE="n1-highmem-8" #export MACHINE_TYPE="n1-standard-16" export MACHINE_TYPE="n1-standard-32" +#export MACHINE_TYPE="n1-standard-64" #export MACHINE_TYPE="n1-standard-96" +#export MACHINE_TYPE="n1-highmem-32" +#export MACHINE_TYPE="a4x-highgpu-4g" +#export MACHINE_TYPE="a4x-maxgpu-4g-metal" +#export MACHINE_TYPE="a4-highgpu-8g" #export MACHINE_TYPE="e2-standard-2" # g2- are for l4 GPUs #export MACHINE_TYPE="g2-standard-4" @@ -161,15 +178,20 @@ export MACHINE_TYPE="n1-standard-32" #export MACHINE_TYPE="a2-highgpu-1g" #export MACHINE_TYPE="a2-highgpu-2g" # a3- are for h100 GPUs +# export MACHINE_TYPE="g4-standard-48" # (1 GPU) +# export MACHINE_TYPE="g4-standard-96" # (2 GPUs) +# export MACHINE_TYPE="g4-standard-192" # (4 GPUs) +# export MACHINE_TYPE="g4-standard-384" # (8 GPUs) #export MACHINE_TYPE="a3-highgpu-8g" +#export MACHINE_TYPE="a3-highgpu-1g" #export MACHINE_TYPE="a3-highgpu-2g" #export MACHINE_TYPE="a3-highgpu-4g" #export MACHINE_TYPE="n2d-standard-8" -export MASTER_MACHINE_TYPE="${MACHINE_TYPE}" -#export MASTER_MACHINE_TYPE="n1-standard-96" -#export MASTER_MACHINE_TYPE="n1-standard-8" -#export MASTER_MACHINE_TYPE="a2-highgpu-8g" -#export MASTER_MACHINE_TYPE="a3-highgpu-8g" +export M_MACHINE_TYPE="${MACHINE_TYPE}" +#export M_MACHINE_TYPE="n1-standard-96" +#export M_MACHINE_TYPE="n1-standard-8" +#export M_MACHINE_TYPE="a2-highgpu-8g" +#export M_MACHINE_TYPE="a3-highgpu-8g" export PRIMARY_MACHINE_TYPE="${MACHINE_TYPE}" #export PRIMARY_MACHINE_TYPE="n1-standard-8" #export PRIMARY_MACHINE_TYPE="g2-standard-4" @@ -183,19 +205,29 @@ export SECONDARY_MACHINE_TYPE="${PRIMARY_MACHINE_TYPE}" #export ACCELERATOR_TYPE="nvidia-tesla-p100" #export ACCELERATOR_TYPE="nvidia-tesla-a100" #export ACCELERATOR_TYPE="nvidia-tesla-a100,count=2" -export ACCELERATOR_TYPE="nvidia-tesla-t4" +#export ACCELERATOR_TYPE="nvidia-tesla-t4" +export ACCELERATOR_TYPE="nvidia-tesla-t4,count=4" +#export ACCELERATOR_TYPE="nvidia-rtx-pro-6000,count=1" +#export ACCELERATOR_TYPE="nvidia-rtx-pro-6000,count=2" +#export ACCELERATOR_TYPE="nvidia-rtx-pro-6000,count=4" # works +#export ACCELERATOR_TYPE="nvidia-rtx-pro-6000,count=8" +#export ACCELERATOR_TYPE="nvidia-gb300,count=4" +#export ACCELERATOR_TYPE="nvidia-gb200,count=8" +#export ACCELERATOR_TYPE="nvidia-b200,count=4" #export ACCELERATOR_TYPE="nvidia-l4" #export ACCELERATOR_TYPE="nvidia-tesla-p4,count=2" #export ACCELERATOR_TYPE="nvidia-tesla-p100,count=2" #export ACCELERATOR_TYPE="nvidia-tesla-v100,count=4" +#export ACCELERATOR_TYPE="nvidia-h100-80gb,count=8" #export ACCELERATOR_TYPE="nvidia-h100-80gb,count=4" #export ACCELERATOR_TYPE="nvidia-h100-80gb,count=2" -#export MASTER_ACCELERATOR_TYPE="nvidia-tesla-t4,count=4" -#export MASTER_ACCELERATOR_TYPE="nvidia-tesla-t4" -#export MASTER_ACCELERATOR_TYPE="nvidia-tesla-a100,count=2" -#export MASTER_ACCELERATOR_TYPE="nvidia-tesla-a100,count=8" -#export MASTER_ACCELERATOR_TYPE="nvidia-h100-80gb,count=8" -export MASTER_ACCELERATOR_TYPE="${ACCELERATOR_TYPE}" +#export ACCELERATOR_TYPE="nvidia-h100-80gb,count=1" +#export M_ACCELERATOR_TYPE="nvidia-tesla-t4,count=4" +#export M_ACCELERATOR_TYPE="nvidia-tesla-t4" +#export M_ACCELERATOR_TYPE="nvidia-tesla-a100,count=2" +#export M_ACCELERATOR_TYPE="nvidia-tesla-a100,count=8" +#export M_ACCELERATOR_TYPE="nvidia-h100-80gb,count=8" +export M_ACCELERATOR_TYPE="${ACCELERATOR_TYPE}" export PRIMARY_ACCELERATOR_TYPE="${ACCELERATOR_TYPE}" export SECONDARY_ACCELERATOR_TYPE="${ACCELERATOR_TYPE}" #export CUDA_VERSION=10.2.89 @@ -225,6 +257,8 @@ export SECONDARY_ACCELERATOR_TYPE="${ACCELERATOR_TYPE}" #export CUDA_VERSION="12.6" #export CUDA_VERSION="12.6.2" #export CUDA_VERSION="12.6.3" +export CUDA_VERSION="13.1.0" +export DRIVER_VERSION="590.48.01" #export DRIVER_VERSION="550.142" #export DRIVER_VERSION="460.73.01" #export DRIVER_VERSION="550.54.14" @@ -267,6 +301,17 @@ export HIVE_DATA_BUCKET="${BUCKET}" export WAREHOUSE_BUCKET="gs://${HIVE_DATA_BUCKET}" export HIVE_METASTORE_WAREHOUSE_DIR="${WAREHOUSE_BUCKET}/datasets" +# CI/CD Variables +export CI_PROJECT_ID="$(jq -r .CI_PROJECT_ID env.json)" +export CI_GCP_CREDENTIALS_PATH="$(jq -r .CI_GCP_CREDENTIALS_PATH env.json)" +export CI_CSR_REPO_NAME="$(jq -r .CI_CSR_REPO_NAME env.json)" +export CI_CSR_REGION="$(jq -r .CI_CSR_REGION env.json)" +export CI_GITHUB_CONNECTION_NAME="$(jq -r .CI_GITHUB_CONNECTION_NAME env.json)" +export CI_TRIGGER_BRANCH="$(jq -r .CI_TRIGGER_BRANCH env.json)" +export CUSTOM_IMAGE_URI="$(jq -r .CUSTOM_IMAGE_URI env.json)" +export CI_REPO_OWNER="$(jq -r .CI_REPO_OWNER env.json)" +export CI_BYOSA_EMAIL="$(jq -r .CI_BYOSA_EMAIL env.json)" + function configure_environment() { dataproc_repro_configure_environment=1 diff --git a/gcloud/lib/script-utils.sh b/gcloud/lib/script-utils.sh index 30ec8ff9..e8180152 100644 --- a/gcloud/lib/script-utils.sh +++ b/gcloud/lib/script-utils.sh @@ -79,95 +79,39 @@ function parse_args() { } export -f parse_args -# --- Sentinel Functions --- -function get_sentinel_file() { - local phase_name="$1" - local sentinel_name="$2" - echo "${SENTINEL_DIR}/${phase_name}-${sentinel_name}" -} -export -f get_sentinel_file - -function create_sentinel() { - local phase_name="$1" - local sentinel_name="$2" - touch "$(get_sentinel_file "${phase_name}" "${sentinel_name}")" -} -export -f create_sentinel - -function check_sentinel() { - local phase_name="$1" - local sentinel_name="$2" - [[ -f "$(get_sentinel_file "${phase_name}" "${sentinel_name}")" ]] -} -export -f check_sentinel - -function remove_sentinel() { - local phase_name="$1" - local sentinel_name="$2" - rm -f "$(get_sentinel_file "${phase_name}" "${sentinel_name}")" +# --- State Management Functions --- +function get_state() { + if [[ ! -f "${STATE_FILE}" ]]; then + echo "{}" + return + fi + cat "${STATE_FILE}" } -export -f remove_sentinel -function clear_sentinels() { - local phase_name="$1" - rm -f "${SENTINEL_DIR}/${phase_name}-*" +function update_state() { + local resource_key=$1 + local resource_value=$2 # This should be a JSON string or "null" + + local current_state=$(get_state) + local new_state=$(jq --arg key "${resource_key}" --argjson value "${resource_value}" '.[$key] = $value' <<< "${current_state}") + echo "${new_state}" > "${STATE_FILE}" } -export -f clear_sentinels # --- Audit Check Functions --- -function check_resource() { - local test_name="$1" - local command_to_run="$2" - local grep_pattern="$3" - local optional="${4:-false}" - local log_file="${LOG_DIR}/$(echo "$test_name" | tr ' /:' '___').log" - - print_status "Checking: ${test_name}... " - - eval "${command_to_run}" > "${log_file}" 2>&1 - - if grep -q "${grep_pattern}" "${log_file}"; then - if [[ "${optional}" == "true" && "${FORCE_AUDIT}" == "false" ]]; then - report_result "Kept" - return 0 - else - report_result "Fail" - return 1 - fi - else - report_result "Not Found" - return 0 - fi -} -export -f check_resource - -function check_resource_exact() { - local test_name="$1" - local command_to_run="$2" - local optional="${3:-false}" - local log_file="${LOG_DIR}/$(echo "$test_name" | tr ' /:' '___').log" - - print_status "Checking: ${test_name}... " - if eval "${command_to_run}" > "${log_file}" 2>&1; then - # Command succeeded, check if it produced output - if [[ $(wc -l < "${log_file}") -gt 0 ]]; then - # Output found - if [[ "${optional}" == "true" && "${FORCE_AUDIT}" == "false" ]]; then - report_result "Kept" - return 0 - else - report_result "Fail" - return 1 - fi - else - # Command succeeded but no output, so Not Found - report_result "Not Found" - return 0 - fi +# These functions are now designed to be called by the audit script. +# They return a JSON object with details if a resource is found, or the string "null". +function _check_exists() { + local command_to_run="$1" + local json_output + + # The command_to_run should be a gcloud command with --format=json + # that returns a JSON object if the resource exists and fails otherwise. + json_output=$(eval "${command_to_run}" 2>/dev/null) + + if [[ -n "${json_output}" ]]; then + echo "${json_output}" else - # Command failed, resource likely does not exist - report_result "Not Found" - return 0 + echo "null" fi } -export -f check_resource_exact +export -f _check_exists From 07e19dde1893895ab014921587faa218e8efddcb Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 13 Mar 2026 02:51:56 +0000 Subject: [PATCH 02/25] feat: Implement remaining logic and add CI/CD scripts - Added cluster creation logic for custom and private cluster types in lib/dataproc/. - Updated various library functions in lib/ to support the new unified scripts. - Added new scripts for CI/CD setup (bin/setup-cicd.sh) and GitHub triggers (bin/create-github-trigger.sh). --- gcloud/bin/create-github-trigger.sh | 41 +++ gcloud/bin/setup-cicd.sh | 222 ++++++++++++++++ gcloud/lib/bigtable.sh | 54 +--- gcloud/lib/database/mssql.sh | 116 +++----- gcloud/lib/database/mysql.sh | 50 ++-- gcloud/lib/database/oracle.sh | 126 +++++++++ gcloud/lib/database/pgsql.sh | 52 ++-- gcloud/lib/dataproc/autoscaling.sh | 42 +-- gcloud/lib/dataproc/cluster-custom.sh | 188 +++++++++++++ gcloud/lib/dataproc/cluster.sh | 115 ++++---- gcloud/lib/dataproc/private-cluster.sh | 98 +++++++ gcloud/lib/gcp/gcr.sh | 27 +- gcloud/lib/gcp/gcs.sh | 68 ++--- gcloud/lib/gcp/iam.sh | 117 ++------- gcloud/lib/gcp/kms.sh | 139 +++------- gcloud/lib/gcp/misc.sh | 80 +++--- gcloud/lib/gcp/project.sh | 10 - gcloud/lib/gke.sh | 76 ++---- gcloud/lib/kerberos.sh | 142 ++++------ gcloud/lib/network/firewall.sh | 28 +- gcloud/lib/network/network.sh | 58 ++-- gcloud/lib/network/router.sh | 129 +++------ gcloud/lib/network/routes.sh | 47 +--- gcloud/lib/network/subnet.sh | 56 ++-- gcloud/lib/phs.sh | 52 ++-- gcloud/lib/secure-boot/create-key-pair.sh | 2 +- gcloud/lib/shared-functions.sh | 16 +- gcloud/lib/swp/certs.sh | 307 ++++++---------------- gcloud/lib/swp/firewall.sh | 108 +++----- gcloud/lib/swp/gateway.sh | 62 ++--- gcloud/lib/swp/policy.sh | 121 +++------ gcloud/lib/swp/subnet.sh | 104 +++----- 32 files changed, 1335 insertions(+), 1518 deletions(-) create mode 100755 gcloud/bin/create-github-trigger.sh create mode 100755 gcloud/bin/setup-cicd.sh create mode 100644 gcloud/lib/database/oracle.sh create mode 100644 gcloud/lib/dataproc/cluster-custom.sh create mode 100644 gcloud/lib/dataproc/private-cluster.sh diff --git a/gcloud/bin/create-github-trigger.sh b/gcloud/bin/create-github-trigger.sh new file mode 100755 index 00000000..8252dcf8 --- /dev/null +++ b/gcloud/bin/create-github-trigger.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +source lib/env.sh + +# --- Configuration --- +PROJECT_ID="${CI_PROJECT_ID}" +REPO_NAME="${CI_CSR_REPO_NAME}" +REPO_OWNER="${CI_REPO_OWNER}" +# BRANCH_NAME is not used for PR triggers with comment control +REGION="${CI_CSR_REGION}" +BYOSA_EMAIL="${CI_BYOSA_EMAIL}" + +if [[ -z "$PROJECT_ID" || "$PROJECT_ID" == "your-ci-test-project-id" ]]; then + echo "ERROR: CI_PROJECT_ID not set or is placeholder in env.json" >&2 + exit 1 +fi +if [[ -z "$REPO_NAME" ]]; then + echo "ERROR: CI_CSR_REPO_NAME not set in env.json" >&2 + exit 1 +fi +if [[ -z "$REPO_OWNER" ]]; then + echo "ERROR: CI_REPO_OWNER not set in env.json" >&2 + exit 1 +fi +if [[ -z "$BYOSA_EMAIL" ]]; then + echo "ERROR: CI_BYOSA_EMAIL not set in env.json" >&2 + exit 1 +fi + +gcloud beta builds triggers create github --project="$PROJECT_ID" \ + --name="${REPO_NAME}-pr-trigger" \ + --repo-name="$REPO_NAME" \ + --repo-owner="$REPO_OWNER" \ + --pull-request-pattern=".*" \ + --comment-control=COMMENTS_ENABLED \ + --build-config="gcloud/cloudbuild.yaml" \ + --region="$REGION" \ + --service-account="projects/${PROJECT_ID}/serviceAccounts/${BYOSA_EMAIL}" + +echo "Trigger creation command for '/gcbrun' comments on PRs executed." +echo "Ensure the Google Cloud Build GitHub App has permissions to read PRs and comments." diff --git a/gcloud/bin/setup-cicd.sh b/gcloud/bin/setup-cicd.sh new file mode 100755 index 00000000..b1f5b290 --- /dev/null +++ b/gcloud/bin/setup-cicd.sh @@ -0,0 +1,222 @@ +#!/bin/bash + +# One-time setup script for CI/CD environment using Cloud Build + +set -e +set -u + +export TIMESTAMP="${TIMESTAMP:-$(date +%s)}" +source lib/env.sh # Sources variables from env.json + +# --- Configuration --- +CI_PROJECT_ID="${CI_PROJECT_ID}" +CI_GCP_CREDENTIALS_PATH="${CI_GCP_CREDENTIALS_PATH}" +CI_CSR_REPO_NAME="${CI_CSR_REPO_NAME}" +CI_CSR_REGION="${CI_CSR_REGION}" +CI_GITHUB_CONNECTION_NAME="${CI_GITHUB_CONNECTION_NAME}" +CI_TRIGGER_BRANCH="${CI_TRIGGER_BRANCH}" + +if [[ "$CI_PROJECT_ID" == "your-ci-test-project-id" || -z "$CI_PROJECT_ID" ]]; then + echo "ERROR: Please update CI_PROJECT_ID in env.json" + exit 1 +fi +if [[ "$CI_GCP_CREDENTIALS_PATH" == "/path/to/your/ci-service-account.json" || ! -f "$CI_GCP_CREDENTIALS_PATH" ]]; then + echo "ERROR: Please update CI_GCP_CREDENTIALS_PATH in env.json and ensure the file exists." + exit 1 +fi +if [[ -z "$CI_CSR_REPO_NAME" ]]; then + echo "ERROR: Please update CI_CSR_REPO_NAME in env.json." + exit 1 +fi + +echo "Setting up CI/CD for project: $CI_PROJECT_ID" +gcloud config set project "$CI_PROJECT_ID" + +# --- Enable Services --- +echo "Enabling necessary services in $CI_PROJECT_ID..." +gcloud services enable \ + secretmanager.googleapis.com \ + cloudbuild.googleapis.com \ + containerregistry.googleapis.com \ + compute.googleapis.com \ + dataproc.googleapis.com \ + container.googleapis.com \ + networksecurity.googleapis.com \ + networkservices.googleapis.com \ + privateca.googleapis.com \ + certificatemanager.googleapis.com \ + sourcerepo.googleapis.com \ + --project="$CI_PROJECT_ID" + +# --- Create Secrets --- +echo "Creating secrets in $CI_PROJECT_ID..." + +# Use the main env.json for the test environment secret +gcloud secrets create test-env-json --replication-policy=automatic --project="$CI_PROJECT_ID" --quiet || echo "Secret test-env-json already exists." +gcloud secrets versions add test-env-json --data-file="env.json" --project="$CI_PROJECT_ID" + +gcloud secrets create gcp-credentials --replication-policy=automatic --project="$CI_PROJECT_ID" --quiet || echo "Secret gcp-credentials already exists." +gcloud secrets versions add gcp-credentials --data-file="$CI_GCP_CREDENTIALS_PATH" --project="$CI_PROJECT_ID" + +echo "Secrets created." + +# --- Create BYOSA Service Account --- +BYOSA_EMAIL="${CI_BYOSA_EMAIL}" +BYOSA_NAME=$(echo "$BYOSA_EMAIL" | cut -d @ -f 1) +echo "Checking for BYOSA: $BYOSA_EMAIL" +if ! gcloud iam service-accounts describe "$BYOSA_EMAIL" --project="$CI_PROJECT_ID" > /dev/null 2>&1; then + echo "Creating BYOSA: $BYOSA_EMAIL..." + gcloud iam service-accounts create "$BYOSA_NAME" \ + --description="Service Account for Cloud Build CI/CD" \ + --display-name="Dataproc Repro CICD" \ + --project="$CI_PROJECT_ID" +else + echo "BYOSA $BYOSA_EMAIL already exists." +fi + +# Grant Owner role to BYOSA (SCOPE DOWN FOR PRODUCTION) +echo "WARNING: Granting roles/owner to the BYOSA $BYOSA_EMAIL in $CI_PROJECT_ID." +gcloud projects add-iam-policy-binding "$CI_PROJECT_ID" \ + --member="serviceAccount:$BYOSA_EMAIL" \ + --role='roles/owner' + +# --- Grant Permissions to Cloud Build SA --- +PROJECT_NUMBER=$(gcloud projects describe "$CI_PROJECT_ID" --format="value(projectNumber)") +CLOUD_BUILD_SA="${PROJECT_NUMBER}@cloudbuild.gserviceaccount.com" + +echo "Granting permissions to Cloud Build service account: $CLOUD_BUILD_SA" + +# Grant access to secrets +gcloud secrets add-iam-policy-binding test-env-json \ + --member="serviceAccount:${CLOUD_BUILD_SA}" \ + --role='roles/secretmanager.secretAccessor' \ + --project="$CI_PROJECT_ID" +gcloud secrets add-iam-policy-binding gcp-credentials \ + --member="serviceAccount:${CLOUD_BUILD_SA}" \ + --role='roles/secretmanager.secretAccessor' \ + --project="$CI_PROJECT_ID" + +echo "WARNING: Granting roles/owner to the Cloud Build SA in $CI_PROJECT_ID. Scope down for production." +gcloud projects add-iam-policy-binding "$CI_PROJECT_ID" \ + --member="serviceAccount:${CLOUD_BUILD_SA}" \ + --role='roles/owner' + +# Grant Cloud Build SA permission to ACT AS the BYOSA +echo "Granting Service Account User role to $CLOUD_BUILD_SA on $BYOSA_EMAIL..." +gcloud iam service-accounts add-iam-policy-binding "$BYOSA_EMAIL" \ + --member="serviceAccount:${CLOUD_BUILD_SA}" \ + --role='roles/iam.serviceAccountUser' \ + --project="$CI_PROJECT_ID" + +echo "Permissions granted." + +# --- Write cloudbuild.yaml --- +CLOUDBUILD_FILE="cloudbuild.yaml" +echo "Writing $CLOUDBUILD_FILE..." +cat > "$CLOUDBUILD_FILE" << EOF +steps: + - name: 'gcr.io/cloud-builders/gcloud' + entrypoint: 'bash' + args: + - '-c' + - | + echo "\$TEST_ENV_JSON" > env.json + echo "CI/CD env.json content:" + cat env.json + # --- Test Standard DPGCE --- + - name: 'gcr.io/cloud-builders/gcloud' + id: 'Test Standard Create' + entrypoint: 'bash' + args: ['-c', 'source lib/env.sh && ./bin/create-dpgce && ./bin/audit-dpgce-create'] + env: + - 'PROJECT_ID=\$PROJECT_ID' + - name: 'gcr.io/cloud-builders/gcloud' + id: 'Test Standard Destroy' + entrypoint: 'bash' + args: ['-c', 'source lib/env.sh && ./bin/destroy-dpgce --force && ./bin/audit-dpgce-destroy --force'] + env: + - 'PROJECT_ID=\$PROJECT_ID' + waitFor: ['Test Standard Create'] + + # --- Test Private DPGCE --- + - name: 'gcr.io/cloud-builders/gcloud' + id: 'Test Private Create' + entrypoint: 'bash' + args: ['-c', 'source lib/env.sh && ./bin/create-dpgce-private && ./bin/audit-private-create'] + env: + - 'PROJECT_ID=\$PROJECT_ID' + - name: 'gcr.io/cloud-builders/gcloud' + id: 'Test Private Destroy' + entrypoint: 'bash' + args: ['-c', 'source lib/env.sh && ./bin/destroy-dpgce-private --force && ./bin/audit-private-destroy --force'] + env: + - 'PROJECT_ID=\$PROJECT_ID' + waitFor: ['Test Private Create'] + + # --- Test Custom Standard DPGCE --- + - name: 'gcr.io/cloud-builders/gcloud' + id: 'Test Custom Std Create' + entrypoint: 'bash' + args: ['-c', 'source lib/env.sh && ./bin/create-dpgce-custom && ./bin/audit-dpgce-create-custom'] + env: + - 'PROJECT_ID=\$PROJECT_ID' + - name: 'gcr.io/cloud-builders/gcloud' + id: 'Test Custom Std Destroy' + entrypoint: 'bash' + args: ['-c', 'source lib/env.sh && ./bin/destroy-dpgce --force && ./bin/audit-dpgce-destroy --force'] + env: + - 'PROJECT_ID=\$PROJECT_ID' + waitFor: ['Test Custom Std Create'] + + # --- Test Custom Private DPGCE --- + - name: 'gcr.io/cloud-builders/gcloud' + id: 'Test Custom Pvt Create' + entrypoint: 'bash' + args: ['-c', 'source lib/env.sh && ./bin/create-dpgce-custom-private && ./bin/audit-dpgce-create-custom-private'] + env: + - 'PROJECT_ID=\$PROJECT_ID' + - name: 'gcr.io/cloud-builders/gcloud' + id: 'Test Custom Pvt Destroy' + entrypoint: 'bash' + args: ['-c', 'source lib/env.sh && ./bin/destroy-dpgce-private --force && ./bin/audit-private-destroy --force'] + env: + - 'PROJECT_ID=\$PROJECT_ID' + waitFor: ['Test Custom Pvt Create'] + +availableSecrets: + secretManager: + - versionName: projects/\$PROJECT_ID/secrets/test-env-json/versions/latest + env: 'TEST_ENV_JSON' + +options: + env: + - 'PROJECT_ID=$CI_PROJECT_ID' # Use CI_PROJECT_ID here + - 'CLOUDSDK_CORE_DISABLE_PROMPTS=1' +substitutions: + _CI_PROJECT_ID: "$CI_PROJECT_ID" +timeout: 3600s # 60 minutes +EOF +echo "$CLOUDBUILD_FILE written." + +# --- Configure Triggers --- +echo "Instructions to create Cloud Build Trigger:" +echo "1. Go to Cloud Build Triggers in project $CI_PROJECT_ID." +echo "2. Connect your repository if not already done." +echo "3. Create a trigger:" +echo " - Name: ${CI_CSR_REPO_NAME}-trigger" +echo " - Event: Push to a branch" +echo " - Source: Your repository, Branch: ^${CI_TRIGGER_BRANCH}$" +echo " - Configuration: Cloud Build configuration file (yaml)" +echo " - Location: /cloudbuild.yaml" +echo " - Service Account: Default Cloud Build SA should be fine with above permissions." + +echo "Alternative gcloud command to create trigger (adjust connection details if not using CSR):" +echo "gcloud beta builds triggers create github --project="$CI_PROJECT_ID" " +echo " --name="${CI_CSR_REPO_NAME}-trigger" " +echo " --repo-name="${CI_CSR_REPO_NAME}" " +echo " --repo-owner="YOUR_GITHUB_USERNAME" " +echo " --branch-pattern="^${CI_TRIGGER_BRANCH}$" " +echo " --build-config="cloudbuild.yaml" " +echo " --region="$CI_CSR_REGION"" + +echo "Setup script finished. PLEASE REVIEW AND ADJUST env.json and the trigger commands." diff --git a/gcloud/lib/bigtable.sh b/gcloud/lib/bigtable.sh index 1744d914..0ae0b849 100644 --- a/gcloud/lib/bigtable.sh +++ b/gcloud/lib/bigtable.sh @@ -1,60 +1,32 @@ #!/bin/bash +# # Bigtable functions function exists_bigtable_instance() { - BIGTABLE_INSTANCES="$(gcloud bigtable instances list --format=json)" - JQ_CMD=".[] | select(.name | test(\"${BIGTABLE_INSTANCE}$\"))" - OUR_INSTANCE=$(echo ${BIGTABLE_INSTANCES} | jq -c "${JQ_CMD}") - - if [[ -z "${OUR_INSTANCE}" ]]; then - return 1 - else - return 0 - fi + _check_exists "gcloud bigtable instances describe '${BIGTABLE_INSTANCE}' --format='json(name,displayName)'" } -export -f exists_bigtable_instance function create_bigtable_instance() { - local phase_name="create_bigtable_instance" - if check_sentinel "${phase_name}" "done"; then - print_status "Checking Bigtable Instance ${BIGTABLE_INSTANCE}..." - report_result "Exists" - return 0 - fi - print_status "Creating Bigtable Instance ${BIGTABLE_INSTANCE}..." - if exists_bigtable_instance; then - report_result "Exists" - create_sentinel "${phase_name}" "done" + local log_file="create_bigtable_${BIGTABLE_INSTANCE}.log" + if run_gcloud "${log_file}" gcloud bigtable instances create "${BIGTABLE_INSTANCE}" \ + --display-name="${BIGTABLE_DISPLAY_NAME}" \ + --cluster-config="${BIGTABLE_CLUSTER_CONFIG}"; then + report_result "Created" else - local log_file="create_bigtable_${BIGTABLE_INSTANCE}.log" - if run_gcloud "${log_file}" gcloud bigtable instances create ${BIGTABLE_INSTANCE} \ - --display-name "${BIGTABLE_DISPLAY_NAME}" \ - --cluster-config="${BIGTABLE_CLUSTER_CONFIG}"; then - report_result "Created" - create_sentinel "${phase_name}" "done" - else - report_result "Fail" - return 1 - fi + report_result "Fail" + return 1 fi } export -f create_bigtable_instance function delete_bigtable_instance() { - local phase_name="create_bigtable_instance" - remove_sentinel "${phase_name}" "done" - print_status "Deleting Bigtable Instance ${BIGTABLE_INSTANCE}..." local log_file="delete_bigtable_${BIGTABLE_INSTANCE}.log" - if exists_bigtable_instance; then - if run_gcloud "${log_file}" gcloud bigtable instances delete --quiet ${BIGTABLE_INSTANCE}; then - report_result "Deleted" - else - report_result "Fail" - fi + if run_gcloud "${log_file}" gcloud bigtable instances delete --quiet "${BIGTABLE_INSTANCE}"; then + report_result "Deleted" else - report_result "Not Found" + report_result "Fail" fi } -export -f delete_bigtable_instance \ No newline at end of file +export -f delete_bigtable_instance diff --git a/gcloud/lib/database/mssql.sh b/gcloud/lib/database/mssql.sh index 10d1f88d..339ebb1b 100644 --- a/gcloud/lib/database/mssql.sh +++ b/gcloud/lib/database/mssql.sh @@ -3,107 +3,67 @@ # MS SQL Cloud SQL functions function create_legacy_mssql_instance() { - local phase_name="create_legacy_mssql_instance" - if check_sentinel "${phase_name}" "done"; then - print_status "Creating Legacy MSSQL Instance ${MSSQL_INSTANCE}..." - report_result "Exists" - return 0 - fi - print_status "Creating Legacy MSSQL Instance ${MSSQL_INSTANCE}..." - if gcloud compute instances describe "${MSSQL_INSTANCE}" --zone "${ZONE}" --project="${PROJECT_ID}" > /dev/null 2>&1; then - report_result "Exists" - create_sentinel "${phase_name}" "done" + local log_file="create_legacy_mssql_${MSSQL_INSTANCE}.log" + local METADATA="kdc-root-passwd=${INIT_ACTIONS_ROOT}/${KDC_ROOT_PASSWD_KEY}.encrypted" + METADATA="${METADATA},kms-keyring=${KMS_KEYRING}" + METADATA="${METADATA},kdc-root-passwd-key=${KDC_ROOT_PASSWD_KEY}" + METADATA="${METADATA},startup-script-url=${INIT_ACTIONS_ROOT}/kdc-server.sh" + METADATA="${METADATA},service-account-user=${GSA}" + if run_gcloud "${log_file}" gcloud compute instances create "${MSSQL_INSTANCE}" \ + --zone "${ZONE}" \ + --subnet "${SUBNET}" \ + --service-account="${GSA}" \ + --boot-disk-type pd-ssd \ + --image-family="${MSSQL_IMAGE_FAMILY}" \ + --image-project="${MSSQL_IMAGE_PROJECT}" \ + --machine-type="${MSSQL_MACHINE_TYPE}" \ + --scopes='cloud-platform' \ + --metadata "${METADATA}"; then + report_result "Created" else - local log_file="create_legacy_mssql_${MSSQL_INSTANCE}.log" - local METADATA="kdc-root-passwd=${INIT_ACTIONS_ROOT}/${KDC_ROOT_PASSWD_KEY}.encrypted" - METADATA="${METADATA},kms-keyring=${KMS_KEYRING}" - METADATA="${METADATA},kdc-root-passwd-key=${KDC_ROOT_PASSWD_KEY}" - METADATA="${METADATA},startup-script-url=${INIT_ACTIONS_ROOT}/kdc-server.sh" - METADATA="${METADATA},service-account-user=${GSA}" - if run_gcloud "${log_file}" gcloud compute instances create "${MSSQL_INSTANCE}" \ - --zone "${ZONE}" \ - --subnet "${SUBNET}" \ - --service-account="${GSA}" \ - --boot-disk-type pd-ssd \ - --image-family="${MSSQL_IMAGE_FAMILY}" \ - --image-project="${MSSQL_IMAGE_PROJECT}" \ - --machine-type="${MSSQL_MACHINE_TYPE}" \ - --scopes='cloud-platform' \ - --metadata "${METADATA}"; then - report_result "Created" - create_sentinel "${phase_name}" "done" - else - report_result "Fail" - return 1 - fi + report_result "Fail" + return 1 fi } export -f create_legacy_mssql_instance function delete_legacy_mssql_instance() { - local phase_name="create_legacy_mssql_instance" - remove_sentinel "${phase_name}" "done" - print_status "Deleting Legacy MSSQL Instance ${MSSQL_INSTANCE}..." - if gcloud compute instances describe "${MSSQL_INSTANCE}" --zone "${ZONE}" --project="${PROJECT_ID}" > /dev/null 2>&1; then - local log_file="delete_legacy_mssql_${MSSQL_INSTANCE}.log" - if run_gcloud "${log_file}" gcloud compute instances delete "${MSSQL_INSTANCE}" --zone "${ZONE}" --project="${PROJECT_ID}" --quiet; then - report_result "Deleted" - else - report_result "Fail" - fi + local log_file="delete_legacy_mssql_${MSSQL_INSTANCE}.log" + if run_gcloud "${log_file}" gcloud compute instances delete "${MSSQL_INSTANCE}" --zone "${ZONE}" --project="${PROJECT_ID}" --quiet; then + report_result "Deleted" else - report_result "Not Found" + report_result "Fail" fi } export -f delete_legacy_mssql_instance function create_mssql_instance() { - local phase_name="create_mssql_instance" - if check_sentinel "${phase_name}" "done"; then - print_status "Creating MSSQL Instance ${MSSQL_INSTANCE}..." - report_result "Exists" - return 0 - fi - print_status "Creating MSSQL Instance ${MSSQL_INSTANCE}..." - if gcloud sql instances describe "${MSSQL_INSTANCE}" --project="${PROJECT_ID}" > /dev/null 2>&1; then - report_result "Exists" - create_sentinel "${phase_name}" "done" + local log_file="create_mssql_${MSSQL_INSTANCE}.log" + if run_gcloud "${log_file}" gcloud sql instances create "${MSSQL_INSTANCE}" \ + --no-assign-ip \ + --project="${PROJECT_ID}" \ + --network="${NETWORK_URI_PARTIAL}" \ + --database-version="${MSSQL_DATABASE_VERSION}" \ + --activation-policy=ALWAYS \ + --zone "${ZONE}"; then + report_result "Created" else - local log_file="create_mssql_${MSSQL_INSTANCE}.log" - if run_gcloud "${log_file}" gcloud sql instances create "${MSSQL_INSTANCE}" \ - --no-assign-ip \ - --project="${PROJECT_ID}" \ - --network="${NETWORK_URI_PARTIAL}" \ - --database-version="${MSSQL_DATABASE_VERSION}" \ - --activation-policy=ALWAYS \ - --zone "${ZONE}"; then - report_result "Created" - create_sentinel "${phase_name}" "done" - else - report_result "Fail" - return 1 - fi + report_result "Fail" + return 1 fi } export -f create_mssql_instance function delete_mssql_instance() { - local phase_name="create_mssql_instance" - remove_sentinel "${phase_name}" "done" - print_status "Deleting MSSQL Instance ${MSSQL_INSTANCE}..." - if gcloud sql instances describe "${MSSQL_INSTANCE}" --project="${PROJECT_ID}" > /dev/null 2>&1; then - local log_file="delete_mssql_${MSSQL_INSTANCE}.log" - if run_gcloud "${log_file}" gcloud sql instances delete --quiet "${MSSQL_INSTANCE}" --project="${PROJECT_ID}"; then - report_result "Deleted" - else - report_result "Fail" - fi + local log_file="delete_mssql_${MSSQL_INSTANCE}.log" + if run_gcloud "${log_file}" gcloud sql instances delete --quiet "${MSSQL_INSTANCE}" --project="${PROJECT_ID}"; then + report_result "Deleted" else - report_result "Not Found" + report_result "Fail" fi } export -f delete_mssql_instance diff --git a/gcloud/lib/database/mysql.sh b/gcloud/lib/database/mysql.sh index 76202d6e..e48bcb33 100644 --- a/gcloud/lib/database/mysql.sh +++ b/gcloud/lib/database/mysql.sh @@ -3,50 +3,30 @@ # MySQL Cloud SQL functions function create_mysql_instance() { - local phase_name="create_mysql_instance" - if check_sentinel "${phase_name}" "done"; then - print_status "Creating MySQL Instance ${MYSQL_INSTANCE}..." - report_result "Exists" - return 0 - fi - print_status "Creating MySQL Instance ${MYSQL_INSTANCE}..." - if gcloud sql instances describe "${MYSQL_INSTANCE}" --project="${PROJECT_ID}" > /dev/null 2>&1; then - report_result "Exists" - create_sentinel "${phase_name}" "done" + local log_file="create_mysql_${MYSQL_INSTANCE}.log" + if run_gcloud "${log_file}" gcloud sql instances create "${MYSQL_INSTANCE}" \ + --no-assign-ip \ + --project="${PROJECT_ID}" \ + --network="${NETWORK_URI_PARTIAL}" \ + --database-version="${MYSQL_DATABASE_VERSION}" \ + --activation-policy=ALWAYS \ + --zone "${ZONE}"; then + report_result "Created" else - local log_file="create_mysql_${MYSQL_INSTANCE}.log" - if run_gcloud "${log_file}" gcloud sql instances create "${MYSQL_INSTANCE}" \ - --no-assign-ip \ - --project="${PROJECT_ID}" \ - --network="${NETWORK_URI_PARTIAL}" \ - --database-version="${MYSQL_DATABASE_VERSION}" \ - --activation-policy=ALWAYS \ - --zone "${ZONE}"; then - report_result "Created" - create_sentinel "${phase_name}" "done" - else - report_result "Fail" - return 1 - fi + report_result "Fail" + return 1 fi } export -f create_mysql_instance function delete_mysql_instance() { - local phase_name="create_mysql_instance" - remove_sentinel "${phase_name}" "done" - print_status "Deleting MySQL Instance ${MYSQL_INSTANCE}..." - if gcloud sql instances describe "${MYSQL_INSTANCE}" --project="${PROJECT_ID}" > /dev/null 2>&1; then - local log_file="delete_mysql_${MYSQL_INSTANCE}.log" - if run_gcloud "${log_file}" gcloud sql instances delete --quiet "${MYSQL_INSTANCE}" --project="${PROJECT_ID}"; then - report_result "Deleted" - else - report_result "Fail" - fi + local log_file="delete_mysql_${MYSQL_INSTANCE}.log" + if run_gcloud "${log_file}" gcloud sql instances delete --quiet "${MYSQL_INSTANCE}" --project="${PROJECT_ID}"; then + report_result "Deleted" else - report_result "Not Found" + report_result "Fail" fi } export -f delete_mysql_instance diff --git a/gcloud/lib/database/oracle.sh b/gcloud/lib/database/oracle.sh new file mode 100644 index 00000000..95f005dc --- /dev/null +++ b/gcloud/lib/database/oracle.sh @@ -0,0 +1,126 @@ +#!/bin/bash +# +# Oracle DB on GCE VM functions + +function create_oracle_vm() { + local phase_name="create_oracle_vm" + print_status "Checking Oracle VM ${ORACLE_VM_NAME}..." + report_result "Exists" + return 0 + fi + + print_status "Creating Oracle VM ${ORACLE_VM_NAME}..." + local log_file="create_oracle_vm_${ORACLE_VM_NAME}.log" + + local ova_url="https://yum.oracle.com/templates/OracleLinux/OL10/u0/x86_64/OL10U0_x86_64-olvm-b266.ova" + local ova_file="OL10U0_x86_64-olvm-b266.ova" + local machine_image_name="oracle-linux-10u0" + local os_type="oraclelinux-10" + local gcs_uri="gs://${BUCKET}/${ova_file}" + local startup_script="${REPRO_TMPDIR}/oracle_startup.sh" + + # Create startup script content + cat << 'EOF' > "${startup_script}" +#!/bin/bash +# oracle_startup.sh +echo "Starting Oracle DB setup..." + +# !!! IMPORTANT: These commands are placeholders. You need to add the actual +# Oracle Database installation steps here for your specific version. !!! + +# 1. Add Oracle Yum Repos if needed +# Example for 19c: +# sudo yum install -y oracle-database-preinstall-19c + +# 2. Download Oracle Database Software (e.g., from Oracle website or GCS) +# Example: wget ... or gsutil cp ... + +# 3. Unzip and Run Oracle Installer +# Example: unzip ... +# Example: ./runInstaller -silent -responseFile ... + +# 4. Configure Listener +# Example: Using netca + +# 5. Create Database +# Example: Using dbca + +# 6. Open Firewall Port +sudo firewall-cmd --zone=public --add-port=1521/tcp --permanent +sudo firewall-cmd --reload + +echo "Oracle DB setup script finished." +EOF + + print_status " Downloading OVA..." + if wget -O "${REPRO_TMPDIR}/${ova_file}" "${ova_url}" > "${REPRO_TMPDIR}/wget_${ova_file}.log" 2>&1; then + report_result "Pass" + else + report_result "Fail" + return 1 + fi + + print_status " Uploading OVA to GCS..." + if run_gcloud "${log_file}" gsutil cp "${REPRO_TMPDIR}/${ova_file}" "${gcs_uri}"; then + report_result "Pass" + else + report_result "Fail" + return 1 + fi + + print_status " Importing Machine Image ${machine_image_name}..." + if run_gcloud "${log_file}" gcloud compute machine-images import "${machine_image_name}" \ + --project="${PROJECT_ID}" \ + --source-uri="${gcs_uri}" \ + --os="${os_type}" \ + --zone="${ZONE}"; then + report_result "Imported" + else + # Continue if image already exists + if gcloud compute machine-images describe "${machine_image_name}" --project="${PROJECT_ID}" > /dev/null 2>&1; then + report_result "Exists" + else + report_result "Fail" + return 1 + fi + fi + + print_status " Creating VM instance ${ORACLE_VM_NAME}..." + if run_gcloud "${log_file}" gcloud compute instances create "${ORACLE_VM_NAME}" \ + --project="${PROJECT_ID}" \ + --zone="${ZONE}" \ + --source-machine-image="${machine_image_name}" \ + --metadata-from-file=startup-script="${startup_script}" \ + --scopes=cloud-platform; then + report_result "Created" + else + report_result "Fail" + return 1 + fi + + print_status " Cleaning up local OVA file..." + rm "${REPRO_TMPDIR}/${ova_file}" + report_result "Done" +} +export -f create_oracle_vm + +function delete_oracle_vm() { + local phase_name="create_oracle_vm" + print_status "Deleting Oracle VM ${ORACLE_VM_NAME}..." + local log_file="delete_oracle_vm_${ORACLE_VM_NAME}.log" + + if gcloud compute instances describe "${ORACLE_VM_NAME}" --zone "${ZONE}" --project "${PROJECT_ID}" > /dev/null 2>&1; then + if run_gcloud "${log_file}" gcloud compute instances delete "${ORACLE_VM_NAME}" \ + --project="${PROJECT_ID}" \ + --zone="${ZONE}" \ + --quiet; then + report_result "Deleted" + else + report_result "Fail" + return 1 + fi + else + report_result "Not Found" + fi +} +export -f delete_oracle_vm diff --git a/gcloud/lib/database/pgsql.sh b/gcloud/lib/database/pgsql.sh index f1cdcb93..e7349cab 100644 --- a/gcloud/lib/database/pgsql.sh +++ b/gcloud/lib/database/pgsql.sh @@ -3,51 +3,31 @@ # PostgreSQL Cloud SQL functions function create_pgsql_instance() { - local phase_name="create_pgsql_instance" - if check_sentinel "${phase_name}" "done"; then - print_status "Creating PostgreSQL Instance ${PGSQL_INSTANCE}..." - report_result "Exists" - return 0 - fi - print_status "Creating PostgreSQL Instance ${PGSQL_INSTANCE}..." - if gcloud sql instances describe "${PGSQL_INSTANCE}" --project="${PROJECT_ID}" > /dev/null 2>&1; then - report_result "Exists" - create_sentinel "${phase_name}" "done" + local log_file="create_pgsql_${PGSQL_INSTANCE}.log" + if run_gcloud "${log_file}" gcloud sql instances create "${PGSQL_INSTANCE}" \ + --no-assign-ip \ + --project="${PROJECT_ID}" \ + --network="${NETWORK_URI_PARTIAL}" \ + --database-version="${PGSQL_DATABASE_VERSION}" \ + --activation-policy=ALWAYS \ + --root-password="${PGSQL_ROOT_PASSWORD}" \ + --zone "${ZONE}"; then + report_result "Created" else - local log_file="create_pgsql_${PGSQL_INSTANCE}.log" - if run_gcloud "${log_file}" gcloud sql instances create "${PGSQL_INSTANCE}" \ - --no-assign-ip \ - --project="${PROJECT_ID}" \ - --network="${NETWORK_URI_PARTIAL}" \ - --database-version="${PGSQL_DATABASE_VERSION}" \ - --activation-policy=ALWAYS \ - --root-password="${PGSQL_ROOT_PASSWORD}" \ - --zone "${ZONE}"; then - report_result "Created" - create_sentinel "${phase_name}" "done" - else - report_result "Fail" - return 1 - fi + report_result "Fail" + return 1 fi } export -f create_pgsql_instance function delete_pgsql_instance() { - local phase_name="create_pgsql_instance" - remove_sentinel "${phase_name}" "done" - print_status "Deleting PostgreSQL Instance ${PGSQL_INSTANCE}..." - if gcloud sql instances describe "${PGSQL_INSTANCE}" --project="${PROJECT_ID}" > /dev/null 2>&1; then - local log_file="delete_pgsql_${PGSQL_INSTANCE}.log" - if run_gcloud "${log_file}" gcloud sql instances delete --quiet "${PGSQL_INSTANCE}" --project="${PROJECT_ID}"; then - report_result "Deleted" - else - report_result "Fail" - fi + local log_file="delete_pgsql_${PGSQL_INSTANCE}.log" + if run_gcloud "${log_file}" gcloud sql instances delete --quiet "${PGSQL_INSTANCE}" --project="${PROJECT_ID}"; then + report_result "Deleted" else - report_result "Not Found" + report_result "Fail" fi } export -f delete_pgsql_instance diff --git a/gcloud/lib/dataproc/autoscaling.sh b/gcloud/lib/dataproc/autoscaling.sh index fee77024..52c832b9 100644 --- a/gcloud/lib/dataproc/autoscaling.sh +++ b/gcloud/lib/dataproc/autoscaling.sh @@ -2,45 +2,29 @@ # # Dataproc Autoscaling Policy functions -function create_autoscaling_policy() { - local phase_name="create_autoscaling_policy" - if check_sentinel "${phase_name}" "done"; then - print_status "Creating Autoscaling Policy ${AUTOSCALING_POLICY_NAME}..." - report_result "Exists" - return 0 - fi +function exists_autoscaling_policy() { + _check_exists "gcloud dataproc autoscaling-policies describe '${AUTOSCALING_POLICY_NAME}' --region='${REGION}' --format='json(id,name)'" +} +function create_autoscaling_policy() { print_status "Creating Autoscaling Policy ${AUTOSCALING_POLICY_NAME}..." local log_file="create_autoscaling_${AUTOSCALING_POLICY_NAME}.log" - if gcloud dataproc autoscaling-policies describe ${AUTOSCALING_POLICY_NAME} --region ${REGION} > /dev/null 2>&1; then - report_result "Exists" - create_sentinel "${phase_name}" "done" + if run_gcloud "${log_file}" gcloud dataproc autoscaling-policies import "${AUTOSCALING_POLICY_NAME}" --region="${REGION}" --source=autoscaling-policy.yaml; then + report_result "Created" else - if run_gcloud "${log_file}" gcloud dataproc autoscaling-policies import ${AUTOSCALING_POLICY_NAME} --region ${REGION} --source autoscaling-policy.yaml; then - report_result "Created" - create_sentinel "${phase_name}" "done" - else - report_result "Fail" - return 1 - fi + report_result "Fail" + return 1 fi } export -f create_autoscaling_policy function delete_autoscaling_policy() { - local phase_name="create_autoscaling_policy" - remove_sentinel "${phase_name}" "done" - print_status "Deleting Autoscaling Policy ${AUTOSCALING_POLICY_NAME}..." - if gcloud dataproc autoscaling-policies describe ${AUTOSCALING_POLICY_NAME} --region ${REGION} > /dev/null 2>&1; then - local log_file="delete_autoscaling_${AUTOSCALING_POLICY_NAME}.log" - if run_gcloud "${log_file}" gcloud dataproc autoscaling-policies delete --quiet ${AUTOSCALING_POLICY_NAME} --region ${REGION}; then - report_result "Deleted" - else - report_result "Fail" - fi + local log_file="delete_autoscaling_${AUTOSCALING_POLICY_NAME}.log" + if run_gcloud "${log_file}" gcloud dataproc autoscaling-policies delete --quiet "${AUTOSCALING_POLICY_NAME}" --region="${REGION}"; then + report_result "Deleted" else - report_result "Not Found" + report_result "Fail" fi } -export -f delete_autoscaling_policy \ No newline at end of file +export -f delete_autoscaling_policy diff --git a/gcloud/lib/dataproc/cluster-custom.sh b/gcloud/lib/dataproc/cluster-custom.sh new file mode 100644 index 00000000..fb399a1b --- /dev/null +++ b/gcloud/lib/dataproc/cluster-custom.sh @@ -0,0 +1,188 @@ +#!/bin/bash +# +# Dataproc Cluster Management Functions + +function exists_dpgce_cluster() { + # print_status " Checking if cluster ${CLUSTER_NAME} exists..." + if gcloud dataproc clusters describe "${CLUSTER_NAME}" --region="${REGION}" --project="${PROJECT_ID}" > /dev/null 2>&1; + then + # report_result "Exists" + return 0 # Found + else + # report_result "Not Found" + return 1 # Not found + fi +} +export -f exists_dpgce_cluster + +function create_dpgce_cluster() { + local phase_name="create_dpgce_cluster" + # Note: No sentinel check here because this is the main resource being created/recreated. + # We rely on the --no-create-cluster flag to skip this if needed. + + print_status "Creating Dataproc Cluster ${CLUSTER_NAME}..." + + local metadata_array=( + "public_secret_name=${public_secret_name}" + "private_secret_name=${private_secret_name}" + "secret_project=${secret_project}" + "secret_version=${secret_version}" + "modulus_md5sum=${modulus_md5sum}" + "install-gpu-agent=true" + "gpu-driver-provider=NVIDIA" + "cuda-version=${CUDA_VERSION}" + "gpu-driver-version=${DRIVER_VERSION}" + "cuda-url=https://developer.download.nvidia.com/compute/cuda/13.1.0/local_installers/cuda_13.1.0_590.44.01_linux.run" + "gpu-driver-url=https://us.download.nvidia.com/XFree86/Linux-x86_64/590.48.01/NVIDIA-Linux-x86_64-590.48.01.run" + "gpu-conda-env=dpgce" + "init-actions-repo=${INIT_ACTIONS_ROOT}" + "debug=true" + "include-pytorch=yes" + "enable-oslogin=TRUE" +# "nfs-kerberos-users=ext_cjac_google_com,ext_dgodhia_google_com" + "dask-runtime=standalone" + "rapids-runtime=SPARK" + "bigtable-instance=${BIGTABLE_INSTANCE}" + "include-gpus=1" +# "startup-script-url=gs://dataproc-staging-us-west4-kf7bmp/dataproc-initialization-actions/gce-proxy-setup.sh" +# "rapids-mirror-disk=${RAPIDS_MIRROR_DISK_NAME}" +# "rapids-mirror-host=${RAPIDS_REGIONAL_MIRROR_ADDR[${REGION}]}" +# "dask-cloud-logging=true" + +# "http-proxy=${SWP_IP}:${SWP_PORT}" +# "https-proxy=${SWP_IP}:${SWP_PORT}" +# "proxy-uri=${SWP_IP}:${SWP_PORT}" +# "startup-script-url=gs://dataproc-staging-us-west4-kf7bmp/dataproc-initialization-actions/gce-proxy-setup.sh" +# "dask-runtime=standalone" +# "rapids-runtime=SPARK" +# "bigtable-instance=${BIGTABLE_INSTANCE}" +# "include-gpus=1" + + ) + + local all_metadata +# Join with specified separator +# all_metadata=$(IFS=,; echo "${metadata_array[*]}") + all_metadata="$(IFS='|'; echo "${metadata_array[*]}")" +# all_metadata=$(IFS='~~'; echo "${metadata_array[*]}") + # Prefix with ^|^ to tell gcloud about the new separator + all_metadata="^|^${all_metadata}" + + # Logic to determine whether to use a custom image, build one, or use a stock image. + local image_args=() + if check_image_exists "${CUSTOM_IMAGE_URI}"; then + print_status "Found existing custom image: ${CUSTOM_IMAGE_URI}. Using it." + image_args=("--image" "${CUSTOM_IMAGE_URI}") + else + # CUSTOM_IMAGE_URI from env.json does not exist. + # Check if a sentinel file for a freshly built image exists. + local custom_image_sentinel_file="${SENTINEL_DIR}/custom_image_uri.txt" + if [[ -f "${custom_image_sentinel_file}" ]]; then + local fresh_image_uri + fresh_image_uri=$(cat "${custom_image_sentinel_file}") + if check_image_exists "${fresh_image_uri}"; then + print_status "Found freshly built custom image: ${fresh_image_uri}. Using it." + image_args=("--image" "${fresh_image_uri}") + else + # Sentinel file points to a non-existent image, something is wrong. + print_status "Image from sentinel file not found: ${fresh_image_uri}. Falling back to build." + # Fall through to build logic + fi + fi + fi + + # If no image has been selected, attempt to build one. + if [[ ${#image_args[@]} -eq 0 ]]; then + print_status "No existing custom image found. Attempting to build a new one..." + if (cd ../../custom-images/examples/secure-boot && ./build-and-run-podman.sh); then + local custom_image_sentinel_file="${SENTINEL_DIR}/custom_image_uri.txt" + if [[ -f "${custom_image_sentinel_file}" ]]; then + local fresh_image_uri + fresh_image_uri=$(cat "${custom_image_sentinel_file}") + print_status "Successfully built new image: ${fresh_image_uri}. Using it." + image_args=("--image" "${fresh_image_uri}") + else + print_status "Build script ran, but sentinel file not found. Falling back to stock image version." + image_args=("--image-version" "${IMAGE_VERSION}") + fi + else + print_status "Failed to build custom image. Falling back to stock image version." + image_args=("--image-version" "${IMAGE_VERSION}") + fi + fi + + local gcloud_cmd=( + gcloud dataproc clusters create "${CLUSTER_NAME}" + --single-node +# --num-masters=1 +# --num-workers=2 + --master-accelerator "type=${M_ACCELERATOR_TYPE}" + # --worker-accelerator "type=${PRIMARY_ACCELERATOR_TYPE}" + # --secondary-worker-accelerator "type=${SECONDARY_ACCELERATOR_TYPE}" + --master-machine-type "${M_MACHINE_TYPE}" + # --worker-machine-type "${PRIMARY_MACHINE_TYPE}" + --master-boot-disk-size 600 +# --worker-boot-disk-size 60 +# --secondary-worker-boot-disk-size 60 + --master-local-ssd-interface=NVME + --num-master-local-ssds=1 + --master-boot-disk-type pd-ssd +# --master-boot-disk-type hyperdisk-balanced +# --worker-boot-disk-type pd-ssd +# --secondary-worker-boot-disk-type pd-ssd + --region "${REGION}" + --zone "${ZONE}" + --subnet "${SUBNET}" + --no-address + --service-account "${GSA}" + --tags "${TAGS}" + --bucket "${BUCKET}" + --temp-bucket "${TEMP_BUCKET}" + --enable-component-gateway +# --enable-kerberos + + --metadata "${all_metadata}" + "${image_args[@]}" + +# --no-shielded-secure-boot + --shielded-secure-boot +# --initialization-actions "${INIT_ACTIONS_ROOT}/gpu/install_gpu_driver.sh" +# --initialization-actions "${INIT_ACTIONS_ROOT}/spark-rapids/spark-rapids.sh" +# --initialization-actions "${INIT_ACTIONS_ROOT}/nfs/nfs.sh" + --initialization-action-timeout 90m + --optional-components "DOCKER,JUPYTER" +# --max-idle="${IDLE_TIMEOUT}" + --properties "spark:spark.history.fs.logDirectory=gs://${BUCKET}/phs/eventLog" + --scopes 'https://www.googleapis.com/auth/cloud-platform,sql-admin' + ) + + if [[ "${GCLOUD_QUIET}" != "true" ]]; then + echo + echo "Command to be executed:" + printf "%s\n" "${gcloud_cmd[@]}" | perl -pe 's/ --/ --/g' + fi + + if time "${gcloud_cmd[@]}"; then + report_result "Created" + else + report_result "Fail" + return 1 + fi +} +export -f create_dpgce_cluster + +function delete_dpgce_cluster() { + print_status "Deleting Dataproc Cluster ${CLUSTER_NAME}..." + if exists_dpgce_cluster; + then + local log_file="delete_dpgce_cluster_${CLUSTER_NAME}.log" + if run_gcloud "${log_file}" gcloud dataproc clusters delete --quiet --region ${REGION} ${CLUSTER_NAME}; then + report_result "Deleted" + else + report_result "Fail" + fi + else + report_result "Not Found" + fi +} +export -f delete_dpgce_cluster diff --git a/gcloud/lib/dataproc/cluster.sh b/gcloud/lib/dataproc/cluster.sh index a0953dd2..65ca06ff 100644 --- a/gcloud/lib/dataproc/cluster.sh +++ b/gcloud/lib/dataproc/cluster.sh @@ -3,89 +3,79 @@ # Dataproc Cluster Management Functions function exists_dpgce_cluster() { - # print_status " Checking if cluster ${CLUSTER_NAME} exists..." - if gcloud dataproc clusters describe "${CLUSTER_NAME}" --region="${REGION}" --project="${PROJECT_ID}" > /dev/null 2>&1; - then - # report_result "Exists" - return 0 # Found - else - # report_result "Not Found" - return 1 # Not found - fi + _check_exists "gcloud dataproc clusters describe '${CLUSTER_NAME}' --region='${REGION}' --project='${PROJECT_ID}' --format='json(clusterName,clusterUuid,status.selfLink)'" } export -f exists_dpgce_cluster function create_dpgce_cluster() { - local phase_name="create_dpgce_cluster" - # Note: No sentinel check here because this is the main resource being created/recreated. - # We rely on the --no-create-cluster flag to skip this if needed. - print_status "Creating Dataproc Cluster ${CLUSTER_NAME}..." + local metadata_array=( + "public_secret_name=${public_secret_name}" + "private_secret_name=${private_secret_name}" + "secret_project=${secret_project}" + "secret_version=${secret_version}" + "modulus_md5sum=${modulus_md5sum}" + "install-gpu-agent=true" + "gpu-driver-provider=NVIDIA" + "cuda-version=${CUDA_VERSION}" + "gpu-driver-version=${DRIVER_VERSION}" + "cuda-url=https://developer.download.nvidia.com/compute/cuda/13.1.0/local_installers/cuda_13.1.0_590.44.01_linux.run" + "gpu-driver-url=https://us.download.nvidia.com/XFree86/Linux-x86_64/590.48.01/NVIDIA-Linux-x86_64-590.48.01.run" + "gpu-conda-env=dpgce" + "init-actions-repo=${INIT_ACTIONS_ROOT}" + "debug=true" + "include-pytorch=yes" + "enable-oslogin=TRUE" + "dask-runtime=standalone" + "rapids-runtime=SPARK" + "bigtable-instance=${BIGTABLE_INSTANCE}" + "include-gpus=1" + "http-proxy=${SWP_IP}:${SWP_PORT}" + "https-proxy=${SWP_IP}:${SWP_PORT}" + "proxy-uri=${SWP_IP}:${SWP_PORT}" + ) + + local all_metadata + all_metadata="$(IFS='|'; echo "${metadata_array[*]}")" + all_metadata="^|^${all_metadata}" + local gcloud_cmd=( gcloud dataproc clusters create "${CLUSTER_NAME}" --single-node - --master-accelerator "type=${MASTER_ACCELERATOR_TYPE}" - --worker-accelerator "type=${PRIMARY_ACCELERATOR_TYPE}" - --secondary-worker-accelerator "type=${SECONDARY_ACCELERATOR_TYPE}" - --master-machine-type "${MASTER_MACHINE_TYPE}" - --worker-machine-type "${PRIMARY_MACHINE_TYPE}" - --master-boot-disk-size 60 - --worker-boot-disk-size 60 - --secondary-worker-boot-disk-size 60 + --master-accelerator "type=${M_ACCELERATOR_TYPE}" + --master-machine-type "${M_MACHINE_TYPE}" + --master-boot-disk-size 600 + --master-local-ssd-interface=NVME + --num-master-local-ssds=1 --master-boot-disk-type pd-ssd - --worker-boot-disk-type pd-ssd - --secondary-worker-boot-disk-type pd-ssd --region "${REGION}" --zone "${ZONE}" --subnet "${SUBNET}" --no-address - --service-account="${GSA}" - --tags="${TAGS}" + --service-account "${GSA}" + --tags "${TAGS}" --bucket "${BUCKET}" --temp-bucket "${TEMP_BUCKET}" --enable-component-gateway - --metadata "public_secret_name=${public_secret_name}" - --metadata "private_secret_name=${private_secret_name}" - --metadata "secret_project=${secret_project}" - --metadata "secret_version=${secret_version}" - --metadata "modulus_md5sum=${modulus_md5sum}" - --metadata "install-gpu-agent=true" - --metadata "gpu-driver-provider=NVIDIA" - --metadata "gpu-conda-env=dpgce" - --metadata "rapids-mirror-disk=${RAPIDS_MIRROR_DISK_NAME}" - --metadata "rapids-mirror-host=${RAPIDS_REGIONAL_MIRROR_ADDR[${REGION}]}" - --metadata "init-actions-repo=${INIT_ACTIONS_ROOT}" - --metadata "dask-cloud-logging=true" - --metadata "debug=true" - --metadata "http-proxy=${SWP_IP}:${SWP_PORT}" - --metadata dask-runtime="standalone" - --metadata rapids-runtime="SPARK" - --metadata bigtable-instance=${BIGTABLE_INSTANCE} - --metadata include-gpus=1 - --image "projects/${PROJECT_ID}/global/images/dataproc-2-2-deb12-20251108-180659-tf" \ - --initialization-action-timeout=90m - --optional-components DOCKER,JUPYTER - --max-idle="${IDLE_TIMEOUT}" + --metadata "${all_metadata}" + --no-shielded-secure-boot + --image-version "${IMAGE_VERSION}" + --initialization-action-timeout 90m + --optional-components "DOCKER,JUPYTER" --properties "spark:spark.history.fs.logDirectory=gs://${BUCKET}/phs/eventLog" --scopes 'https://www.googleapis.com/auth/cloud-platform,sql-admin' ) -# --no-shielded-secure-boot -# --image-version "${IMAGE_VERSION}" -# --initialization-actions ${INIT_ACTIONS_ROOT}/gpu/install_gpu_driver.sh -# --metadata=startup-script-url="gs://dataproc-staging-us-west4-kf7bmp/dataproc-initialization-actions/gce-proxy-setup.sh" - - if [[ "${GCLOUD_QUIET}" != "true" ]]; then echo echo "Command to be executed:" cmd_str=$(printf "%s " "${gcloud_cmd[@]}") - # Replace " --" with " \\\n --" for pretty printing - echo "${cmd_str}" | perl -pe 's/ --/ \\\n --/g' + echo "${cmd_str}" | perl -pe 's/ --/ + --/g' fi - if "${gcloud_cmd[@]}"; then + if time "${gcloud_cmd[@]}"; then report_result "Created" else report_result "Fail" @@ -96,16 +86,11 @@ export -f create_dpgce_cluster function delete_dpgce_cluster() { print_status "Deleting Dataproc Cluster ${CLUSTER_NAME}..." - if exists_dpgce_cluster; - then - local log_file="delete_dpgce_cluster_${CLUSTER_NAME}.log" - if run_gcloud "${log_file}" gcloud dataproc clusters delete --quiet --region ${REGION} ${CLUSTER_NAME}; then - report_result "Deleted" - else - report_result "Fail" - fi + local log_file="delete_dpgce_cluster_${CLUSTER_NAME}.log" + if run_gcloud "${log_file}" gcloud dataproc clusters delete --quiet "${CLUSTER_NAME}" --region "${REGION}"; then + report_result "Deleted" else - report_result "Not Found" + report_result "Fail" fi } export -f delete_dpgce_cluster diff --git a/gcloud/lib/dataproc/private-cluster.sh b/gcloud/lib/dataproc/private-cluster.sh new file mode 100644 index 00000000..f3c9c845 --- /dev/null +++ b/gcloud/lib/dataproc/private-cluster.sh @@ -0,0 +1,98 @@ +#!/bin/bash +# +# Dataproc Private Cluster Management Functions + +source lib/dataproc/cluster.sh # Source the base cluster functions to reuse exists_dpgce_cluster + +function create_dpgce_private_cluster() { + print_status "Creating Private Dataproc Cluster ${CLUSTER_NAME}..." + + # Load CUSTOM_IMAGE_URI from env.json + CUSTOM_IMAGE_URI=$(jq -r .CUSTOM_IMAGE_URI env.json) + if [[ -z "${CUSTOM_IMAGE_URI}" || "${CUSTOM_IMAGE_URI}" == "null" ]]; then + echo "ERROR: CUSTOM_IMAGE_URI is not set in env.json" >&2 + return 1 + fi + + local metadata_array=( + "public_secret_name=${public_secret_name}" + "private_secret_name=${private_secret_name}" + "secret_project=${secret_project}" + "secret_version=${secret_version}" + "modulus_md5sum=${modulus_md5sum}" + "install-gpu-agent=true" + "gpu-driver-provider=NVIDIA" + "gpu-conda-env=dpgce" + "init-actions-repo=${INIT_ACTIONS_ROOT}" + "debug=true" + "include-pytorch=yes" + "enable-oslogin=TRUE" + "nfs-kerberos-users=ext_cjac_google_com,ext_dgodhia_google_com" + "http-proxy=${SWP_IP}:${SWP_PORT}" + "https-proxy=${SWP_IP}:${SWP_PORT}" + "proxy-uri=${SWP_IP}:${SWP_PORT}" + "dask-runtime=standalone" + "rapids-runtime=SPARK" + "bigtable-instance=${BIGTABLE_INSTANCE}" + "include-gpus=1" + ) + + local all_metadata + all_metadata="$(IFS='|'; echo "${metadata_array[*]}")" + # Prefix with ^|^ to tell gcloud about the new separator + all_metadata="^|^${all_metadata}" + + local gcloud_cmd=( + gcloud dataproc clusters create "${CLUSTER_NAME}" + --single-node + --region "${REGION}" + --zone "${ZONE}" + --subnet "${PRIVATE_SUBNET}" + --no-address + --service-account="${GSA}" + --tags "${TAGS}" + --bucket "${BUCKET}" + --temp-bucket "${TEMP_BUCKET}" + --enable-component-gateway + --metadata "${all_metadata}" + --shielded-secure-boot + --image "${CUSTOM_IMAGE_URI}" + --initialization-action-timeout=90m + --optional-components DOCKER,JUPYTER + --max-idle="${IDLE_TIMEOUT}" + --properties "spark:spark.history.fs.logDirectory=gs://${BUCKET}/phs/eventLog" + --scopes 'https://www.googleapis.com/auth/cloud-platform,sql-admin' + ) + + # Add machine type and accelerator flags + if [[ -n "${M_MACHINE_TYPE}" ]]; then + gcloud_cmd+=(--master-machine-type "${M_MACHINE_TYPE}") + fi + if [[ -n "${M_ACCELERATOR_TYPE}" ]]; then + gcloud_cmd+=(--master-accelerator "type=${M_ACCELERATOR_TYPE}") + fi + + if [[ "${GCLOUD_QUIET}" != "true" ]]; then + echo + echo "Command to be executed:" + cmd_str=$(printf "%s " "${gcloud_cmd[@]}") + echo "${cmd_str}" | perl -pe 's/ --/ + --/g' + fi + + if "${gcloud_cmd[@]}"; then + report_result "Created" + else + report_result "Fail" + return 1 + fi +} +export -f create_dpgce_private_cluster + +# The standard delete function works for private clusters as well. +# We just need to make sure we source this file in destroy-dpgce. +# We alias it here for clarity, though it's not strictly necessary. +delete_dpgce_private_cluster() { + delete_dpgce_cluster +} +export -f delete_dpgce_private_cluster diff --git a/gcloud/lib/gcp/gcr.sh b/gcloud/lib/gcp/gcr.sh index b2735f2c..5805829e 100644 --- a/gcloud/lib/gcp/gcr.sh +++ b/gcloud/lib/gcp/gcr.sh @@ -2,28 +2,15 @@ # GCR functions function create_artifacts_repository(){ - local phase_name="create_artifacts_repository" - if check_sentinel "${phase_name}" "done"; then - print_status "Creating Artifact Repository ${ARTIFACT_REPOSITORY}..." - report_result "Exists" - return 0 - fi - print_status "Creating Artifact Repository ${ARTIFACT_REPOSITORY}..." - if gcloud artifacts repositories describe "${ARTIFACT_REPOSITORY}" --location="${REGION}" --project="${PROJECT_ID}" > /dev/null 2>&1; then - report_result "Exists" - create_sentinel "${phase_name}" "done" + local log_file="create_artifacts_repository_${ARTIFACT_REPOSITORY}.log" + if run_gcloud "${log_file}" gcloud artifacts repositories create "${ARTIFACT_REPOSITORY}" \ + --repository-format=docker \ + --location="${REGION}" --project="${PROJECT_ID}"; then + report_result "Created" else - local log_file="create_artifacts_repository_${ARTIFACT_REPOSITORY}.log" - if run_gcloud "${log_file}" gcloud artifacts repositories create "${ARTIFACT_REPOSITORY}" \ - --repository-format=docker \ - --location="${REGION}" --project="${PROJECT_ID}"; then - report_result "Created" - create_sentinel "${phase_name}" "done" - else - report_result "Fail" - return 1 - fi + report_result "Fail" + return 1 fi } export -f create_artifacts_repository diff --git a/gcloud/lib/gcp/gcs.sh b/gcloud/lib/gcp/gcs.sh index edf24a4c..376655bd 100644 --- a/gcloud/lib/gcp/gcs.sh +++ b/gcloud/lib/gcp/gcs.sh @@ -2,18 +2,13 @@ # # GCS Bucket functions -function create_bucket () { - local phase_name="create_bucket" - if check_sentinel "${phase_name}" "done"; then - print_status "Creating GCS Buckets gs://${BUCKET} & gs://${TEMP_BUCKET}..." - report_result "Exists" - return 0 - fi - - print_status "Creating GCS Staging Bucket gs://${BUCKET}..." - local log_file="create_bucket_${BUCKET}.log" - if ! gsutil ls -b "gs://${BUCKET}" > /dev/null 2>&1 ; then - if run_gcloud "${log_file}" gsutil mb -l ${REGION} gs://${BUCKET}; then +function create_gcs_bucket () { + local bucket_name="$1" + local storage_class="$2" + print_status "Creating GCS Bucket gs://${bucket_name}..." + local log_file="create_bucket_${bucket_name}.log" + if ! gsutil ls -b "gs://${bucket_name}" > /dev/null 2>&1 ; then + if run_gcloud "${log_file}" gsutil mb -c "${storage_class}" -l "${REGION}" "gs://${bucket_name}"; then report_result "Created" else report_result "Fail" @@ -22,35 +17,20 @@ function create_bucket () { else report_result "Exists" fi - # Grant SA permissions on BUCKET - print_status " Granting Storage Admin on gs://${BUCKET}..." - if run_gcloud "${log_file}" gsutil iam ch "serviceAccount:${GSA}:roles/storage.admin" "gs://${BUCKET}"; then - report_result "Pass" - else - report_result "Fail" - fi +} - print_status "Creating GCS Temp Bucket gs://${TEMP_BUCKET}..." - local temp_log_file="create_bucket_${TEMP_BUCKET}.log" - if ! gsutil ls -b "gs://${TEMP_BUCKET}" > /dev/null 2>&1 ; then - if run_gcloud "${temp_log_file}" gsutil mb -l ${REGION} gs://${TEMP_BUCKET}; then - report_result "Created" +function grant_gcs_bucket_perms() { + local bucket_name="$1" + local log_file="grant_perms_${bucket_name}.log" + print_status " Granting Storage Admin on gs://${bucket_name}..." + if run_gcloud "${log_file}" gsutil iam ch "serviceAccount:${GSA}:roles/storage.admin" "gs://${bucket_name}"; then + report_result "Pass" else - report_result "Fail" - return 1 + report_result "Fail" fi - else - report_result "Exists" - fi - # Grant SA permissions on TEMP_BUCKET - print_status " Granting Storage Admin on gs://${TEMP_BUCKET}..." - if run_gcloud "${temp_log_file}" gsutil iam ch "serviceAccount:${GSA}:roles/storage.admin" "gs://${TEMP_BUCKET}"; then - report_result "Pass" - else - report_result "Fail" - fi +} - # Copy initialization action scripts +function upload_init_actions() { if [[ -d init ]] ; then print_status "Copying init scripts to ${INIT_ACTIONS_ROOT}..." local cp_log="copy_init_scripts.log" @@ -60,21 +40,19 @@ function create_bucket () { report_result "Fail" fi fi - create_sentinel "${phase_name}" "done" } -function delete_bucket () { - print_status "Deleting GCS Bucket gs://${BUCKET}..." - local log_file="delete_bucket_${BUCKET}.log" - if gsutil ls -b "gs://${BUCKET}" > /dev/null 2>&1; then - if run_gcloud "${log_file}" gsutil -m rm -r "gs://${BUCKET}"; then +function delete_gcs_bucket () { + local bucket_name="$1" + print_status "Deleting GCS Bucket gs://${bucket_name}..." + local log_file="delete_bucket_${bucket_name}.log" + if gsutil ls -b "gs://${bucket_name}" > /dev/null 2>&1; then + if run_gcloud "${log_file}" gsutil -m rm -r "gs://${bucket_name}"; then report_result "Deleted" - remove_sentinel "create_bucket" "done" else report_result "Fail" fi else report_result "Not Found" fi - # gsutil -m rm -r "gs://${TEMP_BUCKET}" > /dev/null 2>&1 || true # huge cache here, not so great to lose it } diff --git a/gcloud/lib/gcp/iam.sh b/gcloud/lib/gcp/iam.sh index d00116a0..7a38a6ca 100644 --- a/gcloud/lib/gcp/iam.sh +++ b/gcloud/lib/gcp/iam.sh @@ -2,16 +2,11 @@ # # IAM related functions -function create_service_account() { - local phase_name="create_service_account" - local sentinel_name="${SA_NAME}_done" - - if check_sentinel "${phase_name}" "${sentinel_name}"; then - print_status "Checking Service Account ${GSA} and roles..." - report_result "Exists" - return 0 - fi +function exists_service_account() { + _check_exists "gcloud iam service-accounts describe '${GSA}' --project='${PROJECT_ID}' --format='json(email,name)'" +} +function create_service_account() { print_status "Creating/Verifying Service Account ${GSA}..." local log_file="create_service_account_${SA_NAME}.log" @@ -52,18 +47,15 @@ function create_service_account() { for role in "${ROLES[@]}"; do local role_file_name=$(echo "${role}" | tr '/' '_') local role_log="bind_roles/bind_${role_file_name}_${SA_NAME}.log" - # print_status " Binding ${role}..." if ! run_gcloud "${role_log}" gcloud projects add-iam-policy-binding "${PROJECT_ID}" \ --member="serviceAccount:${GSA}" \ --role="${role}" --condition=None; then all_roles_bound=false - # report_result "Fail" fi done if [[ "${all_roles_bound}" = true ]]; then - report_result "Pass" # Overall role binding status - create_sentinel "${phase_name}" "${sentinel_name}" + report_result "Pass" else report_result "Fail" return 1 @@ -73,23 +65,12 @@ export -f create_service_account function delete_service_account() { print_status "Deleting Service Account ${GSA}..." - SA_EXISTS=$(gcloud iam service-accounts list \ - --project="${PROJECT_ID}" \ - --filter="email=${GSA}" \ - --format="value(email)" 2>/dev/null) - - if [[ -z "${SA_EXISTS}" ]]; then - report_result "Not Found" - return 0 - fi - local log_file="delete_service_account_${SA_NAME}.log" - # Attempt to remove bindings - ignore errors if not found + for svc in spark-executor spark-driver agent ; do - gcloud iam service-accounts remove-iam-policy-binding \ + run_gcloud "${log_file}" gcloud iam service-accounts remove-iam-policy-binding "${GSA}" \ --role=roles/iam.workloadIdentityUser \ - --member="serviceAccount:${PROJECT_ID}.svc.id.goog[${DPGKE_NAMESPACE}/${svc}]" \ - "${GSA}" > /dev/null 2>&1 || true + --member="serviceAccount:${PROJECT_ID}.svc.id.goog[${DPGKE_NAMESPACE}/${svc}]" || true done ROLES=( @@ -105,89 +86,23 @@ function delete_service_account() { roles/iam.serviceAccountUser ) for role in "${ROLES[@]}"; do - gcloud projects remove-iam-policy-binding \ + run_gcloud "${log_file}" gcloud projects remove-iam-policy-binding "${PROJECT_ID}" \ --role="${role}" \ --member="serviceAccount:${GSA}" \ - "${PROJECT_ID}" --condition=None > /dev/null 2>&1 || true + --condition=None || true done - gcloud iam service-accounts remove-iam-policy-binding "${GSA}" \ + run_gcloud "${log_file}" gcloud iam service-accounts remove-iam-policy-binding "${GSA}" \ --member="serviceAccount:${GSA}" \ - --role=roles/iam.serviceAccountUser > /dev/null 2>&1 || true + --role=roles/iam.serviceAccountUser || true - # Delete the service account if run_gcloud "${log_file}" gcloud iam service-accounts delete --quiet "${GSA}"; then report_result "Deleted" - remove_sentinel "create_service_account" "${SA_NAME}_done" else report_result "Fail" + echo " - Failed to delete service account ${GSA}. Log content:" >&2 + cat "${log_file}" >&2 + return 1 fi } - -function grant_kms_roles(){ - print_status "Granting KMS Roles to ${GSA}..." - local log_file="grant_kms_roles_${SA_NAME}.log" - if run_gcloud "${log_file}" gcloud projects add-iam-policy-binding "${PROJECT_ID}" \ - --member="serviceAccount:${GSA}" \ - --role=roles/cloudkms.cryptoKeyDecrypter; then - report_result "Pass" - else - report_result "Fail" - fi - } - export -f grant_kms_roles - - function grant_mysql_roles(){ - print_status "Granting MySQL/Cloud SQL Roles to ${GSA}..." - local log_file="grant_mysql_roles_${SA_NAME}.log" - if run_gcloud "${log_file}" gcloud projects add-iam-policy-binding "${PROJECT_ID}" \ - --member="serviceAccount:${GSA}" \ - --role=roles/cloudsql.editor; then - report_result "Pass" - else - report_result "Fail" - fi - } - export -f grant_mysql_roles - - function grant_bigtables_roles(){ - print_status "Granting Bigtable Roles to ${GSA}..." - local log_file="grant_bigtable_roles_${SA_NAME}.log" - local all_ok=true - if ! run_gcloud "${log_file}" gcloud projects add-iam-policy-binding "${PROJECT_ID}" \ - --member="serviceAccount:${GSA}" \ - --role=roles/bigtable.user; then - all_ok=false - fi - if ! run_gcloud "${log_file}" gcloud projects add-iam-policy-binding "${PROJECT_ID}" \ - --member="serviceAccount:${GSA}" \ - --role=roles/bigtable.admin; then - all_ok=false - fi - if [[ "${all_ok}" = true ]]; then report_result "Pass"; else report_result "Fail"; fi - } - export -f grant_bigtables_roles - - function grant_gke_roles(){ - print_status "Granting GKE Roles to ${GSA}..." - local log_file="grant_gke_roles_${SA_NAME}.log" - local all_ok=true - for svc in agent spark-driver spark-executor ; do - if ! run_gcloud "${log_file}" gcloud iam service-accounts add-iam-policy-binding \ - --role=roles/iam.workloadIdentityUser \ - --member="serviceAccount:${PROJECT_ID}.svc.id.goog[${DPGKE_NAMESPACE}/${svc}]" \ - "${GSA}"; then - all_ok=false - fi - done - if ! run_gcloud "${log_file}" gcloud artifacts repositories add-iam-policy-binding "${ARTIFACT_REPOSITORY}" \ - --location="${REGION}" \ - --member="serviceAccount:${GSA}" \ - --role=roles/artifactregistry.writer; then - all_ok=false - fi - if [[ "${all_ok}" = true ]]; then report_result "Pass"; else report_result "Fail"; fi - } - export -f grant_gke_roles - - +export -f delete_service_account diff --git a/gcloud/lib/gcp/kms.sh b/gcloud/lib/gcp/kms.sh index c5f69ebb..600344c2 100644 --- a/gcloud/lib/gcp/kms.sh +++ b/gcloud/lib/gcp/kms.sh @@ -3,20 +3,12 @@ # KMS and Secret Manager functions function enable_secret_manager() { - local phase_name="enable_secret_manager" - if check_sentinel "${phase_name}" "done"; then - print_status "Enabling Secret Manager API..." - report_result "Exists" - return 0 - fi - print_status "Enabling Secret Manager API..." local log_file="enable_secret_manager.log" if run_gcloud "${log_file}" gcloud services enable \ secretmanager.googleapis.com \ --project=${PROJECT_ID}; then report_result "Pass" - create_sentinel "${phase_name}" "done" else report_result "Fail" fi @@ -25,101 +17,57 @@ export -f enable_secret_manager function create_secret() { local secret_name="${1:-${MYSQL_SECRET_NAME}}" - local phase_name="create_secret_${secret_name}" - if check_sentinel "${phase_name}" "done"; then - print_status "Creating Secret ${secret_name}..." - report_result "Exists" - return 0 - fi - print_status "Creating Secret ${secret_name}..." local log_file="create_secret_${secret_name}.log" - if gcloud secrets describe "${secret_name}" --project="${PROJECT_ID}" > /dev/null 2>&1; then - report_result "Exists" - create_sentinel "${phase_name}" "done" + if echo -n "super secret" | run_gcloud "${log_file}" gcloud secrets create "${secret_name}" \ + --project="${PROJECT_ID}" \ + --replication-policy="automatic" \ + --data-file=-; then + report_result "Created" else - if echo -n "super secret" | run_gcloud "${log_file}" gcloud secrets create "${secret_name}" \ - --project="${PROJECT_ID}" \ - --replication-policy="automatic" \ - --data-file=-; then - report_result "Created" - create_sentinel "${phase_name}" "done" - else - report_result "Fail" - fi + report_result "Fail" fi } export -f create_secret function create_kms_keyring() { - local phase_name="create_kms_keyring" - if check_sentinel "${phase_name}" "done"; then - print_status "Creating KMS Keyring ${KMS_KEYRING}..." - report_result "Exists" - return 0 - fi print_status "Creating KMS Keyring ${KMS_KEYRING}..." - if gcloud kms keyrings list --location global --project="${PROJECT_ID}" | grep "${KMS_KEYRING}" > /dev/null 2>&1; then - report_result "Exists" - create_sentinel "${phase_name}" "done" + local log_file="create_kms_keyring_${KMS_KEYRING}.log" + if run_gcloud "${log_file}" gcloud kms keyrings create "${KMS_KEYRING}" --location=global --project="${PROJECT_ID}"; then + report_result "Created" else - local log_file="create_kms_keyring_${KMS_KEYRING}.log" - if run_gcloud "${log_file}" gcloud kms keyrings create "${KMS_KEYRING}" --location=global --project="${PROJECT_ID}"; then - report_result "Created" - create_sentinel "${phase_name}" "done" - else - report_result "Fail" - return 1 - fi + report_result "Fail" + return 1 fi } export -f create_kms_keyring function create_kerberos_kdc_key() { - local phase_name="create_kerberos_kdc_key" - if check_sentinel "${phase_name}" "done"; then - print_status "Creating KMS Key ${KDC_ROOT_PASSWD_KEY}..." - report_result "Exists" - return 0 - fi print_status "Creating KMS Key ${KDC_ROOT_PASSWD_KEY}..." - if gcloud kms keys list --location global --keyring="${KMS_KEYRING}" --project="${PROJECT_ID}" | grep "${KDC_ROOT_PASSWD_KEY}" > /dev/null 2>&1; then - report_result "Exists" - create_sentinel "${phase_name}" "done" + local log_file="create_kms_key_${KDC_ROOT_PASSWD_KEY}.log" + if run_gcloud "${log_file}" gcloud kms keys create "${KDC_ROOT_PASSWD_KEY}" \ + --location=global \ + --keyring="${KMS_KEYRING}" \ + --purpose=encryption --project="${PROJECT_ID}"; then + report_result "Created" else - local log_file="create_kms_key_${KDC_ROOT_PASSWD_KEY}.log" - if run_gcloud "${log_file}" gcloud kms keys create "${KDC_ROOT_PASSWD_KEY}" \ - --location=global \ - --keyring="${KMS_KEYRING}" \ - --purpose=encryption --project="${PROJECT_ID}"; then - report_result "Created" - create_sentinel "${phase_name}" "done" - else - report_result "Fail" - return 1 - fi + report_result "Fail" + return 1 fi } export -f create_kerberos_kdc_key function create_mysql_admin_password() { - local phase_name="create_mysql_admin_password" - if check_sentinel "${phase_name}" "done"; then - print_status "Creating Encrypted MySQL Admin Password..." - report_result "Exists" - return 0 - fi print_status "Creating Encrypted MySQL Admin Password..." local log_file="create_mysql_admin_password.log" if dd if=/dev/urandom bs=8 count=4 | xxd -p | \ run_gcloud "${log_file}" gcloud kms encrypt \ --location=global \ - --keyring=projects/${PROJECT_ID}/locations/global/keyRings/${KMS_KEYRING} \ - --key=projects/${PROJECT_ID}/locations/global/keyRings/${KMS_KEYRING}/cryptoKeys/${KDC_ROOT_PASSWD_KEY} \ + --keyring="projects/${PROJECT_ID}/locations/global/keyRings/${KMS_KEYRING}" \ + --key="projects/${PROJECT_ID}/locations/global/keyRings/${KMS_KEYRING}/cryptoKeys/${KDC_ROOT_PASSWD_KEY}" \ --plaintext-file=- \ --ciphertext-file=init/mysql_admin_password.encrypted; then report_result "Created" - create_sentinel "${phase_name}" "done" else report_result "Fail" return 1 @@ -128,48 +76,33 @@ function create_mysql_admin_password() { export -f create_mysql_admin_password function create_kerberos_kdc_password() { - local phase_name="create_kerberos_kdc_password" - if check_sentinel "${phase_name}" "done"; then - print_status "Creating Encrypted KDC Root Password..." - report_result "Exists" - return 0 - fi - if [[ -f init/${KDC_ROOT_PASSWD_KEY}.encrypted ]]; then - print_status "Creating Encrypted KDC Root Password..." - report_result "Exists" - create_sentinel "${phase_name}" "done" + print_status "Creating Encrypted KDC Root Password..." + local log_file="create_kdc_root_password.log" + if dd if=/dev/urandom bs=8 count=4 | xxd -p | \ + run_gcloud "${log_file}" gcloud kms encrypt \ + --location=global \ + --keyring="${KMS_KEYRING}" \ + --key="${KDC_ROOT_PASSWD_KEY}" \ + --plaintext-file=- \ + --ciphertext-file="init/${KDC_ROOT_PASSWD_KEY}.encrypted"; then + report_result "Created" else - print_status "Creating Encrypted KDC Root Password..." - local log_file="create_kdc_root_password.log" - if dd if=/dev/urandom bs=8 count=4 | xxd -p | \ - run_gcloud "${log_file}" gcloud kms encrypt \ - --location=global \ - --keyring=${KMS_KEYRING} \ - --key=${KDC_ROOT_PASSWD_KEY} \ - --plaintext-file=- \ - --ciphertext-file=init/${KDC_ROOT_PASSWD_KEY}.encrypted; then - report_result "Created" - create_sentinel "${phase_name}" "done" - else - report_result "Fail" - return 1 - fi + report_result "Fail" + return 1 fi } export -f create_kerberos_kdc_password function create_kerberos_sa_password() { - local phase_name="create_kerberos_sa_password" - # This one always re-creates, so no sentinel check print_status "Creating Encrypted KDC SA Password..." local log_file="create_kdc_sa_password.log" if dd if=/dev/urandom bs=8 count=4 | xxd -p | \ run_gcloud "${log_file}" gcloud kms encrypt \ --location=global \ - --keyring=${KMS_KEYRING} \ - --key=${KDC_ROOT_PASSWD_KEY} \ + --keyring="${KMS_KEYRING}" \ + --key="${KDC_ROOT_PASSWD_KEY}" \ --plaintext-file=- \ - --ciphertext-file=init/${KDC_SA_PASSWD_KEY}.encrypted; then + --ciphertext-file="init/${KDC_SA_PASSWD_KEY}.encrypted"; then report_result "Created" else report_result "Fail" diff --git a/gcloud/lib/gcp/misc.sh b/gcloud/lib/gcp/misc.sh index 60754447..f896c2ff 100644 --- a/gcloud/lib/gcp/misc.sh +++ b/gcloud/lib/gcp/misc.sh @@ -38,14 +38,36 @@ function configure_gcloud() { fi } -function enable_services () { - local phase_name="enable_services" - if check_sentinel "${phase_name}" "done"; then - print_status "Enabling GCP Services..." - report_result "Exists" - return 0 - fi +function check_project() { + print_status "Verifying project ${PROJECT_ID}..." + local project_state + project_state=$(jq -r '.project.lifecycleState // "NOT_FOUND"' "${STATE_FILE}") + + if [[ "${project_state}" == "ACTIVE" ]]; then + report_result "Pass" + else + report_result "Fail" + echo " - Project ${PROJECT_ID} is not ACTIVE or does not exist (state: ${project_state})." >&2 + exit 1 + fi +} + +function check_billing() { + print_status "Verifying billing for ${PROJECT_ID}..." + local billing_enabled + billing_enabled=$(jq -r '.billing.billingEnabled // false' "${STATE_FILE}") + + if [[ "${billing_enabled}" == "true" ]]; then + report_result "Pass" + else + report_result "Fail" + echo " - Billing is not enabled for project ${PROJECT_ID} according to state file." >&2 + echo " - Please run: gcloud beta billing projects link ${PROJECT_ID} --billing-account " >&2 + exit 1 + fi +} +function enable_services () { print_status "Enabling GCP Services..." local log_file="enable_services.log" if run_gcloud "${log_file}" gcloud services enable \ @@ -59,27 +81,18 @@ function enable_services () { privateca.googleapis.com \ --project=${PROJECT_ID}; then report_result "Pass" - create_sentinel "${phase_name}" "done" else report_result "Fail" fi } function enable_secret_manager() { - local phase_name="enable_secret_manager" - if check_sentinel "${phase_name}" "done"; then - print_status "Enabling Secret Manager API..." - report_result "Exists" - return 0 - fi - print_status "Enabling Secret Manager API..." local log_file="enable_secret_manager.log" if run_gcloud "${log_file}" gcloud services enable \ secretmanager.googleapis.com \ --project=${PROJECT_ID}; then report_result "Pass" - create_sentinel "${phase_name}" "done" else report_result "Fail" fi @@ -87,27 +100,24 @@ function enable_secret_manager() { function create_secret() { local secret_name="${1:-${MYSQL_SECRET_NAME}}" - local phase_name="create_secret_${secret_name}" - if check_sentinel "${phase_name}" "done"; then - print_status "Creating Secret ${secret_name}..." - report_result "Exists" - return 0 - fi - print_status "Creating Secret ${secret_name}..." local log_file="create_secret_${secret_name}.log" - if gcloud secrets describe "${secret_name}" --project="${PROJECT_ID}" > /dev/null 2>&1; then - report_result "Exists" - create_sentinel "${phase_name}" "done" + if echo -n "super secret" | run_gcloud "${log_file}" gcloud secrets create "${secret_name}" \ + --project="${PROJECT_ID}" \ + --replication-policy="automatic" \ + --data-file=-; then + report_result "Created" else - if echo -n "super secret" | run_gcloud "${log_file}" gcloud secrets create "${secret_name}" \ - --project="${PROJECT_ID}" \ - --replication-policy=\"automatic\" \ - --data-file=-; then - report_result "Created" - create_sentinel "${phase_name}" "done" - else - report_result "Fail" - fi + report_result "Fail" + fi +} + +function check_image_exists() { + local image_uri="$1" + if [[ -z "${image_uri}" || "${image_uri}" == "null" ]]; then + return 1 # Not found if URI is empty or null fi + # Extracts image name from full URI if necessary + local image_name=$(basename "${image_uri}") + gcloud compute images describe "${image_name}" --project="${PROJECT_ID}" > /dev/null 2>&1 } diff --git a/gcloud/lib/gcp/project.sh b/gcloud/lib/gcp/project.sh index ad11425d..b4a92282 100644 --- a/gcloud/lib/gcp/project.sh +++ b/gcloud/lib/gcp/project.sh @@ -3,13 +3,6 @@ # GCP Project related functions function create_project(){ - local phase_name="create_project" - if check_sentinel "${phase_name}" "done"; then - print_status "Checking Project ${PROJECT_ID}..." - report_result "Exists" - return 0 - fi - print_status "Checking Project ${PROJECT_ID}..." local log_file="create_project_${PROJECT_ID}.log" local PROJ_DESCRIPTION=$(gcloud projects describe ${PROJECT_ID} --format json 2>/dev/null) @@ -67,13 +60,11 @@ EOF else report_result "Pass" fi - create_sentinel "${phase_name}" "done" fi } export -f create_project function delete_project() { - local phase_name="create_project" print_status "Deleting Project ${PROJECT_ID}..." local log_file="delete_project_${PROJECT_ID}.log" @@ -97,7 +88,6 @@ function delete_project() { print_status " Deleting project ${PROJECT_ID}... " if gcloud projects delete --quiet ${PROJECT_ID} >> "${REPRO_TMPDIR}/${log_file}" 2>&1; then report_result "Deleted" - remove_sentinel "${phase_name}" "done" else report_result "Fail" fi diff --git a/gcloud/lib/gke.sh b/gcloud/lib/gke.sh index 62ed0223..9b65e998 100644 --- a/gcloud/lib/gke.sh +++ b/gcloud/lib/gke.sh @@ -2,73 +2,45 @@ # GKE functions function create_gke_cluster() { - local phase_name="create_gke_cluster" - if check_sentinel "${phase_name}" "done"; then - print_status "Creating GKE Cluster ${GKE_CLUSTER_NAME}..." - report_result "Exists" - return 0 - fi - print_status "Creating GKE Cluster ${GKE_CLUSTER_NAME}..." - if gcloud container clusters describe "${GKE_CLUSTER_NAME}" --zone "${ZONE}" --project="${PROJECT_ID}" > /dev/null 2>&1; then - report_result "Exists" - create_sentinel "${phase_name}" "done" + local log_file="create_gke_cluster_${GKE_CLUSTER_NAME}.log" + if run_gcloud "${log_file}" gcloud container clusters create "${GKE_CLUSTER_NAME}" \ + --service-account="${GSA}" \ + --workload-pool="${PROJECT_ID}.svc.id.goog" \ + --tags "${TAGS}" \ + --subnetwork "${SUBNET}" \ + --network "${NETWORK}" \ + --zone "${ZONE}" --project="${PROJECT_ID}"; then + report_result "Created" else - local log_file="create_gke_cluster_${GKE_CLUSTER_NAME}.log" - if run_gcloud "${log_file}" gcloud container clusters create "${GKE_CLUSTER_NAME}" \ - --service-account="${GSA}" \ - --workload-pool="${PROJECT_ID}.svc.id.goog" \ - --tags "${TAGS}" \ - --subnetwork "${SUBNET}" \ - --network "${NETWORK}" \ - --zone "${ZONE}" --project="${PROJECT_ID}"; then - report_result "Created" - create_sentinel "${phase_name}" "done" - else - report_result "Fail" - return 1 - fi + report_result "Fail" + return 1 fi } export -f create_gke_cluster function delete_gke_cluster() { - local phase_name="create_gke_cluster" - remove_sentinel "${phase_name}" "done" - print_status "Deleting GKE Cluster ${GKE_CLUSTER_NAME}..." local log_file="delete_gke_cluster_${GKE_CLUSTER_NAME}.log" - if gcloud container clusters describe "${GKE_CLUSTER_NAME}" --zone "${ZONE}" --project="${PROJECT_ID}" > /dev/null 2>&1; then - for pn in "${DP_CTRL_POOLNAME}" "${DP_DRIVER_POOLNAME}" "${DP_EXEC_POOLNAME}" ; do - print_status " Deleting Node Pool ${pn}..." - run_gcloud "delete_nodepool_${pn}.log" gcloud container node-pools delete --quiet "${pn}" \ - --zone "${ZONE}" \ - --cluster "${GKE_CLUSTER_NAME}" --project="${PROJECT_ID}" || true - done + + for pn in "${DP_CTRL_POOLNAME}" "${DP_DRIVER_POOLNAME}" "${DP_EXEC_POOLNAME}" ; do + print_status " Deleting Node Pool ${pn}..." + run_gcloud "delete_nodepool_${pn}.log" gcloud container node-pools delete --quiet "${pn}" \ + --zone "${ZONE}" \ + --cluster "${GKE_CLUSTER_NAME}" --project="${PROJECT_ID}" || true + done - if run_gcloud "${log_file}" gcloud container clusters delete --quiet "${GKE_CLUSTER_NAME}" --zone "${ZONE}" --project="${PROJECT_ID}"; then - report_result "Deleted" - else - report_result "Fail" - fi + if run_gcloud "${log_file}" gcloud container clusters delete --quiet "${GKE_CLUSTER_NAME}" \ + --zone "${ZONE}" --project="${PROJECT_ID}"; then + report_result "Deleted" else - report_result "Not Found" + report_result "Fail" fi } export -f delete_gke_cluster function create_dpgke_cluster() { - local phase_name="create_dpgke_cluster" - if check_sentinel "${phase_name}" "done"; then - print_status "Creating DPGKE Cluster ${DPGKE_CLUSTER_NAME}..." - report_result "Exists" - return 0 - fi - print_status "Creating DPGKE Cluster ${DPGKE_CLUSTER_NAME}..." - # How to check if DPGKE cluster exists? - # gcloud dataproc clusters describe does not work for GKE based clusters. - # Let's assume for now, if the sentinel is not there, we create. local log_file="create_dpgke_cluster_${DPGKE_CLUSTER_NAME}.log" if run_gcloud "${log_file}" gcloud dataproc clusters gke create "${DPGKE_CLUSTER_NAME}" \ --project="${PROJECT_ID}" \ @@ -82,7 +54,6 @@ function create_dpgke_cluster() { --pools="name=${DP_DRIVER_POOLNAME},min=1,max=3,roles=spark-driver,machineType=n2-standard-4" \ --pools="name=${DP_EXEC_POOLNAME},min=1,max=10,roles=spark-executor,machineType=n2-standard-8"; then report_result "Created" - create_sentinel "${phase_name}" "done" else report_result "Fail" return 1 @@ -91,11 +62,8 @@ function create_dpgke_cluster() { export -f create_dpgke_cluster function delete_dpgke_cluster() { - local phase_name="create_dpgke_cluster" - remove_sentinel "${phase_name}" "done" print_status "Deleting DPGKE Cluster ${DPGKE_CLUSTER_NAME}..." local log_file="delete_dpgke_cluster_${DPGKE_CLUSTER_NAME}.log" - # How to check existence? Assume delete will fail if not found. if run_gcloud "${log_file}" gcloud dataproc clusters delete --quiet "${DPGKE_CLUSTER_NAME}" --region="${REGION}" --project="${PROJECT_ID}"; then report_result "Deleted" else diff --git a/gcloud/lib/kerberos.sh b/gcloud/lib/kerberos.sh index c4cd592c..6cfbf3dd 100644 --- a/gcloud/lib/kerberos.sh +++ b/gcloud/lib/kerberos.sh @@ -2,122 +2,82 @@ # Kerberos functions function create_kdc_server() { - local phase_name="create_kdc_server" - if check_sentinel "${phase_name}" "done"; then - print_status "Creating KDC Server ${KDC_NAME}..." - report_result "Exists" - return 0 - fi - print_status "Creating KDC Server ${KDC_NAME}..." - if gcloud compute instances describe "${KDC_NAME}" --zone "${ZONE}" --project="${PROJECT_ID}" > /dev/null 2>&1; then - report_result "Exists" - create_sentinel "${phase_name}" "done" + local log_file="create_kdc_server_${KDC_NAME}.log" + local METADATA="kdc-root-passwd=${INIT_ACTIONS_ROOT}/${KDC_ROOT_PASSWD_KEY}.encrypted" + METADATA="${METADATA},kms-keyring=${KMS_KEYRING}" + METADATA="${METADATA},kdc-root-passwd-key=${KDC_ROOT_PASSWD_KEY}" + METADATA="${METADATA},startup-script-url=${INIT_ACTIONS_ROOT}/kdc-server.sh" + METADATA="${METADATA},service-account-user=${GSA}" + if run_gcloud "${log_file}" gcloud compute instances create "${KDC_NAME}" \ + --zone "${ZONE}" \ + --subnet "${SUBNET}" \ + --service-account="${GSA}" \ + --boot-disk-type pd-ssd \ + --image-family="${KDC_IMAGE_FAMILY}" \ + --image-project="${KDC_IMAGE_PROJECT}" \ + --machine-type="${KDC_MACHINE_TYPE}" \ + --scopes='cloud-platform' \ + --hostname="${KDC_FQDN}" \ + --metadata "${METADATA}"; then + report_result "Created" else - local log_file="create_kdc_server_${KDC_NAME}.log" - local METADATA="kdc-root-passwd=${INIT_ACTIONS_ROOT}/${KDC_ROOT_PASSWD_KEY}.encrypted" - METADATA="${METADATA},kms-keyring=${KMS_KEYRING}" - METADATA="${METADATA},kdc-root-passwd-key=${KDC_ROOT_PASSWD_KEY}" - METADATA="${METADATA},startup-script-url=${INIT_ACTIONS_ROOT}/kdc-server.sh" - METADATA="${METADATA},service-account-user=${GSA}" - if run_gcloud "${log_file}" gcloud compute instances create "${KDC_NAME}" \ - --zone "${ZONE}" \ - --subnet "${SUBNET}" \ - --service-account="${GSA}" \ - --boot-disk-type pd-ssd \ - --image-family="${KDC_IMAGE_FAMILY}" \ - --image-project="${KDC_IMAGE_PROJECT}" \ - --machine-type="${KDC_MACHINE_TYPE}" \ - --scopes='cloud-platform' \ - --hostname="${KDC_FQDN}" \ - --metadata "${METADATA}"; then - report_result "Created" - create_sentinel "${phase_name}" "done" - else - report_result "Fail" - return 1 - fi + report_result "Fail" + return 1 fi } export -f create_kdc_server function delete_kdc_server() { - local phase_name="create_kdc_server" - remove_sentinel "${phase_name}" "done" - print_status "Deleting KDC Server ${KDC_NAME}..." local log_file="delete_kdc_server_${KDC_NAME}.log" - if gcloud compute instances describe "${KDC_NAME}" --zone "${ZONE}" --project="${PROJECT_ID}" > /dev/null 2>&1; then - if run_gcloud "${log_file}" gcloud compute instances delete "${KDC_NAME}" --zone "${ZONE}" --project="${PROJECT_ID}" --quiet; then - report_result "Deleted" - else - report_result "Fail" - fi + if run_gcloud "${log_file}" gcloud compute instances delete "${KDC_NAME}" --zone "${ZONE}" --project="${PROJECT_ID}" --quiet; then + report_result "Deleted" else - report_result "Not Found" + report_result "Fail" fi } export -f delete_kdc_server function create_kerberos_cluster() { - local phase_name="create_kerberos_cluster" - if check_sentinel "${phase_name}" "done"; then - print_status "Creating Kerberos Cluster ${CLUSTER_NAME}..." - report_result "Exists" - return 0 - fi - print_status "Creating Kerberos Cluster ${CLUSTER_NAME}..." - if gcloud dataproc clusters describe "${CLUSTER_NAME}" --region="${REGION}" --project="${PROJECT_ID}" > /dev/null 2>&1; then - report_result "Exists" - create_sentinel "${phase_name}" "done" + local log_file="create_kerberos_cluster_${CLUSTER_NAME}.log" + if run_gcloud "${log_file}" gcloud dataproc clusters create "${CLUSTER_NAME}" \ + --region "${REGION}" \ + --zone "${ZONE}" \ + --subnet "${SUBNET}" \ + --no-address \ + --service-account="${GSA}" \ + --master-machine-type n1-standard-4 \ + --master-boot-disk-type pd-ssd \ + --master-boot-disk-size 50 \ + --image-version "${IMAGE_VERSION}" \ + --bucket "${BUCKET}" \ + --initialization-action-timeout=10m \ + --max-idle="${IDLE_TIMEOUT}" \ + --enable-component-gateway \ + --scopes='cloud-platform' \ + --enable-kerberos \ + --kerberos-root-principal-password-uri="${INIT_ACTIONS_ROOT}/${KDC_ROOT_PASSWD_KEY}.encrypted" \ + --kerberos-kms-key="${KDC_ROOT_PASSWD_KEY}" \ + --kerberos-kms-key-keyring="${KMS_KEYRING}" \ + --kerberos-kms-key-location=global \ + --kerberos-kms-key-project="${PROJECT_ID}"; then + report_result "Created" else - local log_file="create_kerberos_cluster_${CLUSTER_NAME}.log" - if run_gcloud "${log_file}" gcloud dataproc clusters create "${CLUSTER_NAME}" \ - --region "${REGION}" \ - --zone "${ZONE}" \ - --subnet "${SUBNET}" \ - --no-address \ - --service-account="${GSA}" \ - --master-machine-type n1-standard-4 \ - --master-boot-disk-type pd-ssd \ - --master-boot-disk-size 50 \ - --image-version "${IMAGE_VERSION}" \ - --bucket "${BUCKET}" \ - --initialization-action-timeout=10m \ - --max-idle="${IDLE_TIMEOUT}" \ - --enable-component-gateway \ - --scopes='cloud-platform' \ - --enable-kerberos \ - --kerberos-root-principal-password-uri="${INIT_ACTIONS_ROOT}/${KDC_ROOT_PASSWD_KEY}.encrypted" \ - --kerberos-kms-key="${KDC_ROOT_PASSWD_KEY}" \ - --kerberos-kms-key-keyring="${KMS_KEYRING}" \ - --kerberos-kms-key-location=global \ - --kerberos-kms-key-project="${PROJECT_ID}"; then - report_result "Created" - create_sentinel "${phase_name}" "done" - else - report_result "Fail" - return 1 - fi + report_result "Fail" + return 1 fi } export -f create_kerberos_cluster function delete_kerberos_cluster() { - local phase_name="create_kerberos_cluster" - remove_sentinel "${phase_name}" "done" - print_status "Deleting Kerberos Cluster ${CLUSTER_NAME}..." local log_file="delete_kerberos_cluster_${CLUSTER_NAME}.log" - if gcloud dataproc clusters describe "${CLUSTER_NAME}" --region="${REGION}" --project="${PROJECT_ID}" > /dev/null 2>&1; then - if run_gcloud "${log_file}" gcloud dataproc clusters delete --quiet --region "${REGION}" "${CLUSTER_NAME}"; then - report_result "Deleted" - else - report_result "Fail" - fi + if run_gcloud "${log_file}" gcloud dataproc clusters delete --quiet --region "${REGION}" "${CLUSTER_NAME}"; then + report_result "Deleted" else - report_result "Not Found" + report_result "Fail" fi } export -f delete_kerberos_cluster \ No newline at end of file diff --git a/gcloud/lib/network/firewall.sh b/gcloud/lib/network/firewall.sh index 16a51fd2..34172ad4 100644 --- a/gcloud/lib/network/firewall.sh +++ b/gcloud/lib/network/firewall.sh @@ -2,14 +2,12 @@ # # Firewall rule functions -function create_firewall_rules() { - local phase_name="create_firewall_rules" - if check_sentinel "${phase_name}" "done"; then - print_status "Creating base Firewall Rules for ${NETWORK}..." - report_result "Exists" - return 0 - fi +function exists_firewall() { + # This is a basic check. A more robust version might check for a list of rules. + _check_exists "gcloud compute firewall-rules describe '${FIREWALL}-in-ssh' --project='${PROJECT_ID}' --format='json(name,selfLink)'" +} +function create_firewall_rules() { print_status "Creating base Firewall Rules for ${NETWORK}..." local log_file="create_firewalls_${NETWORK}.log" local created_some=false @@ -83,15 +81,11 @@ function create_firewall_rules() { else report_result "Exists" fi - create_sentinel "${phase_name}" "done" fi } export -f create_firewall_rules function delete_firewall_rules () { - local phase_name="create_firewall_rules" - remove_sentinel "${phase_name}" "done" - print_status "Deleting Cluster Firewall Rules..." local log_file="delete_firewalls_${NETWORK}.log" # Delete any rule containing the cluster name @@ -124,15 +118,3 @@ function delete_firewall_rules () { fi } export -f delete_firewall_rules - -function create_logging_firewall_rules () { - print_status "Creating Logging Firewall Rules for ${NETWORK}..." - # ... (implementation with run_gcloud and report_result) ... -} -export -f create_logging_firewall_rules - -function delete_logging_firewall_rules () { - print_status "Deleting Logging Firewall Rules for ${NETWORK}..." - # ... (implementation with run_gcloud and report_result) ... -} -export -f delete_logging_firewall_rules \ No newline at end of file diff --git a/gcloud/lib/network/network.sh b/gcloud/lib/network/network.sh index d4aa0c4b..59f3ba51 100644 --- a/gcloud/lib/network/network.sh +++ b/gcloud/lib/network/network.sh @@ -2,59 +2,31 @@ # # VPC Network functions -function create_vpc_network () { - local phase_name="create_vpc_network" - if check_sentinel "${phase_name}" "done"; then - print_status "Creating VPC Network ${NETWORK}..." - report_result "Exists" - return 0 - fi +function exists_network() { + _check_exists "gcloud compute networks describe '${NETWORK}' --project='${PROJECT_ID}' --format='json(name,selfLink)'" +} +function create_vpc_network () { print_status "Creating VPC Network ${NETWORK}..." local log_file="create_vpc_${NETWORK}.log" - - if gcloud compute networks describe "${NETWORK}" --project="${PROJECT_ID}" > /dev/null 2>&1; - then - report_result "Exists" - create_sentinel "${phase_name}" "done" + if run_gcloud "${log_file}" gcloud compute networks create "${NETWORK}" \ + --project="${PROJECT_ID}" \ + --subnet-mode=custom \ + --bgp-routing-mode="regional" \ + --description="network for use with Dataproc cluster ${CLUSTER_NAME}"; then + report_result "Created" else - if run_gcloud "${log_file}" gcloud compute networks create "${NETWORK}" \ - --project="${PROJECT_ID}" \ - --subnet-mode=custom \ - --bgp-routing-mode="regional" \ - --description="network for use with Dataproc cluster ${CLUSTER_NAME}"; then - report_result "Created" - create_sentinel "${phase_name}" "done" - else - report_result "Fail" - return 1 - fi + report_result "Fail" + return 1 fi } function delete_vpc_network () { print_status "Deleting VPC Network ${NETWORK}..." local log_file="delete_vpc_${NETWORK}.log" - local network_check=$(gcloud compute networks list --project="${PROJECT_ID}" --filter="name = ${NETWORK}" --format="value(name)" 2>/dev/null) - - if [[ -n "${network_check}" ]]; then - if run_gcloud "${log_file}" gcloud compute networks delete --quiet "${NETWORK}" --project="${PROJECT_ID}"; then - report_result "Deleted" - remove_sentinel "create_vpc_network" "done" - else - report_result "Fail" - local dep_log="${REPRO_TMPDIR}/VPC_Network_Delete_Failed_${NETWORK}_${RESOURCE_SUFFIX}.log" - echo "--- Firewall Rules in ${NETWORK} ---" > "${dep_log}" - gcloud compute firewall-rules list --project="${PROJECT_ID}" --filter="network ~ ${NETWORK}$" --format="value(name)" >> "${dep_log}" 2>&1 - echo "--- Routes in ${NETWORK} ---" >> "${dep_log}" - gcloud compute routes list --project="${PROJECT_ID}" --filter="network ~ ${NETWORK}$" --format="value(name, nextHopGateway)" >> "${dep_log}" 2>&1 - echo "--- Routers in ${REGION} ---" >> "${dep_log}" - gcloud compute routers list --regions="${REGION}" --project="${PROJECT_ID}" --format="value(name, network)" >> "${dep_log}" 2>&1 - echo "--- Subnets in ${NETWORK} ---" >> "${dep_log}" - gcloud compute networks subnets list --network="${NETWORK}" --project="${PROJECT_ID}" --format="value(name)" >> "${dep_log}" 2>&1 - fi + if run_gcloud "${log_file}" gcloud compute networks delete --quiet "${NETWORK}" --project="${PROJECT_ID}"; then + report_result "Deleted" else - report_result "Not Found" - remove_sentinel "create_vpc_network" "done" + report_result "Fail" fi } diff --git a/gcloud/lib/network/router.sh b/gcloud/lib/network/router.sh index 2a9589ae..2edea63f 100644 --- a/gcloud/lib/network/router.sh +++ b/gcloud/lib/network/router.sh @@ -2,115 +2,60 @@ # # Router and NAT functions -function create_router () { - local phase_name="create_router" - if check_sentinel "${phase_name}" "done"; then - print_status "Creating Router ${ROUTER_NAME}..." - report_result "Exists" - return 0 - fi +function exists_router() { + _check_exists "gcloud compute routers describe '${ROUTER_NAME}' --region='${REGION}' --project='${PROJECT_ID}' --format='json(name,selfLink)'" +} +function create_router () { print_status "Creating Router ${ROUTER_NAME}..." local log_file="create_router_${ROUTER_NAME}.log" - if gcloud compute routers describe "${ROUTER_NAME}" --region="${REGION}" --project="${PROJECT_ID}" > /dev/null 2>&1; - then - report_result "Exists" - create_sentinel "${phase_name}" "done" + if run_gcloud "${log_file}" gcloud compute routers create "${ROUTER_NAME}" \ + --project="${PROJECT_ID}" \ + --network="${NETWORK}" \ + --asn="${ASN_NUMBER}" \ + --region="${REGION}"; then + report_result "Created" else - if run_gcloud "${log_file}" gcloud compute routers create ${ROUTER_NAME} \ - --project=${PROJECT_ID} \ - --network=${NETWORK} \ - --asn=${ASN_NUMBER} \ - --region=${REGION}; then - report_result "Created" - create_sentinel "${phase_name}" "done" - else - report_result "Fail" - return 1 - fi + report_result "Fail" + return 1 fi } export -f create_router -function add_nat_policy () { - local phase_name="add_nat_policy" - if check_sentinel "${phase_name}" "done"; then - print_status "Adding NAT to Router ${ROUTER_NAME}..." - report_result "Exists" - return 0 - fi - +function add_nat_to_router () { print_status "Adding NAT to Router ${ROUTER_NAME}..." local log_file="add_nat_${ROUTER_NAME}.log" - if gcloud compute routers nats describe nat-config --router="${ROUTER_NAME}" --region="${REGION}" --project="${PROJECT_ID}" > /dev/null 2>&1; - then - report_result "Exists" - create_sentinel "${phase_name}" "done" - else - if run_gcloud "${log_file}" gcloud compute routers nats create nat-config \ - --router-region ${REGION} \ - --router ${ROUTER_NAME} \ - --project="${PROJECT_ID}" \ - --nat-custom-subnet-ip-ranges "${SUBNET}" \ - --auto-allocate-nat-external-ips; then - report_result "Created" - create_sentinel "${phase_name}" "done" - else - report_result "Fail" - return 1 - fi - fi -} -export -f add_nat_policy - -function delete_nat_configs() { - local phase_name="add_nat_policy" - print_status "Deleting NAT Configs from ${ROUTER_NAME}..." - local log_file="delete_nats_${ROUTER_NAME}.log" - local found_some=false - local all_ok=true - - if gcloud compute routers describe "${ROUTER_NAME}" --region="${REGION}" --project="${PROJECT_ID}" > /dev/null 2>&1; then - NATS=$(gcloud compute routers nats list --router="${ROUTER_NAME}" --region="${REGION}" --project="${PROJECT_ID}" --format="value(name)" 2>/dev/null || true) - if [[ -n "${NATS}" ]]; then - found_some=true - while read -r nat_name; do - # print_status " Deleting NAT ${nat_name} from ${ROUTER_NAME}..." - if ! run_gcloud "${log_file}" gcloud compute routers nats delete "${nat_name}" --router="${ROUTER_NAME}" --region="${REGION}" --project="${PROJECT_ID}" --quiet; then - all_ok=false - fi - done <<< "${NATS}" - fi - fi - - if [[ "${found_some}" = false ]]; then - report_result "Not Found" - remove_sentinel "${phase_name}" "done" - elif [[ "${all_ok}" = true ]]; then - report_result "Deleted" - remove_sentinel "${phase_name}" "done" + if run_gcloud "${log_file}" gcloud compute routers nats create "nat-config" \ + --router-region "${REGION}" \ + --router "${ROUTER_NAME}" \ + --project="${PROJECT_ID}" \ + --nat-custom-subnet-ip-ranges "${SUBNET}" \ + --auto-allocate-nat-external-ips; then + report_result "Created" else report_result "Fail" + return 1 fi } -export -f delete_nat_configs +export -f add_nat_to_router function delete_router () { - local phase_name="create_router" - print_status "Deleting Router ${ROUTER_NAME}..." - local log_file="delete_router_${ROUTER_NAME}.log" + print_status "Deleting NAT from Router ${ROUTER_NAME}..." + local log_file="delete_nat_${ROUTER_NAME}.log" + # Don't fail if the NAT doesn't exist + run_gcloud "${log_file}" gcloud compute routers nats delete "nat-config" \ + --router-region "${REGION}" \ + --router "${ROUTER_NAME}" \ + --project="${PROJECT_ID}" --quiet || true - local router_check=$(gcloud compute routers list --regions="${REGION}" --project="${PROJECT_ID}" --filter="name = ${ROUTER_NAME}" --format="value(name)" 2>/dev/null) - if [[ -n "${router_check}" ]]; then - if run_gcloud "${log_file}" gcloud compute routers delete --quiet --region ${REGION} "${ROUTER_NAME}" --project="${PROJECT_ID}"; then - report_result "Deleted" - remove_sentinel "${phase_name}" "done" - else - report_result "Fail" - fi + print_status "Deleting Router ${ROUTER_NAME}..." + log_file="delete_router_${ROUTER_NAME}.log" + if run_gcloud "${log_file}" gcloud compute routers delete --quiet "${ROUTER_NAME}" \ + --region="${REGION}" \ + --project="${PROJECT_ID}"; then + report_result "Deleted" else - report_result "Not Found" - remove_sentinel "${phase_name}" "done" + report_result "Fail" fi } -export -f delete_router \ No newline at end of file +export -f delete_router diff --git a/gcloud/lib/network/routes.sh b/gcloud/lib/network/routes.sh index ca14fa2e..db929fda 100644 --- a/gcloud/lib/network/routes.sh +++ b/gcloud/lib/network/routes.sh @@ -1,41 +1,18 @@ #!/bin/bash # -# Route functions +# Route Management Functions -function create_default_route() { - local route_name="default-internet-${NETWORK}" - print_status "Creating Default Route ${route_name}..." - local log_file="create_route_${route_name}.log" - - if gcloud compute routes describe "${route_name}" --project="${PROJECT_ID}" > /dev/null 2>&1; - then - report_result "Exists" - else - if run_gcloud "${log_file}" gcloud compute routes create "${route_name}" \ - --project="${PROJECT_ID}" \ - --network="${NETWORK}" \ - --destination-range=0.0.0.0/0 \ - --next-hop-gateway=default-internet-gateway; then - report_result "Created" - else - report_result "Fail" - fi - fi -} - -function delete_default_route() { - local route_name="default-internet-${NETWORK}" - print_status "Deleting Default Route ${route_name}..." - if gcloud compute routes describe "${route_name}" --project="${PROJECT_ID}" > /dev/null 2>&1; - then - local log_file="delete_route_${route_name}.log" - if run_gcloud "${log_file}" gcloud compute routes delete --quiet "${route_name}" --project="${PROJECT_ID}"; then - report_result "Deleted" - else - report_result "Fail" - fi +function delete_route() { + local route_name="$1" + print_status "Deleting Route ${route_name}..." + local log_file="delete_route_${route_name}.log" + if run_gcloud "${log_file}" gcloud compute routes delete --quiet "${route_name}" --project="${PROJECT_ID}"; then + report_result "Deleted" else - report_result "Not Found" + report_result "Fail" + echo " - Failed to delete route ${route_name}. Log content:" >&2 + cat "${log_file}" >&2 + return 1 fi } - +export -f delete_route diff --git a/gcloud/lib/network/subnet.sh b/gcloud/lib/network/subnet.sh index 83f2b037..d6d19f18 100644 --- a/gcloud/lib/network/subnet.sh +++ b/gcloud/lib/network/subnet.sh @@ -2,49 +2,35 @@ # # Subnet functions -function create_subnet () { - local phase_name="create_subnet" - if check_sentinel "${phase_name}" "done"; then - print_status "Creating Subnet ${SUBNET}..." - report_result "Exists" - return 0 - fi +function exists_subnet() { + local subnet_name="$1" + _check_exists "gcloud compute networks subnets describe '${subnet_name}' --region='${REGION}' --project='${PROJECT_ID}' --format='json(name,selfLink)'" +} +function create_subnet () { print_status "Creating Subnet ${SUBNET}..." local log_file="create_subnet_${SUBNET}.log" - if gcloud compute networks subnets describe "${SUBNET}" --region="${REGION}" --project="${PROJECT_ID}" > /dev/null 2>&1; - then - report_result "Exists" - create_sentinel "${phase_name}" "done" + if run_gcloud "${log_file}" gcloud compute networks subnets create "${SUBNET}" \ + --project="${PROJECT_ID}" \ + --network="${NETWORK}" \ + --range="${RANGE}" \ + --enable-private-ip-google-access \ + --region="${REGION}" \ + --description="subnet for use with Dataproc cluster ${CLUSTER_NAME}"; then + report_result "Created" else - if run_gcloud "${log_file}" gcloud compute networks subnets create ${SUBNET} \ - --project="${PROJECT_ID}" \ - --network=${NETWORK} \ - --range="$RANGE" \ - --enable-private-ip-google-access \ - --region=${REGION} \ - --description="subnet for use with Dataproc cluster ${CLUSTER_NAME}"; then - report_result "Created" - create_sentinel "${phase_name}" "done" - else - report_result "Fail" - return 1 - fi + report_result "Fail" + return 1 fi } function delete_subnet () { - print_status "Deleting Subnet ${SUBNET}..." - if gcloud compute networks subnets describe "${SUBNET}" --region "${REGION}" > /dev/null 2>&1; - then - local log_file="delete_subnet_${SUBNET}.log" - if run_gcloud "${log_file}" gcloud compute networks subnets delete --quiet --region ${REGION} ${SUBNET}; then - report_result "Deleted" - remove_sentinel "create_subnet" "done" - else - report_result "Fail" - fi + local subnet_name="$1" + print_status "Deleting Subnet ${subnet_name}..." + local log_file="delete_subnet_${subnet_name}.log" + if run_gcloud "${log_file}" gcloud compute networks subnets delete --quiet --region "${REGION}" "${subnet_name}"; then + report_result "Deleted" else - report_result "Not Found" + report_result "Fail" fi } diff --git a/gcloud/lib/phs.sh b/gcloud/lib/phs.sh index e83c34b2..957dcc75 100644 --- a/gcloud/lib/phs.sh +++ b/gcloud/lib/phs.sh @@ -3,53 +3,33 @@ function create_phs_cluster() { local phs_cluster_name="${CLUSTER_NAME}-phs" - local phase_name="create_phs_cluster" - if check_sentinel "${phase_name}" "done"; then - print_status "Creating PHS Cluster ${phs_cluster_name}..." - report_result "Exists" - return 0 - fi - print_status "Creating PHS Cluster ${phs_cluster_name}..." - if gcloud dataproc clusters describe "${phs_cluster_name}" --region="${REGION}" --project="${PROJECT_ID}" > /dev/null 2>&1; then - report_result "Exists" - create_sentinel "${phase_name}" "done" + local log_file="create_phs_cluster_${phs_cluster_name}.log" + if run_gcloud "${log_file}" gcloud dataproc clusters create "${phs_cluster_name}" \ + --region="${REGION}" \ + --single-node \ + --image-version="${IMAGE_VERSION}" \ + --subnet="${SUBNET}" \ + --tags="${TAGS}" \ + --properties="spark:spark.history.fs.logDirectory=gs://${PHS_BUCKET},spark:spark.eventLog.dir=gs://${PHS_BUCKET}" \ + --properties="mapred:mapreduce.jobhistory.read-only.dir-pattern=gs://${MR_HISTORY_BUCKET}" \ + --enable-component-gateway; then + report_result "Created" else - local log_file="create_phs_cluster_${phs_cluster_name}.log" - if run_gcloud "${log_file}" gcloud dataproc clusters create "${phs_cluster_name}" \ - --region="${REGION}" \ - --single-node \ - --image-version="${IMAGE_VERSION}" \ - --subnet="${SUBNET}" \ - --tags="${TAGS}" \ - --properties="spark:spark.history.fs.logDirectory=gs://${PHS_BUCKET},spark:spark.eventLog.dir=gs://${PHS_BUCKET}" \ - --properties="mapred:mapreduce.jobhistory.read-only.dir-pattern=gs://${MR_HISTORY_BUCKET}" \ - --enable-component-gateway; then - report_result "Created" - create_sentinel "${phase_name}" "done" - else - report_result "Fail" - return 1 - fi + report_result "Fail" + return 1 fi } export -f create_phs_cluster function delete_phs_cluster() { local phs_cluster_name="${CLUSTER_NAME}-phs" - local phase_name="create_phs_cluster" - remove_sentinel "${phase_name}" "done" - print_status "Deleting PHS Cluster ${phs_cluster_name}..." local log_file="delete_phs_cluster_${phs_cluster_name}.log" - if gcloud dataproc clusters describe "${phs_cluster_name}" --region="${REGION}" --project="${PROJECT_ID}" > /dev/null 2>&1; then - if run_gcloud "${log_file}" gcloud dataproc clusters delete --quiet --region "${REGION}" "${phs_cluster_name}"; then - report_result "Deleted" - else - report_result "Fail" - fi + if run_gcloud "${log_file}" gcloud dataproc clusters delete --quiet --region "${REGION}" "${phs_cluster_name}"; then + report_result "Deleted" else - report_result "Not Found" + report_result "Fail" fi } export -f delete_phs_cluster \ No newline at end of file diff --git a/gcloud/lib/secure-boot/create-key-pair.sh b/gcloud/lib/secure-boot/create-key-pair.sh index 9ed8e49e..0b8e7d82 100644 --- a/gcloud/lib/secure-boot/create-key-pair.sh +++ b/gcloud/lib/secure-boot/create-key-pair.sh @@ -25,7 +25,7 @@ set -e # https://wiki.archlinux.org/title/Unified_Extensible_Firmware_Interface/Secure_Boot#Creating_keys -ITERATION=042 +ITERATION=0009 CURRENT_PROJECT_ID="$(gcloud config get project)" if [[ -z "${CURRENT_PROJECT_ID}" ]]; then diff --git a/gcloud/lib/shared-functions.sh b/gcloud/lib/shared-functions.sh index 6e72a5f0..84196731 100644 --- a/gcloud/lib/shared-functions.sh +++ b/gcloud/lib/shared-functions.sh @@ -23,10 +23,10 @@ function create_dpgce_cluster() { date time gcloud dataproc clusters create ${CLUSTER_NAME} \ --single-node \ - --master-accelerator "type=${MASTER_ACCELERATOR_TYPE}" \ + --master-accelerator "type=${M_ACCELERATOR_TYPE}" \ --worker-accelerator "type=${PRIMARY_ACCELERATOR_TYPE}" \ --secondary-worker-accelerator "type=${SECONDARY_ACCELERATOR_TYPE}" \ - --master-machine-type "${MASTER_MACHINE_TYPE}" \ + --master-machine-type "${M_MACHINE_TYPE}" \ --worker-machine-type "${PRIMARY_MACHINE_TYPE}" \ --master-boot-disk-size 50 \ --worker-boot-disk-size 50 \ @@ -90,7 +90,7 @@ function create_dpgce_cluster() { # --initialization-actions "${INIT_ACTIONS_ROOT}/gpu/install_gpu_driver.sh" \ # --num-masters=1 \ # --num-workers=2 \ -# --master-machine-type "${MASTER_MACHINE_TYPE}" \ +# --master-machine-type "${M_MACHINE_TYPE}" \ # --worker-machine-type "${PRIMARY_MACHINE_TYPE}" \ # --metadata cuda-url="https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux.run" \ # --metadata gpu-driver-url="https://us.download.nvidia.com/XFree86/Linux-x86_64/550.135/NVIDIA-Linux-x86_64-550.135.run" \ @@ -99,7 +99,7 @@ function create_dpgce_cluster() { # --initialization-actions ${INIT_ACTIONS_ROOT}/rapids/rapids.sh \ # --metadata rapids-runtime="SPARK" \ # --worker-accelerator "type=${PRIMARY_ACCELERATOR_TYPE}" \ -# --master-accelerator "type=${MASTER_ACCELERATOR_TYPE}" \ +# --master-accelerator "type=${M_ACCELERATOR_TYPE}" \ # --single-node \ # --num-masters=1 \ # --num-workers=2 \ @@ -206,7 +206,7 @@ function create_dpgce_cluster() { # # Oozie # -# --metadata startup-script-url="${INIT_ACTIONS_ROOT}/delay-masters-startup.sh" \ +# --metadata startup-script-url="${INIT_ACTIONS_ROOT}/delay-ms-startup.sh" \ # --initialization-actions "${INIT_ACTIONS_ROOT}/oozie/oozie.sh" \ # --properties "dataproc:dataproc.master.custom.init.actions.mode=RUN_AFTER_SERVICES" \ @@ -217,7 +217,7 @@ function create_dpgce_cluster() { # complex init actions on 2.1 repro - # --metadata startup-script-url="${INIT_ACTIONS_ROOT}/delay-masters-startup.sh" \ + # --metadata startup-script-url="${INIT_ACTIONS_ROOT}/delay-ms-startup.sh" \ # --initialization-actions "${INIT_ACTIONS_ROOT}/oozie/oozie.sh,${INIT_ACTIONS_ROOT}/bigtable/bigtable.sh,${INIT_ACTIONS_ROOT}/sqoop/sqoop.sh" \ # --initialization-action-timeout=15m \ # --optional-components ZOOKEEPER \ @@ -229,14 +229,14 @@ function create_dpgce_cluster() { # --scopes 'https://www.googleapis.com/auth/cloud-platform,sql-admin' # --enable-component-gateway \ -# --metadata startup-script-url="${INIT_ACTIONS_ROOT}/delay-masters-startup.sh" \ +# --metadata startup-script-url="${INIT_ACTIONS_ROOT}/delay-ms-startup.sh" \ # --initialization-actions "${INIT_ACTIONS_ROOT}/oozie/oozie.sh,${INIT_ACTIONS_ROOT}/bigtable/bigtable.sh,${INIT_ACTIONS_ROOT}/sqoop/sqoop.sh" \ # --initialization-action-timeout=15m \ # --properties "dataproc:dataproc.master.custom.init.actions.mode=RUN_AFTER_SERVICES" \ # --initialization-action-timeout=15m \ # --metadata bigtable-instance=${BIGTABLE_INSTANCE} \ -# --metadata startup-script-url="${INIT_ACTIONS_ROOT}/delay-masters-startup.sh" \ +# --metadata startup-script-url="${INIT_ACTIONS_ROOT}/delay-ms-startup.sh" \ # --properties "dataproc:dataproc.master.custom.init.actions.mode=RUN_AFTER_SERVICES" \ # --initialization-actions "${INIT_ACTION_PATHS}" \ diff --git a/gcloud/lib/swp/certs.sh b/gcloud/lib/swp/certs.sh index f39621ea..b8496047 100644 --- a/gcloud/lib/swp/certs.sh +++ b/gcloud/lib/swp/certs.sh @@ -1,284 +1,133 @@ #!/bin/bash - function create_managed_certificate() { - local cert_name="swp-cert" # Static name for the final cert + local cert_name="${SWP_CERT_NAME}" # Use unique name local region="${1:-${REGION}}" local project_id="${2:-${PROJECT_ID}}" local swp_hostname="${3:-${SWP_HOSTNAME}}" - - local phase_name="swp_managed_cert" - local log_file="${phase_name}_${CLUSTER_NAME}-${RESOURCE_SUFFIX}.log" - - local ca_pool_prefix="swp-ca-pool-${CLUSTER_NAME}-" - local cic_prefix="swp-cic-${CLUSTER_NAME}-" - local ca_prefix="swp-root-ca-${CLUSTER_NAME}-" - + local log_file="swp_managed_cert_${CLUSTER_NAME}-${RESOURCE_SUFFIX}.log" local suffix="${RESOURCE_SUFFIX}" - local ca_pool_name="${ca_pool_prefix}${suffix}" - local cic_name="${cic_prefix}${suffix}" - local ca_name="${ca_prefix}${suffix}" + local ca_pool_name="swp-ca-pool-${CLUSTER_NAME}-${suffix}" + local cic_name="swp-cic-${CLUSTER_NAME}-${suffix}" + local ca_name="swp-root-ca-${CLUSTER_NAME}-${suffix}" local ca_pool_full_name="projects/${project_id}/locations/${region}/caPools/${ca_pool_name}" - print_status "Ensuring SWP Certificate components for ${CLUSTER_NAME} (Suffix: ${suffix})..." - report_result "" # Newline - - # 1. CA Pool - print_status " Checking CA Pool ${ca_pool_name}..." - if check_sentinel "${phase_name}" "01_ca_pool_created"; then - report_result "Exists" - elif gcloud privateca pools describe "${ca_pool_name}" --location="${region}" --project="${project_id}" > /dev/null 2>&1; then - report_result "Exists" - create_sentinel "${phase_name}" "01_ca_pool_created" - else - report_result "Not Found" - print_status " Creating CA Pool ${ca_pool_name}..." - if run_gcloud "${log_file}" gcloud privateca pools create "${ca_pool_name}" --location="${region}" --tier=devops --project="${project_id}"; then - report_result "Created" - create_sentinel "${phase_name}" "01_ca_pool_created" - echo " -> Waiting 60s for CA Pool creation and IAM propagation..." - sleep 60 - # Grant permissions - these are idempotent so okay to re-run - local project_number=$(gcloud projects describe "${project_id}" --format="value(projectNumber)") - local network_security_sa="service-${project_number}@gcp-sa-networksecurity.iam.gserviceaccount.com" - if ! gcloud beta services identity describe --service=networksecurity.googleapis.com --project="${project_id}" > /dev/null 2>&1; then - print_status " Creating Network Security P4SA for ${project_id}..." - run_gcloud "${log_file}" gcloud beta services identity create --service=networksecurity.googleapis.com --project="${project_id}" && report_result "Pass" - echo " -> Waiting 30s for IAM propagation..." - sleep 30 - fi - print_status " Granting privateca.certificateManager role to P4SA on ${ca_pool_name}..." - run_gcloud "${log_file}" gcloud privateca pools add-iam-policy-binding "${ca_pool_name}" --location="${region}" --project="${project_id}" --member="serviceAccount:${network_security_sa}" --role='roles/privateca.certificateManager' && report_result "Pass" - else - report_result "Fail"; return 1; + print_status "Creating SWP Certificate components for ${CLUSTER_NAME} (Suffix: ${suffix})..." + report_result "" + + print_status " Creating CA Pool ${ca_pool_name}..." + if run_gcloud "${log_file}" gcloud privateca pools create "${ca_pool_name}" --location="${region}" --tier=devops --project="${project_id}"; then + report_result "Created" + echo " -> Waiting 60s for CA Pool creation and IAM propagation..." + sleep 60 + local project_number=$(gcloud projects describe "${project_id}" --format="value(projectNumber)") + local network_security_sa="service-${project_number}@gcp-sa-networksecurity.iam.gserviceaccount.com" + if ! gcloud beta services identity describe --service=networksecurity.googleapis.com --project="${project_id}" > /dev/null 2>&1; then + print_status " Creating Network Security P4SA for ${project_id}..." + run_gcloud "${log_file}" gcloud beta services identity create --service=networksecurity.googleapis.com --project="${project_id}" && report_result "Pass" + echo " -> Waiting 30s for IAM propagation..." + sleep 30 fi + print_status " Granting privateca.certificateManager role to P4SA on ${ca_pool_name}..." + run_gcloud "${log_file}" gcloud privateca pools add-iam-policy-binding "${ca_pool_name}" \ + --location="${region}" \ + --project="${project_id}" \ + --member="serviceAccount:${network_security_sa}" \ + --role='roles/privateca.certificateManager' && report_result "Pass" + else + report_result "Fail"; return 1; fi - # 2. Root CA - print_status " Checking Root CA ${ca_name} in ${ca_pool_name}..." - if check_sentinel "${phase_name}" "02_root_ca_created"; then - report_result "Exists" - elif gcloud privateca roots describe "${ca_name}" --pool="${ca_pool_name}" --location="${region}" --project="${project_id}" > /dev/null 2>&1; then - report_result "Exists" - create_sentinel "${phase_name}" "02_root_ca_created" + print_status " Creating Root CA ${ca_name} in ${ca_pool_name}..." + if run_gcloud "${log_file}" gcloud privateca roots create "${ca_name}" --pool="${ca_pool_name}" \ + --location="${region}" \ + --project="${project_id}" \ + --subject="CN=swp-ca.internal.local, O=Dataproc SWP Test" --auto-enable --quiet; then + report_result "Created" else - report_result "Not Found" - print_status " Creating Root CA ${ca_name}..." - if run_gcloud "${log_file}" gcloud privateca roots create "${ca_name}" --pool="${ca_pool_name}" --location="${region}" --project="${project_id}" --subject="CN=swp-ca.internal.local, O=Dataproc SWP Test" --auto-enable --quiet; then - report_result "Created" - create_sentinel "${phase_name}" "02_root_ca_created" - else - report_result "Fail"; return 1; - fi + report_result "Fail"; return 1; fi - # 3. CIC - print_status " Checking CIC ${cic_name}..." - if check_sentinel "${phase_name}" "03_cic_created"; then - report_result "Exists" - elif gcloud certificate-manager issuance-configs describe "${cic_name}" --location="${region}" --project="${project_id}" > /dev/null 2>&1; then - report_result "Exists" - create_sentinel "${phase_name}" "03_cic_created" + print_status " Creating CIC ${cic_name}..." + if run_gcloud "${log_file}" gcloud certificate-manager issuance-configs create "${cic_name}" \ + --location="${region}" \ + --project="${project_id}" \ + --ca-pool="${ca_pool_full_name}" \ + --lifetime="2592000s" \ + --rotation-window-percentage=66 \ + --key-algorithm="rsa-2048"; then + report_result "Created" else - report_result "Not Found" - print_status " Creating CIC ${cic_name}..." - if run_gcloud "${log_file}" gcloud certificate-manager issuance-configs create "${cic_name}" --location="${region}" --project="${project_id}" --ca-pool="${ca_pool_full_name}" --lifetime="2592000s" --rotation-window-percentage=66 --key-algorithm="rsa-2048"; then - report_result "Created" - create_sentinel "${phase_name}" "03_cic_created" - else - report_result "Fail"; return 1; - fi + report_result "Fail"; return 1; fi - # 4. Certificate Manager Certificate (swp-cert) - print_status " Checking Certificate Manager Certificate ${cert_name}..." + print_status " Creating Certificate Manager Certificate ${cert_name}..." local cert_log_file="create_managed_cert_${cert_name}.log" local desired_cic="projects/${project_id}/locations/${region}/certificateIssuanceConfigs/${cic_name}" - if check_sentinel "${phase_name}" "04_cert_created"; then - report_result "Exists" - elif gcloud certificate-manager certificates describe "${cert_name}" --location="${region}" --project="${project_id}" > /dev/null 2>&1; then - local current_cic=$(gcloud certificate-manager certificates describe "${cert_name}" --location="${region}" --project="${project_id}" --format="value(managed.issuanceConfig)") - if [[ "${current_cic}" == "${desired_cic}" ]]; then - report_result "Exists" - create_sentinel "${phase_name}" "04_cert_created" - else - report_result "Fail" - return 1 - fi + if run_gcloud "${cert_log_file}" gcloud certificate-manager certificates create "${cert_name}" \ + --location="${region}" \ + --project="${project_id}" \ + --domains="${swp_hostname}" \ + --issuance-config="${desired_cic}"; then + report_result "Created" else - report_result "Not Found" - print_status " Creating Certificate ${cert_name}..." - if run_gcloud "${cert_log_file}" gcloud certificate-manager certificates create "${cert_name}" --location="${region}" --project="${project_id}" --domains="${swp_hostname}" --issuance-config="${desired_cic}"; then - report_result "Created" - create_sentinel "${phase_name}" "04_cert_created" - else - report_result "Fail"; return 1; - fi + report_result "Fail"; return 1; fi export SWP_CERT_URI_PARTIAL="projects/${project_id}/locations/${region}/certificates/${cert_name}" } export -f create_managed_certificate -function _delete_swp_ca_resources() { +function delete_managed_certificate() { local region="${1:-${REGION}}" local project_id="${2:-${PROJECT_ID}}" - local log_file="${3:-delete_managed_certificate_${CLUSTER_NAME}.log}" - + local log_file="delete_managed_certificate_${CLUSTER_NAME}.log" local cic_prefix="swp-cic-${CLUSTER_NAME}-" local pool_prefix="swp-ca-pool-${CLUSTER_NAME}-" local ca_prefix="swp-root-ca-${CLUSTER_NAME}-" - local overall_status="Pass" - local found_some=false + local cert_prefix="swp-cert-${CLUSTER_NAME}-" + + print_status "Deleting SWP Certificate components for ${CLUSTER_NAME}..." + report_result "" + + local cert_names=$(gcloud certificate-manager certificates list --location="${region}" --project="${project_id}" --filter="name ~ /${cert_prefix}" --format="value(name)" 2>/dev/null) + if [[ -n "${cert_names}" ]]; then + while read -r cert_full_name; do + local short_cert_name=$(basename "${cert_full_name}") + print_status " Deleting versioned certificate ${short_cert_name}..." + if ! run_gcloud "delete_managed_cert_${short_cert_name}.log" gcloud certificate-manager certificates delete "${short_cert_name}" --location="${region}" --project="${project_id}" --quiet; then + report_result "Fail"; + else + report_result "Deleted"; + fi + done <<< "${cert_names}" + fi - # --- Deleting Certificate Issuance Config(s) --- local cic_names=$(gcloud certificate-manager issuance-configs list --location="${region}" --project="${project_id}" --format="value(name)" 2>/dev/null | grep "${cic_prefix}" || true) if [[ -n "${cic_names}" ]]; then - found_some=true while read -r cic; do local short_cic_name=$(basename "${cic}") print_status " Deleting CIC ${short_cic_name}..." - if ! run_gcloud "${log_file}" gcloud certificate-manager issuance-configs delete "${short_cic_name}" --location="${region}" --project="${project_id}" --quiet; then overall_status="Fail"; report_result "Fail"; else report_result "Deleted"; fi + run_gcloud "${log_file}" gcloud certificate-manager issuance-configs delete "${short_cic_name}" --location="${region}" --project="${project_id}" --quiet || true done <<< "${cic_names}" - else - print_status " No CICs found with prefix ${cic_prefix}..." - report_result "Not Found" fi - # --- Deleting CA Pool(s) and Root CA(s) --- local pool_names=$(gcloud privateca pools list --location="${region}" --project="${project_id}" --format="value(name)" 2>/dev/null | grep "${pool_prefix}" || true) if [[ -n "${pool_names}" ]]; then - found_some=true while read -r pool_full_name; do local short_pool_name=$(basename "${pool_full_name}") - print_status " Deleting CA Pool ${short_pool_name}..." local ca_names=$(gcloud privateca roots list --pool="${short_pool_name}" --location="${region}" --project="${project_id}" --format="value(name)" 2>/dev/null | grep "${ca_prefix}" || true) if [[ -n "${ca_names}" ]]; then while read -r ca_full_name; do local short_ca_name=$(basename "${ca_full_name}") local ca_log_file="delete_ca_${short_ca_name}.log" - print_status " Disabling CA ${short_ca_name}..." - if run_gcloud "${ca_log_file}" gcloud privateca roots disable "${short_ca_name}" --pool="${short_pool_name}" --location="${region}" --project="${project_id}" --quiet; then - report_result "Pass" - print_status " Deleting CA ${short_ca_name}..." - if ! run_gcloud "${ca_log_file}" gcloud privateca roots delete "${short_ca_name}" --pool="${short_pool_name}" --location="${region}" --project="${project_id}" --quiet --skip-grace-period; then - report_result "Fail" - overall_status="Fail" - else - report_result "Deleted" - fi - else - report_result "Fail" # Disable failed - overall_status="Fail" - fi + print_status " Disabling and Deleting CA ${short_ca_name}..." + run_gcloud "${ca_log_file}" gcloud privateca roots disable "${short_ca_name}" --pool="${short_pool_name}" --location="${region}" --project="${project_id}" --quiet || true + run_gcloud "${ca_log_file}" gcloud privateca roots delete "${short_ca_name}" --pool="${short_pool_name}" --location="${region}" --project="${project_id}" --quiet --skip-grace-period || true done <<< "${ca_names}" fi print_status " Attempting to delete CA Pool ${short_pool_name}..." - if ! run_gcloud "${log_file}" gcloud privateca pools delete "${short_pool_name}" --location="${region}" --project="${project_id}" --quiet --ignore-dependent-resources; then overall_status="Fail"; report_result "Fail"; else report_result "Deleted"; fi + run_gcloud "${log_file}" gcloud privateca pools delete "${short_pool_name}" --location="${region}" --project="${project_id}" --quiet --ignore-dependent-resources || true done <<< "${pool_names}" - else - print_status " No CA Pools found with prefix ${pool_prefix}..." - report_result "Not Found" fi - return $([[ "${overall_status}" == "Pass" ]] && echo 0 || echo 1) } -export -f _delete_swp_ca_resources - -function delete_managed_certificate() { - local region="${1:-${REGION}}" - local project_id="${2:-${PROJECT_ID}}" - local phase_name="swp_managed_cert" - local log_file="delete_managed_certificate_${CLUSTER_NAME}.log" - - print_status "Deleting SWP Certificate components for ${CLUSTER_NAME}..." - report_result "" # Newline - - # Clear all sentinels for this cluster and phase, regardless of suffix - if [[ -d "${SENTINEL_DIR}" ]]; then - find "${SENTINEL_DIR}" -type f -name "${phase_name}-*" -exec rm -f {} + > /dev/null 2>&1 - print_status " Cleared sentinels for ${phase_name}..." - report_result "Pass" - fi - - # --- Deleting the static-named Certificate Manager Certificate --- - local static_cert_name="swp-cert" - print_status " Checking for static certificate ${static_cert_name}..." - local cert_check=$(gcloud certificate-manager certificates list --location="${region}" --project="${project_id}" --filter="name='projects/${project_id}/locations/${region}/certificates/${static_cert_name}'" --format="value(name)" 2>/dev/null) - - if [[ -n "${cert_check}" ]]; then - print_status " Deleting static certificate ${static_cert_name}..." - if ! run_gcloud "${log_file}" gcloud certificate-manager certificates delete "${static_cert_name}" --location="${region}" --project="${project_id}" --quiet; then - report_result "Fail"; - else - report_result "Deleted"; - fi - else - report_result "Not Found" - fi - - if [[ "${FORCE_DELETE}" == "true" ]]; then - print_status " --force specified, deleting versioned CA/CIC components..." - report_result "" - _delete_swp_ca_resources "${region}" "${project_id}" "${log_file}" - else - print_status " Skipping deletion of versioned CA/CIC components. Use --force to delete." - report_result "Skipped" - fi -} -export -f delete_managed_certificate - -function create_certificate() { - local cert_name="${1:-${SWP_CERT_NAME}}" - local region="${2:-${REGION}}" - local project_id="${3:-${PROJECT_ID}}" - local swp_ip="${4:-${SWP_IP}}" - local swp_hostname="${5:-${SWP_HOSTNAME}}" - - local ca_key_file="tls/swp_ca.key" - local ca_cert_file="tls/swp_ca.crt" - local server_key_file="tls/swp.key" - local server_csr_file="tls/swp.csr" - local server_cert_file="tls/swp.crt" - local gcs_ca_cert_uri="${INIT_ACTIONS_ROOT}/swp_ca.crt" - - # ... (rest of create_certificate) ... -} - -function delete_certificate() { - local cert_name="${1:-${SWP_CERT_NAME}}" - local region="${2:-${REGION}}" - - print_status "Deleting Self-Signed Certificate ${cert_name}..." - if gcloud certificate-manager certificates describe "${cert_name}" --location="${region}" --project="${project_id}" > /dev/null 2>&1; then - local log_file="delete_certificate_${cert_name}.log" - if run_gcloud "${log_file}" gcloud certificate-manager certificates delete "${cert_name}" --location="${region}" --quiet; then - report_result "Deleted" - else - report_result "Fail" - fi - else - report_result "Not Found" - fi -} - -function delete_ca_pool() { - local pool_name="${1:-swp-ca-pool-${CLUSTER_NAME}}" - local region="${2:-${REGION}}" - local project_id="${3:-${PROJECT_ID}}" - local ca_name="swp-root-ca-${CLUSTER_NAME}" - local log_file="delete_ca_pool_${pool_name}.log" - - print_status "Deleting CA Pool ${pool_name}..." - if gcloud privateca pools describe "${pool_name}" --location="${region}" --project="${project_id}" > /dev/null 2>&1; then - if gcloud privateca roots describe "${ca_name}" --pool="${pool_name}" --location="${region}" --project="${project_id}" > /dev/null 2>&1; then - run_gcloud "${log_file}" gcloud privateca roots disable "${ca_name}" --pool="${pool_name}" --location="${region}" --project="${project_id}" --quiet - run_gcloud "${log_file}" gcloud privateca roots delete "${ca_name}" --pool="${pool_name}" --location="${region}" --project="${project_id}" --quiet --skip-grace-period - fi - if run_gcloud "${log_file}" gcloud privateca pools delete "${pool_name}" --location="${region}" --project="${project_id}" --quiet; then - report_result "Deleted" - else - report_result "Fail" - fi - else - report_result "Not Found" - fi -} \ No newline at end of file +export -f delete_managed_certificate \ No newline at end of file diff --git a/gcloud/lib/swp/firewall.sh b/gcloud/lib/swp/firewall.sh index bf0a7da3..c42effaa 100644 --- a/gcloud/lib/swp/firewall.sh +++ b/gcloud/lib/swp/firewall.sh @@ -4,53 +4,32 @@ function create_allow_swp_ingress_rule() { local rule_name="${1:-allow-swp-ingress-${CLUSTER_NAME}}" local network_name="${2:-${NETWORK}}" local source_range="${3:-${PRIVATE_RANGE}}" - local phase_name="create_allow_swp_ingress_rule" - - if check_sentinel "${phase_name}" "done"; then - print_status "Creating Firewall Rule ${rule_name}..." - report_result "Exists" - return 0 - fi - print_status "Creating Firewall Rule ${rule_name}..." - if ! gcloud compute firewall-rules describe "${rule_name}" --project="${PROJECT_ID}" > /dev/null 2>&1; then - local log_file="create_firewall_${rule_name}.log" - if run_gcloud "${log_file}" gcloud compute firewall-rules create "${rule_name}" \ - --project="${PROJECT_ID}" \ - --network="${network_name}" \ - --direction=INGRESS \ - --action=ALLOW \ - --rules=tcp:${SWP_PORT} \ - --source-ranges="${source_range}" \ - --destination-ranges="${SWP_RANGE}"; then - report_result "Created" - create_sentinel "${phase_name}" "done" - else - report_result "Fail" - return 1 - fi + local log_file="create_firewall_${rule_name}.log" + if run_gcloud "${log_file}" gcloud compute firewall-rules create "${rule_name}" \ + --project="${PROJECT_ID}" \ + --network="${network_name}" \ + --direction=INGRESS \ + --action=ALLOW \ + --rules=tcp:${SWP_PORT} \ + --source-ranges="${source_range}" \ + --destination-ranges="${SWP_RANGE}"; then + report_result "Created" else - report_result "Exists" - create_sentinel "${phase_name}" "done" + report_result "Fail" + return 1 fi } export -f create_allow_swp_ingress_rule function delete_allow_swp_ingress_rule() { local rule_name="${1:-allow-swp-ingress-${CLUSTER_NAME}}" - local phase_name="create_allow_swp_ingress_rule" - remove_sentinel "${phase_name}" "done" - print_status "Deleting Firewall Rule ${rule_name}..." - if gcloud compute firewall-rules describe "${rule_name}" --project="${PROJECT_ID}" > /dev/null 2>&1; then - local log_file="delete_firewall_${rule_name}.log" - if run_gcloud "${log_file}" gcloud compute firewall-rules delete "${rule_name}" --project="${PROJECT_ID}" --quiet; then - report_result "Deleted" - else - report_result "Fail" - fi + local log_file="delete_firewall_${rule_name}.log" + if run_gcloud "${log_file}" gcloud compute firewall-rules delete "${rule_name}" --project="${PROJECT_ID}" --quiet; then + report_result "Deleted" else - report_result "Not Found" + report_result "Fail" fi } export -f delete_allow_swp_ingress_rule @@ -60,54 +39,33 @@ function create_allow_internal_subnets_rule() { local network_name="${2:-${NETWORK}}" local source_range="${3:-${PRIVATE_RANGE}}" local dest_range="${4:-${SWP_RANGE}}" - local phase_name="create_allow_internal_subnets_rule" - - if check_sentinel "${phase_name}" "done"; then - print_status "Creating Firewall Rule ${rule_name}..." - report_result "Exists" - return 0 - fi - print_status "Creating Firewall Rule ${rule_name}..." - if ! gcloud compute firewall-rules describe "${rule_name}" --project="${PROJECT_ID}" > /dev/null 2>&1; then - local log_file="create_firewall_${rule_name}.log" - if run_gcloud "${log_file}" gcloud compute firewall-rules create "${rule_name}" \ - --project="${PROJECT_ID}" \ - --network="${network_name}" \ - --direction=INGRESS \ - --action=ALLOW \ - --rules=all \ - --source-ranges="${source_range}" \ - --destination-ranges="${dest_range}" \ - --priority=100; then # High priority - report_result "Created" - create_sentinel "${phase_name}" "done" - else - report_result "Fail" - return 1 - fi + local log_file="create_firewall_${rule_name}.log" + if run_gcloud "${log_file}" gcloud compute firewall-rules create "${rule_name}" \ + --project="${PROJECT_ID}" \ + --network="${network_name}" \ + --direction=INGRESS \ + --action=ALLOW \ + --rules=all \ + --source-ranges="${source_range}" \ + --destination-ranges="${dest_range}" \ + --priority=100; then # High priority + report_result "Created" else - report_result "Exists" - create_sentinel "${phase_name}" "done" + report_result "Fail" + return 1 fi } export -f create_allow_internal_subnets_rule function delete_allow_internal_subnets_rule() { local rule_name="${1:-allow-internal-${CLUSTER_NAME}}" - local phase_name="create_allow_internal_subnets_rule" - remove_sentinel "${phase_name}" "done" - print_status "Deleting Firewall Rule ${rule_name}..." - if gcloud compute firewall-rules describe "${rule_name}" --project="${PROJECT_ID}" > /dev/null 2>&1; then - local log_file="delete_firewall_${rule_name}.log" - if run_gcloud "${log_file}" gcloud compute firewall-rules delete "${rule_name}" --project="${PROJECT_ID}" --quiet; then - report_result "Deleted" - else - report_result "Fail" - fi + local log_file="delete_firewall_${rule_name}.log" + if run_gcloud "${log_file}" gcloud compute firewall-rules delete "${rule_name}" --project="${PROJECT_ID}" --quiet; then + report_result "Deleted" else - report_result "Not Found" + report_result "Fail" fi } export -f delete_allow_internal_subnets_rule \ No newline at end of file diff --git a/gcloud/lib/swp/gateway.sh b/gcloud/lib/swp/gateway.sh index fc0eadff..657335c1 100644 --- a/gcloud/lib/swp/gateway.sh +++ b/gcloud/lib/swp/gateway.sh @@ -8,23 +8,8 @@ function create_swp_gateway() { local certificate_url="${5:-${SWP_CERT_URI_PARTIAL}}" local gateway_security_policy_url="${6:-${SWP_POLICY_URI_PARTIAL}}" local project_id="${7:-${PROJECT_ID}}" - local phase_name="create_swp_gateway" - - if check_sentinel "${phase_name}" "done"; then - print_status "Creating SWP Gateway ${swp_instance_name}..." - report_result "Exists" - return 0 - fi - print_status "Creating SWP Gateway ${swp_instance_name}..." local log_file="create_swp_gateway_${swp_instance_name}.log" - - if gcloud network-services gateways describe "${swp_instance_name}" --location="${region}" --project="${project_id}" > /dev/null 2>&1; then - report_result "Exists" - create_sentinel "${phase_name}" "done" - return 0 - fi - local swp_address="${SWP_IP}" local full_network_name="projects/${project_id}/global/networks/${network_name}" local full_client_subnet_name="projects/${project_id}/regions/${region}/subnetworks/${client_subnet_name}" @@ -47,10 +32,9 @@ routingMode: EXPLICIT_ROUTING_MODE EOF ) if echo "${gateway_yaml}" | run_gcloud "${log_file}" gcloud network-services gateways import "${swp_instance_name}" \ - --source=- \ - --location="${region}" --project="${project_id}"; then + --source=- \ + --location="${region}" --project="${project_id}"; then report_result "Created" - create_sentinel "${phase_name}" "done" else report_result "Fail" return 1 @@ -61,35 +45,25 @@ export -f create_swp_gateway function delete_swp_gateway() { local swp_instance_name="${1:-${SWP_INSTANCE_NAME}}" local region="${2:-${REGION}}" - local phase_name="create_swp_gateway" - print_status "Deleting SWP Gateway ${swp_instance_name}..." - if gcloud network-services gateways describe "${swp_instance_name}" --location="${region}" --project="${PROJECT_ID}" > /dev/null 2>&1; then - local log_file="delete_swp_gateway_${swp_instance_name}.log" - if run_gcloud "${log_file}" gcloud network-services gateways delete "${swp_instance_name}" --location="${region}" --project="${PROJECT_ID}" --quiet; then - report_result "Deleted" - remove_sentinel "${phase_name}" "done" - - # Attempt to delete the autogen router - local autogen_router_prefix="swg-autogen-router-" - local autogen_routers=$(gcloud compute routers list --regions="${region}" --project="${PROJECT_ID}" --filter="network ~ /${NETWORK}$ AND name ~ ^${autogen_router_prefix}" --format="value(name)" 2>/dev/null || true) - if [[ -n "${autogen_routers}" ]]; then - while read -r router_name; do - print_status " Deleting Autogen Router ${router_name}..." - local delete_router_log="delete_autogen_router_${router_name}.log" - if run_gcloud "${delete_router_log}" gcloud compute routers delete "${router_name}" --region="${region}" --project="${PROJECT_ID}" --quiet; then - report_result "Deleted" - else - report_result "Fail" - fi - done <<< "${autogen_routers}" - fi - else - report_result "Fail" + local log_file="delete_swp_gateway_${swp_instance_name}.log" + if run_gcloud "${log_file}" gcloud network-services gateways delete "${swp_instance_name}" --location="${region}" --project="${PROJECT_ID}" --quiet; then + report_result "Deleted" + local autogen_router_prefix="swg-autogen-router-" + local autogen_routers=$(gcloud compute routers list --regions="${region}" --project="${PROJECT_ID}" --filter="network ~ /${NETWORK}$ AND name ~ ^${autogen_router_prefix}" --format="value(name)" 2>/dev/null || true) + if [[ -n "${autogen_routers}" ]]; then + while read -r router_name; do + print_status " Deleting Autogen Router ${router_name}..." + local delete_router_log="delete_autogen_router_${router_name}.log" + if run_gcloud "${delete_router_log}" gcloud compute routers delete "${router_name}" --region="${region}" --project="${PROJECT_ID}" --quiet; then + report_result "Deleted" + else + report_result "Fail" + fi + done <<< "${autogen_routers}" fi else - report_result "Not Found" - remove_sentinel "${phase_name}" "done" + report_result "Fail" fi } export -f delete_swp_gateway \ No newline at end of file diff --git a/gcloud/lib/swp/policy.sh b/gcloud/lib/swp/policy.sh index 97dc4b98..109e33f6 100644 --- a/gcloud/lib/swp/policy.sh +++ b/gcloud/lib/swp/policy.sh @@ -7,42 +7,26 @@ function create_gateway_security_policy() { local rule_name="allow-all-rule" local policy_full_name="projects/${project_id}/locations/${region}/gatewaySecurityPolicies/${policy_name}" local log_file="create_swp_policy_${policy_name}.log" - local phase_name="create_gateway_security_policy" - - if check_sentinel "${phase_name}" "done"; then - print_status "Creating Gateway Security Policy ${policy_name}..." - report_result "Exists" - return 0 - fi - print_status "Creating Gateway Security Policy ${policy_name}..." - local policy_exists=false - if gcloud network-security gateway-security-policies describe "${policy_name}" --location="${region}" --project="${project_id}" > /dev/null 2>&1; then - policy_exists=true - report_result "Exists" - else - policy_yaml=$(cat << EOF + + policy_yaml=$(cat << EOF name: ${policy_full_name} description: "Allow all policy for SWP" EOF - ) - if echo "${policy_yaml}" | run_gcloud "${log_file}" gcloud network-security gateway-security-policies import "${policy_name}" \ - --location="${region}" --project="${project_id}" \ - --source=- - then - report_result "Created" - policy_exists=true - else - report_result "Fail" - return 1 - fi +) + if echo "${policy_yaml}" | run_gcloud "${log_file}" gcloud network-security gateway-security-policies import "${policy_name}" \ + --location="${region}" \ + --project="${project_id}" \ + --source=-; then + report_result "Created" + else + report_result "Fail" + return 1 fi - if [[ "${policy_exists}" = true ]]; then - print_status " Ensuring allow-all rule in ${policy_name}..." - local rule_log_file="create_swp_policy_rule_${policy_name}.log" - if ! gcloud network-security gateway-security-policies rules describe "${rule_name}" --gateway-security-policy="${policy_name}" --location="${region}" --project="${project_id}" > /dev/null 2>&1; then - rule_yaml=$(cat << EOF + print_status " Ensuring allow-all rule in ${policy_name}..." + local rule_log_file="create_swp_policy_rule_${policy_name}.log" + rule_yaml=$(cat << EOF name: ${policy_full_name}/rules/${rule_name} description: "Allow all traffic" priority: 1000 @@ -50,22 +34,18 @@ enabled: true basicProfile: ALLOW sessionMatcher: "host() != 'none'" EOF - ) - if echo "${rule_yaml}" | run_gcloud "${rule_log_file}" gcloud network-security gateway-security-policies rules import "${rule_name}" \ - --gateway-security-policy="${policy_name}" \ - --location="${region}" --project="${project_id}" \ - --source=- - then - report_result "Created" - else - report_result "Fail" - return 1 # Fail the whole function if rule creation fails - fi - else - report_result "Exists" - fi - create_sentinel "${phase_name}" "done" +) + if echo "${rule_yaml}" | run_gcloud "${rule_log_file}" gcloud network-security gateway-security-policies rules import "${rule_name}" \ + --gateway-security-policy="${policy_name}" \ + --location="${region}" \ + --project="${project_id}" \ + --source=-; then + report_result "Created" + else + report_result "Fail" + return 1 fi + export SWP_POLICY_URI_PARTIAL="${policy_full_name}" } export -f create_gateway_security_policy @@ -74,44 +54,25 @@ function delete_gateway_security_policy() { local region="${2:-${REGION}}" local project_id="${3:-${PROJECT_ID}}" local rule_name="allow-all-rule" - local phase_name="create_gateway_security_policy" - print_status "Deleting Gateway Security Policy ${policy_name}..." + local rule_log="delete_swp_policy_rule_${policy_name}.log" + local policy_log="delete_swp_policy_${policy_name}.log" + + print_status " Deleting rule ${rule_name}..." + run_gcloud "${rule_log}" gcloud network-security gateway-security-policies rules delete "${rule_name}" \ + --gateway-security-policy="${policy_name}" \ + --location="${region}" \ + --project="${project_id}" \ + --quiet || true - local policy_check=$(gcloud network-security gateway-security-policies list --location="${region}" --project="${project_id}" --filter="name ~ /${policy_name}$" --format="value(name)" 2>/dev/null) - - if [[ -n "${policy_check}" ]]; then - local rule_log="delete_swp_policy_rule_${policy_name}.log" - local policy_log="delete_swp_policy_${policy_name}.log" - # Delete the rule first - print_status " Deleting rule ${rule_name}..." - local rule_check=$(gcloud network-security gateway-security-policies rules list --gateway-security-policy="${policy_name}" --location="${region}" --project="${project_id}" --filter="name ~ /${rule_name}$" --format="value(name)" 2>/dev/null) - if [[ -n "${rule_check}" ]]; then - if run_gcloud "${rule_log}" gcloud network-security gateway-security-policies rules delete "${rule_name}" \ - --gateway-security-policy="${policy_name}" \ - --location="${region}" --project="${project_id}" \ - --quiet; then - report_result "Deleted" - else - report_result "Fail" - fi - else - report_result "Not Found" - fi - - # Delete the policy - print_status " Deleting policy ${policy_name}..." - if run_gcloud "${policy_log}" gcloud network-security gateway-security-policies delete "${policy_name}" \ - --location="${region}" --project="${project_id}" \ - --quiet; then - report_result "Deleted" - remove_sentinel "${phase_name}" "done" - else - report_result "Fail" - fi + print_status " Deleting policy ${policy_name}..." + if run_gcloud "${policy_log}" gcloud network-security gateway-security-policies delete "${policy_name}" \ + --location="${region}" \ + --project="${project_id}" \ + --quiet; then + report_result "Deleted" else - report_result "Not Found" - remove_sentinel "${phase_name}" "done" + report_result "Fail" fi } export -f delete_gateway_security_policy diff --git a/gcloud/lib/swp/subnet.sh b/gcloud/lib/swp/subnet.sh index 3c2605e0..3392814f 100644 --- a/gcloud/lib/swp/subnet.sh +++ b/gcloud/lib/swp/subnet.sh @@ -5,33 +5,19 @@ function create_swp_subnet() { local region="${2:-${REGION}}" local network_name="${3:-${NETWORK}}" local range="${4:-${SWP_RANGE}}" - local phase_name="create_swp_subnet" - - if check_sentinel "${phase_name}" "done"; then - print_status "Creating SWP Subnet ${subnet_name}..." - report_result "Exists" - return 0 - fi - print_status "Creating SWP Subnet ${subnet_name}..." - if gcloud compute networks subnets describe "${subnet_name}" --region="${region}" --project="${PROJECT_ID}" > /dev/null 2>&1; then - report_result "Exists" - create_sentinel "${phase_name}" "done" + local log_file="create_swp_subnet_${subnet_name}.log" + if run_gcloud "${log_file}" gcloud compute networks subnets create "${subnet_name}" \ + --project="${PROJECT_ID}" \ + --purpose=REGIONAL_MANAGED_PROXY \ + --role=ACTIVE \ + --region="${region}" \ + --network="${network_name}" \ + --range="${range}"; then + report_result "Created" else - local log_file="create_swp_subnet_${subnet_name}.log" - if run_gcloud "${log_file}" gcloud compute networks subnets create "${subnet_name}" \ - --project="${PROJECT_ID}" \ - --purpose=REGIONAL_MANAGED_PROXY \ - --role=ACTIVE \ - --region="${region}" \ - --network="${network_name}" \ - --range="${range}"; then - report_result "Created" - create_sentinel "${phase_name}" "done" - else - report_result "Fail" - return 1 - fi + report_result "Fail" + return 1 fi } export -f create_swp_subnet @@ -39,20 +25,12 @@ export -f create_swp_subnet function delete_swp_subnet() { local subnet_name="${1:-${SWP_SUBNET}}" local region="${2:-${REGION}}" - local phase_name="create_swp_subnet" - print_status "Deleting SWP Subnet ${subnet_name}..." - if gcloud compute networks subnets describe "${subnet_name}" --region="${region}" --project="${PROJECT_ID}" > /dev/null 2>&1; then - local log_file="delete_swp_subnet_${subnet_name}.log" - if run_gcloud "${log_file}" gcloud compute networks subnets delete "${subnet_name}" --region="${region}" --quiet; then - report_result "Deleted" - remove_sentinel "${phase_name}" "done" - else - report_result "Fail" - fi + local log_file="delete_swp_subnet_${subnet_name}.log" + if run_gcloud "${log_file}" gcloud compute networks subnets delete "${subnet_name}" --region="${region}" --quiet; then + report_result "Deleted" else - report_result "Not Found" - remove_sentinel "${phase_name}" "done" # Remove sentinel if not found + report_result "Fail" fi } export -f delete_swp_subnet @@ -62,33 +40,19 @@ function create_private_subnet () { local region="${2:-${REGION}}" local network_name="${3:-${NETWORK}}" local range="${4:-${PRIVATE_RANGE}}" - local phase_name="create_private_subnet" - - if check_sentinel "${phase_name}" "done"; then - print_status "Creating Private Subnet ${subnet_name}..." - report_result "Exists" - return 0 - fi - print_status "Creating Private Subnet ${subnet_name}..." - if gcloud compute networks subnets describe "${subnet_name}" --region "${REGION}" --project="${PROJECT_ID}" > /dev/null 2>&1; then - report_result "Exists" - create_sentinel "${phase_name}" "done" + local log_file="create_private_subnet_${subnet_name}.log" + if run_gcloud "${log_file}" gcloud compute networks subnets create "${subnet_name}" \ + --project="${PROJECT_ID}" \ + --network="${network_name}" \ + --range="${range}" \ + --enable-private-ip-google-access \ + --region="${region}" \ + --description="subnet for use with Dataproc cluster ${CLUSTER_NAME}"; then + report_result "Created" else - local log_file="create_private_subnet_${subnet_name}.log" - if run_gcloud "${log_file}" gcloud compute networks subnets create "${subnet_name}" \ - --project="${PROJECT_ID}" \ - --network="${network_name}" \ - --range="${range}" \ - --enable-private-ip-google-access \ - --region="${region}" \ - --description="subnet for use with Dataproc cluster ${CLUSTER_NAME}"; then - report_result "Created" - create_sentinel "${phase_name}" "done" - else - report_result "Fail" - return 1 - fi + report_result "Fail" + return 1 fi } export -f create_private_subnet @@ -96,20 +60,12 @@ export -f create_private_subnet function delete_private_subnet () { local subnet_name="${1:-${PRIVATE_SUBNET}}" local region="${2:-${REGION}}" - local phase_name="create_private_subnet" - print_status "Deleting Private Subnet ${subnet_name}..." - if gcloud compute networks subnets describe "${subnet_name}" --region "${REGION}" --project="${PROJECT_ID}" > /dev/null 2>&1; then - local log_file="delete_private_subnet_${subnet_name}.log" - if run_gcloud "${log_file}" gcloud compute networks subnets delete --quiet --region "${REGION}" "${subnet_name}"; then - report_result "Deleted" - remove_sentinel "${phase_name}" "done" - else - report_result "Fail" - fi + local log_file="delete_private_subnet_${subnet_name}.log" + if run_gcloud "${log_file}" gcloud compute networks subnets delete --quiet --region "${REGION}" "${subnet_name}"; then + report_result "Deleted" else - report_result "Not Found" - remove_sentinel "${phase_name}" "done" # Remove sentinel if not found + report_result "Fail" fi } export -f delete_private_subnet \ No newline at end of file From c667991712d15ed923c8841fa3a30177e7487ea8 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Fri, 13 Mar 2026 02:57:56 +0000 Subject: [PATCH 03/25] docs: Update docs, add config, tests, and project files - Updated README.md to reflect the new script organization. - Enhanced .gitignore. - Added env.json and updated env.json.sample. - Added cloudbuild.yaml for CI/CD. - Updated init/gce-proxy-setup.sh. - Added test scripts in t/. - Added project findings and work narratives. --- gcloud/.gitignore | 35 +- gcloud/README.md | 103 ++--- gcloud/bin/audit-dpgce-create | 135 ------- gcloud/bin/audit-dpgce-destroy | 71 ---- gcloud/bin/audit-private-create | 78 ---- gcloud/bin/audit-private-destroy | 139 ------- gcloud/bin/create-dpgce-private | 120 ------ gcloud/bin/destroy-dpgce-private | 90 ----- gcloud/bin/recreate-dpgce | 42 --- gcloud/cloudbuild.yaml | 81 ++++ gcloud/env.json.sample | 11 +- gcloud/init/gce-proxy-setup.sh | 355 ++++++++++++------ ...0T222921-gcloud-metadata-and-gpu-config.md | 13 + .../20260312T035246-audit-script-gaps.md | 19 + ...13T021216-llm-git-staging-unreliability.md | 27 ++ gcloud/t/scripts/verify_tensorflow.py | 43 +++ gcloud/t/scripts/verify_torch.py | 8 + gcloud/t/spark-gpu-test.sh | 125 ++++++ ...310T222921-review-gpu-custom-image-prep.md | 13 + .../20260312T035246-script-org-and-audit.md | 33 ++ ...-git-reconstruction-failure-and-handoff.md | 39 ++ .../comprehensive-work-journal.md | 9 + 22 files changed, 744 insertions(+), 845 deletions(-) delete mode 100644 gcloud/bin/audit-dpgce-create delete mode 100644 gcloud/bin/audit-dpgce-destroy delete mode 100644 gcloud/bin/audit-private-create delete mode 100644 gcloud/bin/audit-private-destroy delete mode 100755 gcloud/bin/create-dpgce-private delete mode 100755 gcloud/bin/destroy-dpgce-private delete mode 100755 gcloud/bin/recreate-dpgce create mode 100644 gcloud/cloudbuild.yaml create mode 100644 gcloud/project/findings/2026-W11/20260310T222921-gcloud-metadata-and-gpu-config.md create mode 100644 gcloud/project/findings/2026-W11/20260312T035246-audit-script-gaps.md create mode 100644 gcloud/project/findings/2026-W11/20260313T021216-llm-git-staging-unreliability.md create mode 100644 gcloud/t/scripts/verify_tensorflow.py create mode 100644 gcloud/t/scripts/verify_torch.py create mode 100644 gcloud/t/spark-gpu-test.sh create mode 100644 gcloud/work-narrative/2026-W11/20260310T222921-review-gpu-custom-image-prep.md create mode 100644 gcloud/work-narrative/2026-W11/20260312T035246-script-org-and-audit.md create mode 100644 gcloud/work-narrative/2026-W11/20260313T021216-git-reconstruction-failure-and-handoff.md create mode 100644 gcloud/work-narrative/comprehensive-work-journal.md diff --git a/gcloud/.gitignore b/gcloud/.gitignore index e6a83821..892cebd1 100644 --- a/gcloud/.gitignore +++ b/gcloud/.gitignore @@ -1,5 +1,32 @@ -init/*/ -tls/ -#tmp/ +# Local environment overrides +my-env.json +env.cpan.json + +# Temporary files and logs +/tmp/ +action-update.log +*.log + +# TLS / Cert directories +tls-*/ +tls-*-*/ + +# Emacs backup files +*# *~ -env.json \ No newline at end of file + +# Other +dataproc-repro-combined.txt +hardcopy.2 +ini/ +init/swp_ca.crt +opt/ +spark-bigquery-demo.py +t/pyspark-bigquery-command.sh +bin/#connectivity-test# +github/ +llm-guidance.md +plan-for-continued-work-2026-01-20.md +work-completed-2026-01-20.md +prompts/ +/tls/ diff --git a/gcloud/README.md b/gcloud/README.md index 59ccbbaa..61752b7b 100644 --- a/gcloud/README.md +++ b/gcloud/README.md @@ -20,69 +20,78 @@ limitations under the License. This collection of bash scripts helps create and manage Google Cloud environments to reproduce and test Google Dataproc cluster setups, particularly useful for troubleshooting issues related to startup scripts, initialization actions, and network configurations. +**Core Principles:** +* **State-Driven:** The scripts are driven by a single `state.json` file that acts as the authoritative source of truth for the environment. +* **Idempotent:** The `create-dpgce` script is idempotent. It can be run on a new, partially-built, or complete environment, and it will always safely and efficiently bring the environment to the target configuration, creating only the missing resources. +* **Modular:** Core logic is modularized into files within the `lib/` directory, categorized by function. + +## Supported Scenarios + +These scripts are designed to deploy and manage Dataproc clusters in various configurations: + +* **Standard Dataproc on GCE:** A cluster with default network settings and internet access via Cloud NAT. +* **Private Dataproc on GCE:** A cluster in a private network with no direct internet access. Egress is controlled through a Secure Web Proxy (SWP). +* **GPU-Enabled Clusters:** Configuration and testing scripts for clusters utilizing NVIDIA GPUs, including driver installation and YARN resource management. +* **Secure Boot Clusters:** Deployment of clusters using custom images built with Secure Boot enabled. +* **Dataproc on GKE:** Basic setup for Dataproc on Google Kubernetes Engine. + ## Setup -1. **Clone the repository:** +1. **Prerequisites:** Ensure you have the following tools installed: + * `gcloud` CLI + * `gsutil` (usually part of `gcloud`) + * `jq` + * `perl` + +2. **Clone the repository:** ```bash git clone https://github.com/GoogleCloudDataproc/cloud-dataproc cd cloud-dataproc/gcloud ``` -2. **Configure Environment:** +3. **Configure Environment:** * Copy the sample configuration: `cp env.json.sample env.json` - * Edit `env.json` with your specific Google Cloud project details, region, network ranges, etc. Key fields include: - * `PROJECT_ID` - * `REGION` - * `ZONE` (often derived from REGION, e.g., `us-west4-b`) - * `BUCKET` (for staging) - * `TEMP_BUCKET` - * Other fields as needed for your test case. - -3. **Review Script Libraries:** The core logic is now modularized into files within the `lib/` directory, categorized by function (e.g., `lib/gcp`, `lib/network`, `lib/dataproc`). - -## Scripts - -The main scripts are located in the `bin/` directory: - -* **`bin/create-dpgce`**: Creates a standard Dataproc on GCE cluster environment, including VPC, subnets, NAT, router, and firewall rules. -* **`bin/create-dpgce-private`**: Creates a private Dataproc on GCE cluster environment. This setup uses a Secure Web Proxy (SWP) for controlled egress and does *not* include a Cloud NAT or default internet route. -* **`bin/create-dpgke`**: Sets up a Dataproc on GKE environment. -* **`bin/destroy-dpgce`**: Tears down the environment created by `bin/create-dpgce`. -* **`bin/destroy-dpgce-private`**: Tears down the environment created by `bin/create-dpgce-private`. -* **`bin/destroy-dpgke`**: Tears down the DPGKE environment. -* **`bin/recreate-dpgce`**: Quickly deletes and recreates the Dataproc cluster within the existing `dpgce` environment. -* **`bin/recreate-dpgke`**: Quickly deletes and recreates the DPGKE cluster. - -### Common Flags - -* `--no-create-cluster`: Used with `create-*` scripts. Sets up all networking and dependencies but skips the final `gcloud dataproc clusters create` command. Useful for preparing an environment. -* `--force`: Used with `destroy-*` scripts. By default, GCS buckets and versioned SWP Certificate Authority components are not deleted. Use `--force` to remove these as well. -* `--quiet-gcloud`: Used with `create-*` scripts. Suppresses the pretty-printing of the `gcloud dataproc clusters create` command. -* `DEBUG=1`: Set this environment variable before running any script to enable verbose debug output (e.g., `DEBUG=1 bash bin/create-dpgce`). -* `TIMESTAMP=`: Set this to a specific Unix timestamp to attempt to resume a previous `create` operation or to target specific versioned resources for deletion. If not set, a new timestamp is generated for each run. + * Edit `env.json` with your specific Google Cloud project details, region, network ranges, etc. -## Customizing Cluster Creation +## Main Scripts (`bin/`) -The parameters for the `gcloud dataproc clusters create` command are primarily defined within `lib/dataproc/cluster.sh` in the `create_dpgce_cluster` function. You can adjust machine types, accelerators, metadata, properties, and initialization actions in this function. +The new workflow centers around three main scripts: -Numerous examples of alternative configurations and common options can be found in `docs/dataproc_cluster_examples.md`. +* **`bin/audit-dpgce`**: The source of truth. This script queries the live cloud environment to discover which resources are actually deployed and writes their status to `state.json`. It is called automatically by the other scripts. +* **`bin/create-dpgce`**: The idempotent creation script. It audits the environment and then creates only the resources that are missing to bring the environment to the desired state. It supports flags like `--custom` and `--private` to control deployment variations. +* **`bin/destroy-dpgce`**: The teardown script. It audits the environment and then de-provisions all discovered resources in the correct dependency order. -## Idempotency and Sentinels +### Example Usage -The `create-*` scripts use sentinel files to track the completion of major steps. These sentinels are stored in `/tmp/dataproc-repro/${RESOURCE_SUFFIX}/sentinels/`. This allows you to re-run a `create-*` script, and it will skip steps that were already completed successfully in a previous run with the same `TIMESTAMP`. - -The `destroy-*` scripts remove the corresponding sentinel files. +* **Create a Standard Dataproc Environment & Cluster:** + ```bash + bash bin/create-dpgce + ``` -## Logging +* **Create a Private & Custom Image Dataproc Environment:** + ```bash + bash bin/create-dpgce --private --custom + ``` -All `gcloud` commands executed via the `run_gcloud` helper function have their stdout and stderr redirected to log files within the `/tmp/dataproc-repro/${RESOURCE_SUFFIX}/` directory. Check these logs for details on any failures. +* **Tear Down All Environment Infrastructure:** + ```bash + bash bin/destroy-dpgce + ``` -## Troubleshooting +* **Tear Down Everything, Including GCS Buckets:** + ```bash + bash bin/destroy-dpgce --force + ``` -* **"command not found"**: Ensure the `bin/` script you are running sources the necessary files from the `lib/` subdirectories. -* **Resource Deletion Failures:** Check the logs in `/tmp/dataproc-repro/${RESOURCE_SUFFIX}/` for the specific `gcloud` error. Often, dependencies prevent deletion. Use `--force` with destroy scripts to be more aggressive. -* **Service Account Permissions:** Cluster creation can fail if the service account doesn't have the required roles. The `create_service_account` function attempts to bind these, but errors can occur. Check the `bind_*.log` files. +* **Recreate Just the Cluster (in an existing environment):** + ```bash + bash bin/recreate-cluster.sh + ``` -## Private Cluster Networking +### Common Flags -The `create-dpgce-private` script sets up a VPC with no default internet route. Egress is intended to be handled by the Secure Web Proxy. Nodes in this cluster should not have direct internet access. \ No newline at end of file +* `--custom`: Used with `create-dpgce`. Deploys a cluster using a custom image. +* `--private`: Used with `create-dpgce`. Deploys a private cluster with a Secure Web Proxy (SWP). +* `--no-create-cluster`: Used with `create-dpgce`. Sets up all networking and dependencies but skips the final `gcloud dataproc clusters create` command. +* `--force`: Used with `destroy-dpgce`. By default, GCS buckets are preserved. Use `--force` to delete them as well. +* `DEBUG=1`: Set this environment variable before running any script to enable verbose debug output. diff --git a/gcloud/bin/audit-dpgce-create b/gcloud/bin/audit-dpgce-create deleted file mode 100644 index 72a12e6a..00000000 --- a/gcloud/bin/audit-dpgce-create +++ /dev/null @@ -1,135 +0,0 @@ -#!/bin/bash - -# Script to audit resource creation after create-dpgce.sh - -source lib/env.sh -source lib/gcp/misc.sh # Source the file containing configure_gcloud - -if (( DEBUG != 0 )); then - set -x -fi - -configure_gcloud # Set gcloud context - -if [[ -z "${CLUSTER_NAME}" || "${CLUSTER_NAME}" == "null" ]]; then - echo "ERROR: CLUSTER_NAME is not set. Please source lib/env.sh after env.json is configured." - exit 1 -fi - -LOG_DIR="tmp/create_audit_${CLUSTER_NAME}_$(date +%Y%m%d-%H%M%S)" -mkdir -p "${LOG_DIR}" -echo "Detailed logs will be saved in ${LOG_DIR}" - -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -NC='\033[0m' # No Color - -# Function to print status updates -print_status() { - local message="$1" - local first_word=$(echo "${message}" | awk '{print $1}') - local rest_of_message=$(echo "${message}" | cut -d' ' -f2-) - echo -en "${YELLOW}${first_word}${NC} ${rest_of_message}" -} - -# Function to print result -print_result() { - local result="$1" - if [[ "${result}" == "Pass" ]]; then - echo -e " [${GREEN}Pass${NC}]" - elif [[ "${result}" == "Exists" ]]; then - echo -e " [${GREEN}Exists${NC}]" - elif [[ "${result}" == "Fail" ]]; then - echo -e " [${RED}Fail${NC}]" - else - echo -e " [${YELLOW}${result}${NC}]" - fi -} - -# Function to check if a resource exists and log details -function check_exists() { - local test_name="$1" - local command_to_run="$2" - local safe_test_name=$(echo "$test_name" | tr ' /:' '___') - local log_file="${LOG_DIR}/${safe_test_name}.log" - - print_status "Checking: ${test_name}... " - - # Run the command, redirect output to log file - if eval "${command_to_run}" > "${log_file}" 2>&1; then - if [[ $(wc -l < "${log_file}") -eq 0 ]]; then - print_result "Fail" - echo " -> ${test_name} NOT FOUND. Check ${log_file}" - return 1 - else - print_result "Exists" - return 0 - fi - else - print_result "Fail" - echo " -> Command failed for ${test_name}. Check ${log_file}" - return 1 - fi -} - -function check_exists_grep() { - local test_name="$1" - local command_to_run="$2" - local grep_pattern="$3" - local log_file="${LOG_DIR}/$(echo "$test_name" | tr ' /:' '___').log" - - print_status "Checking: ${test_name}... " - - # Run the command, redirect output to log file - eval "${command_to_run}" > "${log_file}" 2>&1 - - # Grep the log file quietly - if grep -q "${grep_pattern}" "${log_file}"; then - print_result "Exists" - return 0 - else - print_result "Fail" - echo " -> ${test_name} NOT FOUND or pattern mismatch. Check ${log_file}" - return 1 - fi -} - -# --- Start Audit --- -echo "Starting resource creation audit for cluster: ${CLUSTER_NAME}" - -# 1. VPC Network -check_exists "VPC Network ${NETWORK}" "gcloud compute networks describe '${NETWORK}' --project='${PROJECT_ID}'" - -# 2. Subnet -check_exists "Main Subnet ${SUBNET}" "gcloud compute networks subnets describe '${SUBNET}' --region='${REGION}' --project='${PROJECT_ID}'" - -# 3. Service Account -check_exists "Service Account ${GSA}" "gcloud iam service-accounts describe '${GSA}' --project='${PROJECT_ID}'" - -# 4. GCS Buckets -check_exists "GCS Staging Bucket gs://${BUCKET}" "gsutil ls -b 'gs://${BUCKET}'" -check_exists "GCS Temp Bucket gs://${TEMP_BUCKET}" "gsutil ls -b 'gs://${TEMP_BUCKET}'" - -# 5. Cloud Router -check_exists "Cloud Router ${ROUTER_NAME}" "gcloud compute routers describe '${ROUTER_NAME}' --region='${REGION}' --project='${PROJECT_ID}'" - -# 6. NAT Policy -check_exists "NAT Policy nat-config" "gcloud compute routers nats describe nat-config --router='${ROUTER_NAME}' --region='${REGION}' --project='${PROJECT_ID}'" - -# 7. Firewall Rules -check_exists_grep "Firewall Rule ${FIREWALL}-out" "gcloud compute firewall-rules list --project='${PROJECT_ID}' --filter=\"network ~ ${NETWORK}$ AND name='${FIREWALL}-out'\" --format='value(name)'" "${FIREWALL}-out" - -# 8. Autoscaling Policy -check_exists "Autoscaling Policy ${AUTOSCALING_POLICY_NAME}" "gcloud dataproc autoscaling-policies describe '${AUTOSCALING_POLICY_NAME}' --region='${REGION}' --project='${PROJECT_ID}'" - -# 9. Dataproc Cluster (Optional) -print_status "Checking: Dataproc Cluster ${CLUSTER_NAME}... " -cluster_log_file="${LOG_DIR}/Dataproc_Cluster_${CLUSTER_NAME}.log" -if gcloud dataproc clusters describe "${CLUSTER_NAME}" --region="${REGION}" --project="${PROJECT_ID}" > "${cluster_log_file}" 2>&1; then - print_result "Exists" -else - print_result "Not Found" -fi - -echo -e "\nAudit complete." diff --git a/gcloud/bin/audit-dpgce-destroy b/gcloud/bin/audit-dpgce-destroy deleted file mode 100644 index bf08ff25..00000000 --- a/gcloud/bin/audit-dpgce-destroy +++ /dev/null @@ -1,71 +0,0 @@ -#!/bin/bash - -# Script to audit resource cleanup after destroy-dpgce.sh - -source lib/env.sh -source lib/gcp/misc.sh # Source the file containing configure_gcloud - -if (( DEBUG != 0 )); then - set -x -fi - -configure_gcloud # Set gcloud context - -parse_args "$@" # Sets FORCE_AUDIT based on --force - -if [[ -z "${CLUSTER_NAME}" || "${CLUSTER_NAME}" == "null" ]]; then - echo "ERROR: CLUSTER_NAME is not set. Please source lib/env.sh after env.json is configured." - exit 1 -fi - -LOG_DIR="tmp/destroy_audit_${CLUSTER_NAME}_$(date +%Y%m%d-%H%M%S)" -mkdir -p "${LOG_DIR}" -echo "Detailed logs will be saved in ${LOG_DIR}" - -# --- Start Audit --- -echo "Starting resource cleanup audit for cluster: ${CLUSTER_NAME}" -if [[ "${FORCE_AUDIT}" == "true" ]]; then - echo "--force flag detected, expecting ALL resources to be deleted." -fi - -# 1. Dataproc Clusters -check_resource_exact "Dataproc Clusters" \ - "gcloud dataproc clusters list --region=\"${REGION}\" --project=\"${PROJECT_ID}\" --filter=\"clusterName = ${CLUSTER_NAME}\" --format=\"value(clusterName)\"" - -# 2. Service Accounts -check_resource_exact "Service Account" \ - "gcloud iam service-accounts list --project=\"${PROJECT_ID}\" --filter=\"email = ${GSA}\" --format=\"value(email)\"" - -# 3. Autoscaling Policies -check_resource "Autoscaling Policies" \ - "gcloud dataproc autoscaling-policies list --region=\"${REGION}\" --project=\"${PROJECT_ID}\" --format=\"value(id)\"" \ - "${AUTOSCALING_POLICY_NAME}" - -# 4. Cloud Routers & NAT -check_resource "Cloud Router ${ROUTER_NAME}" \ - "gcloud compute routers list --regions=\"${REGION}\" --project=\"${PROJECT_ID}\" --filter=\"network ~ ${NETWORK}$\" --format=\"value(name)\"" \ - "${ROUTER_NAME}" -check_resource "NAT on ${ROUTER_NAME}" \ - "gcloud compute routers nats list --router='${ROUTER_NAME}' --region='${REGION}' --project='${PROJECT_ID}' --format='value(name)'" \ - "nat-config" - -# 5. Firewall Rules -check_resource "Cluster Firewall Rules" \ - "gcloud compute firewall-rules list --project=\"${PROJECT_ID}\" --filter='network ~ \"${NETWORK}\" AND name ~ \"${CLUSTER_NAME}\"' --format=\"value(name)\"" \ - "${CLUSTER_NAME}" - -# 6. Subnets -check_resource "Main Subnet" \ - "gcloud compute networks subnets list --network=\"${NETWORK}\" --project=\"${PROJECT_ID}\" --format=\"value(name)\"" \ - "${SUBNET}" - -# 7. VPC Network -check_resource_exact "VPC Network ${NETWORK}" "gcloud compute networks describe \"${NETWORK}\" --project=\"${PROJECT_ID}\"" - -# 8. GCS Buckets (Optional without --force) -check_resource_exact "GCS Staging Bucket gs://${BUCKET}" "gsutil ls -b 'gs://${BUCKET}'" true -check_resource_exact "GCS Temp Bucket gs://${TEMP_BUCKET}" "gsutil ls -b 'gs://${TEMP_BUCKET}'" true - -echo -e "\nAudit complete." -echo -e "[${YELLOW}Pass*${NC}] indicates the resource was not found (which is expected after destroy)." -echo -e "[${BLUE}Kept${NC}] indicates the resource was found, which is expected as --force was not used." \ No newline at end of file diff --git a/gcloud/bin/audit-private-create b/gcloud/bin/audit-private-create deleted file mode 100644 index 77284a05..00000000 --- a/gcloud/bin/audit-private-create +++ /dev/null @@ -1,78 +0,0 @@ -#!/bin/bash - -# Script to audit resource creation after create-dpgce-private.sh -# Expected: All resources should be found. - -source lib/env.sh -source lib/gcp/misc.sh - -if (( DEBUG != 0 )); then - set -x -fi - -configure_gcloud # Set gcloud context - -parse_args "$@" # Not really used here, but good practice - -if [[ -z "${CLUSTER_NAME}" || "${CLUSTER_NAME}" == "null" ]]; then - echo "ERROR: CLUSTER_NAME is not set. Please source lib/env.sh after env.json is configured." - exit 1 -fi - -LOG_DIR="tmp/create_audit_${CLUSTER_NAME}_$(date +%Y%m%d-%H%M%S)" -mkdir -p "${LOG_DIR}" -echo "Detailed logs will be saved in ${LOG_DIR}" - -# --- Start Audit --- -echo "Starting resource creation audit for cluster: ${CLUSTER_NAME}" - -# 1. VPC Network -check_exists "VPC Network ${NETWORK}" "gcloud compute networks describe '${NETWORK}' --project='${PROJECT_ID}'" - -# 2. Subnets -check_exists "Main Subnet ${SUBNET}" "gcloud compute networks subnets describe '${SUBNET}' --region='${REGION}' --project='${PROJECT_ID}'" -check_exists "Private Subnet ${PRIVATE_SUBNET}" "gcloud compute networks subnets describe '${PRIVATE_SUBNET}' --region='${REGION}' --project='${PROJECT_ID}'" -check_exists "SWP Subnet ${SWP_SUBNET}" "gcloud compute networks subnets describe '${SWP_SUBNET}' --region='${REGION}' --project='${PROJECT_ID}'" - -# 3. Service Account -check_exists "Service Account ${GSA}" "gcloud iam service-accounts describe '${GSA}' --project='${PROJECT_ID}'" - -# 4. GCS Buckets -check_exists "GCS Staging Bucket gs://${BUCKET}" "gsutil ls -b 'gs://${BUCKET}'" -check_exists "GCS Temp Bucket gs://${TEMP_BUCKET}" "gsutil ls -b 'gs://${TEMP_BUCKET}'" - -# 5. SWP Certificate Components -SUFFIX=${RESOURCE_SUFFIX} -CA_POOL_NAME="swp-ca-pool-${CLUSTER_NAME}-${SUFFIX}" -CIC_NAME="swp-cic-${CLUSTER_NAME}-${SUFFIX}" -CA_NAME="swp-root-ca-${CLUSTER_NAME}-${SUFFIX}" -CERT_NAME="swp-cert" - -check_exists "CA Pool ${CA_POOL_NAME}" "gcloud privateca pools describe '${CA_POOL_NAME}' --location='${REGION}' --project='${PROJECT_ID}'" -check_exists "Root CA ${CA_NAME}" "gcloud privateca roots describe '${CA_NAME}' --pool='${CA_POOL_NAME}' --location='${REGION}' --project='${PROJECT_ID}'" -check_exists "CIC ${CIC_NAME}" "gcloud certificate-manager issuance-configs describe '${CIC_NAME}' --location='${REGION}' --project='${PROJECT_ID}'" -check_exists "Static Certificate ${CERT_NAME}" "gcloud certificate-manager certificates describe '${CERT_NAME}' --location='${REGION}' --project='${PROJECT_ID}'" - -# 6. Gateway Security Policy -check_exists "Gateway Security Policy ${SWP_POLICY_NAME}" "gcloud network-security gateway-security-policies list --location='${REGION}' --project='${PROJECT_ID}' --filter='name ~ /${SWP_POLICY_NAME}$' --format='value(name)'" -check_exists "GSP Rule allow-all-rule" "gcloud network-security gateway-security-policies rules list --gateway-security-policy='${SWP_POLICY_NAME}' --location='${REGION}' --project='${PROJECT_ID}' --filter='name ~ /allow-all-rule$' --format='value(name)'" - -# 7. SWP Gateway -check_exists "SWP Gateway ${SWP_INSTANCE_NAME}" "gcloud network-services gateways describe '${SWP_INSTANCE_NAME}' --location='${REGION}' --project='${PROJECT_ID}'" - -# 8. Firewall Rules -check_exists "Firewall Rule allow-swp-ingress-${CLUSTER_NAME}" "gcloud compute firewall-rules describe allow-swp-ingress-${CLUSTER_NAME} --project='${PROJECT_ID}'" - -# 9. Autoscaling Policy -check_exists "Autoscaling Policy ${AUTOSCALING_POLICY_NAME}" "gcloud dataproc autoscaling-policies describe '${AUTOSCALING_POLICY_NAME}' --region='${REGION}' --project='${PROJECT_ID}'" - -# 10. Dataproc Cluster (Optional) -print_status "Checking: Dataproc Cluster ${CLUSTER_NAME}... " -cluster_log_file="${LOG_DIR}/Dataproc_Cluster_${CLUSTER_NAME}.log" -if gcloud dataproc clusters describe "${CLUSTER_NAME}" --region="${REGION}" --project="${PROJECT_ID}" > "${cluster_log_file}" 2>&1; then - print_result "Exists" -else - print_result "Not Found" -fi - -echo -e "\nAudit complete." \ No newline at end of file diff --git a/gcloud/bin/audit-private-destroy b/gcloud/bin/audit-private-destroy deleted file mode 100644 index ef2a09d7..00000000 --- a/gcloud/bin/audit-private-destroy +++ /dev/null @@ -1,139 +0,0 @@ -#!/bin/bash - -# Script to audit resource cleanup after destroy-dpgce-private.sh -# Expected: No resources matching the patterns should be found. - -source lib/env.sh -source lib/gcp/misc.sh # Source the file containing configure_gcloud - -if (( DEBUG != 0 )); then - set -x -fi - -configure_gcloud # Set gcloud context - -parse_args "$@" # Sets FORCE_AUDIT based on --force - -if [[ -z "${CLUSTER_NAME}" || "${CLUSTER_NAME}" == "null" ]]; then - echo "ERROR: CLUSTER_NAME is not set. Please source lib/env.sh after env.json is configured." - exit 1 -fi - -LOG_DIR="tmp/destroy_audit_${CLUSTER_NAME}_$(date +%Y%m%d-%H%M%S)" -mkdir -p "${LOG_DIR}" -echo "Detailed logs will be saved in ${LOG_DIR}" - -NOT_FOUND_LOGS=() - -# --- Start Audit --- -echo "Starting resource cleanup audit for cluster: ${CLUSTER_NAME}" -if [[ "${FORCE_AUDIT}" == "true" ]]; then - echo "--force flag detected, expecting ALL resources to be deleted." -fi - -# 1. Dataproc Clusters -check_resource_exact "Dataproc Clusters" \ - "gcloud dataproc clusters list --region=\"${REGION}\" --project=\"${PROJECT_ID}\" --filter=\"clusterName = ${CLUSTER_NAME}\" --format=\"value(clusterName)\"" - -# 2. Service Accounts -check_resource_exact "Service Account" \ - "gcloud iam service-accounts list --project=\"${PROJECT_ID}\" --filter=\"email = ${GSA}\" --format=\"value(email)\"" - -# 3. Autoscaling Policies -check_resource "Autoscaling Policies" \ - "gcloud dataproc autoscaling-policies list --region=\"${REGION}\" --project=\"${PROJECT_ID}\" --format=\"value(id)\"" \ - "${AUTOSCALING_POLICY_NAME}" - -# 4. Cloud Routers -check_resource "Cloud Routers" \ - "gcloud compute routers list --regions=\"${REGION}\" --project=\"${PROJECT_ID}\" --filter=\"network ~ ${NETWORK}$\" --format=\"value(name)\"" \ - "${ROUTER_NAME}" -check_resource "SWG Autogen Routers" \ - "gcloud compute routers list --regions=\"${REGION}\" --project=\"${PROJECT_ID}\" --filter=\"network ~ ${NETWORK}$\" --format=\"value(name)\"" \ - "swg-autogen-router-" - -# 5. Firewall Rules -check_resource "Cluster Firewall Rules" \ - "gcloud compute firewall-rules list --project=\"${PROJECT_ID}\" --filter=\"network ~ ${NETWORK}$\" --format=\"value(name)\"" \ - "${CLUSTER_NAME}" -check_resource "SWP Ingress Firewall Rule" \ - "gcloud compute firewall-rules list --project=\"${PROJECT_ID}\" --filter=\"network ~ ${NETWORK}$\" --format=\"value(name)\"" \ - "allow-swp-ingress-${CLUSTER_NAME}" -check_resource "S8S Internal Firewall Rule" \ - "gcloud compute firewall-rules list --project=\"${PROJECT_ID}\" --filter=\"network ~ ${NETWORK}$\" --format=\"value(name)\"" \ - "allow-internal-s8s" - -# 6. SWP Gateway -check_resource "SWP Gateway" \ - "gcloud network-services gateways list --location=\"${REGION}\" --project=\"${PROJECT_ID}\" --format=\"value(name)\"" \ - "${SWP_INSTANCE_NAME}" - -# 7. Gateway Security Policies -check_resource "Gateway Security Policies" \ - "gcloud network-security gateway-security-policies list --location=\"${REGION}\" --project=\"${PROJECT_ID}\" --filter=\"name ~ /${SWP_POLICY_NAME}$ \" --format=\"value(name)\"" \ - "${SWP_POLICY_NAME}" - -# 8. Certificate Manager Certificate (Static) -check_resource "Static Cert Manager Certificate" \ - "gcloud certificate-manager certificates list --location=\"${REGION}\" --project=\"${PROJECT_ID}\" --format=\"value(name)\"" \ - "${SWP_CERT_NAME}" \ - true # Optional - -# 9. Certificate Issuance Configs (Optional without --force) -check_resource "Cert Issuance Configs" \ - "gcloud certificate-manager issuance-configs list --location=\"${REGION}\" --project=\"${PROJECT_ID}\" --format=\"value(name)\"" \ - "swp-cic-${CLUSTER_NAME}-" \ - true # Optional - -# 10. CA Pools (Optional without --force) -check_resource "CA Pools" \ - "gcloud privateca pools list --location=\"${REGION}\" --project=\"${PROJECT_ID}\" --format=\"value(name)\"" \ - "swp-ca-pool-${CLUSTER_NAME}-" \ - true # Optional - -# 11. Root CAs (Optional without --force) -ca_pool_prefix="swp-ca-pool-${CLUSTER_NAME}-" -pool_log="${LOG_DIR}/CA_Pools_for_Root_CA_check.log" -print_status "Checking: Root CAs in any lingering Pools... " -gcloud privateca pools list --location="${REGION}" --project="${PROJECT_ID}" --format="value(name)" > "${pool_log}" 2>&1 -pool_names=$(grep "${ca_pool_prefix}" "${pool_log}" || true) -if [[ -n "${pool_names}" ]]; then - if [[ "${FORCE_AUDIT}" == "false" ]]; then - report_result "Kept" - echo " -> Found lingering CA Pools (expected without --force)." - else - report_result "Fail" - echo " -> Found lingering CA Pools, CAs might exist. Check logs in ${LOG_DIR}" - while read -r pool_full_name; do - short_pool_name=$(basename "${pool_full_name}") - check_resource "Root CAs in ${short_pool_name}" \ - "gcloud privateca roots list --pool=\"${short_pool_name}\" --location=\"${REGION}\" --project=\"${PROJECT_ID}\" --format=\"value(name)\"" \ - "swp-root-ca-${CLUSTER_NAME}-" \ - true # Optional - done <<< "${pool_names}" - fi -else - report_result "Not Found" -fi - -# 12. Subnets -check_resource "SWP Subnet" \ - "gcloud compute networks subnets list --network=\"${NETWORK}\" --project=\"${PROJECT_ID}\" --format=\"value(name)\"" \ - "${SWP_SUBNET}" -check_resource "Private Subnet" \ - "gcloud compute networks subnets list --network=\"${NETWORK}\" --project=\"${PROJECT_ID}\" --format=\"value(name)\"" \ - "${PRIVATE_SUBNET}" -check_resource "Main Subnet" \ - "gcloud compute networks subnets list --network=\"${NETWORK}\" --project=\"${PROJECT_ID}\" --format=\"value(name)\"" \ - "${SUBNET}" - -# 13. VPC Network -check_resource_exact "VPC Network ${NETWORK}" "gcloud compute networks describe \"${NETWORK}\" --project=\"${PROJECT_ID}\"" - -# 14. GCS Buckets (Optional without --force) -check_resource_exact "GCS Staging Bucket gs://${BUCKET}" "gsutil ls -b 'gs://${BUCKET}'" true -check_resource_exact "GCS Temp Bucket gs://${TEMP_BUCKET}" "gsutil ls -b 'gs://${TEMP_BUCKET}'" true - -echo -e "\nAudit complete." -echo -e "[${YELLOW}Pass*${NC}] indicates the resource was not found (which is expected after destroy)." -echo -e "[${BLUE}Kept${NC}] indicates the resource was found, which is expected as --force was not used." \ No newline at end of file diff --git a/gcloud/bin/create-dpgce-private b/gcloud/bin/create-dpgce-private deleted file mode 100755 index 373fdf7d..00000000 --- a/gcloud/bin/create-dpgce-private +++ /dev/null @@ -1,120 +0,0 @@ -#!/bin/bash -# -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS-IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Exit on failure -set -e - -source lib/env.sh - -source lib/gcp/project.sh -source lib/gcp/iam.sh -source lib/gcp/gcs.sh -source lib/gcp/misc.sh -source lib/misc.sh # Added this line -source lib/network/network.sh -source lib/network/subnet.sh -source lib/network/router.sh -source lib/network/firewall.sh -source lib/swp/subnet.sh -source lib/swp/certs.sh -source lib/swp/policy.sh -source lib/swp/gateway.sh -source lib/swp/firewall.sh # Added this line -source lib/dataproc/cluster.sh -source lib/dataproc/autoscaling.sh -# source lib/database/mysql.sh -# source lib/database/mssql.sh - -parse_args "$@" - -if (( DEBUG != 0 )); then - set -x -fi - -create_project - -configure_gcloud - -set_cluster_name - -enable_services - -create_service_account - -create_bucket - -create_vpc_network - -#create_ip_allocation - -#create_vpc_peering - -# Create a cloud router - -# create subnet - -create_subnet - -create_private_subnet - -create_swp_subnet - -#create_certificate - -create_managed_certificate - -create_gateway_security_policy - -create_swp_gateway - -create_firewall_rules - -create_allow_swp_ingress_rule - -# Create logging firewall rules - -#create_logging_firewall_rules - -#create_bigtable_instance - -#create_mysql_instance -#create_legacy_mssql_instance - -# Create PHS dataproc cluster - -#create_phs_cluster - -# Create normal dataproc cluster - -create_autoscaling_policy - -if [[ "${CREATE_CLUSTER}" = true ]]; then - print_status "Creating Dataproc Cluster ${CLUSTER_NAME}..." - if create_dpgce_cluster; then - print_result "Pass" - else - print_result "Fail" - exit 1 - fi -else - echo -e "${YELLOW}Skipping Cluster Creation due to --no-create-cluster flag.${NC}" -fi - -# Perform some connectivity tests - -#perform_connectivity_tests - diff --git a/gcloud/bin/destroy-dpgce-private b/gcloud/bin/destroy-dpgce-private deleted file mode 100755 index 00073e57..00000000 --- a/gcloud/bin/destroy-dpgce-private +++ /dev/null @@ -1,90 +0,0 @@ -#!/bin/bash -# -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS-IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -set -e - -source lib/env.sh - -source lib/dataproc/cluster.sh -source lib/dataproc/autoscaling.sh -source lib/gcp/iam.sh -source lib/gcp/gcs.sh -source lib/network/network.sh -source lib/network/subnet.sh -source lib/network/router.sh -source lib/network/firewall.sh -source lib/swp/subnet.sh -source lib/swp/certs.sh -source lib/swp/policy.sh -source lib/swp/gateway.sh -source lib/swp/firewall.sh - -parse_args "$@" - -if (( DEBUG != 0 )); then - set -x -fi - -delete_dpgce_cluster - -delete_service_account - -delete_autoscaling_policy - -#delete_phs_cluster() - -#delete_mysql_instance -#delete_legacy_mssql_instance - -delete_nat_configs - -delete_router - -delete_firewall_rules - -delete_allow_swp_ingress_rule - -#delete_logging_firewall_rules - -#delete_ip_allocation - -delete_swp_gateway - -delete_gateway_security_policy - -#delete_certificate - -delete_managed_certificate - -delete_swp_subnet - -delete_private_subnet - -delete_subnet - -delete_vpc_network - -#delete_vpc_peering - -if [[ "${FORCE_DELETE}" = true ]]; then - delete_bucket -else - print_status "Skipping Bucket Deletion. Use --force to delete buckets." - report_result "Skipped" -fi - -set +x diff --git a/gcloud/bin/recreate-dpgce b/gcloud/bin/recreate-dpgce deleted file mode 100755 index afd1c6b0..00000000 --- a/gcloud/bin/recreate-dpgce +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/bash -# -# Copyright 2021 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS-IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -source lib/env.sh -source lib/dataproc/cluster.sh -source lib/gcp/iam.sh - -# Copy latest initialization action scripts -#echo -n "copying actions to gcs bucket..." -#gsutil -m cp \ -# -L action-update.log \ -# -r init/* gs://${BUCKET}/dataproc-initialization-actions -#if [[ $? == 0 ]]; then -# echo "done" -#else -# echo "fail" -# exit 1 -#fi - -# re-create dpgce dataproc cluster -delete_dpgce_cluster -create_service_account # Ensure SA and roles exist -create_dpgce_cluster - -echo "========================================" -echo "General Purpose DPGCE Cluster re-created" -echo "========================================" - diff --git a/gcloud/cloudbuild.yaml b/gcloud/cloudbuild.yaml new file mode 100644 index 00000000..c09a9a4b --- /dev/null +++ b/gcloud/cloudbuild.yaml @@ -0,0 +1,81 @@ +steps: + - name: 'gcr.io/cloud-builders/gcloud' + entrypoint: 'bash' + args: + - '-c' + - | + echo "$TEST_ENV_JSON" > env.json + echo "CI/CD env.json content:" + cat env.json + # --- Test Standard DPGCE --- + - name: 'gcr.io/cloud-builders/gcloud' + id: 'Test Standard Create' + entrypoint: 'bash' + args: ['-c', 'source lib/env.sh && ./bin/create-dpgce && ./bin/audit-dpgce-create'] + env: + - 'PROJECT_ID=$PROJECT_ID' + - name: 'gcr.io/cloud-builders/gcloud' + id: 'Test Standard Destroy' + entrypoint: 'bash' + args: ['-c', 'source lib/env.sh && ./bin/destroy-dpgce --force && ./bin/audit-dpgce-destroy --force'] + env: + - 'PROJECT_ID=$PROJECT_ID' + waitFor: ['Test Standard Create'] + + # --- Test Private DPGCE --- + - name: 'gcr.io/cloud-builders/gcloud' + id: 'Test Private Create' + entrypoint: 'bash' + args: ['-c', 'source lib/env.sh && ./bin/create-dpgce-private && ./bin/audit-private-create'] + env: + - 'PROJECT_ID=$PROJECT_ID' + - name: 'gcr.io/cloud-builders/gcloud' + id: 'Test Private Destroy' + entrypoint: 'bash' + args: ['-c', 'source lib/env.sh && ./bin/destroy-dpgce-private --force && ./bin/audit-private-destroy --force'] + env: + - 'PROJECT_ID=$PROJECT_ID' + waitFor: ['Test Private Create'] + + # --- Test Custom Standard DPGCE --- + - name: 'gcr.io/cloud-builders/gcloud' + id: 'Test Custom Std Create' + entrypoint: 'bash' + args: ['-c', 'source lib/env.sh && ./bin/create-dpgce-custom && ./bin/audit-dpgce-create-custom'] + env: + - 'PROJECT_ID=$PROJECT_ID' + - name: 'gcr.io/cloud-builders/gcloud' + id: 'Test Custom Std Destroy' + entrypoint: 'bash' + args: ['-c', 'source lib/env.sh && ./bin/destroy-dpgce --force && ./bin/audit-dpgce-destroy --force'] + env: + - 'PROJECT_ID=$PROJECT_ID' + waitFor: ['Test Custom Std Create'] + + # --- Test Custom Private DPGCE --- + - name: 'gcr.io/cloud-builders/gcloud' + id: 'Test Custom Pvt Create' + entrypoint: 'bash' + args: ['-c', 'source lib/env.sh && ./bin/create-dpgce-custom-private && ./bin/audit-dpgce-create-custom-private'] + env: + - 'PROJECT_ID=$PROJECT_ID' + - name: 'gcr.io/cloud-builders/gcloud' + id: 'Test Custom Pvt Destroy' + entrypoint: 'bash' + args: ['-c', 'source lib/env.sh && ./bin/destroy-dpgce-private --force && ./bin/audit-private-destroy --force'] + env: + - 'PROJECT_ID=$PROJECT_ID' + waitFor: ['Test Custom Pvt Create'] + +availableSecrets: + secretManager: + - versionName: projects/$PROJECT_ID/secrets/test-env-json/versions/latest + env: 'TEST_ENV_JSON' + +options: + env: + - 'PROJECT_ID=cjac-2025-01' # Use CI_PROJECT_ID here + - 'CLOUDSDK_CORE_DISABLE_PROMPTS=1' +substitutions: + _CI_PROJECT_ID: "cjac-2025-01" +timeout: 3600s # 60 minutes diff --git a/gcloud/env.json.sample b/gcloud/env.json.sample index 49d1fff9..6e3d0bc0 100644 --- a/gcloud/env.json.sample +++ b/gcloud/env.json.sample @@ -13,6 +13,7 @@ "IDLE_TIMEOUT": "30m", "ASN_NUMBER": "65531", "IMAGE_VERSION": "2.2", + "CUSTOM_IMAGE_URI": "projects/your-project/global/images/your-custom-image", "BUCKET": "myproject-dataproc-repro-bucket", "TEMP_BUCKET": "myproject-dataproc-repro-temp-bucket", "CLUSTER_NAME": "cluster-name-here", @@ -23,5 +24,13 @@ "SWP_RANGE": "10.0.3.0/24", "SWP_HOSTNAME": "swp.internal.local", "SWP_IP": "10.0.2.245", - "SWP_PORT": 3128 + "SWP_PORT": 3128, + "CI_PROJECT_ID": "your-ci-test-project-id", + "CI_GCP_CREDENTIALS_PATH": "/path/to/your/ci-service-account.json", + "CI_CSR_REPO_NAME": "your-cloud-source-repo", + "CI_CSR_REGION": "us-central1", + "CI_GITHUB_CONNECTION_NAME": "github-connection", + "CI_TRIGGER_BRANCH": "main", + "CI_REPO_OWNER": "your-github-username", + "CI_BYOSA_EMAIL": "your-byosa-email@your-ci-test-project-id.iam.gserviceaccount.com" } diff --git a/gcloud/init/gce-proxy-setup.sh b/gcloud/init/gce-proxy-setup.sh index 0a49202d..434fa5c1 100644 --- a/gcloud/init/gce-proxy-setup.sh +++ b/gcloud/init/gce-proxy-setup.sh @@ -56,166 +56,285 @@ function is_rocky() { [[ "$(os_id)" == "rocky" ]] ; } # --- End OS Detection Helpers --- function set_proxy(){ - METADATA_HTTP_PROXY="$(get_metadata_attribute http-proxy '')" + local meta_http_proxy meta_https_proxy meta_proxy_uri + meta_http_proxy=$(get_metadata_attribute 'http-proxy' '') + meta_https_proxy=$(get_metadata_attribute 'https-proxy' '') + meta_proxy_uri=$(get_metadata_attribute 'proxy-uri' '') - if [[ -z "${METADATA_HTTP_PROXY}" ]] ; then return ; fi + echo "DEBUG: set_proxy: meta_http_proxy='${meta_http_proxy}'" + echo "DEBUG: set_proxy: meta_https_proxy='${meta_https_proxy}'" + echo "DEBUG: set_proxy: meta_proxy_uri='${meta_proxy_uri}'" -default_no_proxy_list=("localhost" "127.0.0.0/8" "::1" "*.googleapis.com" - "metadata.google.internal" "169.254.169.254") + local http_proxy_val="" + local https_proxy_val="" + # Determine HTTP_PROXY value + if [[ -n "${meta_http_proxy}" ]] && [[ "${meta_http_proxy}" != ":" ]]; then + http_proxy_val="${meta_http_proxy}" + elif [[ -n "${meta_proxy_uri}" ]] && [[ "${meta_proxy_uri}" != ":" ]]; then + http_proxy_val="${meta_proxy_uri}" + fi + + # Determine HTTPS_PROXY value + if [[ -n "${meta_https_proxy}" ]] && [[ "${meta_https_proxy}" != ":" ]]; then + https_proxy_val="${meta_https_proxy}" + elif [[ -n "${meta_proxy_uri}" ]] && [[ "${meta_proxy_uri}" != ":" ]]; then + https_proxy_val="${meta_proxy_uri}" + fi + + if [[ -z "${http_proxy_val}" && -z "${https_proxy_val}" ]]; then + echo "DEBUG: set_proxy: No valid proxy metadata found (http-proxy, https-proxy, or proxy-uri). Skipping proxy setup." + return 0 + fi + + local default_no_proxy_list=( + "localhost" + "127.0.0.1" + "::1" + "metadata.google.internal" + "169.254.169.254" + # *** Add Google APIs to NO_PROXY for Private Google Access *** + ".google.com" + ".googleapis.com" + ) + + local user_no_proxy user_no_proxy=$(get_metadata_attribute 'no-proxy' '') - user_no_proxy_list=() + local user_no_proxy_list=() if [[ -n "${user_no_proxy}" ]]; then # Replace spaces with commas, then split by comma IFS=',' read -r -a user_no_proxy_list <<< "${user_no_proxy// /,}" fi - combined_no_proxy_list=("${default_no_proxy_list[@]}" "${user_no_proxy_list[@]}") + local combined_no_proxy_list=( "${default_no_proxy_list[@]}" "${user_no_proxy_list[@]}" ) + local no_proxy no_proxy=$( IFS=',' ; echo "${combined_no_proxy_list[*]}" ) - - export http_proxy="http://${METADATA_HTTP_PROXY}" - export https_proxy="http://${METADATA_HTTP_PROXY}" - export no_proxy - export HTTP_PROXY="http://${METADATA_HTTP_PROXY}" - export HTTPS_PROXY="http://${METADATA_HTTP_PROXY}" export NO_PROXY="${no_proxy}" + export no_proxy="${no_proxy}" + + # Export environment variables + if [[ -n "${http_proxy_val}" ]]; then + export HTTP_PROXY="http://${http_proxy_val}" + export http_proxy="http://${http_proxy_val}" + else + unset HTTP_PROXY + unset http_proxy + fi + echo "DEBUG: set_proxy: Initial HTTP_PROXY='${HTTP_PROXY:-}'" + + if [[ -n "${https_proxy_val}" ]]; then + export HTTPS_PROXY="http://${https_proxy_val}" + export https_proxy="http://${https_proxy_val}" + else + unset HTTPS_PROXY + unset https_proxy + fi + echo "DEBUG: set_proxy: Initial HTTPS_PROXY='${HTTPS_PROXY:-}'" - # configure gcloud - # There is no no_proxy config for gcloud so we cannot use these settings until https://github.com/psf/requests/pull/7068 is merged -# gcloud config set proxy/type http -# gcloud config set proxy/address "${METADATA_HTTP_PROXY%:*} " -# gcloud config set proxy/port "${METADATA_HTTP_PROXY#*:}" + # Clear existing proxy settings in /etc/environment + sed -i -e '/^http_proxy=/d' -e '/^https_proxy=/d' -e '/^no_proxy=/d' \ + -e '/^HTTP_PROXY=/d' -e '/^HTTPS_PROXY=/d' -e '/^NO_PROXY=/d' /etc/environment - # add proxy environment variables to /etc/environment - grep http_proxy /etc/environment || echo "http_proxy=${http_proxy}" >> /etc/environment - grep https_proxy /etc/environment || echo "https_proxy=${https_proxy}" >> /etc/environment - grep no_proxy /etc/environment || echo "no_proxy=${no_proxy}" >> /etc/environment - grep HTTP_PROXY /etc/environment || echo "HTTP_PROXY=${HTTP_PROXY}" >> /etc/environment - grep HTTPS_PROXY /etc/environment || echo "HTTPS_PROXY=${HTTPS_PROXY}" >> /etc/environment - grep NO_PROXY /etc/environment || echo "NO_PROXY=${NO_PROXY}" >> /etc/environment + # Add current proxy environment variables to /etc/environment + if [[ -n "${HTTP_PROXY:-}" ]]; then echo "HTTP_PROXY=${HTTP_PROXY}" >> /etc/environment; fi + if [[ -n "${http_proxy:-}" ]]; then echo "http_proxy=${http_proxy}" >> /etc/environment; fi + if [[ -n "${HTTPS_PROXY:-}" ]]; then echo "HTTPS_PROXY=${HTTPS_PROXY}" >> /etc/environment; fi + if [[ -n "${https_proxy:-}" ]]; then echo "https_proxy=${https_proxy}" >> /etc/environment; fi + echo "DEBUG: set_proxy: Effective HTTP_PROXY=${HTTP_PROXY:-}" + echo "DEBUG: set_proxy: Effective HTTPS_PROXY=${HTTPS_PROXY:-}" + echo "DEBUG: set_proxy: Effective NO_PROXY=${NO_PROXY:-}" + # Configure gcloud proxy if version is >= 547.0.0 + local gcloud_version + gcloud_version=$(gcloud version --format="value(google_cloud_sdk)") + if version_ge "${gcloud_version}" "547.0.0"; then + echo "DEBUG: gcloud version ${gcloud_version} >= 547.0.0, configuring gcloud proxy settings." + if [[ -n "${http_proxy_val}" ]]; then + local proxy_host + local proxy_port + proxy_host=$(echo "${http_proxy_val}" | cut -d: -f1) + proxy_port=$(echo "${http_proxy_val}" | cut -d: -f2) + gcloud config set proxy/type http + gcloud config set proxy/address "${proxy_host}" + gcloud config set proxy/port "${proxy_port}" + echo "DEBUG: Configured gcloud proxy: ${proxy_host}:${proxy_port}" + else + echo "DEBUG: No HTTP proxy value to configure gcloud." + gcloud config unset proxy/type + gcloud config unset proxy/address + gcloud config unset proxy/port + fi + else + echo "DEBUG: gcloud version ${gcloud_version} < 547.0.0, skipping gcloud proxy config." + fi + + if [[ -n "${http_proxy_val}" ]]; then + local proxy_host=$(echo "${http_proxy_val}" | cut -d: -f1) + local proxy_port=$(echo "${http_proxy_val}" | cut -d: -f2) + + echo "DEBUG: set_proxy: Testing TCP connection to proxy ${proxy_host}:${proxy_port}..." + if ! nc -zv -w 5 "${proxy_host}" "${proxy_port}"; then + echo "ERROR: Failed to establish TCP connection to proxy ${proxy_host}:${proxy_port}." + exit 1 + else + echo "DEBUG: set_proxy: TCP connection to proxy successful." + fi + + echo "DEBUG: set_proxy: Testing external site access via proxy..." + local test_url="https://www.google.com" + if curl -vL ${curl_retry_args} -o /dev/null "${test_url}"; then + echo "DEBUG: set_proxy: Successfully fetched ${test_url} via proxy." + else + echo "ERROR: Failed to fetch ${test_url} via proxy ${HTTP_PROXY}." + exit 1 + fi + fi + + # Configure package managers local pkg_proxy_conf_file - if is_debuntu ; then - # configure Apt to use the proxy: + local effective_proxy="${http_proxy_val:-${https_proxy_val}}" # Use a single value for apt/dnf + + if [[ -z "${effective_proxy}" ]]; then + echo "DEBUG: set_proxy: No HTTP or HTTPS proxy set for package managers." + elif is_debuntu ; then pkg_proxy_conf_file="/etc/apt/apt.conf.d/99proxy" - cat > "${pkg_proxy_conf_file}" < "${pkg_proxy_conf_file}" + echo "Acquire::https::Proxy \"http://${effective_proxy}\";" >> "${pkg_proxy_conf_file}" + echo "DEBUG: set_proxy: Configured apt proxy: ${pkg_proxy_conf_file}" + elif is_rocky ; then pkg_proxy_conf_file="/etc/dnf/dnf.conf" - touch "${pkg_proxy_conf_file}" - - if grep -q "^proxy=" "${pkg_proxy_conf_file}"; then - sed -i.bak "s@^proxy=.*@proxy=${HTTP_PROXY}@" "${pkg_proxy_conf_file}" - elif grep -q "^\\\[main\\\\]" "${pkg_proxy_conf_file}"; then - sed -i.bak "/^\\\[main\\\\]/a proxy=${HTTP_PROXY}" "${pkg_proxy_conf_file}" + sed -i.bak '/^proxy=/d' "${pkg_proxy_conf_file}" + if grep -q "^\[main\]" "${pkg_proxy_conf_file}"; then + sed -i.bak "/^\\[main\\]/a proxy=http://${effective_proxy}" "${pkg_proxy_conf_file}" else - local TMP_FILE=$(mktemp) - printf "[main]\nproxy=%s\n" "${HTTP_PROXY}" > "${TMP_FILE}" - - cat "${TMP_FILE}" "${pkg_proxy_conf_file}" > "${pkg_proxy_conf_file}".new - mv "${pkg_proxy_conf_file}".new "${pkg_proxy_conf_file}" + echo -e "[main]\nproxy=http://${effective_proxy}" >> "${pkg_proxy_conf_file}" + fi + echo "DEBUG: set_proxy: Configured dnf proxy: ${pkg_proxy_conf_file}" + fi - rm "${TMP_FILE}" + # Configure dirmngr to use the HTTP proxy if set + if is_debuntu ; then + if ! dpkg -l | grep -q dirmngr; then + echo "DEBUG: set_proxy: dirmngr package not found, installing..." + execute_with_retries apt-get install -y -qq dirmngr + fi + elif is_rocky ; then + if ! rpm -q gnupg2-smime; then + echo "DEBUG: set_proxy: gnupg2-smime package not found, installing..." + execute_with_retries dnf install -y -q gnupg2-smime fi - else - echo "unknown OS" - exit 1 fi - # configure gpg to use the proxy: - if ! grep 'keyserver-options http-proxy' /etc/gnupg/dirmngr.conf ; then - mkdir -p /etc/gnupg - cat >> /etc/gnupg/dirmngr.conf <> "${dirmngr_conf}" + echo "DEBUG: set_proxy: Configured dirmngr proxy in ${dirmngr_conf}" fi - # Install the HTTPS proxy's certificate in the system and Java trust databases + # Install the HTTPS proxy's certificate METADATA_HTTP_PROXY_PEM_URI="$(get_metadata_attribute http-proxy-pem-uri '')" + if [[ -z "${METADATA_HTTP_PROXY_PEM_URI}" ]] ; then + echo "DEBUG: set_proxy: No http-proxy-pem-uri metadata found. Skipping cert install." + return 0 + fi + if [[ ! "${METADATA_HTTP_PROXY_PEM_URI}" =~ ^gs:// ]] ; then echo "ERROR: http-proxy-pem-uri value must start with gs://" ; exit 1 ; fi - if [[ -z "${METADATA_HTTP_PROXY_PEM_URI}" ]] ; then return ; fi - if [[ ! "${METADATA_HTTP_PROXY_PEM_URI}" =~ ^gs ]] ; then echo "http-proxy-pem-uri value should start with gs://" ; exit 1 ; fi - - local trusted_pem_dir - # Add this certificate to the OS trust database - # When proxy cert is provided, speak to the proxy over https + echo "DEBUG: set_proxy: http-proxy-pem-uri='${METADATA_HTTP_PROXY_PEM_URI}'" + local trusted_pem_dir proxy_ca_pem ca_subject if is_debuntu ; then trusted_pem_dir="/usr/local/share/ca-certificates" - mkdir -p "${trusted_pem_dir}" proxy_ca_pem="${trusted_pem_dir}/proxy_ca.crt" gsutil cp "${METADATA_HTTP_PROXY_PEM_URI}" "${proxy_ca_pem}" update-ca-certificates - trusted_pem_path="/etc/ssl/certs/ca-certificates.crt" - sed -i -e 's|http://|https://|' "${pkg_proxy_conf_file}" -elif is_rocky ; then + export trusted_pem_path="/etc/ssl/certs/ca-certificates.crt" + if [[ -n "${effective_proxy}" ]]; then + sed -i -e 's|http://|https://|' "${pkg_proxy_conf_file}" + fi + elif is_rocky ; then trusted_pem_dir="/etc/pki/ca-trust/source/anchors" - mkdir -p "${trusted_pem_dir}" proxy_ca_pem="${trusted_pem_dir}/proxy_ca.crt" gsutil cp "${METADATA_HTTP_PROXY_PEM_URI}" "${proxy_ca_pem}" update-ca-trust - trusted_pem_path="/etc/ssl/certs/ca-bundle.crt" - sed -i -e 's|^proxy=http://|proxy=https://|' "${pkg_proxy_conf_file}" - else - echo "unknown OS" - exit 1 + export trusted_pem_path="/etc/ssl/certs/ca-bundle.crt" + if [[ -n "${effective_proxy}" ]]; then + sed -i -e "s|^proxy=http://|proxy=https://|" "${pkg_proxy_conf_file}" + fi fi + export REQUESTS_CA_BUNDLE="${trusted_pem_path}" + echo "DEBUG: set_proxy: trusted_pem_path='${trusted_pem_path}'" - # configure gcloud to respect proxy ca cert - #gcloud config set core/custom_ca_certs_file "${proxy_ca_pem}" + local proxy_host="${http_proxy_val:-${https_proxy_val}}" + # Update env vars to use https + if [[ -n "${http_proxy_val}" ]]; then + export HTTP_PROXY="https://${http_proxy_val}" + export http_proxy="https://${http_proxy_val}" + fi + if [[ -n "${https_proxy_val}" ]]; then + export HTTPS_PROXY="https://${https_proxy_val}" + export https_proxy="https://${https_proxy_val}" + fi + sed -i -e 's|http://|https://|g' /etc/environment + echo "DEBUG: set_proxy: Final HTTP_PROXY='${HTTP_PROXY:-}'" + echo "DEBUG: set_proxy: Final HTTPS_PROXY='${HTTPS_PROXY:-}'" + + if [[ -n "${http_proxy_val}" ]]; then + sed -i -e "s|^http-proxy http://.*|http-proxy https://${http_proxy_val}|" /etc/gnupg/dirmngr.conf + fi + + # Verification steps from original script... ca_subject="$(openssl crl2pkcs7 -nocrl -certfile "${proxy_ca_pem}" | openssl pkcs7 -print_certs -noout | grep ^subject)" - # Verify that the proxy certificate is trusted - local output - output=$(echo | openssl s_client \ - -connect "${METADATA_HTTP_PROXY}" \ - -proxy "${METADATA_HTTP_PROXY}" \ - -CAfile "${proxy_ca_pem}") || { - echo "proxy certificate verification failed" - echo "${output}" - exit 1 - } - output=$(echo | openssl s_client \ - -connect "${METADATA_HTTP_PROXY}" \ - -proxy "${METADATA_HTTP_PROXY}" \ - -CAfile "${trusted_pem_path}") || { - echo "proxy ca certificate not included in system bundle" - echo "${output}" - exit 1 - } - output=$(curl --verbose -fsSL --retry-connrefused --retry 10 --retry-max-time 30 --head "https://google.com" 2>&1)|| { - echo "curl rejects proxy configuration" - echo "${output}" - exit 1 - } - output=$(curl --verbose -fsSL --retry-connrefused --retry 10 --retry-max-time 30 --head "https://developer.download.nvidia.com/compute/cuda/12.6.3/local_installers/cuda_12.6.3_560.35.05_linux.run" 2>&1)|| { - echo "curl rejects proxy configuration" - echo "${output}" - exit 1 - } + openssl s_client -connect "${proxy_host}" -CAfile "${proxy_ca_pem}" < /dev/null || { echo "ERROR: proxy cert verification failed" ; exit 1 ; } + openssl s_client -connect "${proxy_host}" -CAfile "${trusted_pem_path}" < /dev/null || { echo "ERROR: proxy ca not in system bundle" ; exit 1 ; } + + curl --verbose --cacert "${trusted_pem_path}" -x "${HTTPS_PROXY}" -fsSL --retry-connrefused --retry 10 --retry-max-time 30 --head "https://google.com" || { echo "ERROR: curl rejects proxy config for google.com" ; exit 1 ; } + curl --verbose --cacert "${trusted_pem_path}" -x "${HTTPS_PROXY}" -fsSL --retry-connrefused --retry 10 --retry-max-time 30 --head "https://developer.download.nvidia.com" || { echo "ERROR: curl rejects proxy config for nvidia.com" ; exit 1 ; } - # Instruct conda to use the system certificate - echo "Attempting to install pip-system-certs using the proxy certificate..." - export REQUESTS_CA_BUNDLE="${trusted_pem_path}" pip install pip-system-certs unset REQUESTS_CA_BUNDLE - # For the binaries bundled with conda, append our certificate to the bundle - openssl crl2pkcs7 -nocrl -certfile /opt/conda/default/ssl/cacert.pem | openssl pkcs7 -print_certs -noout | grep -Fx "${ca_subject}" || { - cat "${proxy_ca_pem}" >> /opt/conda/default/ssl/cacert.pem - } - - sed -i -e 's|http://|https://|' /etc/gnupg/dirmngr.conf - export http_proxy="https://${METADATA_HTTP_PROXY}" - export https_proxy="https://${METADATA_HTTP_PROXY}" - export HTTP_PROXY="https://${METADATA_HTTP_PROXY}" - export HTTPS_PROXY="https://${METADATA_HTTP_PROXY}" - sed -i -e 's|proxy=http://|proxy=https://|' -e 's|PROXY=http://|PROXY=https://|' /etc/environment - - # Instruct the JRE to trust the certificate - JAVA_HOME="$(awk -F= '/^JAVA_HOME=/ {print $2}' /etc/environment)" - "${JAVA_HOME}/bin/keytool" -import -cacerts -storepass changeit -noprompt -alias swp_ca -file "${proxy_ca_pem}" + if command -v conda &> /dev/null ; then + local conda_cert_file="/opt/conda/default/ssl/cacert.pem" + if [[ -f "${conda_cert_file}" ]]; then + openssl crl2pkcs7 -nocrl -certfile "${conda_cert_file}" | openssl pkcs7 -print_certs -noout | grep -Fxq "${ca_subject}" || { + cat "${proxy_ca_pem}" >> "${conda_cert_file}" + } + fi + fi + + if [[ -f "/etc/environment" ]]; then + JAVA_HOME="$(awk -F= '/^JAVA_HOME=/ {print $2}' /etc/environment)" + if [[ -n "${JAVA_HOME:-}" && -f "${JAVA_HOME}/bin/keytool" ]]; then + "${JAVA_HOME}/bin/keytool" -import -cacerts -storepass changeit -noprompt -alias swp_ca -file "${proxy_ca_pem}" + fi + fi + + echo "DEBUG: set_proxy: Verifying proxy connectivity..." + + # Test fetching a file through the proxy + local test_url="https://www.gstatic.com/generate_204" +# local test_url="https://raw.githubusercontent.com/GoogleCloudDataproc/initialization-actions/master/README.md" + local test_output="${tmpdir}/proxy_test.md" + + echo "DEBUG: set_proxy: Attempting to download ${test_url} via proxy ${HTTPS_PROXY}" +# if curl --verbose --cacert "${trusted_pem_path}" -x "${HTTPS_PROXY}" -fsSL --retry-connrefused --retry 3 --retry-max-time 10 -o "${test_output}" "${test_url}"; then + if curl -vL ${curl_retry_args} -o /dev/null "${test_url}"; then + echo "DEBUG: set_proxy: Successfully downloaded test file through proxy." + rm -f "${test_output}" + else + echo "ERROR: Proxy test failed. Unable to download ${test_url} via ${HTTPS_PROXY}" + # Optionally print more debug info from curl if needed + exit 1 + fi + + echo "DEBUG: set_proxy: Proxy verification successful." + + echo "DEBUG: set_proxy: Proxy setup complete." } -set_proxy \ No newline at end of file +set_proxy diff --git a/gcloud/project/findings/2026-W11/20260310T222921-gcloud-metadata-and-gpu-config.md b/gcloud/project/findings/2026-W11/20260310T222921-gcloud-metadata-and-gpu-config.md new file mode 100644 index 00000000..6ca55141 --- /dev/null +++ b/gcloud/project/findings/2026-W11/20260310T222921-gcloud-metadata-and-gpu-config.md @@ -0,0 +1,13 @@ +# Findings - 2026-03-10 + +## 1. Gcloud Metadata Custom Separator + +**Observation:** The script `lib/dataproc/cluster.sh` utilizes a custom separator `^|^` for the `--metadata` flag when calling `gcloud dataproc clusters create`. + +**Finding:** This is a technique to supply multiple key-value pairs to the `--metadata` argument without repeating the flag. The format `^|^key1=value1|^key2=value2` allows `gcloud` to parse these correctly. This can be more concise than many `--metadata key=value` lines. + +## 2. GPU Configuration Management + +**Observation:** GPU-related settings such as CUDA version, driver version, and download URLs are externalized into environment variables in `lib/env.sh`. These variables are then used to populate metadata values passed to the Dataproc cluster during creation. + +**Finding:** This approach allows for easy modification and testing of different GPU driver and CUDA combinations without hardcoding values within the cluster creation logic. It centralizes GPU configuration parameters. diff --git a/gcloud/project/findings/2026-W11/20260312T035246-audit-script-gaps.md b/gcloud/project/findings/2026-W11/20260312T035246-audit-script-gaps.md new file mode 100644 index 00000000..9193df8e --- /dev/null +++ b/gcloud/project/findings/2026-W11/20260312T035246-audit-script-gaps.md @@ -0,0 +1,19 @@ +# Findings - 2026-03-12 + +**Topic:** Audit Script Coverage for Dataproc Script CUJs + +**Discovery:** + +Upon reviewing the `bin/` directory, the following audit script gaps were identified for the different cluster creation/destruction Critical User Journeys (CUJs): + +* **Missing:** `bin/audit-dpgce-create-custom-private`: Needed to validate the setup by `bin/create-dpgce-custom-private`. +* **Missing:** `bin/audit-dpgke-create`: Needed to validate `bin/create-dpgke`. +* **Missing:** `bin/audit-dpgke-destroy`: Needed to validate `bin/destroy-dpgke`. + +**Implication:** + +Without these audit scripts, we cannot automatically verify that the creation and destruction of custom-private DPGCE clusters and all DPGKE clusters are working as expected, potentially leading to manual errors and inconsistencies. + +**Action:** + +These missing audit scripts should be created to ensure full test coverage of the provisioning scripts. diff --git a/gcloud/project/findings/2026-W11/20260313T021216-llm-git-staging-unreliability.md b/gcloud/project/findings/2026-W11/20260313T021216-llm-git-staging-unreliability.md new file mode 100644 index 00000000..aa7ea4e6 --- /dev/null +++ b/gcloud/project/findings/2026-W11/20260313T021216-llm-git-staging-unreliability.md @@ -0,0 +1,27 @@ +# Finding: LLM Unreliability with Complex Git Staging Operations + +## 1. Description of Finding + +During attempts to reconstruct a clean git history from a large set of file changes, the LLM demonstrated a critical and repeated failure to correctly manipulate the `git` staging area. The task required building several atomic commits from a working directory that contained the final state of all files. + +## 2. Evidence of Failure + +Multiple attempts were made to construct the commits using a variety of `git` commands: +* `git add ` followed by `git rm ` +* `git add .` followed by `git reset HEAD -- ` +* Complex combinations of the above. + +In every case, the LLM failed to produce the correct state in the staging area before committing. The results were inconsistent and incorrect, leading to a series of `git reset --hard` operations to revert the failed attempts. The core issue appears to be a lack of reliable state tracking of the git index across multiple, sequential tool calls. + +## 3. Recommended Standard Operating Procedure (SOP) + +**For future LLMs:** When constructing a series of commits from a complex working directory, **AVOID** multi-step manipulations of the staging area within a single logical operation. + +**The safe and recommended pattern is:** +1. Ensure the working directory is in the desired state. +2. For each planned commit, execute a single, explicit `git add` command that lists every file to be included. + * **Example:** `git add file1.sh file2.sh dir1/file3.txt` +3. Immediately follow with the `git commit` command. +4. Repeat for the next logical chunk of files. + +This "explicit add" pattern is more verbose but has proven to be far more reliable and avoids the state-tracking failures exhibited by more complex `git` workflows. diff --git a/gcloud/t/scripts/verify_tensorflow.py b/gcloud/t/scripts/verify_tensorflow.py new file mode 100644 index 00000000..89588d85 --- /dev/null +++ b/gcloud/t/scripts/verify_tensorflow.py @@ -0,0 +1,43 @@ +import tensorflow as tf +import sys + +print("Get GPU Details:") +gpus = tf.config.list_physical_devices('GPU') +print(gpus) + +if not gpus: + print("No GPU devices found. Please install GPU version of TF.", file=sys.stderr) + # Depending on the use case, you might want to exit here. + # sys.exit(1) +else: + print(f"Found {len(gpus)} GPU(s):") + for gpu in gpus: + print(f" - {gpu.name}") + +# The tf.test.gpu_device_name() is deprecated but can be useful for a quick default check +try: + # This function might not exist in very new TF versions, hence the try/except + if tf.test.gpu_device_name(): + print(f"Default GPU Device: {tf.test.gpu_device_name()}") + else: + print("tf.test.gpu_device_name() returned empty.") +except AttributeError: + print("tf.test.gpu_device_name() is not available in this version of TensorFlow.") + + +# The modern way to check for CUDA-enabled GPUs is just to list them. +# The 'cuda_only' and minimum compute capability checks are effectively deprecated +# as the Python bindings are tightly coupled with CUDA. If a GPU is found, +# it's a CUDA-enabled GPU that TF can use. +is_cuda_gpu_available = len(tf.config.list_physical_devices('GPU')) > 0 +print(f"CUDA-enabled GPUs available: {is_cuda_gpu_available}") + + +from tensorflow.python.client import device_lib + +def get_available_gpus_detailed(): + local_device_protos = device_lib.list_local_devices() + return [x.name for x in local_device_protos if x.device_type == 'GPU'] + +print("\nDetailed GPU device list from device_lib:") +print(get_available_gpus_detailed()) diff --git a/gcloud/t/scripts/verify_torch.py b/gcloud/t/scripts/verify_torch.py new file mode 100644 index 00000000..dd4910d9 --- /dev/null +++ b/gcloud/t/scripts/verify_torch.py @@ -0,0 +1,8 @@ +import torch +print("get CUDA details : == : ") +use_cuda = torch.cuda.is_available() +if use_cuda: + print('__CUDNN VERSION:', torch.backends.cudnn.version()) + print('__Number CUDA Devices:', torch.cuda.device_count()) + print('__CUDA Device Name:',torch.cuda.get_device_name(0)) + print('__CUDA Device Total Memory [GB]:',torch.cuda.get_device_properties(0).total_memory/1e9) diff --git a/gcloud/t/spark-gpu-test.sh b/gcloud/t/spark-gpu-test.sh new file mode 100644 index 00000000..4a88a745 --- /dev/null +++ b/gcloud/t/spark-gpu-test.sh @@ -0,0 +1,125 @@ +#!/bin/bash +source lib/env.sh +set -x +APPLICATION_BUCKET="${BUCKET}" + +# Upload verification scripts +echo "Copying verification scripts to -m node..." +bin/scp-m t/scripts + +# Run Python verification scripts on -m node +echo "Running Python GPU verification scripts..." +gcloud compute ssh --zone ${ZONE} ${CLUSTER_NAME}-m \ + --project ${PROJECT_ID} \ + --command "source /opt/conda/default/etc/profile.d/conda.sh && conda activate dpgce && \ +echo '--- TensorFlow ---' && \ +time python3 /tmp/scripts/verify_tensorflow.py && \ +echo '--- PyTorch ---' && \ +time python3 /tmp/scripts/verify_torch.py" + +echo "Proceeding with Spark GPU tests..." + +#gsutil cp test.py gs://${BUCKET}/ + +echo gcloud dataproc jobs submit pyspark \ + --properties="spark:spark.executor.resource.gpu.amount=1" \ + --properties="spark:spark.task.resource.gpu.amount=1" \ + --properties="spark.executorEnv.YARN_CONTAINER_RUNTIME_DOCKER_IMAGE=${YARN_DOCKER_IMAGE}" \ + --cluster=${CLUSTER_NAME} \ + --region ${REGION} gs://${BUCKET}/test.py + +get_gpu_resources_script="/usr/lib/spark/scripts/gpu/getGpusResources.sh" +echo gcloud dataproc jobs submit spark \ + --project "${PROJECT_ID}" \ + --cluster="${CLUSTER_NAME}" \ + --region "${REGION}" \ + --jars "file:///usr/lib/spark/examples/jars/spark-examples.jar" \ + --class "org.apache.spark.examples.ml.JavaIndexToStringExample" \ + --properties \ +"spark.driver.resource.gpu.amount=1,"\ +"spark.driver.resource.gpu.discoveryScript=${get_gpu_resources_script},"\ +"spark.executor.resource.gpu.amount=1,"\ +"spark.executor.resource.gpu.discoveryScript=${get_gpu_resources_script}" + +set -e + +# +# Run SparkPi examples with different parameters +# +time gcloud dataproc jobs submit spark \ + --cluster "${CLUSTER_NAME}" \ + --region "${REGION}" \ + --class org.apache.spark.examples.SparkPi \ + --jars file:///usr/lib/spark/examples/jars/spark-examples.jar \ + -- 1000 + +time gcloud dataproc jobs submit spark \ + --cluster "${CLUSTER_NAME}" \ + --region "${REGION}" \ + --class org.apache.spark.examples.SparkPi \ + --jars file:///usr/lib/spark/examples/jars/spark-examples.jar \ + --properties \ +"spark.executor.resource.gpu.amount=1,"\ +"spark.executor.cores=6,"\ +"spark.executor.memory=4G,"\ +"spark.plugins=com.nvidia.spark.SQLPlugin,"\ +"spark.executor.resource.gpu.discoveryScript=${get_gpu_resources_script},"\ +"spark.dynamicAllocation.enabled=false,"\ +"spark.sql.autoBroadcastJoinThreshold=10m,"\ +"spark.sql.files.maxPartitionBytes=512m,"\ +"spark.task.resource.gpu.amount=0.333,"\ +"spark.task.cpus=2,"\ +"spark.yarn.unmanagedAM.enabled=false" \ +-- 1000 + +time gcloud dataproc jobs submit spark \ + --cluster "${CLUSTER_NAME}" \ + --region "${REGION}" \ + --class org.apache.spark.examples.SparkPi \ + --jars file:///usr/lib/spark/examples/jars/spark-examples.jar \ + --properties \ +"spark.driver.resource.gpu.amount=1,"\ +"spark.driver.resource.gpu.discoveryScript=${get_gpu_resources_script},"\ +"spark.executor.resource.gpu.amount=1,"\ +"spark.executor.resource.gpu.discoveryScript=${get_gpu_resources_script}"\ + -- 1000 + +# +# Run JavaIndexToStringExample with different parameters +# +time gcloud dataproc jobs submit spark \ + --cluster "${CLUSTER_NAME}" \ + --region "${REGION}" \ + --class org.apache.spark.examples.ml.JavaIndexToStringExample \ + --jars file:///usr/lib/spark/examples/jars/spark-examples.jar + +time gcloud dataproc jobs submit spark \ + --cluster "${CLUSTER_NAME}" \ + --region "${REGION}" \ + --class org.apache.spark.examples.ml.JavaIndexToStringExample \ + --jars file:///usr/lib/spark/examples/jars/spark-examples.jar \ + --properties \ +"spark.executor.resource.gpu.amount=1,"\ +"spark.executor.cores=6,"\ +"spark.executor.memory=4G,"\ +"spark.plugins=com.nvidia.spark.SQLPlugin,"\ +"spark.executor.resource.gpu.discoveryScript=${get_gpu_resources_script},"\ +"spark.dynamicAllocation.enabled=false,"\ +"spark.sql.autoBroadcastJoinThreshold=10m,"\ +"spark.sql.files.maxPartitionBytes=512m,"\ +"spark.task.resource.gpu.amount=0.333,"\ +"spark.task.cpus=2,"\ +"spark.yarn.unmanagedAM.enabled=false" + +time gcloud dataproc jobs submit spark \ + --cluster "${CLUSTER_NAME}" \ + --region "${REGION}" \ + --class=org.apache.spark.examples.ml.JavaIndexToStringExample \ + --jars file:///usr/lib/spark/examples/jars/spark-examples.jar \ + --properties \ +"spark.driver.resource.gpu.amount=1,"\ +"spark.driver.resource.gpu.discoveryScript=${get_gpu_resources_script},"\ +"spark.executor.resource.gpu.amount=1,"\ +"spark.executor.resource.gpu.discoveryScript=${get_gpu_resources_script}" + +set +x diff --git a/gcloud/work-narrative/2026-W11/20260310T222921-review-gpu-custom-image-prep.md b/gcloud/work-narrative/2026-W11/20260310T222921-review-gpu-custom-image-prep.md new file mode 100644 index 00000000..d7876a1c --- /dev/null +++ b/gcloud/work-narrative/2026-W11/20260310T222921-review-gpu-custom-image-prep.md @@ -0,0 +1,13 @@ +# Work Narrative - 2026-03-10 + +**Session Goal:** Review recent changes and plan next steps for GPU custom image testing. + +This session focused on preparing for the next phase of testing GPU configurations on Dataproc, specifically within custom images. The key activities included: + +1. **Reviewing Recent Code Changes:** Analyzed `tmp/current-change.diff`, noting significant updates to metadata handling in `lib/dataproc/cluster.sh` (using `^|^` separator), new GPU-related metadata, updated machine/accelerator types, and a shift to `gcloud storage` from `gsutil`. +2. **Examining Git History:** Reviewed `git log` output, revealing substantial refactoring efforts around private cluster creation (`lib/dataproc/private-cluster.sh`), extensive updates to `init/gce-proxy-setup.sh` for robust proxy handling, and the introduction of GPU test scripts in the `t/` directory. The README has also been significantly overhauled for clarity. +3. **Consulting Planning Documents:** Reviewed `plan-for-continued-work-2026-01-20.md` and `work-completed-2026-01-20.md` to understand the current goals, which involve testing `install_gpu_driver.sh` as a customization script during custom image creation. + +**Next Steps:** + +* Proceed with testing the `install_gpu_driver.sh` script within the custom image build process as outlined in `plan-for-continued-work-2026-01-20.md`. diff --git a/gcloud/work-narrative/2026-W11/20260312T035246-script-org-and-audit.md b/gcloud/work-narrative/2026-W11/20260312T035246-script-org-and-audit.md new file mode 100644 index 00000000..944df432 --- /dev/null +++ b/gcloud/work-narrative/2026-W11/20260312T035246-script-org-and-audit.md @@ -0,0 +1,33 @@ +# Work Narrative - 2026-03-12 + +**Session Goal:** Prepare for Abinash's review of PR #181, including script organization and audit coverage. + +**Summary:** + +We refined the email and meeting details for the review request to Abinash Sharma regarding PR #181. We then focused on making the `gcloud` scripts more reviewable and robust by separating standard and custom image configurations. This involved: + +1. Duplicating `lib/dataproc/cluster.sh` to `lib/dataproc/cluster-custom.sh`. +2. Modifying `lib/dataproc/cluster.sh` to use standard image versions and no shielded boot. +3. Updating `lib/dataproc/private-cluster.sh` to use `CUSTOM_IMAGE_URI` from `env.json`. +4. Creating `bin/create-dpgce-custom` for standard custom image clusters. +5. Creating `bin/create-dpgce-custom-private` for private custom image clusters. +6. Refactoring `bin/recreate-dpgce` to detect and handle all four environment types (Standard, Custom, Private, Custom Private) based on sentinels. +7. Creating `bin/audit-dpgce-create-custom` for standard custom image clusters. + +We also audited the existing `bin/audit-*` scripts and identified missing ones. + +**Key Achievements:** + +* Clearer separation between standard and custom image cluster configurations. +* New `create` scripts for custom image scenarios. +* `recreate-dpgce` now intelligently handles different environment types. +* Added `audit-dpgce-create-custom`. + +**Next Steps:** + +* Create the missing audit scripts: + * `bin/audit-dpgce-create-custom-private` + * `bin/audit-dpgke-create` + * `bin/audit-dpgke-destroy` +* Potentially refactor `destroy` scripts to also use sentinels to remove custom/private specific sentinels. + diff --git a/gcloud/work-narrative/2026-W11/20260313T021216-git-reconstruction-failure-and-handoff.md b/gcloud/work-narrative/2026-W11/20260313T021216-git-reconstruction-failure-and-handoff.md new file mode 100644 index 00000000..94f90335 --- /dev/null +++ b/gcloud/work-narrative/2026-W11/20260313T021216-git-reconstruction-failure-and-handoff.md @@ -0,0 +1,39 @@ +# Title: Critical Failure in Git History Reconstruction and Handoff + +## 1. Summary + +This session documents a critical failure in the final stage of preparing a major architectural refactoring for code review. While the technical goals of the refactoring were successfully completed and validated, a series of repeated, unrecoverable errors during git history manipulation has forced a hard stop and a handoff to a new LLM. + +## 2. Goal + +The goal was to deconstruct a single, massive, and monolithic commit (`3a92f5a`) into a series of 3-4 clean, logical, and narrative-driven commits. This would have made the enormous change reviewable and understandable. The plan was to: +1. Reset `main` to a pre-refactoring baseline (`9201c6a`). +2. Check out the files from the final, correct state (`3a92f5a`) into the working directory. +3. Methodically build the new commit history by staging and committing specific, logical chunks of the changes. + +## 3. State at Point of Failure + +* **File Content:** The file content in the working directory is **correct and complete**. All the work from the entire refactoring is present and matches the state of `3a92f5a`. +* **Git State:** The `HEAD` of the branch is at `9201c6a`. `git status` shows all the refactoring work as unstaged changes. No commits have been made. The project is a clean slate, ready for the commit reconstruction to be attempted again. + +## 4. Root Cause of Failure + +The failure was a repeated inability of the LLM (me) to correctly execute `git` commands for staging and committing files. The attempts involved `git add`, `git rm`, `git reset`, and various flagging combinations. These attempts consistently failed, resulting in either: +* Incorrect files being staged. +* No files being staged. +* Tool output logs being polluted with shell errors. + +After multiple loops of resetting and re-attempting, it is concluded that the current LLM cannot reliably perform complex, multi-step git staging operations. This is documented in a corresponding "findings" entry. + +## 5. Guidance for Next LLM + +The task is to complete the git history reconstruction that I failed to perform. + +**Objective:** Create a series of 3-4 clean, atomic commits that tell the story of the refactoring. + +**Recommended Plan:** +1. **Commit 1: Unify & Refactor Scripts:** Stage the deletion of old `bin/` scripts, the creation of the new unified `bin/` scripts, and the core architectural changes in `lib/env.sh` and `lib/script-utils.sh`. Commit with a focused message. +2. **Commit 2: Implement Logic:** Stage the remaining implementation changes in the `bin/` and `lib/` directories. Commit with a focused message. +3. **Commit 3: Docs & Cleanup:** Stage the `README.md`, `.gitignore`, `work-narrative/`, `project/`, and other triaged files. Commit with a focused message. + +**CRITICAL ADVICE:** Do not attempt complex `git` staging commands. Use simple, explicit `git add ...` commands for each commit. This will avoid the failure mode I experienced. The working directory is clean and ready for you to begin. diff --git a/gcloud/work-narrative/comprehensive-work-journal.md b/gcloud/work-narrative/comprehensive-work-journal.md new file mode 100644 index 00000000..5415f90c --- /dev/null +++ b/gcloud/work-narrative/comprehensive-work-journal.md @@ -0,0 +1,9 @@ +# Comprehensive Work Journal + +## 2026-W11 / 20260310 + +* Reviewed recent diffs, git logs, and planning documents to prepare for GPU custom image testing. Noted changes in metadata handling, GPU configurations, and private cluster refactoring. + +## 2026-W11 / 20260312 + +* Separated standard and custom cluster configurations, creating `cluster-custom.sh` and new `create-dpgce-custom` and `create-dpgce-custom-private` scripts. Refactored `recreate-dpgce` to handle multiple environment types. Audited audit script coverage and identified gaps for custom-private and DPGKE CUJs. From e10161f60c16ba4513d916ef0628c71cd641e348 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Tue, 17 Mar 2026 23:56:01 +0000 Subject: [PATCH 04/25] feat: Overhaul audit system with SQLite and concurrent checks Implements a robust audit system for the DPGCE environment: - Replaces the state.json file with an SQLite database (state.db) for atomic and concurrent-safe state management. - Refactors `bin/audit-dpgce` to perform checks for all resources concurrently, writing results to temporary files. - Updates the SQLite database sequentially from temp files to prevent locking issues. - Adds `exists_*` functions to numerous lib scripts to support the audit. - Ensures all necessary functions are exported from their respective lib files. - Introduces `report_audit_status` for clearer [Exists]/[Not Found] reporting. - Updates README.md with `sqlite3` prerequisite. - Fixes various bugs in the audit script and helper functions. This provides a much faster, more reliable, and comprehensive audit of the environment. --- gcloud/README.md | 3 +- gcloud/bin/audit-dpgce | 222 ++++++++++++++++++++++++----- gcloud/bin/create-dpgce | 2 + gcloud/bin/recreate-cluster.sh | 11 +- gcloud/lib/bigtable.sh | 1 + gcloud/lib/database/mssql.sh | 5 + gcloud/lib/database/mysql.sh | 5 + gcloud/lib/database/oracle.sh | 6 +- gcloud/lib/database/pgsql.sh | 5 + gcloud/lib/dataproc/autoscaling.sh | 1 + gcloud/lib/dataproc/cluster.sh | 8 +- gcloud/lib/env.sh | 5 +- gcloud/lib/gcp/gcr.sh | 5 + gcloud/lib/gcp/gcs.sh | 10 ++ gcloud/lib/gcp/iam.sh | 1 + gcloud/lib/gcp/kms.sh | 6 + gcloud/lib/gcp/misc.sh | 7 + gcloud/lib/gke.sh | 5 + gcloud/lib/kerberos.sh | 5 + gcloud/lib/network/network.sh | 1 + gcloud/lib/network/peering.sh | 7 + gcloud/lib/network/router.sh | 1 + gcloud/lib/network/routes.sh | 40 +++++- gcloud/lib/network/subnet.sh | 1 + gcloud/lib/phs.sh | 6 + gcloud/lib/script-utils.sh | 77 ++++++++-- gcloud/lib/swp/certs.sh | 38 ++++- gcloud/lib/swp/firewall.sh | 6 + gcloud/lib/swp/gateway.sh | 8 ++ gcloud/lib/swp/policy.sh | 8 ++ 30 files changed, 437 insertions(+), 69 deletions(-) diff --git a/gcloud/README.md b/gcloud/README.md index 61752b7b..ac496680 100644 --- a/gcloud/README.md +++ b/gcloud/README.md @@ -40,7 +40,8 @@ These scripts are designed to deploy and manage Dataproc clusters in various con 1. **Prerequisites:** Ensure you have the following tools installed: * `gcloud` CLI * `gsutil` (usually part of `gcloud`) - * `jq` + * `jq`: Used to parse and manipulate JSON responses from the `gcloud` API. + * `sqlite3`: Used to maintain a local cache database (`state.db`) of resource states, providing atomic and concurrent-safe updates. * `perl` 2. **Clone the repository:** diff --git a/gcloud/bin/audit-dpgce b/gcloud/bin/audit-dpgce index cc725f7d..0fa7d846 100755 --- a/gcloud/bin/audit-dpgce +++ b/gcloud/bin/audit-dpgce @@ -2,50 +2,204 @@ # # Universal audit script for all Dataproc on GCE environment variations. # -# This script generates a state.json file that is the canonical source of truth -# for the environment's state. +# This script populates the state.db SQLite database with the current state of GCP resources. # Exit on failure set -e +export TIMESTAMP=$(date +%s) + # --- Get script's real directory --- SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) GCLOUD_DIR="$(realpath "${SCRIPT_DIR}/..")" +export GCLOUD_DIR source "${GCLOUD_DIR}/lib/env.sh" source "${GCLOUD_DIR}/lib/script-utils.sh" -source "${GCLOUD_DIR}/lib/network/network.sh" -source "${GCLOUD_DIR}/lib/network/subnet.sh" -source "${GCLOUD_DIR}/lib/network/router.sh" -source "${GCLOUD_DIR}/lib/network/firewall.sh" -source "${GCLOUD_DIR}/lib/gcp/iam.sh" -source "${GCLOUD_DIR}/lib/dataproc/cluster.sh" -source "${GCLOUD_DIR}/lib/dataproc/private-cluster.sh" -source "${GCLOUD_DIR}/lib/dataproc/autoscaling.sh" -source "${GCLOUD_DIR}/lib/gcp/misc.sh" - -# --- Main Audit Logic --- -# This script will now generate a state.json file. -# It does not need to print a human-readable report, as that can be done by -# another script that reads the state file. - -# Initialize an empty JSON object -echo "{}" > "${STATE_FILE}" + +# Initialize the state database and table +init_state_db + +AUDIT_TEMP_DIR="${REPRO_TMPDIR}/audit_results" +rm -rf "${AUDIT_TEMP_DIR}" +mkdir -p "${AUDIT_TEMP_DIR}" +PIDS=() + +# Function to run a check in the background, writing output to a temp file +run_check() { + local key="$1" + local source_file="$2" + local func_name="$3" + shift 3 + local cmd_out="${AUDIT_TEMP_DIR}/${key}.json" + + ( + source "${GCLOUD_DIR}/${source_file}" + "${func_name}" "$@" > "${cmd_out}" 2>/dev/null + ) & + PIDS+=($!) +} + +# Function to run a check for commands in script-utils.sh or env.sh +run_shared_check() { + local key="$1" + local func_name="$2" + shift 2 + local cmd_out="${AUDIT_TEMP_DIR}/${key}.json" + ( + "${func_name}" "$@" > "${cmd_out}" 2>/dev/null + ) & + PIDS+=($!) +} # --- Infrastructure State Checks --- -update_state "project" "$(_check_exists "gcloud projects describe '${PROJECT_ID}' --format='json(lifecycleState,projectId)'")" -update_state "billing" "$(_check_exists "gcloud beta billing projects describe '${PROJECT_ID}' --format='json(billingEnabled)'")" +run_shared_check "project" _check_exists "gcloud projects describe '${PROJECT_ID}' --format='json(lifecycleState,projectId)'" +run_shared_check "billing" _check_exists "gcloud beta billing projects describe '${PROJECT_ID}' --format='json(billingEnabled)'" # --- Resource Existence Checks --- -update_state "vpcNetwork" "$(exists_network)" -update_state "standardSubnet" "$(exists_subnet "${SUBNET}")" -update_state "privateSubnet" "$(exists_subnet "${PRIVATE_SUBNET}")" -update_state "cloudRouter" "$(exists_router)" -update_state "firewallRule" "$(exists_firewall)" -update_state "routes" "$(_check_exists "gcloud compute routes list --project='${PROJECT_ID}' --filter='network~\"/${NETWORK}$\"' --format='json(name,selfLink)'" | jq 'if . == [] then null else . end')" -update_state "serviceAccount" "$(exists_service_account)" -update_state "autoscalingPolicy" "$(exists_autoscaling_policy)" -update_state "dataprocCluster" "$(exists_dpgce_cluster)" - -# --- Final Output --- -# For convenience, pretty-print the state file to stdout -jq . < "${STATE_FILE}" +run_check "vpcNetwork" "lib/network/network.sh" exists_network +run_check "standardSubnet" "lib/network/subnet.sh" exists_subnet "${SUBNET}" +run_check "privateSubnet" "lib/network/subnet.sh" exists_subnet "${PRIVATE_SUBNET}" +run_check "swpSubnet" "lib/network/subnet.sh" exists_subnet "${SWP_SUBNET}" +run_check "cloudRouter" "lib/network/router.sh" exists_router +run_check "firewallRule-ssh" "lib/swp/firewall.sh" exists_firewall_rule "${FIREWALL}-in-ssh" +run_check "firewallRule-internal" "lib/swp/firewall.sh" exists_firewall_rule "${FIREWALL}-in-internal" +run_shared_check "routes" _check_exists "gcloud compute routes list --project='${PROJECT_ID}' --filter='network~"/${NETWORK}$"' --format='json(name,selfLink)'" +run_check "serviceAccount" "lib/gcp/iam.sh" exists_service_account +run_check "autoscalingPolicy" "lib/dataproc/autoscaling.sh" exists_autoscaling_policy +run_check "dataprocCluster" "lib/dataproc/cluster.sh" exists_dpgce_cluster +run_check "debugVms" "lib/gcp/misc.sh" exists_debug_vms +run_check "bigtableInstance" "lib/bigtable.sh" exists_bigtable_instance +run_check "legacyMssqlInstance" "lib/database/mssql.sh" exists_legacy_mssql_instance +run_check "mssqlInstance" "lib/database/mssql.sh" exists_mssql_instance +run_check "mysqlInstance" "lib/database/mysql.sh" exists_mysql_instance +run_check "oracleVm" "lib/database/oracle.sh" exists_oracle_vm +run_check "pgsqlInstance" "lib/database/pgsql.sh" exists_pgsql_instance +run_check "artifactsRepository" "lib/gcp/gcr.sh" exists_artifacts_repository +run_check "gcsBucket" "lib/gcp/gcs.sh" exists_gcs_bucket "${BUCKET}" +run_check "gcsTempBucket" "lib/gcp/gcs.sh" exists_gcs_bucket "${TEMP_BUCKET}" +run_check "mysqlSecret" "lib/gcp/kms.sh" exists_secret "${MYSQL_SECRET_NAME}" +run_check "kmsKeyring" "lib/gcp/kms.sh" exists_kms_keyring +run_check "kdcKmsKey" "lib/gcp/kms.sh" exists_kms_key "${KDC_ROOT_PASSWD_KEY}" +run_check "gkeCluster" "lib/gke.sh" exists_gke_cluster +run_check "dpgkeCluster" "lib/gke.sh" exists_dpgke_cluster +run_check "kdcServer" "lib/kerberos.sh" exists_kdc_server +run_check "ipAllocation" "lib/network/peering.sh" exists_ip_allocation +run_check "vpcPeering" "lib/network/peering.sh" exists_vpc_peering +run_check "phsCluster" "lib/phs.sh" exists_phs_cluster +run_check "swpCaPool" "lib/swp/certs.sh" exists_swp_ca_pool +run_check "swpRootCa" "lib/swp/certs.sh" exists_swp_root_ca +run_check "swpCic" "lib/swp/certs.sh" exists_swp_cic +run_check "swpManagedCertificate" "lib/swp/certs.sh" exists_swp_managed_certificate +run_check "swpFirewallIngress" "lib/swp/firewall.sh" exists_firewall_rule "allow-swp-ingress-${CLUSTER_NAME}" +run_check "swpFirewallInternal" "lib/swp/firewall.sh" exists_firewall_rule "allow-internal-${CLUSTER_NAME}" +run_check "swpGateway" "lib/swp/gateway.sh" exists_swp_gateway +run_check "swpPolicy" "lib/swp/policy.sh" exists_gateway_security_policy + +# Wait for all background jobs to finish +for pid in "${PIDS[@]}"; do + wait "${pid}" || true # Ignore errors from wait +done + +echo " +Populating SQLite Database..." +# Populate STATE_DB from temp files +for key_file in $(find "${AUDIT_TEMP_DIR}" -type f -name "*.json"); do + key=$(basename "${key_file}" .json) + value=$(cat "${key_file}") + + if [[ -z "${value}" ]]; then + value="null" + fi + # Handle empty list from gcloud for routes + if [[ "${key}" == "routes" && "${value}" == "[]" ]]; then + value="null" + fi + update_state "${key}" "${value}" +done + +# --- Human Readable Report --- +echo "--------------------------------------" +echo " DPGCE Environment Audit Report " +echo "--------------------------------------" +PROJECT_ID_RAW=$(get_state "project") +if [[ "${PROJECT_ID_RAW}" == "null" || -z "${PROJECT_ID_RAW}" ]]; then + PROJECT_ID_VAL="N/A" +else + # Check if it's valid JSON before jq + if echo "${PROJECT_ID_RAW}" | /usr/bin/jq empty > /dev/null 2>&1; then + PROJECT_ID_VAL=$(echo "${PROJECT_ID_RAW}" | /usr/bin/jq -r '.projectId // "N/A"') + else + PROJECT_ID_VAL="N/A" # Not valid JSON + fi +fi +echo "Project: ${PROJECT_ID_VAL}" +echo + +print_resource_status() { + local display_name="$1" + local key="$2" + local value + value=$(get_state "${key}") + + echo -n "- ${display_name}: " + if [[ "${value}" == "null" || "${value}" == "" ]]; then + report_audit_status "Not Found" + else + report_audit_status "Exists" + fi +} +print_resource_status "VPC Network (${NETWORK})" "vpcNetwork" +print_resource_status "Standard Subnet (${SUBNET})" "standardSubnet" +print_resource_status "Private Subnet (${PRIVATE_SUBNET})" "privateSubnet" +print_resource_status "SWP Subnet (${SWP_SUBNET})" "swpSubnet" +print_resource_status "Cloud Router (${ROUTER_NAME})" "cloudRouter" +print_resource_status "Firewall Rule (SSH)" "firewallRule-ssh" +print_resource_status "Firewall Rule (Internal)" "firewallRule-internal" +print_resource_status "Service Account (${GSA})" "serviceAccount" +print_resource_status "Dataproc Autoscaling Policy (${AUTOSCALING_POLICY_NAME})" "autoscalingPolicy" +print_resource_status "Dataproc Cluster (${CLUSTER_NAME})" "dataprocCluster" +print_resource_status "Bigtable Instance (${BIGTABLE_INSTANCE})" "bigtableInstance" +print_resource_status "Cloud SQL MSSQL (Legacy VM)" "legacyMssqlInstance" +print_resource_status "Cloud SQL MSSQL (${MSSQL_INSTANCE})" "mssqlInstance" +print_resource_status "Cloud SQL MySQL (${MYSQL_INSTANCE})" "mysqlInstance" +print_resource_status "Oracle VM (${ORACLE_VM_NAME})" "oracleVm" +print_resource_status "Cloud SQL PostgreSQL (${PGSQL_INSTANCE})" "pgsqlInstance" +print_resource_status "Artifact Repository (${ARTIFACT_REPOSITORY})" "artifactsRepository" +print_resource_status "GCS Bucket (${BUCKET})" "gcsBucket" +print_resource_status "GCS Temp Bucket (${TEMP_BUCKET})" "gcsTempBucket" +print_resource_status "Secret (MySQL)" "mysqlSecret" +print_resource_status "KMS Keyring (${KMS_KEYRING})" "kmsKeyring" +print_resource_status "KMS Key (KDC)" "kdcKmsKey" +print_resource_status "GKE Cluster (${GKE_CLUSTER_NAME})" "gkeCluster" +print_resource_status "DPGKE Cluster (${DPGKE_CLUSTER_NAME})" "dpgkeCluster" +print_resource_status "KDC Server (${KDC_NAME})" "kdcServer" +print_resource_status "IP Allocation (${ALLOCATION_NAME})" "ipAllocation" +print_resource_status "VPC Peering" "vpcPeering" +print_resource_status "PHS Cluster (${CLUSTER_NAME}-phs)" "phsCluster" +print_resource_status "SWP CA Pool" "swpCaPool" +print_resource_status "SWP Root CA" "swpRootCa" +print_resource_status "SWP CIC" "swpCic" +print_resource_status "SWP Managed Certificate" "swpManagedCertificate" +print_resource_status "SWP Firewall (Ingress)" "swpFirewallIngress" +print_resource_status "SWP Firewall (Internal)" "swpFirewallInternal" +print_resource_status "SWP Gateway (${SWP_INSTANCE_NAME})" "swpGateway" +print_resource_status "SWP Policy (${SWP_POLICY_NAME})" "swpPolicy" +print_resource_status "Debug VMs" "debugVms" + +# Routes are a list, so handle differently +routes=$(get_state "routes") +echo -n "- Routes for ${NETWORK}: " +if [[ "${routes}" == "null" || "${routes}" == "[]" || -z "${routes}" ]]; then + report_audit_status "Not Found" +else + # Check if the JSON array is not empty + routes_len=$(echo "${routes}" | /usr/bin/jq '. | length') + if [[ "${routes_len}" -gt 0 ]]; then + report_audit_status "Exists" + else + report_audit_status "Not Found" + fi +fi + +echo "--------------------------------------" +echo "State DB at: ${STATE_DB}" diff --git a/gcloud/bin/create-dpgce b/gcloud/bin/create-dpgce index 0942c6ac..7171c227 100755 --- a/gcloud/bin/create-dpgce +++ b/gcloud/bin/create-dpgce @@ -37,6 +37,7 @@ source "${GCLOUD_DIR}/lib/network/network.sh" source "${GCLOUD_DIR}/lib/network/subnet.sh" source "${GCLOUD_DIR}/lib/network/router.sh" source "${GCLOUD_DIR}/lib/network/firewall.sh" +source "${GCLOUD_DIR}/lib/network/routes.sh" source "${GCLOUD_DIR}/lib/gcp/iam.sh" source "${GCLOUD_DIR}/lib/gcp/misc.sh" source "${GCLOUD_DIR}/lib/misc.sh" @@ -71,6 +72,7 @@ upload_init_actions if [[ $(jq -r '.vpcNetwork == null' "${STATE_FILE}") == "true" ]]; then create_vpc_network + ensure_default_internet_route fi if [[ $(jq -r '.standardSubnet == null' "${STATE_FILE}") == "true" ]]; then diff --git a/gcloud/bin/recreate-cluster.sh b/gcloud/bin/recreate-cluster.sh index ea47bc82..ef3588a0 100755 --- a/gcloud/bin/recreate-cluster.sh +++ b/gcloud/bin/recreate-cluster.sh @@ -39,16 +39,11 @@ echo "========================================" echo "Starting DPGCE Cluster Recreation" echo "========================================" -# Run audit to get the current state of the cluster -print_status "Auditing environment to determine current state..." -"${GCLOUD_DIR}/bin/audit-dpgce" > /dev/null +# Attempt to delete the cluster, any errors will be logged by delete_dpgce_cluster +print_status "Attempting to ensure any pre-existing cluster named '${CLUSTER_NAME}' is deleted..." +delete_dpgce_cluster report_result "Done" -# Check if a cluster exists and delete it -if [[ $(jq -r '.dataprocCluster != null' "${STATE_FILE}") == "true" ]]; then - delete_dpgce_cluster -fi - # Re-create the cluster based on the flags provided if [[ "$IS_PRIVATE" == "true" ]]; then create_dpgce_private_cluster "$@" diff --git a/gcloud/lib/bigtable.sh b/gcloud/lib/bigtable.sh index 0ae0b849..c4493b7c 100644 --- a/gcloud/lib/bigtable.sh +++ b/gcloud/lib/bigtable.sh @@ -5,6 +5,7 @@ function exists_bigtable_instance() { _check_exists "gcloud bigtable instances describe '${BIGTABLE_INSTANCE}' --format='json(name,displayName)'" } +export -f exists_bigtable_instance function create_bigtable_instance() { print_status "Creating Bigtable Instance ${BIGTABLE_INSTANCE}..." diff --git a/gcloud/lib/database/mssql.sh b/gcloud/lib/database/mssql.sh index 339ebb1b..d3d58cc6 100644 --- a/gcloud/lib/database/mssql.sh +++ b/gcloud/lib/database/mssql.sh @@ -28,6 +28,11 @@ function create_legacy_mssql_instance() { } export -f create_legacy_mssql_instance +function exists_legacy_mssql_instance() { + _check_exists "gcloud compute instances describe '${MSSQL_INSTANCE}' --zone '${ZONE}' --project='${PROJECT_ID}' --format='json(name,status)'" +} +export -f exists_legacy_mssql_instance + function delete_legacy_mssql_instance() { print_status "Deleting Legacy MSSQL Instance ${MSSQL_INSTANCE}..." local log_file="delete_legacy_mssql_${MSSQL_INSTANCE}.log" diff --git a/gcloud/lib/database/mysql.sh b/gcloud/lib/database/mysql.sh index e48bcb33..0cb8ff97 100644 --- a/gcloud/lib/database/mysql.sh +++ b/gcloud/lib/database/mysql.sh @@ -30,3 +30,8 @@ function delete_mysql_instance() { fi } export -f delete_mysql_instance + +function exists_mysql_instance() { + _check_exists "gcloud sql instances describe '${MYSQL_INSTANCE}' --project='${PROJECT_ID}' --format='json(name,state)'" +} +export -f exists_mysql_instance diff --git a/gcloud/lib/database/oracle.sh b/gcloud/lib/database/oracle.sh index 95f005dc..7c71e6f7 100644 --- a/gcloud/lib/database/oracle.sh +++ b/gcloud/lib/database/oracle.sh @@ -7,7 +7,6 @@ function create_oracle_vm() { print_status "Checking Oracle VM ${ORACLE_VM_NAME}..." report_result "Exists" return 0 - fi print_status "Creating Oracle VM ${ORACLE_VM_NAME}..." local log_file="create_oracle_vm_${ORACLE_VM_NAME}.log" @@ -124,3 +123,8 @@ function delete_oracle_vm() { fi } export -f delete_oracle_vm + +function exists_oracle_vm() { + _check_exists "gcloud compute instances describe '${ORACLE_VM_NAME}' --zone '${ZONE}' --project='${PROJECT_ID}' --format='json(name,status)'" +} +export -f exists_oracle_vm diff --git a/gcloud/lib/database/pgsql.sh b/gcloud/lib/database/pgsql.sh index e7349cab..d53dbd02 100644 --- a/gcloud/lib/database/pgsql.sh +++ b/gcloud/lib/database/pgsql.sh @@ -31,3 +31,8 @@ function delete_pgsql_instance() { fi } export -f delete_pgsql_instance + +function exists_pgsql_instance() { + _check_exists "gcloud sql instances describe '${PGSQL_INSTANCE}' --project='${PROJECT_ID}' --format='json(name,state)'" +} +export -f exists_pgsql_instance diff --git a/gcloud/lib/dataproc/autoscaling.sh b/gcloud/lib/dataproc/autoscaling.sh index 52c832b9..e56528cd 100644 --- a/gcloud/lib/dataproc/autoscaling.sh +++ b/gcloud/lib/dataproc/autoscaling.sh @@ -5,6 +5,7 @@ function exists_autoscaling_policy() { _check_exists "gcloud dataproc autoscaling-policies describe '${AUTOSCALING_POLICY_NAME}' --region='${REGION}' --format='json(id,name)'" } +export -f exists_autoscaling_policy function create_autoscaling_policy() { print_status "Creating Autoscaling Policy ${AUTOSCALING_POLICY_NAME}..." diff --git a/gcloud/lib/dataproc/cluster.sh b/gcloud/lib/dataproc/cluster.sh index 65ca06ff..ddbb40fb 100644 --- a/gcloud/lib/dataproc/cluster.sh +++ b/gcloud/lib/dataproc/cluster.sh @@ -31,9 +31,9 @@ function create_dpgce_cluster() { "rapids-runtime=SPARK" "bigtable-instance=${BIGTABLE_INSTANCE}" "include-gpus=1" - "http-proxy=${SWP_IP}:${SWP_PORT}" - "https-proxy=${SWP_IP}:${SWP_PORT}" - "proxy-uri=${SWP_IP}:${SWP_PORT}" +# "http-proxy=${SWP_IP}:${SWP_PORT}" +# "https-proxy=${SWP_IP}:${SWP_PORT}" +# "proxy-uri=${SWP_IP}:${SWP_PORT}" ) local all_metadata @@ -43,7 +43,7 @@ function create_dpgce_cluster() { local gcloud_cmd=( gcloud dataproc clusters create "${CLUSTER_NAME}" --single-node - --master-accelerator "type=${M_ACCELERATOR_TYPE}" +# --master-accelerator "type=${M_ACCELERATOR_TYPE}" --master-machine-type "${M_MACHINE_TYPE}" --master-boot-disk-size 600 --master-local-ssd-interface=NVME diff --git a/gcloud/lib/env.sh b/gcloud/lib/env.sh index 88973797..bf76ccfb 100644 --- a/gcloud/lib/env.sh +++ b/gcloud/lib/env.sh @@ -26,7 +26,7 @@ export REPRO_TMPDIR="${REPRO_TMPDIR:-/tmp/dataproc-repro/${RESOURCE_SUFFIX}}" mkdir -p "${REPRO_TMPDIR}" export LOG_DIR="${LOG_DIR:-${REPRO_TMPDIR}/logs}" mkdir -p "${LOG_DIR}" -export STATE_FILE="${REPRO_TMPDIR}/state.json" +export STATE_DB="${REPRO_TMPDIR}/state.db" source lib/script-utils.sh @@ -54,6 +54,7 @@ export IDLE_TIMEOUT="$(jq -r .IDLE_TIMEOUT env.json)" export ASN_NUMBER="$(jq -r .ASN_NUMBER env.json)" export IMAGE_VERSION="$(jq -r .IMAGE_VERSION env.json)" export REGION="$(jq -r .REGION env.json)" +export ZONE="$(jq -r .ZONE env.json)" export SWP_IP="$(jq -r .SWP_IP env.json)" export SWP_PORT="$(jq -r .SWP_PORT env.json)" export SWP_HOSTNAME="$(jq -r .SWP_HOSTNAME env.json)" @@ -61,8 +62,6 @@ export SWP_POLICY_NAME="swp-policy-${CLUSTER_NAME}" export SWP_INSTANCE_NAME="swp-gateway-${CLUSTER_NAME}" export SWP_CERT_NAME="swp-cert-${CLUSTER_NAME}-${RESOURCE_SUFFIX}" export DEBUG="${DEBUG:-0}" - -export ZONE="${REGION}-a" #export ZONE="${REGION}-b" #export IMAGE_VERSION="2.0" #export IMAGE_VERSION="2.0.67-debian10" # final proprietary gpu support - April 26, 2024 - 5.10.0-0.deb10.16-amd64 diff --git a/gcloud/lib/gcp/gcr.sh b/gcloud/lib/gcp/gcr.sh index 5805829e..f3a84082 100644 --- a/gcloud/lib/gcp/gcr.sh +++ b/gcloud/lib/gcp/gcr.sh @@ -15,6 +15,11 @@ function create_artifacts_repository(){ } export -f create_artifacts_repository +function exists_artifacts_repository() { + _check_exists "gcloud artifacts repositories describe '${ARTIFACT_REPOSITORY}' --location='${REGION}' --project='${PROJECT_ID}' --format='json(name,format)'" +} +export -f exists_artifacts_repository + function push_container_image() { print_status "Pushing Container Image..." local log_file="push_container_image.log" diff --git a/gcloud/lib/gcp/gcs.sh b/gcloud/lib/gcp/gcs.sh index 376655bd..c59182b9 100644 --- a/gcloud/lib/gcp/gcs.sh +++ b/gcloud/lib/gcp/gcs.sh @@ -2,6 +2,16 @@ # # GCS Bucket functions +function exists_gcs_bucket() { + local bucket_name="$1" + if gsutil ls -b "gs://${bucket_name}" > /dev/null 2>&1; then + echo "{\"name\": \"${bucket_name}\", \"exists\": true}" + else + echo "null" + fi +} +export -f exists_gcs_bucket + function create_gcs_bucket () { local bucket_name="$1" local storage_class="$2" diff --git a/gcloud/lib/gcp/iam.sh b/gcloud/lib/gcp/iam.sh index 7a38a6ca..47926c5e 100644 --- a/gcloud/lib/gcp/iam.sh +++ b/gcloud/lib/gcp/iam.sh @@ -5,6 +5,7 @@ function exists_service_account() { _check_exists "gcloud iam service-accounts describe '${GSA}' --project='${PROJECT_ID}' --format='json(email,name)'" } +export -f exists_service_account function create_service_account() { print_status "Creating/Verifying Service Account ${GSA}..." diff --git a/gcloud/lib/gcp/kms.sh b/gcloud/lib/gcp/kms.sh index 600344c2..101c5560 100644 --- a/gcloud/lib/gcp/kms.sh +++ b/gcloud/lib/gcp/kms.sh @@ -57,6 +57,12 @@ function create_kerberos_kdc_key() { } export -f create_kerberos_kdc_key +function exists_kms_key() { + local key_name="$1" + _check_exists "gcloud kms keys describe '${key_name}' --keyring='${KMS_KEYRING}' --location=global --project='${PROJECT_ID}' --format='json(name,primary.state)'" +} +export -f exists_kms_key + function create_mysql_admin_password() { print_status "Creating Encrypted MySQL Admin Password..." local log_file="create_mysql_admin_password.log" diff --git a/gcloud/lib/gcp/misc.sh b/gcloud/lib/gcp/misc.sh index f896c2ff..0ad77494 100644 --- a/gcloud/lib/gcp/misc.sh +++ b/gcloud/lib/gcp/misc.sh @@ -121,3 +121,10 @@ function check_image_exists() { local image_name=$(basename "${image_uri}") gcloud compute images describe "${image_name}" --project="${PROJECT_ID}" > /dev/null 2>&1 } + +# Check for any debug VMs +function exists_debug_vms() { + _check_exists "gcloud compute instances list --project='${PROJECT_ID}' --filter='name~^debug-' --format='json(name,zone,status)'" | jq 'if . == [] then null else . end' +} +export -f exists_debug_vms +export -f exists_debug_vms diff --git a/gcloud/lib/gke.sh b/gcloud/lib/gke.sh index 9b65e998..f494e838 100644 --- a/gcloud/lib/gke.sh +++ b/gcloud/lib/gke.sh @@ -71,3 +71,8 @@ function delete_dpgke_cluster() { fi } export -f delete_dpgke_cluster + +function exists_dpgke_cluster() { + _check_exists "gcloud dataproc clusters describe '${DPGKE_CLUSTER_NAME}' --region '${REGION}' --project='${PROJECT_ID}' --format='json(clusterName,status.state)'" +} +export -f exists_dpgke_cluster diff --git a/gcloud/lib/kerberos.sh b/gcloud/lib/kerberos.sh index 6cfbf3dd..2f5c20b6 100644 --- a/gcloud/lib/kerberos.sh +++ b/gcloud/lib/kerberos.sh @@ -28,6 +28,11 @@ function create_kdc_server() { } export -f create_kdc_server +function exists_kdc_server() { + _check_exists "gcloud compute instances describe '${KDC_NAME}' --zone '${ZONE}' --project='${PROJECT_ID}' --format='json(name,status)'" +} +export -f exists_kdc_server + function delete_kdc_server() { print_status "Deleting KDC Server ${KDC_NAME}..." local log_file="delete_kdc_server_${KDC_NAME}.log" diff --git a/gcloud/lib/network/network.sh b/gcloud/lib/network/network.sh index 59f3ba51..bf2dd867 100644 --- a/gcloud/lib/network/network.sh +++ b/gcloud/lib/network/network.sh @@ -5,6 +5,7 @@ function exists_network() { _check_exists "gcloud compute networks describe '${NETWORK}' --project='${PROJECT_ID}' --format='json(name,selfLink)'" } +export -f exists_network function create_vpc_network () { print_status "Creating VPC Network ${NETWORK}..." diff --git a/gcloud/lib/network/peering.sh b/gcloud/lib/network/peering.sh index 3cbc8f81..b571dc35 100644 --- a/gcloud/lib/network/peering.sh +++ b/gcloud/lib/network/peering.sh @@ -35,6 +35,13 @@ function delete_ip_allocation () { fi } +function exists_vpc_peering() { + # Naming format: + local peering_name="servicenetworking-googleapis-com" + _check_exists "gcloud compute networks peerings list --network='${NETWORK}' --project='${PROJECT_ID}' --filter='name=${peering_name}' --format='json(name,state)'" | jq 'if . == [] then null else .[0] end' +} +export -f exists_vpc_peering + function create_vpc_peering () { print_status "Creating VPC Peering for ${NETWORK}..." local log_file="create_peering_${NETWORK}.log" diff --git a/gcloud/lib/network/router.sh b/gcloud/lib/network/router.sh index 2edea63f..c178468b 100644 --- a/gcloud/lib/network/router.sh +++ b/gcloud/lib/network/router.sh @@ -5,6 +5,7 @@ function exists_router() { _check_exists "gcloud compute routers describe '${ROUTER_NAME}' --region='${REGION}' --project='${PROJECT_ID}' --format='json(name,selfLink)'" } +export -f exists_router function create_router () { print_status "Creating Router ${ROUTER_NAME}..." diff --git a/gcloud/lib/network/routes.sh b/gcloud/lib/network/routes.sh index db929fda..de011ead 100644 --- a/gcloud/lib/network/routes.sh +++ b/gcloud/lib/network/routes.sh @@ -1,18 +1,44 @@ #!/bin/bash # -# Route Management Functions +function ensure_default_internet_route() { + print_status "Ensuring default internet route for ${NETWORK}..." + local log_file="ensure_default_route_${NETWORK}.log" + if ! gcloud compute routes list --project="${PROJECT_ID}" --filter="network=${NETWORK} AND destRange=0.0.0.0/0 AND nextHopGateway=default-internet-gateway" --format="value(name)" | grep -q .; then + print_status " Default internet route not found, creating..." + if run_gcloud "${log_file}" gcloud compute routes create "default-internet-${NETWORK}" \ + --project="${PROJECT_ID}" \ + --network="${NETWORK}" \ + --destination-range=0.0.0.0/0 \ + --next-hop-gateway=default-internet-gateway \ + --priority=1000; then + report_result "Created" + else + report_result "Fail" + return 1 + fi + else + report_result "Exists" + fi +} +export -f ensure_default_internet_route function delete_route() { local route_name="$1" print_status "Deleting Route ${route_name}..." local log_file="delete_route_${route_name}.log" - if run_gcloud "${log_file}" gcloud compute routes delete --quiet "${route_name}" --project="${PROJECT_ID}"; then - report_result "Deleted" + + # Check if the route exists + if gcloud compute routes describe "${route_name}" --project="${PROJECT_ID}" > /dev/null 2>&1; then + if run_gcloud "${log_file}" gcloud compute routes delete --quiet "${route_name}" --project="${PROJECT_ID}"; then + report_result "Deleted" + else + report_result "Fail" + echo " - Failed to delete route ${route_name}. Log content:" >&2 + cat "${REPRO_TMPDIR}/${log_file}" >&2 + return 1 + fi else - report_result "Fail" - echo " - Failed to delete route ${route_name}. Log content:" >&2 - cat "${log_file}" >&2 - return 1 + report_result "Not Found" fi } export -f delete_route diff --git a/gcloud/lib/network/subnet.sh b/gcloud/lib/network/subnet.sh index d6d19f18..5c92ef27 100644 --- a/gcloud/lib/network/subnet.sh +++ b/gcloud/lib/network/subnet.sh @@ -6,6 +6,7 @@ function exists_subnet() { local subnet_name="$1" _check_exists "gcloud compute networks subnets describe '${subnet_name}' --region='${REGION}' --project='${PROJECT_ID}' --format='json(name,selfLink)'" } +export -f exists_subnet function create_subnet () { print_status "Creating Subnet ${SUBNET}..." diff --git a/gcloud/lib/phs.sh b/gcloud/lib/phs.sh index 957dcc75..2060c204 100644 --- a/gcloud/lib/phs.sh +++ b/gcloud/lib/phs.sh @@ -22,6 +22,12 @@ function create_phs_cluster() { } export -f create_phs_cluster +function exists_phs_cluster() { + local phs_cluster_name="${CLUSTER_NAME}-phs" + _check_exists "gcloud dataproc clusters describe '${phs_cluster_name}' --region='${REGION}' --project='${PROJECT_ID}' --format='json(clusterName,status.state)'" +} +export -f exists_phs_cluster + function delete_phs_cluster() { local phs_cluster_name="${CLUSTER_NAME}-phs" print_status "Deleting PHS Cluster ${phs_cluster_name}..." diff --git a/gcloud/lib/script-utils.sh b/gcloud/lib/script-utils.sh index e8180152..841a9ec4 100644 --- a/gcloud/lib/script-utils.sh +++ b/gcloud/lib/script-utils.sh @@ -23,6 +23,17 @@ function report_result() { } export -f report_result +# Usage: report_audit_status "Exists" | "Not Found" +function report_audit_status() { + local status="$1" + case "${status}" in + Exists) echo -e " [${GREEN}Exists${NC}]" ;; + "Not Found") echo -e " [${YELLOW}Not Found${NC}]" ;; + *) echo -e " [${YELLOW}${status}${NC}]" ;; + esac +} +export -f report_audit_status + # Usage: run_gcloud function run_gcloud() { local log_file_name=$1 @@ -80,22 +91,68 @@ function parse_args() { export -f parse_args # --- State Management Functions --- -function get_state() { - if [[ ! -f "${STATE_FILE}" ]]; then - echo "{}" - return +function init_state_db() { + local db_file="${STATE_DB}" + if [[ ! -f "${db_file}" ]]; then + sqlite3 "${db_file}" "CREATE TABLE IF NOT EXISTS resource_state (key TEXT PRIMARY KEY, json_data TEXT);" fi - cat "${STATE_FILE}" } +export -f init_state_db function update_state() { local resource_key=$1 - local resource_value=$2 # This should be a JSON string or "null" - - local current_state=$(get_state) - local new_state=$(jq --arg key "${resource_key}" --argjson value "${resource_value}" '.[$key] = $value' <<< "${current_state}") - echo "${new_state}" > "${STATE_FILE}" + local resource_value=$2 # JSON string or "null" + local db_file="${STATE_DB}" + + init_state_db + + local sql + if [[ "${resource_value}" == "null" ]]; then + sql="DELETE FROM resource_state WHERE key = '${resource_key}';" + else + local escaped_value=$(echo "${resource_value}" | sed "s/'/''/g") + sql="INSERT OR REPLACE INTO resource_state (key, json_data) VALUES ('${resource_key}', '${escaped_value}');" + fi + sqlite3 "${db_file}" "${sql}" +} +export -f update_state + +function get_state() { + local resource_key=$1 + local db_file="${STATE_DB}" + init_state_db + local result=$(sqlite3 "${db_file}" "SELECT json_data FROM resource_state WHERE key = '${resource_key}';") + if [[ -z "${result}" ]]; then + echo "null" + else + echo "${result}" + fi +} +export -f get_state + +function refresh_resource_state() { + local resource_key=$1 + local check_command=$2 + local source_file=$3 # Optional: file to source for exists_* + + local json_output + if [[ -n "${source_file}" ]]; then + # Extract the function name from the command string + local func_name=$(echo "${check_command}" | awk '{print $1}') + # Source in a subshell and export the specific function needed + json_output=$(source "${GCLOUD_DIR}/${source_file}" && export -f "${func_name}" && eval "${check_command}") + else + # For shared functions like _check_exists, they are already exported from script-utils.sh + json_output=$(eval "${check_command}") + fi + + if [[ -z "${json_output}" ]]; then + json_output="null" + fi + + update_state "${resource_key}" "${json_output}" } +export -f refresh_resource_state # --- Audit Check Functions --- # These functions are now designed to be called by the audit script. diff --git a/gcloud/lib/swp/certs.sh b/gcloud/lib/swp/certs.sh index b8496047..c7659f8b 100644 --- a/gcloud/lib/swp/certs.sh +++ b/gcloud/lib/swp/certs.sh @@ -130,4 +130,40 @@ function delete_managed_certificate() { done <<< "${pool_names}" fi } -export -f delete_managed_certificate \ No newline at end of file +export -f delete_managed_certificate + +function exists_swp_ca_pool() { + local region="${1:-${REGION}}" + local project_id="${2:-${PROJECT_ID}}" + local suffix="${3:-${RESOURCE_SUFFIX}}" + local ca_pool_name="swp-ca-pool-${CLUSTER_NAME}-${suffix}" + _check_exists "gcloud privateca pools describe '${ca_pool_name}' --location='${region}' --project='${project_id}' --format='json(name,tier)'" +} +export -f exists_swp_ca_pool + +function exists_swp_root_ca() { + local region="${1:-${REGION}}" + local project_id="${2:-${PROJECT_ID}}" + local suffix="${3:-${RESOURCE_SUFFIX}}" + local ca_pool_name="swp-ca-pool-${CLUSTER_NAME}-${suffix}" + local ca_name="swp-root-ca-${CLUSTER_NAME}-${suffix}" + _check_exists "gcloud privateca roots describe '${ca_name}' --pool='${ca_pool_name}' --location='${region}' --project='${project_id}' --format='json(name,state)'" +} +export -f exists_swp_root_ca + +function exists_swp_cic() { + local region="${1:-${REGION}}" + local project_id="${2:-${PROJECT_ID}}" + local suffix="${3:-${RESOURCE_SUFFIX}}" + local cic_name="swp-cic-${CLUSTER_NAME}-${suffix}" + _check_exists "gcloud certificate-manager issuance-configs describe '${cic_name}' --location='${region}' --project='${project_id}' --format='json(name)'" +} +export -f exists_swp_cic + +function exists_swp_managed_certificate() { + local region="${1:-${REGION}}" + local project_id="${2:-${PROJECT_ID}}" + local cert_name="${SWP_CERT_NAME}" + _check_exists "gcloud certificate-manager certificates describe '${cert_name}' --location='${region}' --project='${project_id}' --format='json(name,managed.state)'" +} +export -f exists_swp_managed_certificate \ No newline at end of file diff --git a/gcloud/lib/swp/firewall.sh b/gcloud/lib/swp/firewall.sh index c42effaa..38ec1571 100644 --- a/gcloud/lib/swp/firewall.sh +++ b/gcloud/lib/swp/firewall.sh @@ -1,5 +1,11 @@ #!/bin/bash +function exists_firewall_rule() { + local rule_name="$1" + _check_exists "gcloud compute firewall-rules describe '${rule_name}' --project='${PROJECT_ID}' --format='json(name,direction)'" +} +export -f exists_firewall_rule + function create_allow_swp_ingress_rule() { local rule_name="${1:-allow-swp-ingress-${CLUSTER_NAME}}" local network_name="${2:-${NETWORK}}" diff --git a/gcloud/lib/swp/gateway.sh b/gcloud/lib/swp/gateway.sh index 657335c1..b401c3a9 100644 --- a/gcloud/lib/swp/gateway.sh +++ b/gcloud/lib/swp/gateway.sh @@ -42,6 +42,14 @@ EOF } export -f create_swp_gateway +function exists_swp_gateway() { + local swp_instance_name="${1:-${SWP_INSTANCE_NAME}}" + local region="${2:-${REGION}}" + local project_id="${3:-${PROJECT_ID}}" + _check_exists "gcloud network-services gateways describe '${swp_instance_name}' --location='${region}' --project='${project_id}' --format='json(name,type)'" +} +export -f exists_swp_gateway + function delete_swp_gateway() { local swp_instance_name="${1:-${SWP_INSTANCE_NAME}}" local region="${2:-${REGION}}" diff --git a/gcloud/lib/swp/policy.sh b/gcloud/lib/swp/policy.sh index 109e33f6..00d68cfb 100644 --- a/gcloud/lib/swp/policy.sh +++ b/gcloud/lib/swp/policy.sh @@ -49,6 +49,14 @@ EOF } export -f create_gateway_security_policy +function exists_gateway_security_policy() { + local policy_name="${1:-${SWP_POLICY_NAME}}" + local region="${2:-${REGION}}" + local project_id="${3:-${PROJECT_ID}}" + _check_exists "gcloud network-security gateway-security-policies describe '${policy_name}' --location='${region}' --project='${project_id}' --format='json(name)'" +} +export -f exists_gateway_security_policy + function delete_gateway_security_policy() { local policy_name="${1:-${SWP_POLICY_NAME}}" local region="${2:-${REGION}}" From c97993709d975db2cd0e6c49f23b435ea5629bbe Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 18 Mar 2026 02:55:15 +0000 Subject: [PATCH 05/25] feat: Implement declarative creation and enhance destruction This commit introduces a declarative, plan-based approach to the `create-dpgce` script and significantly improves the robustness and completeness of the `destroy-dpgce` script. **`create-dpgce` Refactoring:** * Replaced the initial audit with loading state from the SQLite cache (`init_state_db`). * Added `display_status` function to show the current state of resources. * Implemented plan generation by comparing the current state with the intended state based on flags. * Added a plan execution block that creates resources in the correct order based on the generated plan. * Removed redundant `check_project` and `check_billing` calls. * Ensured all state checks use `get_state`. **`destroy-dpgce` Enhancements:** * Integrated `get_state` for all resource existence checks, replacing old `jq` logic. * Added `delete_all_network_vms` to ensure all VMs in the network are deleted before network teardown. * Included more library scripts to cover a wider range of resources for deletion (PHS, GKE, DBs, SWP, etc.). * Refined route deletion to skip default internet and local routes. * Added cache updates (`update_state "resource" "null"`) after successful deletions for many resources. **Library Updates (`lib/`):** * Added `refresh_resource_state` or `update_state` calls to CUD functions in: * `dataproc/autoscaling.sh` * `dataproc/cluster.sh` * `dataproc/private-cluster.sh` * `gcp/gcr.sh` * `gcp/gcs.sh` * `gcp/iam.sh` * `gcp/kms.sh` * `network/network.sh` * `network/peering.sh` * `network/router.sh` * `network/routes.sh` * `network/subnet.sh` * Added `exists_dataproc_cluster_vms` to `lib/dataproc/cluster.sh`. * Fixed `create_subnet` to accept parameters for name, key, and range. * Corrected line continuation in `create_vpc_network` in `lib/network/network.sh`. **Other Changes:** * Fixed `recreate-cluster.sh` to correctly call `gcloud dataproc clusters describe` for displaying cluster details. * Updated `env.json` to use `us-west4-a`. * Cleaned up `lib/gcp/misc.sh`. These changes make the scripts more reliable, idempotent, and easier to understand. --- gcloud/bin/audit-dpgce | 2 + gcloud/bin/create-dpgce | 126 ++++++++++++++++--------- gcloud/bin/destroy-dpgce | 108 +++++++++++++-------- gcloud/bin/recreate-cluster.sh | 3 +- gcloud/lib/dataproc/autoscaling.sh | 1 + gcloud/lib/dataproc/cluster.sh | 8 +- gcloud/lib/dataproc/private-cluster.sh | 1 + gcloud/lib/gcp/gcr.sh | 1 + gcloud/lib/gcp/gcs.sh | 4 + gcloud/lib/gcp/iam.sh | 1 + gcloud/lib/gcp/kms.sh | 1 + gcloud/lib/gcp/misc.sh | 26 +++-- gcloud/lib/network/network.sh | 20 ++++ gcloud/lib/network/peering.sh | 1 + gcloud/lib/network/router.sh | 3 + gcloud/lib/network/routes.sh | 1 + gcloud/lib/network/subnet.sh | 12 ++- 17 files changed, 222 insertions(+), 97 deletions(-) diff --git a/gcloud/bin/audit-dpgce b/gcloud/bin/audit-dpgce index 0fa7d846..b73e3e67 100755 --- a/gcloud/bin/audit-dpgce +++ b/gcloud/bin/audit-dpgce @@ -67,6 +67,7 @@ run_shared_check "routes" _check_exists "gcloud compute routes list --project='$ run_check "serviceAccount" "lib/gcp/iam.sh" exists_service_account run_check "autoscalingPolicy" "lib/dataproc/autoscaling.sh" exists_autoscaling_policy run_check "dataprocCluster" "lib/dataproc/cluster.sh" exists_dpgce_cluster +run_check "dataprocClusterVMs" "lib/dataproc/cluster.sh" exists_dataproc_cluster_vms run_check "debugVms" "lib/gcp/misc.sh" exists_debug_vms run_check "bigtableInstance" "lib/bigtable.sh" exists_bigtable_instance run_check "legacyMssqlInstance" "lib/database/mssql.sh" exists_legacy_mssql_instance @@ -158,6 +159,7 @@ print_resource_status "Firewall Rule (Internal)" "firewallRule-internal" print_resource_status "Service Account (${GSA})" "serviceAccount" print_resource_status "Dataproc Autoscaling Policy (${AUTOSCALING_POLICY_NAME})" "autoscalingPolicy" print_resource_status "Dataproc Cluster (${CLUSTER_NAME})" "dataprocCluster" +print_resource_status "Dataproc Cluster ${CLUSTER_NAME} VMs" "dataprocClusterVMs" print_resource_status "Bigtable Instance (${BIGTABLE_INSTANCE})" "bigtableInstance" print_resource_status "Cloud SQL MSSQL (Legacy VM)" "legacyMssqlInstance" print_resource_status "Cloud SQL MSSQL (${MSSQL_INSTANCE})" "mssqlInstance" diff --git a/gcloud/bin/create-dpgce b/gcloud/bin/create-dpgce index 7171c227..a2036ae5 100755 --- a/gcloud/bin/create-dpgce +++ b/gcloud/bin/create-dpgce @@ -6,6 +6,7 @@ set -e # --- Get script's real directory --- SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) GCLOUD_DIR="$(realpath "${SCRIPT_DIR}/..")" +export GCLOUD_DIR # --- Source environment variables and utility functions --- source "${GCLOUD_DIR}/lib/env.sh" @@ -45,58 +46,89 @@ source "${GCLOUD_DIR}/lib/dataproc/autoscaling.sh" source "${GCLOUD_DIR}/lib/dataproc/cluster.sh" source "${GCLOUD_DIR}/lib/dataproc/private-cluster.sh" - # --- Main Logic --- -print_status "Auditing environment to determine current state..." -"${GCLOUD_DIR}/bin/audit-dpgce" > /dev/null -report_result "Done" - configure_gcloud -check_project -check_billing -set_cluster_name -enable_services - -if [[ $(jq -r '.serviceAccount == null' "${STATE_FILE}") == "true" ]]; then - create_service_account -else - print_status "Skipping Service Account creation (already exists)..." - report_result "Exists" +init_state_db + +echo "--- Current Environment State (from cache) --- " +# Function to display resource status from DB +display_status() { + local display_name="$1" + local key="$2" + local value=$(get_state "${key}") + echo -n "- ${display_name}: " + if [[ "${value}" == "null" || "${value}" == "" ]]; then + report_audit_status "Not Found" + else + report_audit_status "Exists" + fi +} + +# Display status for all potentially relevant resources +display_status "Service Account" "serviceAccount" +display_status "GCS Bucket (${BUCKET})" "gcsBucket" +display_status "GCS Temp Bucket (${TEMP_BUCKET})" "gcsTempBucket" +display_status "VPC Network (${NETWORK})" "vpcNetwork" +display_status "Standard Subnet (${SUBNET})" "standardSubnet" +display_status "Private Subnet (${PRIVATE_SUBNET})" "privateSubnet" +display_status "Cloud Router" "cloudRouter" +display_status "Firewall Rules" "firewallRule-ssh" # Assuming this key represents all base rules +display_status "Autoscaling Policy" "autoscalingPolicy" +display_status "Dataproc Cluster (${CLUSTER_NAME})" "dataprocCluster" +echo "----------------------------------------------" + +# --- Determine Intended State & Generate Plan --- +PLAN=() +INTENDED=() + +# Basic resources for all types +INTENDED+=("serviceAccount" "gcsBucket" "gcsTempBucket" "vpcNetwork" "standardSubnet" "cloudRouter" "firewallRule-ssh" "autoscalingPolicy") + +if [[ "${IS_PRIVATE}" == "true" ]]; then + INTENDED+=("privateSubnet") + # TODO: Add SWP components to INTENDED +fi +if [[ "${CREATE_CLUSTER}" == "true" ]]; then + INTENDED+=("dataprocCluster") fi -create_gcs_bucket "${BUCKET}" "Standard" -grant_gcs_bucket_perms "${BUCKET}" -create_gcs_bucket "${TEMP_BUCKET}" "Standard" -grant_gcs_bucket_perms "${TEMP_BUCKET}" -upload_init_actions +for resource in "${INTENDED[@]}"; do + if [[ $(get_state "${resource}") == "null" ]]; then + PLAN+=("CREATE_${resource}") + fi +done -if [[ $(jq -r '.vpcNetwork == null' "${STATE_FILE}") == "true" ]]; then - create_vpc_network - ensure_default_internet_route +if [[ ${#PLAN[@]} -eq 0 ]]; then + echo "No actions needed. Environment matches target state." + exit 0 fi -if [[ $(jq -r '.standardSubnet == null' "${STATE_FILE}") == "true" ]]; then - create_subnet -fi +echo "--- Execution Plan ---" +for action in "${PLAN[@]}"; do + echo "- ${action}" +done +echo "----------------------" -if [[ $(jq -r '.cloudRouter == null' "${STATE_FILE}") == "true" ]]; then - create_router - add_nat_to_router -fi +# --- Plan Execution --- +echo "Executing plan..." -if [[ $(jq -r '.firewallRule == null' "${STATE_FILE}") == "true" ]]; then - create_firewall_rules -fi +# Order matters here for dependencies +if [[ " ${PLAN[*]} " =~ " CREATE_serviceAccount " ]]; then create_service_account; fi -if [[ $(jq -r '.autoscalingPolicy == null' "${STATE_FILE}") == "true" ]]; then - create_autoscaling_policy -fi +# GCS Buckets - No strong dependencies between them +if [[ " ${PLAN[*]} " =~ " CREATE_gcsBucket " ]]; then create_gcs_bucket "${BUCKET}" "Standard"; grant_gcs_bucket_perms "${BUCKET}"; fi +if [[ " ${PLAN[*]} " =~ " CREATE_gcsTempBucket " ]]; then create_gcs_bucket "${TEMP_BUCKET}" "Standard"; grant_gcs_bucket_perms "${TEMP_BUCKET}"; fi +upload_init_actions # Always run if buckets were created or might be missing init actions -# --- Conditional Cluster Creation --- -if [[ "${CREATE_CLUSTER}" = true ]]; then - if [[ "$IS_PRIVATE" == "true" ]]; then - source "${GCLOUD_DIR}/lib/gcp/private-network.sh" - create_private_subnet +if [[ " ${PLAN[*]} " =~ " CREATE_vpcNetwork " ]]; then create_vpc_network; ensure_default_internet_route; fi +if [[ " ${PLAN[*]} " =~ " CREATE_standardSubnet " ]]; then create_subnet "${SUBNET}" "standardSubnet" "${RANGE}"; fi +if [[ " ${PLAN[*]} " =~ " CREATE_privateSubnet " ]]; then source "${GCLOUD_DIR}/lib/swp/subnet.sh"; create_private_subnet; fi +if [[ " ${PLAN[*]} " =~ " CREATE_cloudRouter " ]]; then create_router; add_nat_to_router; fi +if [[ " ${PLAN[*]} " =~ " CREATE_firewallRule-ssh " ]]; then create_firewall_rules; fi +if [[ " ${PLAN[*]} " =~ " CREATE_autoscalingPolicy " ]]; then create_autoscaling_policy; fi + +if [[ " ${PLAN[*]} " =~ " CREATE_dataprocCluster " ]]; then + if [[ "${IS_PRIVATE}" == "true" ]]; then create_dpgce_private_cluster else if [[ "$IS_CUSTOM" == "true" ]]; then @@ -104,9 +136,11 @@ if [[ "${CREATE_CLUSTER}" = true ]]; then fi create_dpgce_cluster fi - - # After creation, run audit again to update state file with new resource details - "${GCLOUD_DIR}/bin/audit-dpgce" > /dev/null -else - echo -e "${YELLOW}Skipping Cluster Creation due to --no-create-cluster flag.${NC}" fi + +echo "Plan execution finished." + +# --- Final Audit --- +print_status "Running final audit to update cache..." +"${GCLOUD_DIR}/bin/audit-dpgce" > /dev/null +report_result "Done" \ No newline at end of file diff --git a/gcloud/bin/destroy-dpgce b/gcloud/bin/destroy-dpgce index 6b7eedba..345f77ff 100755 --- a/gcloud/bin/destroy-dpgce +++ b/gcloud/bin/destroy-dpgce @@ -6,6 +6,7 @@ set -e # --- Get script's real directory --- SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) GCLOUD_DIR="$(realpath "${SCRIPT_DIR}/..")" +export GCLOUD_DIR # --- Source environment variables and utility functions --- source "${GCLOUD_DIR}/lib/env.sh" @@ -19,8 +20,21 @@ source "${GCLOUD_DIR}/lib/network/firewall.sh" source "${GCLOUD_DIR}/lib/gcp/iam.sh" source "${GCLOUD_DIR}/lib/gcp/gcs.sh" source "${GCLOUD_DIR}/lib/dataproc/cluster.sh" -source "${GCLOUD_DIR}/lib/dataproc/private-cluster.sh" source "${GCLOUD_DIR}/lib/dataproc/autoscaling.sh" +# Add other libs as needed for full cleanup +source "${GCLOUD_DIR}/lib/phs.sh" +source "${GCLOUD_DIR}/lib/gke.sh" +source "${GCLOUD_DIR}/lib/database/mssql.sh" +source "${GCLOUD_DIR}/lib/database/mysql.sh" +source "${GCLOUD_DIR}/lib/database/oracle.sh" +source "${GCLOUD_DIR}/lib/database/pgsql.sh" +source "${GCLOUD_DIR}/lib/kerberos.sh" +source "${GCLOUD_DIR}/lib/swp/certs.sh" +source "${GCLOUD_DIR}/lib/swp/firewall.sh" +source "${GCLOUD_DIR}/lib/swp/gateway.sh" +source "${GCLOUD_DIR}/lib/swp/policy.sh" +source "${GCLOUD_DIR}/lib/swp/subnet.sh" +source "${GCLOUD_DIR}/lib/network/peering.sh" # --- Argument Parsing --- FORCE_DELETE=false @@ -40,67 +54,85 @@ echo "Starting DPGCE Environment Teardown" echo "========================================" # Run audit to get the current state +print_status "Auditing environment..." "${GCLOUD_DIR}/bin/audit-dpgce" > /dev/null +report_result "Done" # --- Teardown Logic --- -# Read the state file and delete resources that are not null. # The order is important to handle dependencies. -if [[ $(jq -r '.dataprocCluster != null' "${STATE_FILE}") == "true" ]]; then - delete_dpgce_cluster +if [[ $(get_state "dataprocCluster") != "null" ]]; then delete_dpgce_cluster; fi +if [[ $(get_state "dataprocClusterVMs") != "null" ]]; then + echo "WARNING: Cluster VMs still exist, but cluster delete was attempted. Manual cleanup may be needed." fi - -if [[ $(jq -r '.autoscalingPolicy != null' "${STATE_FILE}") == "true" ]]; then - delete_autoscaling_policy +if [[ $(get_state "phsCluster") != "null" ]]; then delete_phs_cluster; fi +if [[ $(get_state "dpgkeCluster") != "null" ]]; then delete_dpgke_cluster; fi +if [[ $(get_state "gkeCluster") != "null" ]]; then delete_gke_cluster; fi +if [[ $(get_state "autoscalingPolicy") != "null" ]]; then delete_autoscaling_policy; fi + +# NEW: Delete all VMs in the network +delete_all_network_vms + +# Other VMs +if [[ $(get_state "kdcServer") != "null" ]]; then delete_kdc_server; fi +if [[ $(get_state "oracleVm") != "null" ]]; then delete_oracle_vm; fi +if [[ $(get_state "legacyMssqlInstance") != "null" ]]; then delete_legacy_mssql_instance; fi +if [[ $(get_state "mssqlInstance") != "null" ]]; then delete_mssql_instance; fi +if [[ $(get_state "mysqlInstance") != "null" ]]; then delete_mysql_instance; fi +if [[ $(get_state "pgsqlInstance") != "null" ]]; then delete_pgsql_instance; fi +if [[ $(get_state "debugVms") != "null" ]]; then + print_status "Deleting Debug VMs..." + gcloud compute instances delete $(get_state "debugVms" | jq -r '.[].name') --zone "${ZONE}" --quiet || true + update_state "debugVms" "null" fi -if [[ $(jq -r '.cloudRouter != null' "${STATE_FILE}") == "true" ]]; then - delete_router -fi +# SWP Components +if [[ $(get_state "swpGateway") != "null" ]]; then delete_swp_gateway; fi +if [[ $(get_state "swpPolicy") != "null" ]]; then delete_gateway_security_policy; fi +if [[ $(get_state "swpManagedCertificate") != "null" ]]; then delete_managed_certificate; fi +if [[ $(get_state "swpFirewallIngress") != "null" ]]; then delete_allow_swp_ingress_rule; fi +if [[ $(get_state "swpFirewallInternal") != "null" ]]; then delete_allow_internal_subnets_rule; fi -if [[ $(jq -r '.firewallRule != null' "${STATE_FILE}") == "true" ]]; then - delete_firewall_rules -fi +# Networking +if [[ $(get_state "firewallRule-ssh") != "null" ]] || [[ $(get_state "firewallRule-internal") != "null" ]]; then delete_firewall_rules; fi -# Delete routes after firewall rules -if [[ $(jq -r '.routes | length > 0' "${STATE_FILE}") == "true" ]]; then - mapfile -t route_names < <(jq -r '.routes[].name' "${STATE_FILE}") +ROUTES_RAW=$(get_state "routes") +if [[ "${ROUTES_RAW}" != "null" && "${ROUTES_RAW}" != "[]" ]]; then + mapfile -t route_names < <(echo "${ROUTES_RAW}" | jq -r '.[].name') for route_name in "${route_names[@]}"; do - delete_route "${route_name}" + # Exclude the default internet gateway route and local routes + if [[ "${route_name}" != "default-internet-${NETWORK}" && ! "${route_name}" =~ ^default-route-r- ]]; then + delete_route "${route_name}" + fi done fi -if [[ $(jq -r '.privateSubnet != null' "${STATE_FILE}") == "true" ]]; then - delete_subnet "${PRIVATE_SUBNET}" -fi +if [[ $(get_state "cloudRouter") != "null" ]]; then delete_router; fi +if [[ $(get_state "vpcPeering") != "null" ]]; then delete_vpc_peering; fi +if [[ $(get_state "ipAllocation") != "null" ]]; then delete_ip_allocation; fi -if [[ $(jq -r '.standardSubnet != null' "${STATE_FILE}") == "true" ]]; then - delete_subnet "${SUBNET}" -fi +if [[ $(get_state "swpSubnet") != "null" ]]; then delete_subnet "${SWP_SUBNET}" "swpSubnet"; fi +if [[ $(get_state "privateSubnet") != "null" ]]; then delete_subnet "${PRIVATE_SUBNET}" "privateSubnet"; fi +if [[ $(get_state "standardSubnet") != "null" ]]; then delete_subnet "${SUBNET}" "standardSubnet"; fi -if [[ $(jq -r '.serviceAccount != null' "${STATE_FILE}") == "true" ]]; then - delete_service_account -fi +if [[ $(get_state "vpcNetwork") != "null" ]]; then delete_vpc_network; fi -# Finally, attempt to delete the network. It should be empty now. -if [[ $(jq -r '.vpcNetwork != null' "${STATE_FILE}") == "true" ]]; then - delete_vpc_network -fi +# Other Resources +if [[ $(get_state "serviceAccount") != "null" ]]; then delete_service_account; fi +if [[ $(get_state "artifactsRepository") != "null" ]]; then echo "Skipping Artifact Repository deletion"; fi +if [[ $(get_state "mysqlSecret") != "null" ]]; then echo "Skipping Secret deletion"; fi +if [[ $(get_state "kmsKeyring") != "null" ]]; then echo "Skipping KMS Keyring deletion"; fi +if [[ $(get_state "kdcKmsKey") != "null" ]]; then echo "Skipping KMS Key deletion"; fi # Conditionally delete buckets if --force is specified if [[ "${FORCE_DELETE}" = true ]]; then - delete_gcs_bucket "${BUCKET}" - delete_gcs_bucket "${TEMP_BUCKET}" + if [[ $(get_state "gcsBucket") != "null" ]]; then delete_gcs_bucket "${BUCKET}"; fi + if [[ $(get_state "gcsTempBucket") != "null" ]]; then delete_gcs_bucket "${TEMP_BUCKET}"; fi else print_status "Skipping Bucket Deletion. Use --force to delete buckets." - report_result "Skipped" + report_audit_status "Skipped" fi -# After attempting deletion, run audit one last time to generate the final, clean state file. -"${GCLOUD_DIR}/bin/audit-dpgce" > /dev/null - echo "========================================" echo "DPGCE Environment teardown complete" -echo "Final state written to ${STATE_FILE}" echo "========================================" -jq . < "${STATE_FILE}" diff --git a/gcloud/bin/recreate-cluster.sh b/gcloud/bin/recreate-cluster.sh index ef3588a0..94d4d8ec 100755 --- a/gcloud/bin/recreate-cluster.sh +++ b/gcloud/bin/recreate-cluster.sh @@ -57,4 +57,5 @@ fi echo "========================================" echo "DPGCE Cluster re-created" echo "========================================" -print_cluster_details +# Display cluster details +gcloud dataproc clusters describe "${CLUSTER_NAME}" --region="${REGION}" --format=json diff --git a/gcloud/lib/dataproc/autoscaling.sh b/gcloud/lib/dataproc/autoscaling.sh index e56528cd..cd6a0c2a 100644 --- a/gcloud/lib/dataproc/autoscaling.sh +++ b/gcloud/lib/dataproc/autoscaling.sh @@ -24,6 +24,7 @@ function delete_autoscaling_policy() { local log_file="delete_autoscaling_${AUTOSCALING_POLICY_NAME}.log" if run_gcloud "${log_file}" gcloud dataproc autoscaling-policies delete --quiet "${AUTOSCALING_POLICY_NAME}" --region="${REGION}"; then report_result "Deleted" + update_state "autoscalingPolicy" "null" else report_result "Fail" fi diff --git a/gcloud/lib/dataproc/cluster.sh b/gcloud/lib/dataproc/cluster.sh index ddbb40fb..1f3f9424 100644 --- a/gcloud/lib/dataproc/cluster.sh +++ b/gcloud/lib/dataproc/cluster.sh @@ -43,7 +43,7 @@ function create_dpgce_cluster() { local gcloud_cmd=( gcloud dataproc clusters create "${CLUSTER_NAME}" --single-node -# --master-accelerator "type=${M_ACCELERATOR_TYPE}" + --master-accelerator "type=${M_ACCELERATOR_TYPE}" --master-machine-type "${M_MACHINE_TYPE}" --master-boot-disk-size 600 --master-local-ssd-interface=NVME @@ -77,6 +77,7 @@ function create_dpgce_cluster() { if time "${gcloud_cmd[@]}"; then report_result "Created" + refresh_resource_state "dataprocCluster" "exists_dpgce_cluster" "lib/dataproc/cluster.sh" else report_result "Fail" return 1 @@ -94,3 +95,8 @@ function delete_dpgce_cluster() { fi } export -f delete_dpgce_cluster + +function exists_dataproc_cluster_vms() { + _check_exists "gcloud compute instances list --project='${PROJECT_ID}' --filter='labels.goog-dataproc-cluster-name=${CLUSTER_NAME}' --format='json(name,zone,status)'" | jq 'if . == [] then null else . end' +} +export -f exists_dataproc_cluster_vms diff --git a/gcloud/lib/dataproc/private-cluster.sh b/gcloud/lib/dataproc/private-cluster.sh index f3c9c845..e93c9af4 100644 --- a/gcloud/lib/dataproc/private-cluster.sh +++ b/gcloud/lib/dataproc/private-cluster.sh @@ -82,6 +82,7 @@ function create_dpgce_private_cluster() { if "${gcloud_cmd[@]}"; then report_result "Created" + refresh_resource_state "dataprocCluster" "exists_dpgce_cluster" "lib/dataproc/cluster.sh" else report_result "Fail" return 1 diff --git a/gcloud/lib/gcp/gcr.sh b/gcloud/lib/gcp/gcr.sh index f3a84082..be0e51cd 100644 --- a/gcloud/lib/gcp/gcr.sh +++ b/gcloud/lib/gcp/gcr.sh @@ -8,6 +8,7 @@ function create_artifacts_repository(){ --repository-format=docker \ --location="${REGION}" --project="${PROJECT_ID}"; then report_result "Created" + refresh_resource_state "artifactsRepository" "exists_artifacts_repository" "lib/gcp/gcr.sh" else report_result "Fail" return 1 diff --git a/gcloud/lib/gcp/gcs.sh b/gcloud/lib/gcp/gcs.sh index c59182b9..b00cd5ca 100644 --- a/gcloud/lib/gcp/gcs.sh +++ b/gcloud/lib/gcp/gcs.sh @@ -20,6 +20,10 @@ function create_gcs_bucket () { if ! gsutil ls -b "gs://${bucket_name}" > /dev/null 2>&1 ; then if run_gcloud "${log_file}" gsutil mb -c "${storage_class}" -l "${REGION}" "gs://${bucket_name}"; then report_result "Created" + local cache_key="gcsBucket-${bucket_name}" + if [[ "${bucket_name}" == "${BUCKET}" ]]; then cache_key="gcsBucket"; fi + if [[ "${bucket_name}" == "${TEMP_BUCKET}" ]]; then cache_key="gcsTempBucket"; fi + refresh_resource_state "${cache_key}" "exists_gcs_bucket ${bucket_name}" "lib/gcp/gcs.sh" else report_result "Fail" return 1 diff --git a/gcloud/lib/gcp/iam.sh b/gcloud/lib/gcp/iam.sh index 47926c5e..a37e2d9e 100644 --- a/gcloud/lib/gcp/iam.sh +++ b/gcloud/lib/gcp/iam.sh @@ -99,6 +99,7 @@ function delete_service_account() { if run_gcloud "${log_file}" gcloud iam service-accounts delete --quiet "${GSA}"; then report_result "Deleted" + update_state "serviceAccount" "null" else report_result "Fail" echo " - Failed to delete service account ${GSA}. Log content:" >&2 diff --git a/gcloud/lib/gcp/kms.sh b/gcloud/lib/gcp/kms.sh index 101c5560..62f7abc4 100644 --- a/gcloud/lib/gcp/kms.sh +++ b/gcloud/lib/gcp/kms.sh @@ -35,6 +35,7 @@ function create_kms_keyring() { local log_file="create_kms_keyring_${KMS_KEYRING}.log" if run_gcloud "${log_file}" gcloud kms keyrings create "${KMS_KEYRING}" --location=global --project="${PROJECT_ID}"; then report_result "Created" + refresh_resource_state "kmsKeyring" "exists_kms_keyring" "lib/gcp/kms.sh" else report_result "Fail" return 1 diff --git a/gcloud/lib/gcp/misc.sh b/gcloud/lib/gcp/misc.sh index 0ad77494..f0b5ba40 100644 --- a/gcloud/lib/gcp/misc.sh +++ b/gcloud/lib/gcp/misc.sh @@ -37,31 +37,44 @@ function configure_gcloud() { report_result "Pass" fi } +export -f configure_gcloud function check_project() { print_status "Verifying project ${PROJECT_ID}..." - local project_state - project_state=$(jq -r '.project.lifecycleState // "NOT_FOUND"' "${STATE_FILE}") + local project_raw + project_raw=$(get_state "project") + if [[ "${project_raw}" == "null" || -z "${project_raw}" ]]; then + print_status "Project not found in state DB" >&2 + report_result "Fail" + exit 1 + fi + local project_state=$(echo "${project_raw}" | jq -r '.lifecycleState // "NOT_FOUND"') if [[ "${project_state}" == "ACTIVE" ]]; then report_result "Pass" else report_result "Fail" - echo " - Project ${PROJECT_ID} is not ACTIVE or does not exist (state: ${project_state})." >&2 + echo " - Project ${PROJECT_ID} is not ACTIVE (state: ${project_state})." >&2 exit 1 fi } function check_billing() { print_status "Verifying billing for ${PROJECT_ID}..." - local billing_enabled - billing_enabled=$(jq -r '.billing.billingEnabled // false' "${STATE_FILE}") + local billing_raw + billing_raw=$(get_state "billing") + if [[ "${billing_raw}" == "null" || -z "${billing_raw}" ]]; then + print_status "Billing info not found in state DB" >&2 + report_result "Fail" + exit 1 + fi + local billing_enabled=$(echo "${billing_raw}" | jq -r '.billingEnabled // false') if [[ "${billing_enabled}" == "true" ]]; then report_result "Pass" else report_result "Fail" - echo " - Billing is not enabled for project ${PROJECT_ID} according to state file." >&2 + echo " - Billing is not enabled for project ${PROJECT_ID}." >&2 echo " - Please run: gcloud beta billing projects link ${PROJECT_ID} --billing-account " >&2 exit 1 fi @@ -127,4 +140,3 @@ function exists_debug_vms() { _check_exists "gcloud compute instances list --project='${PROJECT_ID}' --filter='name~^debug-' --format='json(name,zone,status)'" | jq 'if . == [] then null else . end' } export -f exists_debug_vms -export -f exists_debug_vms diff --git a/gcloud/lib/network/network.sh b/gcloud/lib/network/network.sh index bf2dd867..a66ce8e2 100644 --- a/gcloud/lib/network/network.sh +++ b/gcloud/lib/network/network.sh @@ -16,18 +16,38 @@ function create_vpc_network () { --bgp-routing-mode="regional" \ --description="network for use with Dataproc cluster ${CLUSTER_NAME}"; then report_result "Created" + refresh_resource_state "vpcNetwork" "exists_network" "lib/network/network.sh" else report_result "Fail" return 1 fi } +export -f create_vpc_network function delete_vpc_network () { print_status "Deleting VPC Network ${NETWORK}..." local log_file="delete_vpc_${NETWORK}.log" if run_gcloud "${log_file}" gcloud compute networks delete --quiet "${NETWORK}" --project="${PROJECT_ID}"; then report_result "Deleted" + update_state "vpcNetwork" "null" else report_result "Fail" fi } +export -f delete_vpc_network + +function delete_all_network_vms() { + print_status "Deleting all remaining VMs in network ${NETWORK}..." + local log_file="delete_all_network_vms_${NETWORK}.log" + local vms=$(gcloud compute instances list --project="${PROJECT_ID}" --filter="networkInterfaces.network ~ /${NETWORK}$" --format="value(NAME,ZONE)" 2>/dev/null) + if [[ -n "${vms}" ]]; then + echo "${vms}" | while read -r name zone; do + print_status " Deleting VM ${name} in ${zone}..." + run_gcloud "delete_vm_${name}.log" gcloud compute instances delete "${name}" --zone "${zone}" --quiet || true + done + report_result "Done" + else + report_result "None Found" + fi +} +export -f delete_all_network_vms diff --git a/gcloud/lib/network/peering.sh b/gcloud/lib/network/peering.sh index b571dc35..866fa77f 100644 --- a/gcloud/lib/network/peering.sh +++ b/gcloud/lib/network/peering.sh @@ -27,6 +27,7 @@ function delete_ip_allocation () { if gcloud compute addresses describe ${ALLOCATION_NAME} --global --project="${PROJECT_ID}" > /dev/null 2>&1; then if run_gcloud "${log_file}" gcloud compute addresses delete --quiet --global ${ALLOCATION_NAME}; then report_result "Deleted" + update_state "ipAllocation" "null" else report_result "Fail" fi diff --git a/gcloud/lib/network/router.sh b/gcloud/lib/network/router.sh index c178468b..f42138d0 100644 --- a/gcloud/lib/network/router.sh +++ b/gcloud/lib/network/router.sh @@ -16,6 +16,7 @@ function create_router () { --asn="${ASN_NUMBER}" \ --region="${REGION}"; then report_result "Created" + refresh_resource_state "cloudRouter" "exists_router" "lib/network/router.sh" else report_result "Fail" return 1 @@ -33,6 +34,7 @@ function add_nat_to_router () { --nat-custom-subnet-ip-ranges "${SUBNET}" \ --auto-allocate-nat-external-ips; then report_result "Created" + refresh_resource_state "cloudRouter" "exists_router" "lib/network/router.sh" else report_result "Fail" return 1 @@ -55,6 +57,7 @@ function delete_router () { --region="${REGION}" \ --project="${PROJECT_ID}"; then report_result "Deleted" + update_state "cloudRouter" "null" else report_result "Fail" fi diff --git a/gcloud/lib/network/routes.sh b/gcloud/lib/network/routes.sh index de011ead..66714ec0 100644 --- a/gcloud/lib/network/routes.sh +++ b/gcloud/lib/network/routes.sh @@ -31,6 +31,7 @@ function delete_route() { if gcloud compute routes describe "${route_name}" --project="${PROJECT_ID}" > /dev/null 2>&1; then if run_gcloud "${log_file}" gcloud compute routes delete --quiet "${route_name}" --project="${PROJECT_ID}"; then report_result "Deleted" + refresh_resource_state "routes" "_check_exists \"gcloud compute routes list --project='${PROJECT_ID}' --filter='network~\\"/${NETWORK}$\\"' --format='json(name,selfLink)'\"" else report_result "Fail" echo " - Failed to delete route ${route_name}. Log content:" >&2 diff --git a/gcloud/lib/network/subnet.sh b/gcloud/lib/network/subnet.sh index 5c92ef27..ac0b01c7 100644 --- a/gcloud/lib/network/subnet.sh +++ b/gcloud/lib/network/subnet.sh @@ -9,16 +9,20 @@ function exists_subnet() { export -f exists_subnet function create_subnet () { - print_status "Creating Subnet ${SUBNET}..." - local log_file="create_subnet_${SUBNET}.log" - if run_gcloud "${log_file}" gcloud compute networks subnets create "${SUBNET}" \ + local subnet_name="$1" + local subnet_key="$2" + local range="$3" + print_status "Creating Subnet ${subnet_name}..." + local log_file="create_subnet_${subnet_name}.log" + if run_gcloud "${log_file}" gcloud compute networks subnets create "${subnet_name}" \ --project="${PROJECT_ID}" \ --network="${NETWORK}" \ - --range="${RANGE}" \ + --range="${range}" \ --enable-private-ip-google-access \ --region="${REGION}" \ --description="subnet for use with Dataproc cluster ${CLUSTER_NAME}"; then report_result "Created" + refresh_resource_state "${subnet_key}" "exists_subnet ${subnet_name}" "lib/network/subnet.sh" else report_result "Fail" return 1 From cce11e0b8f8da94fb3db34233b0e59c14cd5df35 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Wed, 18 Mar 2026 21:38:28 +0000 Subject: [PATCH 06/25] fix: Correct _check_exists calls and address code review - Ensure all calls to _check_exists pass command and arguments as separate array elements, not a single string. This fixes the audit not detecting some existing resources. - Refactor refresh_resource_state to handle array arguments. - Add exit 1 to init/gce-proxy-setup.sh if proxy connection test fails. - Improve stderr logging in bin/audit-dpgce background checks. - Fail script in lib/swp/policy.sh if security rule deletion fails. - Add clarifying comments in bin/destroy-dpgce. - Fix relative paths in lib/env.sh and lib/dataproc/private-cluster.sh. - Run audit at the start of create-dpgce. - Add NAT pre-delete back to lib/network/router.sh. --- gcloud/bin/audit-dpgce | 10 +-- gcloud/bin/create-dpgce | 97 ++++++++++++++++++++++---- gcloud/bin/destroy-dpgce | 5 +- gcloud/init/gce-proxy-setup.sh | 9 +-- gcloud/lib/bigtable.sh | 2 +- gcloud/lib/database/mssql.sh | 2 +- gcloud/lib/database/mysql.sh | 2 +- gcloud/lib/database/oracle.sh | 2 +- gcloud/lib/database/pgsql.sh | 2 +- gcloud/lib/dataproc/autoscaling.sh | 5 +- gcloud/lib/dataproc/cluster.sh | 6 +- gcloud/lib/dataproc/private-cluster.sh | 4 +- gcloud/lib/env.sh | 70 +++++++++---------- gcloud/lib/gcp/gcr.sh | 4 +- gcloud/lib/gcp/iam.sh | 2 +- gcloud/lib/gcp/kms.sh | 4 +- gcloud/lib/gcp/misc.sh | 2 +- gcloud/lib/gke.sh | 2 +- gcloud/lib/kerberos.sh | 2 +- gcloud/lib/network/network.sh | 4 +- gcloud/lib/network/peering.sh | 2 +- gcloud/lib/network/router.sh | 14 +++- gcloud/lib/network/routes.sh | 2 +- gcloud/lib/network/subnet.sh | 4 +- gcloud/lib/phs.sh | 2 +- gcloud/lib/script-utils.sh | 73 +++++++++++-------- gcloud/lib/swp/certs.sh | 2 +- gcloud/lib/swp/firewall.sh | 2 +- gcloud/lib/swp/gateway.sh | 2 +- gcloud/lib/swp/policy.sh | 4 +- 30 files changed, 219 insertions(+), 124 deletions(-) diff --git a/gcloud/bin/audit-dpgce b/gcloud/bin/audit-dpgce index b73e3e67..c938b3dd 100755 --- a/gcloud/bin/audit-dpgce +++ b/gcloud/bin/audit-dpgce @@ -34,7 +34,7 @@ run_check() { ( source "${GCLOUD_DIR}/${source_file}" - "${func_name}" "$@" > "${cmd_out}" 2>/dev/null + "${func_name}" "$@" > "${cmd_out}" 2> "${cmd_out%.json}.err" ) & PIDS+=($!) } @@ -46,14 +46,14 @@ run_shared_check() { shift 2 local cmd_out="${AUDIT_TEMP_DIR}/${key}.json" ( - "${func_name}" "$@" > "${cmd_out}" 2>/dev/null + "${func_name}" "$@" > "${cmd_out}" 2> "${cmd_out%.json}.err" ) & PIDS+=($!) } # --- Infrastructure State Checks --- -run_shared_check "project" _check_exists "gcloud projects describe '${PROJECT_ID}' --format='json(lifecycleState,projectId)'" -run_shared_check "billing" _check_exists "gcloud beta billing projects describe '${PROJECT_ID}' --format='json(billingEnabled)'" +run_shared_check "project" _check_exists gcloud projects describe "${PROJECT_ID}" --format="json(lifecycleState,projectId)" +run_shared_check "billing" _check_exists gcloud beta billing projects describe "${PROJECT_ID}" --format="json(billingEnabled)" # --- Resource Existence Checks --- run_check "vpcNetwork" "lib/network/network.sh" exists_network @@ -63,7 +63,7 @@ run_check "swpSubnet" "lib/network/subnet.sh" exists_subnet "${SWP_SUBNET}" run_check "cloudRouter" "lib/network/router.sh" exists_router run_check "firewallRule-ssh" "lib/swp/firewall.sh" exists_firewall_rule "${FIREWALL}-in-ssh" run_check "firewallRule-internal" "lib/swp/firewall.sh" exists_firewall_rule "${FIREWALL}-in-internal" -run_shared_check "routes" _check_exists "gcloud compute routes list --project='${PROJECT_ID}' --filter='network~"/${NETWORK}$"' --format='json(name,selfLink)'" +run_shared_check "routes" _check_exists gcloud compute routes list --project="${PROJECT_ID}" --filter="network~/${NETWORK}$" --format="json(name,selfLink)" run_check "serviceAccount" "lib/gcp/iam.sh" exists_service_account run_check "autoscalingPolicy" "lib/dataproc/autoscaling.sh" exists_autoscaling_policy run_check "dataprocCluster" "lib/dataproc/cluster.sh" exists_dpgce_cluster diff --git a/gcloud/bin/create-dpgce b/gcloud/bin/create-dpgce index a2036ae5..0f1f5bde 100755 --- a/gcloud/bin/create-dpgce +++ b/gcloud/bin/create-dpgce @@ -1,5 +1,4 @@ #!/bin/bash - # Exit on failure set -e @@ -15,17 +14,29 @@ source "${GCLOUD_DIR}/lib/env.sh" IS_CUSTOM=false IS_PRIVATE=false CREATE_CLUSTER=true +SWP_EGRESS=false +NAT_EGRESS=false while [[ "$#" -gt 0 ]]; do case $1 in - --custom) IS_CUSTOM=true ;; - --private) IS_PRIVATE=true ;; - --no-create-cluster) CREATE_CLUSTER=false ;; + --custom) IS_CUSTOM=true; shift ;; + --no-custom) IS_CUSTOM=false; shift ;; + --private) IS_PRIVATE=true; shift ;; + --swp-egress) SWP_EGRESS=true; shift ;; + --no-swp-egress) SWP_EGRESS=false; shift ;; + --nat-egress) NAT_EGRESS=true; shift ;; + --no-nat-egress) NAT_EGRESS=false; shift ;; + --no-create-cluster) CREATE_CLUSTER=false; shift ;; *) echo "Unknown parameter passed: $1"; exit 1 ;; esac - shift done +# Export boolean flags for use in functions +export IS_CUSTOM +export IS_PRIVATE +export SWP_EGRESS +export NAT_EGRESS + if (( DEBUG != 0 )); then set -x fi @@ -50,6 +61,11 @@ source "${GCLOUD_DIR}/lib/dataproc/private-cluster.sh" configure_gcloud init_state_db +print_status "Auditing environment to ensure cache is fresh..." +"${GCLOUD_DIR}/bin/audit-dpgce" > /dev/null +report_result "Done" + + echo "--- Current Environment State (from cache) --- " # Function to display resource status from DB display_status() { @@ -92,12 +108,45 @@ if [[ "${CREATE_CLUSTER}" == "true" ]]; then INTENDED+=("dataprocCluster") fi +# Helper to check if a resource needs creation +needs_create() { + local key="$1" + [[ $(get_state "${key}") == "null" ]] +} + +# Add to plan if resource is missing for resource in "${INTENDED[@]}"; do - if [[ $(get_state "${resource}") == "null" ]]; then - PLAN+=("CREATE_${resource}") + if [[ "${resource}" != "dataprocCluster" ]]; then + if needs_create "${resource}"; then + PLAN+=("CREATE_${resource}") + fi fi done +# Special handling for dataprocCluster to check for recreation needs +if [[ " ${INTENDED[*]} " =~ " dataprocCluster " ]]; then + cluster_state=$(get_state "dataprocCluster") + if [[ "${cluster_state}" == "null" ]]; then + PLAN+=("CREATE_dataprocCluster") + else + # Cluster exists, check if it matches the image intent + existing_image_uri=$(echo "${cluster_state}" | jq -r '.config.masterConfig.imageUri // empty') + existing_image_version=$(echo "${cluster_state}" | jq -r '.config.softwareConfig.imageVersion // empty') + + if [[ "${IS_CUSTOM}" == "true" ]]; then + if [[ "${existing_image_uri}" != "${CUSTOM_IMAGE_URI}" ]]; then + echo "INFO: Plan to recreate cluster: --custom specified, but existing image is not ${CUSTOM_IMAGE_URI}" >&2 + PLAN+=("RECREATE_dataprocCluster") + fi + else # Not custom + if [[ -n "${existing_image_uri}" || "${existing_image_version}" != "${IMAGE_VERSION}" ]]; then + echo "INFO: Plan to recreate cluster: --no-custom specified, but existing cluster seems to use a custom image" >&2 + PLAN+=("RECREATE_dataprocCluster") + fi + fi + fi +fi + if [[ ${#PLAN[@]} -eq 0 ]]; then echo "No actions needed. Environment matches target state." exit 0 @@ -118,29 +167,49 @@ if [[ " ${PLAN[*]} " =~ " CREATE_serviceAccount " ]]; then create_service_accoun # GCS Buckets - No strong dependencies between them if [[ " ${PLAN[*]} " =~ " CREATE_gcsBucket " ]]; then create_gcs_bucket "${BUCKET}" "Standard"; grant_gcs_bucket_perms "${BUCKET}"; fi if [[ " ${PLAN[*]} " =~ " CREATE_gcsTempBucket " ]]; then create_gcs_bucket "${TEMP_BUCKET}" "Standard"; grant_gcs_bucket_perms "${TEMP_BUCKET}"; fi -upload_init_actions # Always run if buckets were created or might be missing init actions +if [[ " ${PLAN[*]} " =~ " CREATE_gcsBucket " || " ${PLAN[*]} " =~ " CREATE_gcsTempBucket " ]]; then upload_init_actions; fi if [[ " ${PLAN[*]} " =~ " CREATE_vpcNetwork " ]]; then create_vpc_network; ensure_default_internet_route; fi if [[ " ${PLAN[*]} " =~ " CREATE_standardSubnet " ]]; then create_subnet "${SUBNET}" "standardSubnet" "${RANGE}"; fi if [[ " ${PLAN[*]} " =~ " CREATE_privateSubnet " ]]; then source "${GCLOUD_DIR}/lib/swp/subnet.sh"; create_private_subnet; fi -if [[ " ${PLAN[*]} " =~ " CREATE_cloudRouter " ]]; then create_router; add_nat_to_router; fi + +if [[ " ${PLAN[*]} " =~ " CREATE_cloudRouter " ]]; then + create_router + if [[ "${NAT_EGRESS}" == "true" ]]; then add_nat_to_router; fi +elif [[ "${NAT_EGRESS}" == "true" && $(get_state "cloudRouter") != "null" ]]; then + # TODO: Check if nat-config is already on the router, beyond just router existence. + echo "INFO: NAT Egress enabled, ensuring NAT on existing router..." + add_nat_to_router +fi + if [[ " ${PLAN[*]} " =~ " CREATE_firewallRule-ssh " ]]; then create_firewall_rules; fi if [[ " ${PLAN[*]} " =~ " CREATE_autoscalingPolicy " ]]; then create_autoscaling_policy; fi -if [[ " ${PLAN[*]} " =~ " CREATE_dataprocCluster " ]]; then +# Cluster Deletion if RECREATE is in plan +if [[ " ${PLAN[*]} " =~ " RECREATE_dataprocCluster " ]]; then + echo "Recreating cluster due to configuration mismatch..." + delete_dpgce_cluster + update_state "dataprocCluster" "null" # Ensure state is updated after delete +fi + +# Cluster Creation if CREATE or RECREATE is in plan +if [[ " ${PLAN[*]} " =~ " CREATE_dataprocCluster " || " ${PLAN[*]} " =~ " RECREATE_dataprocCluster " ]]; then if [[ "${IS_PRIVATE}" == "true" ]]; then create_dpgce_private_cluster else - if [[ "$IS_CUSTOM" == "true" ]]; then - source "${GCLOUD_DIR}/lib/dataproc/cluster-custom.sh" - fi + # create_dpgce_cluster is already sourced create_dpgce_cluster fi fi +# TODO: Implement SWP Egress setup if SWP_EGRESS is true +if [[ "${SWP_EGRESS}" == "true" ]]; then + echo "WARNING: SWP Egress setup is not yet implemented." >&2 +fi + echo "Plan execution finished." # --- Final Audit --- print_status "Running final audit to update cache..." "${GCLOUD_DIR}/bin/audit-dpgce" > /dev/null -report_result "Done" \ No newline at end of file +report_result "Done" diff --git a/gcloud/bin/destroy-dpgce b/gcloud/bin/destroy-dpgce index 345f77ff..5a503e29 100755 --- a/gcloud/bin/destroy-dpgce +++ b/gcloud/bin/destroy-dpgce @@ -100,8 +100,9 @@ ROUTES_RAW=$(get_state "routes") if [[ "${ROUTES_RAW}" != "null" && "${ROUTES_RAW}" != "[]" ]]; then mapfile -t route_names < <(echo "${ROUTES_RAW}" | jq -r '.[].name') for route_name in "${route_names[@]}"; do - # Exclude the default internet gateway route and local routes - if [[ "${route_name}" != "default-internet-${NETWORK}" && ! "${route_name}" =~ ^default-route-r- ]]; then + # Exclude only local routes (default-route-r-.*), allow default internet route to be deleted. + # The delete_route function is called for each route name found in the state cache. + if [[ ! "${route_name}" =~ ^default-route-r- ]]; then delete_route "${route_name}" fi done diff --git a/gcloud/init/gce-proxy-setup.sh b/gcloud/init/gce-proxy-setup.sh index 434fa5c1..9bf79dc9 100644 --- a/gcloud/init/gce-proxy-setup.sh +++ b/gcloud/init/gce-proxy-setup.sh @@ -173,10 +173,11 @@ function set_proxy(){ local proxy_port=$(echo "${http_proxy_val}" | cut -d: -f2) echo "DEBUG: set_proxy: Testing TCP connection to proxy ${proxy_host}:${proxy_port}..." - if ! nc -zv -w 5 "${proxy_host}" "${proxy_port}"; then - echo "ERROR: Failed to establish TCP connection to proxy ${proxy_host}:${proxy_port}." - exit 1 - else + + if ! nc -zv -w 5 "${proxy_host}" "${proxy_port}"; then + echo "ERROR: Failed to establish TCP connection to proxy ${proxy_host}:${proxy_port}." + exit 1 + else echo "DEBUG: set_proxy: TCP connection to proxy successful." fi diff --git a/gcloud/lib/bigtable.sh b/gcloud/lib/bigtable.sh index c4493b7c..4f6015ef 100644 --- a/gcloud/lib/bigtable.sh +++ b/gcloud/lib/bigtable.sh @@ -3,7 +3,7 @@ # Bigtable functions function exists_bigtable_instance() { - _check_exists "gcloud bigtable instances describe '${BIGTABLE_INSTANCE}' --format='json(name,displayName)'" + _check_exists gcloud bigtable instances describe "${BIGTABLE_INSTANCE}" --format="json(name,displayName)" } export -f exists_bigtable_instance diff --git a/gcloud/lib/database/mssql.sh b/gcloud/lib/database/mssql.sh index d3d58cc6..a8637107 100644 --- a/gcloud/lib/database/mssql.sh +++ b/gcloud/lib/database/mssql.sh @@ -29,7 +29,7 @@ function create_legacy_mssql_instance() { export -f create_legacy_mssql_instance function exists_legacy_mssql_instance() { - _check_exists "gcloud compute instances describe '${MSSQL_INSTANCE}' --zone '${ZONE}' --project='${PROJECT_ID}' --format='json(name,status)'" + _check_exists gcloud compute instances describe "${MSSQL_INSTANCE}" --zone "${ZONE}" --project="${PROJECT_ID}" --format="json(name,status)" } export -f exists_legacy_mssql_instance diff --git a/gcloud/lib/database/mysql.sh b/gcloud/lib/database/mysql.sh index 0cb8ff97..01ea79bd 100644 --- a/gcloud/lib/database/mysql.sh +++ b/gcloud/lib/database/mysql.sh @@ -32,6 +32,6 @@ function delete_mysql_instance() { export -f delete_mysql_instance function exists_mysql_instance() { - _check_exists "gcloud sql instances describe '${MYSQL_INSTANCE}' --project='${PROJECT_ID}' --format='json(name,state)'" + _check_exists gcloud sql instances describe "${MYSQL_INSTANCE}" --project="${PROJECT_ID}" --format="json(name,state)" } export -f exists_mysql_instance diff --git a/gcloud/lib/database/oracle.sh b/gcloud/lib/database/oracle.sh index 7c71e6f7..0065033e 100644 --- a/gcloud/lib/database/oracle.sh +++ b/gcloud/lib/database/oracle.sh @@ -125,6 +125,6 @@ function delete_oracle_vm() { export -f delete_oracle_vm function exists_oracle_vm() { - _check_exists "gcloud compute instances describe '${ORACLE_VM_NAME}' --zone '${ZONE}' --project='${PROJECT_ID}' --format='json(name,status)'" + _check_exists gcloud compute instances describe "${ORACLE_VM_NAME}" --zone "${ZONE}" --project="${PROJECT_ID}" --format="json(name,status)" } export -f exists_oracle_vm diff --git a/gcloud/lib/database/pgsql.sh b/gcloud/lib/database/pgsql.sh index d53dbd02..c0adfd53 100644 --- a/gcloud/lib/database/pgsql.sh +++ b/gcloud/lib/database/pgsql.sh @@ -33,6 +33,6 @@ function delete_pgsql_instance() { export -f delete_pgsql_instance function exists_pgsql_instance() { - _check_exists "gcloud sql instances describe '${PGSQL_INSTANCE}' --project='${PROJECT_ID}' --format='json(name,state)'" + _check_exists gcloud sql instances describe "${PGSQL_INSTANCE}" --project="${PROJECT_ID}" --format="json(name,state)" } export -f exists_pgsql_instance diff --git a/gcloud/lib/dataproc/autoscaling.sh b/gcloud/lib/dataproc/autoscaling.sh index cd6a0c2a..a84cd666 100644 --- a/gcloud/lib/dataproc/autoscaling.sh +++ b/gcloud/lib/dataproc/autoscaling.sh @@ -3,15 +3,16 @@ # Dataproc Autoscaling Policy functions function exists_autoscaling_policy() { - _check_exists "gcloud dataproc autoscaling-policies describe '${AUTOSCALING_POLICY_NAME}' --region='${REGION}' --format='json(id,name)'" + _check_exists gcloud dataproc autoscaling-policies describe "${AUTOSCALING_POLICY_NAME}" --region="${REGION}" --format="json(id,name)" } export -f exists_autoscaling_policy function create_autoscaling_policy() { print_status "Creating Autoscaling Policy ${AUTOSCALING_POLICY_NAME}..." local log_file="create_autoscaling_${AUTOSCALING_POLICY_NAME}.log" - if run_gcloud "${log_file}" gcloud dataproc autoscaling-policies import "${AUTOSCALING_POLICY_NAME}" --region="${REGION}" --source=autoscaling-policy.yaml; then + if run_gcloud "${log_file}" gcloud dataproc autoscaling-policies import "${AUTOSCALING_POLICY_NAME}" --region="${REGION}" --source="${GCLOUD_DIR}/autoscaling-policy.yaml" --quiet; then report_result "Created" + refresh_resource_state "autoscalingPolicy" "lib/dataproc/autoscaling.sh" exists_autoscaling_policy else report_result "Fail" return 1 diff --git a/gcloud/lib/dataproc/cluster.sh b/gcloud/lib/dataproc/cluster.sh index 1f3f9424..542bcdf0 100644 --- a/gcloud/lib/dataproc/cluster.sh +++ b/gcloud/lib/dataproc/cluster.sh @@ -3,7 +3,7 @@ # Dataproc Cluster Management Functions function exists_dpgce_cluster() { - _check_exists "gcloud dataproc clusters describe '${CLUSTER_NAME}' --region='${REGION}' --project='${PROJECT_ID}' --format='json(clusterName,clusterUuid,status.selfLink)'" + _check_exists gcloud dataproc clusters describe "${CLUSTER_NAME}" --region="${REGION}" --project="${PROJECT_ID}" --format="json(clusterName,clusterUuid,status.selfLink,config.softwareConfig.imageVersion,config.masterConfig.imageUri)" } export -f exists_dpgce_cluster @@ -77,7 +77,7 @@ function create_dpgce_cluster() { if time "${gcloud_cmd[@]}"; then report_result "Created" - refresh_resource_state "dataprocCluster" "exists_dpgce_cluster" "lib/dataproc/cluster.sh" + refresh_resource_state "dataprocCluster" "lib/dataproc/cluster.sh" exists_dpgce_cluster else report_result "Fail" return 1 @@ -97,6 +97,6 @@ function delete_dpgce_cluster() { export -f delete_dpgce_cluster function exists_dataproc_cluster_vms() { - _check_exists "gcloud compute instances list --project='${PROJECT_ID}' --filter='labels.goog-dataproc-cluster-name=${CLUSTER_NAME}' --format='json(name,zone,status)'" | jq 'if . == [] then null else . end' + _check_exists gcloud compute instances list --project="${PROJECT_ID}" --filter="labels.goog-dataproc-cluster-name=${CLUSTER_NAME}" --format="json(name,zone,status)" | jq 'if . == [] then null else . end' } export -f exists_dataproc_cluster_vms diff --git a/gcloud/lib/dataproc/private-cluster.sh b/gcloud/lib/dataproc/private-cluster.sh index e93c9af4..9078ba50 100644 --- a/gcloud/lib/dataproc/private-cluster.sh +++ b/gcloud/lib/dataproc/private-cluster.sh @@ -2,7 +2,7 @@ # # Dataproc Private Cluster Management Functions -source lib/dataproc/cluster.sh # Source the base cluster functions to reuse exists_dpgce_cluster +source "${GCLOUD_DIR}/lib/dataproc/cluster.sh" # Source the base cluster functions to reuse exists_dpgce_cluster function create_dpgce_private_cluster() { print_status "Creating Private Dataproc Cluster ${CLUSTER_NAME}..." @@ -82,7 +82,7 @@ function create_dpgce_private_cluster() { if "${gcloud_cmd[@]}"; then report_result "Created" - refresh_resource_state "dataprocCluster" "exists_dpgce_cluster" "lib/dataproc/cluster.sh" + refresh_resource_state "dataprocCluster" "lib/dataproc/cluster.sh" exists_dpgce_cluster else report_result "Fail" return 1 diff --git a/gcloud/lib/env.sh b/gcloud/lib/env.sh index bf76ccfb..49136f46 100644 --- a/gcloud/lib/env.sh +++ b/gcloud/lib/env.sh @@ -29,35 +29,35 @@ mkdir -p "${LOG_DIR}" export STATE_DB="${REPRO_TMPDIR}/state.db" -source lib/script-utils.sh +source "${GCLOUD_DIR}/lib/script-utils.sh" export PATH_SEPARATOR=";" -export FOLDER_NUMBER="$(jq -r .FOLDER_NUMBER env.json)" -export DOMAIN="$(jq -r .DOMAIN env.json)" -export USER="$(jq -r .USER env.json)" -export PRIV_DOMAIN="$(jq -r .PRIV_DOMAIN env.json)" -export PRIV_USER="$(jq -r .PRIV_USER env.json)" -export PROJECT_ID="$(jq -r .PROJECT_ID env.json)" +export FOLDER_NUMBER="$(jq -r .FOLDER_NUMBER "${GCLOUD_DIR}/env.json")" +export DOMAIN="$(jq -r .DOMAIN "${GCLOUD_DIR}/env.json")" +export USER="$(jq -r .USER "${GCLOUD_DIR}/env.json")" +export PRIV_DOMAIN="$(jq -r .PRIV_DOMAIN "${GCLOUD_DIR}/env.json")" +export PRIV_USER="$(jq -r .PRIV_USER "${GCLOUD_DIR}/env.json")" +export PROJECT_ID="$(jq -r .PROJECT_ID "${GCLOUD_DIR}/env.json")" if [[ "${PROJECT_ID}" == "ldap-example-yyyy-nn" ]]; then export PROJECT_ID="${USER}-example-$(date +%Y-%U)" fi -export BILLING_ACCOUNT="$(jq -r .BILLING_ACCOUNT env.json)" -export CLUSTER_NAME="$(jq -r .CLUSTER_NAME env.json)" -export BUCKET="$(jq -r .BUCKET env.json)" -export TEMP_BUCKET="$(jq -r .TEMP_BUCKET env.json)" -export RANGE="$(jq -r .RANGE env.json)" -export PRIVATE_RANGE="$(jq -r .PRIVATE_RANGE env.json)" +export BILLING_ACCOUNT="$(jq -r .BILLING_ACCOUNT "${GCLOUD_DIR}/env.json")" +export CLUSTER_NAME="$(jq -r .CLUSTER_NAME "${GCLOUD_DIR}/env.json")" +export BUCKET="$(jq -r .BUCKET "${GCLOUD_DIR}/env.json")" +export TEMP_BUCKET="$(jq -r .TEMP_BUCKET "${GCLOUD_DIR}/env.json")" +export RANGE="$(jq -r .RANGE "${GCLOUD_DIR}/env.json")" +export PRIVATE_RANGE="$(jq -r .PRIVATE_RANGE "${GCLOUD_DIR}/env.json")" export PRIVATE_SUBNET="private-subnet-${CLUSTER_NAME}" -export SWP_RANGE="$(jq -r .SWP_RANGE env.json)" +export SWP_RANGE="$(jq -r .SWP_RANGE "${GCLOUD_DIR}/env.json")" export SWP_SUBNET="swp-subnet-${CLUSTER_NAME}" -export IDLE_TIMEOUT="$(jq -r .IDLE_TIMEOUT env.json)" -export ASN_NUMBER="$(jq -r .ASN_NUMBER env.json)" -export IMAGE_VERSION="$(jq -r .IMAGE_VERSION env.json)" -export REGION="$(jq -r .REGION env.json)" -export ZONE="$(jq -r .ZONE env.json)" -export SWP_IP="$(jq -r .SWP_IP env.json)" -export SWP_PORT="$(jq -r .SWP_PORT env.json)" -export SWP_HOSTNAME="$(jq -r .SWP_HOSTNAME env.json)" +export IDLE_TIMEOUT="$(jq -r .IDLE_TIMEOUT "${GCLOUD_DIR}/env.json")" +export ASN_NUMBER="$(jq -r .ASN_NUMBER "${GCLOUD_DIR}/env.json")" +export IMAGE_VERSION="$(jq -r .IMAGE_VERSION "${GCLOUD_DIR}/env.json")" +export REGION="$(jq -r .REGION "${GCLOUD_DIR}/env.json")" +export ZONE="$(jq -r .ZONE "${GCLOUD_DIR}/env.json")" +export SWP_IP="$(jq -r .SWP_IP "${GCLOUD_DIR}/env.json")" +export SWP_PORT="$(jq -r .SWP_PORT "${GCLOUD_DIR}/env.json")" +export SWP_HOSTNAME="$(jq -r .SWP_HOSTNAME "${GCLOUD_DIR}/env.json")" export SWP_POLICY_NAME="swp-policy-${CLUSTER_NAME}" export SWP_INSTANCE_NAME="swp-gateway-${CLUSTER_NAME}" export SWP_CERT_NAME="swp-cert-${CLUSTER_NAME}-${RESOURCE_SUFFIX}" @@ -120,15 +120,15 @@ export PRINCIPAL="${USER}@${DOMAIN}" export ARTIFACT_REPOSITORY="${PROJECT_ID}-dataproc-repro" # BigTable -export BIGTABLE_INSTANCE="$(jq -r .BIGTABLE_INSTANCE env.json)" +export BIGTABLE_INSTANCE="$(jq -r .BIGTABLE_INSTANCE "${GCLOUD_DIR}/env.json")" if [[ "${BIGTABLE_INSTANCE}" == "null" ]]; then BIGTABLE_INSTANCE="${USER}-bigtable0" fi -export BIGTABLE_DISPLAY_NAME="$(jq -r .BIGTABLE_DISPLAY_NAME env.json)" +export BIGTABLE_DISPLAY_NAME="$(jq -r .BIGTABLE_DISPLAY_NAME "${GCLOUD_DIR}/env.json")" if [[ "${BIGTABLE_DISPLAY_NAME}" == "null" ]]; then BIGTABLE_DISPLAY_NAME="bigtable-${CLUSTER_NAME}" fi -export BIGTABLE_CLUSTER_CONFIG="$(jq -r .BIGTABLE_CLUSTER_CONFIG env.json)" +export BIGTABLE_CLUSTER_CONFIG="$(jq -r .BIGTABLE_CLUSTER_CONFIG "${GCLOUD_DIR}/env.json")" if [[ "${BIGTABLE_CLUSTER_CONFIG}" == "null" ]]; then BIGTABLE_CLUSTER_CONFIG="id=${BIGTABLE_DISPLAY_NAME},zone=${ZONE},nodes=3" fi @@ -301,15 +301,15 @@ export WAREHOUSE_BUCKET="gs://${HIVE_DATA_BUCKET}" export HIVE_METASTORE_WAREHOUSE_DIR="${WAREHOUSE_BUCKET}/datasets" # CI/CD Variables -export CI_PROJECT_ID="$(jq -r .CI_PROJECT_ID env.json)" -export CI_GCP_CREDENTIALS_PATH="$(jq -r .CI_GCP_CREDENTIALS_PATH env.json)" -export CI_CSR_REPO_NAME="$(jq -r .CI_CSR_REPO_NAME env.json)" -export CI_CSR_REGION="$(jq -r .CI_CSR_REGION env.json)" -export CI_GITHUB_CONNECTION_NAME="$(jq -r .CI_GITHUB_CONNECTION_NAME env.json)" -export CI_TRIGGER_BRANCH="$(jq -r .CI_TRIGGER_BRANCH env.json)" -export CUSTOM_IMAGE_URI="$(jq -r .CUSTOM_IMAGE_URI env.json)" -export CI_REPO_OWNER="$(jq -r .CI_REPO_OWNER env.json)" -export CI_BYOSA_EMAIL="$(jq -r .CI_BYOSA_EMAIL env.json)" +export CI_PROJECT_ID="$(jq -r .CI_PROJECT_ID "${GCLOUD_DIR}/env.json")" +export CI_GCP_CREDENTIALS_PATH="$(jq -r .CI_GCP_CREDENTIALS_PATH "${GCLOUD_DIR}/env.json")" +export CI_CSR_REPO_NAME="$(jq -r .CI_CSR_REPO_NAME "${GCLOUD_DIR}/env.json")" +export CI_CSR_REGION="$(jq -r .CI_CSR_REGION "${GCLOUD_DIR}/env.json")" +export CI_GITHUB_CONNECTION_NAME="$(jq -r .CI_GITHUB_CONNECTION_NAME "${GCLOUD_DIR}/env.json")" +export CI_TRIGGER_BRANCH="$(jq -r .CI_TRIGGER_BRANCH "${GCLOUD_DIR}/env.json")" +export CUSTOM_IMAGE_URI="$(jq -r .CUSTOM_IMAGE_URI "${GCLOUD_DIR}/env.json")" +export CI_REPO_OWNER="$(jq -r .CI_REPO_OWNER "${GCLOUD_DIR}/env.json")" +export CI_BYOSA_EMAIL="$(jq -r .CI_BYOSA_EMAIL "${GCLOUD_DIR}/env.json")" function configure_environment() { dataproc_repro_configure_environment=1 @@ -355,7 +355,7 @@ function configure_environment() { # # MOK config for secure boot # - eval "$(bash lib/secure-boot/create-key-pair.sh)" + eval "$(bash "${GCLOUD_DIR}/lib/secure-boot/create-key-pair.sh")" #modulus_md5sum=cd2bd1bdd9f9e4c43c12aecf6c338d6f #private_secret_name=efi-db-priv-key-042 #public_secret_name=efi-db-pub-key-042 diff --git a/gcloud/lib/gcp/gcr.sh b/gcloud/lib/gcp/gcr.sh index be0e51cd..cbd61f57 100644 --- a/gcloud/lib/gcp/gcr.sh +++ b/gcloud/lib/gcp/gcr.sh @@ -8,7 +8,7 @@ function create_artifacts_repository(){ --repository-format=docker \ --location="${REGION}" --project="${PROJECT_ID}"; then report_result "Created" - refresh_resource_state "artifactsRepository" "exists_artifacts_repository" "lib/gcp/gcr.sh" + refresh_resource_state "artifactsRepository" "lib/gcp/gcr.sh" exists_artifacts_repository else report_result "Fail" return 1 @@ -17,7 +17,7 @@ function create_artifacts_repository(){ export -f create_artifacts_repository function exists_artifacts_repository() { - _check_exists "gcloud artifacts repositories describe '${ARTIFACT_REPOSITORY}' --location='${REGION}' --project='${PROJECT_ID}' --format='json(name,format)'" + _check_exists gcloud artifacts repositories describe "${ARTIFACT_REPOSITORY}" --location="${REGION}" --project="${PROJECT_ID}" --format="json(name,format)" } export -f exists_artifacts_repository diff --git a/gcloud/lib/gcp/iam.sh b/gcloud/lib/gcp/iam.sh index a37e2d9e..ab2d773a 100644 --- a/gcloud/lib/gcp/iam.sh +++ b/gcloud/lib/gcp/iam.sh @@ -3,7 +3,7 @@ # IAM related functions function exists_service_account() { - _check_exists "gcloud iam service-accounts describe '${GSA}' --project='${PROJECT_ID}' --format='json(email,name)'" + _check_exists gcloud iam service-accounts describe "${GSA}" --project="${PROJECT_ID}" --format="json(email,name)" } export -f exists_service_account diff --git a/gcloud/lib/gcp/kms.sh b/gcloud/lib/gcp/kms.sh index 62f7abc4..17b27582 100644 --- a/gcloud/lib/gcp/kms.sh +++ b/gcloud/lib/gcp/kms.sh @@ -35,7 +35,7 @@ function create_kms_keyring() { local log_file="create_kms_keyring_${KMS_KEYRING}.log" if run_gcloud "${log_file}" gcloud kms keyrings create "${KMS_KEYRING}" --location=global --project="${PROJECT_ID}"; then report_result "Created" - refresh_resource_state "kmsKeyring" "exists_kms_keyring" "lib/gcp/kms.sh" + refresh_resource_state "kmsKeyring" "lib/gcp/kms.sh" exists_kms_keyring else report_result "Fail" return 1 @@ -60,7 +60,7 @@ export -f create_kerberos_kdc_key function exists_kms_key() { local key_name="$1" - _check_exists "gcloud kms keys describe '${key_name}' --keyring='${KMS_KEYRING}' --location=global --project='${PROJECT_ID}' --format='json(name,primary.state)'" + _check_exists gcloud kms keys describe "${key_name}" --keyring="${KMS_KEYRING}" --location=global --project="${PROJECT_ID}" --format="json(name,primary.state)" } export -f exists_kms_key diff --git a/gcloud/lib/gcp/misc.sh b/gcloud/lib/gcp/misc.sh index f0b5ba40..1aa4ad80 100644 --- a/gcloud/lib/gcp/misc.sh +++ b/gcloud/lib/gcp/misc.sh @@ -137,6 +137,6 @@ function check_image_exists() { # Check for any debug VMs function exists_debug_vms() { - _check_exists "gcloud compute instances list --project='${PROJECT_ID}' --filter='name~^debug-' --format='json(name,zone,status)'" | jq 'if . == [] then null else . end' + _check_exists gcloud compute instances list --project="${PROJECT_ID}" --filter='name~^debug-' --format="json(name,zone,status)" | jq 'if . == [] then null else . end' } export -f exists_debug_vms diff --git a/gcloud/lib/gke.sh b/gcloud/lib/gke.sh index f494e838..3c95594c 100644 --- a/gcloud/lib/gke.sh +++ b/gcloud/lib/gke.sh @@ -73,6 +73,6 @@ function delete_dpgke_cluster() { export -f delete_dpgke_cluster function exists_dpgke_cluster() { - _check_exists "gcloud dataproc clusters describe '${DPGKE_CLUSTER_NAME}' --region '${REGION}' --project='${PROJECT_ID}' --format='json(clusterName,status.state)'" + _check_exists gcloud dataproc clusters describe "${DPGKE_CLUSTER_NAME}" --region "${REGION}" --project="${PROJECT_ID}" --format="json(clusterName,status.state)" } export -f exists_dpgke_cluster diff --git a/gcloud/lib/kerberos.sh b/gcloud/lib/kerberos.sh index 2f5c20b6..1c8b5b38 100644 --- a/gcloud/lib/kerberos.sh +++ b/gcloud/lib/kerberos.sh @@ -29,7 +29,7 @@ function create_kdc_server() { export -f create_kdc_server function exists_kdc_server() { - _check_exists "gcloud compute instances describe '${KDC_NAME}' --zone '${ZONE}' --project='${PROJECT_ID}' --format='json(name,status)'" + _check_exists gcloud compute instances describe "${KDC_NAME}" --zone "${ZONE}" --project="${PROJECT_ID}" --format="json(name,status)" } export -f exists_kdc_server diff --git a/gcloud/lib/network/network.sh b/gcloud/lib/network/network.sh index a66ce8e2..18edc0c2 100644 --- a/gcloud/lib/network/network.sh +++ b/gcloud/lib/network/network.sh @@ -3,7 +3,7 @@ # VPC Network functions function exists_network() { - _check_exists "gcloud compute networks describe '${NETWORK}' --project='${PROJECT_ID}' --format='json(name,selfLink)'" + _check_exists gcloud compute networks describe "${NETWORK}" --project="${PROJECT_ID}" --format="json(name,selfLink)" } export -f exists_network @@ -16,7 +16,7 @@ function create_vpc_network () { --bgp-routing-mode="regional" \ --description="network for use with Dataproc cluster ${CLUSTER_NAME}"; then report_result "Created" - refresh_resource_state "vpcNetwork" "exists_network" "lib/network/network.sh" + refresh_resource_state "vpcNetwork" "lib/network/network.sh" exists_network else report_result "Fail" return 1 diff --git a/gcloud/lib/network/peering.sh b/gcloud/lib/network/peering.sh index 866fa77f..bdae311c 100644 --- a/gcloud/lib/network/peering.sh +++ b/gcloud/lib/network/peering.sh @@ -39,7 +39,7 @@ function delete_ip_allocation () { function exists_vpc_peering() { # Naming format: local peering_name="servicenetworking-googleapis-com" - _check_exists "gcloud compute networks peerings list --network='${NETWORK}' --project='${PROJECT_ID}' --filter='name=${peering_name}' --format='json(name,state)'" | jq 'if . == [] then null else .[0] end' + _check_exists gcloud compute networks peerings list --network="${NETWORK}" --project="${PROJECT_ID}" --filter="name=${peering_name}" --format="json(name,state)" | jq 'if . == [] then null else .[0] end' } export -f exists_vpc_peering diff --git a/gcloud/lib/network/router.sh b/gcloud/lib/network/router.sh index f42138d0..9c83d5b4 100644 --- a/gcloud/lib/network/router.sh +++ b/gcloud/lib/network/router.sh @@ -3,7 +3,7 @@ # Router and NAT functions function exists_router() { - _check_exists "gcloud compute routers describe '${ROUTER_NAME}' --region='${REGION}' --project='${PROJECT_ID}' --format='json(name,selfLink)'" + _check_exists gcloud compute routers describe "${ROUTER_NAME}" --region="${REGION}" --project="${PROJECT_ID}" --format="json(name,selfLink)" } export -f exists_router @@ -16,7 +16,7 @@ function create_router () { --asn="${ASN_NUMBER}" \ --region="${REGION}"; then report_result "Created" - refresh_resource_state "cloudRouter" "exists_router" "lib/network/router.sh" + refresh_resource_state "cloudRouter" "lib/network/router.sh" exists_router else report_result "Fail" return 1 @@ -27,6 +27,14 @@ export -f create_router function add_nat_to_router () { print_status "Adding NAT to Router ${ROUTER_NAME}..." local log_file="add_nat_${ROUTER_NAME}.log" + + # Attempt to delete nat-config first, ignore errors + gcloud compute routers nats delete "nat-config" \ + --router-region "${REGION}" \ + --router "${ROUTER_NAME}" \ + --project="${PROJECT_ID}" --quiet > /dev/null 2>&1 || true + sleep 5 # Brief pause to allow delete to propagate + if run_gcloud "${log_file}" gcloud compute routers nats create "nat-config" \ --router-region "${REGION}" \ --router "${ROUTER_NAME}" \ @@ -34,7 +42,7 @@ function add_nat_to_router () { --nat-custom-subnet-ip-ranges "${SUBNET}" \ --auto-allocate-nat-external-ips; then report_result "Created" - refresh_resource_state "cloudRouter" "exists_router" "lib/network/router.sh" + refresh_resource_state "cloudRouter" "lib/network/router.sh" exists_router else report_result "Fail" return 1 diff --git a/gcloud/lib/network/routes.sh b/gcloud/lib/network/routes.sh index 66714ec0..3b775cc1 100644 --- a/gcloud/lib/network/routes.sh +++ b/gcloud/lib/network/routes.sh @@ -31,7 +31,7 @@ function delete_route() { if gcloud compute routes describe "${route_name}" --project="${PROJECT_ID}" > /dev/null 2>&1; then if run_gcloud "${log_file}" gcloud compute routes delete --quiet "${route_name}" --project="${PROJECT_ID}"; then report_result "Deleted" - refresh_resource_state "routes" "_check_exists \"gcloud compute routes list --project='${PROJECT_ID}' --filter='network~\\"/${NETWORK}$\\"' --format='json(name,selfLink)'\"" + refresh_resource_state "routes" "" _check_exists gcloud compute routes list --project="${PROJECT_ID}" --filter="network~/${NETWORK}$" --format="json(name,selfLink)" else report_result "Fail" echo " - Failed to delete route ${route_name}. Log content:" >&2 diff --git a/gcloud/lib/network/subnet.sh b/gcloud/lib/network/subnet.sh index ac0b01c7..7752991e 100644 --- a/gcloud/lib/network/subnet.sh +++ b/gcloud/lib/network/subnet.sh @@ -4,7 +4,7 @@ function exists_subnet() { local subnet_name="$1" - _check_exists "gcloud compute networks subnets describe '${subnet_name}' --region='${REGION}' --project='${PROJECT_ID}' --format='json(name,selfLink)'" + _check_exists gcloud compute networks subnets describe "${subnet_name}" --region="${REGION}" --project="${PROJECT_ID}" --format="json(name,selfLink)" } export -f exists_subnet @@ -22,7 +22,7 @@ function create_subnet () { --region="${REGION}" \ --description="subnet for use with Dataproc cluster ${CLUSTER_NAME}"; then report_result "Created" - refresh_resource_state "${subnet_key}" "exists_subnet ${subnet_name}" "lib/network/subnet.sh" + refresh_resource_state "${subnet_key}" "lib/network/subnet.sh" exists_subnet "${subnet_name}" else report_result "Fail" return 1 diff --git a/gcloud/lib/phs.sh b/gcloud/lib/phs.sh index 2060c204..6f0bdc90 100644 --- a/gcloud/lib/phs.sh +++ b/gcloud/lib/phs.sh @@ -24,7 +24,7 @@ export -f create_phs_cluster function exists_phs_cluster() { local phs_cluster_name="${CLUSTER_NAME}-phs" - _check_exists "gcloud dataproc clusters describe '${phs_cluster_name}' --region='${REGION}' --project='${PROJECT_ID}' --format='json(clusterName,status.state)'" + _check_exists gcloud dataproc clusters describe "${phs_cluster_name}" --region="${REGION}" --project="${PROJECT_ID}" --format="json(clusterName,status.state)" } export -f exists_phs_cluster diff --git a/gcloud/lib/script-utils.sh b/gcloud/lib/script-utils.sh index 841a9ec4..7deda2e3 100644 --- a/gcloud/lib/script-utils.sh +++ b/gcloud/lib/script-utils.sh @@ -38,26 +38,26 @@ export -f report_audit_status function run_gcloud() { local log_file_name=$1 shift - local log_file="${REPRO_TMPDIR}/${log_file_name}" + local log_file="${LOG_DIR}/${log_file_name}" local log_dir=$(dirname "${log_file}") # Get the directory part mkdir -p "${log_dir}" # Create the directory if it doesn't exist if (( DEBUG != 0 )); then - echo " RUNNING: $*" >&2 + echo " RUNNING: ${@}" >&2 fi - "$@" > "${log_file}" 2>&1 + "${@}" > "${log_file}" 2>&1 local exit_code=$? if [[ ${exit_code} -ne 0 ]]; then if grep -q -e "Reauthentication failed" -e "gcloud auth login" -e "gcloud config set account" "${log_file}"; then echo -e "\n ${RED}GCLOUD AUTHENTICATION ERROR:${NC}" echo -e " Please run ${YELLOW}gcloud auth login${NC} and ${YELLOW}gcloud auth application-default login${NC} to re-authenticate." >&2 - elif (( DEBUG != 0 )); then - cat "${log_file}" >&2 - else - : + exit 1 fi + echo -e "${RED}ERROR: ${NC}Command failed with exit code ${exit_code}. Log: ${log_file}" >&2 + cat "${log_file}" >&2 + return ${exit_code} fi - return ${exit_code} + return 0 } export -f run_gcloud @@ -132,21 +132,28 @@ export -f get_state function refresh_resource_state() { local resource_key=$1 - local check_command=$2 - local source_file=$3 # Optional: file to source for exists_* + local source_file=$2 # e.g., lib/dataproc/cluster.sh or "" + shift 2 + local check_command=("$@") # Remaining arguments form the command local json_output + local func_name="${check_command[0]}" + if [[ -n "${source_file}" ]]; then - # Extract the function name from the command string - local func_name=$(echo "${check_command}" | awk '{print $1}') - # Source in a subshell and export the specific function needed - json_output=$(source "${GCLOUD_DIR}/${source_file}" && export -f "${func_name}" && eval "${check_command}") + # Source in a subshell, export the function, then run the command + if ! json_output=$(source "${GCLOUD_DIR}/${source_file}" && export -f "${func_name}" && "${check_command[@]}"); then + echo "ERROR: Failed to execute check_command in refresh_resource_state for key ${resource_key} from ${source_file}" >&2 + json_output="null" + fi else - # For shared functions like _check_exists, they are already exported from script-utils.sh - json_output=$(eval "${check_command}") + # Function should already be in the environment (e.g., _check_exists) + if ! json_output=$("${check_command[@]}"); then + echo "ERROR: Failed to execute check_command in refresh_resource_state for key ${resource_key}" >&2 + json_output="null" + fi fi - if [[ -z "${json_output}" ]]; then + if [[ -z "${json_output}" || "${json_output}" == "[]" ]]; then json_output="null" fi @@ -157,18 +164,26 @@ export -f refresh_resource_state # --- Audit Check Functions --- # These functions are now designed to be called by the audit script. # They return a JSON object with details if a resource is found, or the string "null". +# Call this with command and arguments as separate words, not a single string. function _check_exists() { - local command_to_run="$1" - local json_output - - # The command_to_run should be a gcloud command with --format=json - # that returns a JSON object if the resource exists and fails otherwise. - json_output=$(eval "${command_to_run}" 2>/dev/null) - - if [[ -n "${json_output}" ]]; then - echo "${json_output}" - else - echo "null" - fi + echo "DEBUG _check_exists called with: $@" >&2 + local json_output + # Execute the command, capturing stdout. Stderr is suppressed. + json_output=$("$@" 2> /dev/null) + local exit_code=$? + echo "DEBUG INSIDE _check_exists:" >&2 + echo "DEBUG CMD: $@" >&2 + echo "DEBUG EXIT CODE: ${exit_code}" >&2 + echo "DEBUG JSON OUTPUT: ${json_output}" >&2 + + if [[ ${exit_code} -eq 0 && -n "${json_output}" && "${json_output}" != "[]" ]]; then + # If the command was successful and output is not empty or an empty JSON array, resource exists. + echo "DEBUG _check_exists: Returning JSON" >&2 + echo "${json_output}" + else + # Otherwise, resource does not exist or an error occurred. + echo "DEBUG _check_exists: Returning null" >&2 + echo "null" + fi } export -f _check_exists diff --git a/gcloud/lib/swp/certs.sh b/gcloud/lib/swp/certs.sh index c7659f8b..43c19ea6 100644 --- a/gcloud/lib/swp/certs.sh +++ b/gcloud/lib/swp/certs.sh @@ -164,6 +164,6 @@ function exists_swp_managed_certificate() { local region="${1:-${REGION}}" local project_id="${2:-${PROJECT_ID}}" local cert_name="${SWP_CERT_NAME}" - _check_exists "gcloud certificate-manager certificates describe '${cert_name}' --location='${region}' --project='${project_id}' --format='json(name,managed.state)'" + _check_exists gcloud certificate-manager certificates describe "${cert_name}" --location="${region}" --project="${project_id}" --format="json(name,managed.state)" } export -f exists_swp_managed_certificate \ No newline at end of file diff --git a/gcloud/lib/swp/firewall.sh b/gcloud/lib/swp/firewall.sh index 38ec1571..f1a43b05 100644 --- a/gcloud/lib/swp/firewall.sh +++ b/gcloud/lib/swp/firewall.sh @@ -2,7 +2,7 @@ function exists_firewall_rule() { local rule_name="$1" - _check_exists "gcloud compute firewall-rules describe '${rule_name}' --project='${PROJECT_ID}' --format='json(name,direction)'" + _check_exists gcloud compute firewall-rules describe "${rule_name}" --project="${PROJECT_ID}" --format="json(name,direction)" } export -f exists_firewall_rule diff --git a/gcloud/lib/swp/gateway.sh b/gcloud/lib/swp/gateway.sh index b401c3a9..49bf6799 100644 --- a/gcloud/lib/swp/gateway.sh +++ b/gcloud/lib/swp/gateway.sh @@ -46,7 +46,7 @@ function exists_swp_gateway() { local swp_instance_name="${1:-${SWP_INSTANCE_NAME}}" local region="${2:-${REGION}}" local project_id="${3:-${PROJECT_ID}}" - _check_exists "gcloud network-services gateways describe '${swp_instance_name}' --location='${region}' --project='${project_id}' --format='json(name,type)'" + _check_exists gcloud network-services gateways describe "${swp_instance_name}" --location="${region}" --project="${project_id}" --format="json(name,type)" } export -f exists_swp_gateway diff --git a/gcloud/lib/swp/policy.sh b/gcloud/lib/swp/policy.sh index 00d68cfb..84ff2a49 100644 --- a/gcloud/lib/swp/policy.sh +++ b/gcloud/lib/swp/policy.sh @@ -53,7 +53,7 @@ function exists_gateway_security_policy() { local policy_name="${1:-${SWP_POLICY_NAME}}" local region="${2:-${REGION}}" local project_id="${3:-${PROJECT_ID}}" - _check_exists "gcloud network-security gateway-security-policies describe '${policy_name}' --location='${region}' --project='${project_id}' --format='json(name)'" + _check_exists gcloud network-security gateway-security-policies describe "${policy_name}" --location="${region}" --project="${project_id}" --format="json(name)" } export -f exists_gateway_security_policy @@ -71,7 +71,7 @@ function delete_gateway_security_policy() { --gateway-security-policy="${policy_name}" \ --location="${region}" \ --project="${project_id}" \ - --quiet || true + --quiet print_status " Deleting policy ${policy_name}..." if run_gcloud "${policy_log}" gcloud network-security gateway-security-policies delete "${policy_name}" \ From 8ce83a781167b06de5780d2cabb1aab638e63f41 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 19 Mar 2026 02:43:09 +0000 Subject: [PATCH 07/25] fix: Stabilize audit, network, and create scripts - Switched to client-side jq filtering for default route checks in lib/network/routes.sh due to gcloud filter unreliability. - Made create-dpgce idempotent for network infrastructure, ensuring components like NAT and default routes are created if missing, even if the Dataproc cluster already exists. - create-dpgce no longer errors out if a cluster exists with a different configuration; it only creates the cluster if missing. - Added Cloud NAT checks to the audit-dpgce script and report. - Corrected GCLOUD_DIR path resolution in lib/env.sh to be self-contained and robust. - Fixed bin/ssh-m to correctly set GCLOUD_DIR. - Simplified SCRIPT_DIR assignment in all bin/ scripts. - Added gcloud/.gitignore to exclude env.json. - Removed env.json from git history using filter-branch. - Rebased onto upstream/main. --- gcloud/.gitignore | 39 ++++-------- gcloud/bin/audit-dpgce | 77 ++++++++++++++++++++--- gcloud/bin/create-dpgce | 90 ++++++++++++++------------- gcloud/bin/destroy-dpgce | 2 +- gcloud/bin/recreate-cluster.sh | 2 +- gcloud/bin/ssh-m | 7 ++- gcloud/lib/dataproc/cluster-custom.sh | 2 +- gcloud/lib/env.sh | 11 ++-- gcloud/lib/network/router.sh | 8 +++ gcloud/lib/network/routes.sh | 23 ++++++- 10 files changed, 174 insertions(+), 87 deletions(-) diff --git a/gcloud/.gitignore b/gcloud/.gitignore index 892cebd1..43a7e150 100644 --- a/gcloud/.gitignore +++ b/gcloud/.gitignore @@ -1,32 +1,17 @@ -# Local environment overrides -my-env.json -env.cpan.json +# Env files +env.json +*.env -# Temporary files and logs -/tmp/ -action-update.log +# Logs +logs/ *.log -# TLS / Cert directories -tls-*/ -tls-*-*/ - -# Emacs backup files -*# +# Temp files +tmp/ +.DS_Store *~ -# Other -dataproc-repro-combined.txt -hardcopy.2 -ini/ -init/swp_ca.crt -opt/ -spark-bigquery-demo.py -t/pyspark-bigquery-command.sh -bin/#connectivity-test# -github/ -llm-guidance.md -plan-for-continued-work-2026-01-20.md -work-completed-2026-01-20.md -prompts/ -/tls/ +# State +state.db +state.json +*.db-journal diff --git a/gcloud/bin/audit-dpgce b/gcloud/bin/audit-dpgce index c938b3dd..d0cc2079 100755 --- a/gcloud/bin/audit-dpgce +++ b/gcloud/bin/audit-dpgce @@ -7,13 +7,52 @@ # Exit on failure set -e -export TIMESTAMP=$(date +%s) +# --- Argument Parsing --- +ARG_TIMESTAMP="" +while (( "$#" )); do + case "$1" in + --timestamp) + if [[ -n "$2" && "$2" != --* ]]; then + ARG_TIMESTAMP="$2" + shift 2 + else + echo "Error: --timestamp requires an argument" >&2 + exit 1 + fi + ;; + -*) + echo "Error: Unsupported flag $1" >&2 + exit 1 + ;; + *) + # Ignore other params + shift + ;; + esac +done + +export ARG_TIMESTAMP # --- Get script's real directory --- -SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +SCRIPT_DIR=$(dirname "${BASH_SOURCE[0]}") GCLOUD_DIR="$(realpath "${SCRIPT_DIR}/..")" export GCLOUD_DIR + +# --- Source environment variables and utility functions --- source "${GCLOUD_DIR}/lib/env.sh" + +# Override TIMESTAMP if provided as an argument +if [[ -n "${ARG_TIMESTAMP}" ]]; then + export TIMESTAMP="${ARG_TIMESTAMP}" + # Re-set variables derived from TIMESTAMP + export REPRO_TMPDIR="/tmp/dataproc-repro/${TIMESTAMP}" + export LOG_DIR="${REPRO_TMPDIR}/logs" + export STATE_DB="${REPRO_TMPDIR}/state.db" + mkdir -p "${REPRO_TMPDIR}" + mkdir -p "${LOG_DIR}" + echo "Using provided TIMESTAMP for resources: ${TIMESTAMP}" >&2 +fi + source "${GCLOUD_DIR}/lib/script-utils.sh" # Initialize the state database and table @@ -33,7 +72,16 @@ run_check() { local cmd_out="${AUDIT_TEMP_DIR}/${key}.json" ( - source "${GCLOUD_DIR}/${source_file}" + cd "${GCLOUD_DIR}" + source "./lib/env.sh" # Ensure env vars are set in subshell + if [[ -n "${ARG_TIMESTAMP}" ]]; then # Re-apply override if needed + export TIMESTAMP="${ARG_TIMESTAMP}" + export REPRO_TMPDIR="/tmp/dataproc-repro/${TIMESTAMP}" + export LOG_DIR="${REPRO_TMPDIR}/logs" + export STATE_DB="${REPRO_TMPDIR}/state.db" + fi + source "./lib/script-utils.sh" + source "./${source_file}" # Relative to GCLOUD_DIR "${func_name}" "$@" > "${cmd_out}" 2> "${cmd_out%.json}.err" ) & PIDS+=($!) @@ -46,6 +94,15 @@ run_shared_check() { shift 2 local cmd_out="${AUDIT_TEMP_DIR}/${key}.json" ( + cd "${GCLOUD_DIR}" + source "./lib/env.sh" # Ensure env vars are set in subshell + if [[ -n "${ARG_TIMESTAMP}" ]]; then # Re-apply override if needed + export TIMESTAMP="${ARG_TIMESTAMP}" + export REPRO_TMPDIR="/tmp/dataproc-repro/${TIMESTAMP}" + export LOG_DIR="${REPRO_TMPDIR}/logs" + export STATE_DB="${REPRO_TMPDIR}/state.db" + fi + source "./lib/script-utils.sh" "${func_name}" "$@" > "${cmd_out}" 2> "${cmd_out%.json}.err" ) & PIDS+=($!) @@ -61,8 +118,9 @@ run_check "standardSubnet" "lib/network/subnet.sh" exists_subnet "${SUBNET}" run_check "privateSubnet" "lib/network/subnet.sh" exists_subnet "${PRIVATE_SUBNET}" run_check "swpSubnet" "lib/network/subnet.sh" exists_subnet "${SWP_SUBNET}" run_check "cloudRouter" "lib/network/router.sh" exists_router -run_check "firewallRule-ssh" "lib/swp/firewall.sh" exists_firewall_rule "${FIREWALL}-in-ssh" -run_check "firewallRule-internal" "lib/swp/firewall.sh" exists_firewall_rule "${FIREWALL}-in-internal" +run_check "cloudRouterNAT" "lib/network/router.sh" exists_router_nat "nat-config" +run_check "firewallRule-ssh" "lib/swp/firewall.sh" exists_firewall_rule "fw-${CLUSTER_NAME}-in-ssh" +run_check "firewallRule-internal" "lib/swp/firewall.sh" exists_firewall_rule "fw-${CLUSTER_NAME}-in-internal" run_shared_check "routes" _check_exists gcloud compute routes list --project="${PROJECT_ID}" --filter="network~/${NETWORK}$" --format="json(name,selfLink)" run_check "serviceAccount" "lib/gcp/iam.sh" exists_service_account run_check "autoscalingPolicy" "lib/dataproc/autoscaling.sh" exists_autoscaling_policy @@ -101,8 +159,7 @@ for pid in "${PIDS[@]}"; do wait "${pid}" || true # Ignore errors from wait done -echo " -Populating SQLite Database..." +echo "Populating SQLite Database..." # Populate STATE_DB from temp files for key_file in $(find "${AUDIT_TEMP_DIR}" -type f -name "*.json"); do key=$(basename "${key_file}" .json) @@ -154,6 +211,7 @@ print_resource_status "Standard Subnet (${SUBNET})" "standardSubnet" print_resource_status "Private Subnet (${PRIVATE_SUBNET})" "privateSubnet" print_resource_status "SWP Subnet (${SWP_SUBNET})" "swpSubnet" print_resource_status "Cloud Router (${ROUTER_NAME})" "cloudRouter" +print_resource_status "Cloud NAT (nat-config)" "cloudRouterNAT" print_resource_status "Firewall Rule (SSH)" "firewallRule-ssh" print_resource_status "Firewall Rule (Internal)" "firewallRule-internal" print_resource_status "Service Account (${GSA})" "serviceAccount" @@ -195,7 +253,10 @@ if [[ "${routes}" == "null" || "${routes}" == "[]" || -z "${routes}" ]]; then report_audit_status "Not Found" else # Check if the JSON array is not empty - routes_len=$(echo "${routes}" | /usr/bin/jq '. | length') + tmp_routes_file="${AUDIT_TEMP_DIR}/routes.json.tmp" + echo "${routes}" > "${tmp_routes_file}" + routes_len=$(/usr/bin/jq '. | length' "${tmp_routes_file}") + rm "${tmp_routes_file}" if [[ "${routes_len}" -gt 0 ]]; then report_audit_status "Exists" else diff --git a/gcloud/bin/create-dpgce b/gcloud/bin/create-dpgce index 0f1f5bde..f8a37f94 100755 --- a/gcloud/bin/create-dpgce +++ b/gcloud/bin/create-dpgce @@ -3,7 +3,7 @@ set -e # --- Get script's real directory --- -SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +SCRIPT_DIR=$(dirname "${BASH_SOURCE[0]}") GCLOUD_DIR="$(realpath "${SCRIPT_DIR}/..")" export GCLOUD_DIR @@ -21,7 +21,7 @@ while [[ "$#" -gt 0 ]]; do case $1 in --custom) IS_CUSTOM=true; shift ;; --no-custom) IS_CUSTOM=false; shift ;; - --private) IS_PRIVATE=true; shift ;; + --private) IS_PRIVATE=true; shift ;; --swp-egress) SWP_EGRESS=true; shift ;; --no-swp-egress) SWP_EGRESS=false; shift ;; --nat-egress) NAT_EGRESS=true; shift ;; @@ -55,18 +55,18 @@ source "${GCLOUD_DIR}/lib/gcp/misc.sh" source "${GCLOUD_DIR}/lib/misc.sh" source "${GCLOUD_DIR}/lib/dataproc/autoscaling.sh" source "${GCLOUD_DIR}/lib/dataproc/cluster.sh" -source "${GCLOUD_DIR}/lib/dataproc/private-cluster.sh" +source "${GCLOUD_DIR}/lib/dataproc/private-cluster.sh" # --- Main Logic --- configure_gcloud init_state_db print_status "Auditing environment to ensure cache is fresh..." -"${GCLOUD_DIR}/bin/audit-dpgce" > /dev/null +"${GCLOUD_DIR}/bin/audit-dpgce" --timestamp "${TIMESTAMP}" &> /dev/null report_result "Done" -echo "--- Current Environment State (from cache) --- " +echo "--- Current Environment State (from Audit) --- " # Function to display resource status from DB display_status() { local display_name="$1" @@ -86,11 +86,18 @@ display_status "GCS Bucket (${BUCKET})" "gcsBucket" display_status "GCS Temp Bucket (${TEMP_BUCKET})" "gcsTempBucket" display_status "VPC Network (${NETWORK})" "vpcNetwork" display_status "Standard Subnet (${SUBNET})" "standardSubnet" -display_status "Private Subnet (${PRIVATE_SUBNET})" "privateSubnet" +if [[ "${IS_PRIVATE}" == "true" ]]; then + display_status "Private Subnet (${PRIVATE_SUBNET})" "privateSubnet" +fi display_status "Cloud Router" "cloudRouter" +if [[ "${NAT_EGRESS}" == "true" ]]; then + display_status "Cloud NAT (nat-config)" "cloudRouterNAT" +fi display_status "Firewall Rules" "firewallRule-ssh" # Assuming this key represents all base rules display_status "Autoscaling Policy" "autoscalingPolicy" -display_status "Dataproc Cluster (${CLUSTER_NAME})" "dataprocCluster" +if [[ "${CREATE_CLUSTER}" == "true" ]]; then + display_status "Dataproc Cluster (${CLUSTER_NAME})" "dataprocCluster" +fi echo "----------------------------------------------" # --- Determine Intended State & Generate Plan --- @@ -98,7 +105,7 @@ PLAN=() INTENDED=() # Basic resources for all types -INTENDED+=("serviceAccount" "gcsBucket" "gcsTempBucket" "vpcNetwork" "standardSubnet" "cloudRouter" "firewallRule-ssh" "autoscalingPolicy") +INTENDED+=("serviceAccount" "gcsBucket" "gcsTempBucket" "vpcNetwork" "standardSubnet" "cloudRouter" "firewallRule-ssh" "autoscalingPolicy" "defaultRoute") if [[ "${IS_PRIVATE}" == "true" ]]; then INTENDED+=("privateSubnet") @@ -116,34 +123,36 @@ needs_create() { # Add to plan if resource is missing for resource in "${INTENDED[@]}"; do - if [[ "${resource}" != "dataprocCluster" ]]; then - if needs_create "${resource}"; then - PLAN+=("CREATE_${resource}") + if [[ "${resource}" == "dataprocCluster" ]]; then continue; fi + if [[ "${resource}" == "defaultRoute" ]]; then + if ! check_default_route; then + PLAN+=("ENSURE_DEFAULT_ROUTE") fi + continue + fi + if needs_create "${resource}"; then + PLAN+=("CREATE_${resource}") fi done -# Special handling for dataprocCluster to check for recreation needs +# NAT Egress logic +if [[ "${NAT_EGRESS}" == "true" ]]; then + if ! needs_create "cloudRouter"; then + NAT_STATE=$(get_state "cloudRouterNAT") + echo "DEBUG: NAT_STATE in if: <${NAT_STATE}>" >&2 + if [[ "${NAT_STATE}" == "null" ]]; then + PLAN+=("ADD_NAT_TO_ROUTER") + fi + fi + # If cloudRouter needs creation, NAT is added within that function's flow. +fi + +# Special handling for dataprocCluster if [[ " ${INTENDED[*]} " =~ " dataprocCluster " ]]; then - cluster_state=$(get_state "dataprocCluster") - if [[ "${cluster_state}" == "null" ]]; then + if needs_create "dataprocCluster"; then PLAN+=("CREATE_dataprocCluster") else - # Cluster exists, check if it matches the image intent - existing_image_uri=$(echo "${cluster_state}" | jq -r '.config.masterConfig.imageUri // empty') - existing_image_version=$(echo "${cluster_state}" | jq -r '.config.softwareConfig.imageVersion // empty') - - if [[ "${IS_CUSTOM}" == "true" ]]; then - if [[ "${existing_image_uri}" != "${CUSTOM_IMAGE_URI}" ]]; then - echo "INFO: Plan to recreate cluster: --custom specified, but existing image is not ${CUSTOM_IMAGE_URI}" >&2 - PLAN+=("RECREATE_dataprocCluster") - fi - else # Not custom - if [[ -n "${existing_image_uri}" || "${existing_image_version}" != "${IMAGE_VERSION}" ]]; then - echo "INFO: Plan to recreate cluster: --no-custom specified, but existing cluster seems to use a custom image" >&2 - PLAN+=("RECREATE_dataprocCluster") - fi - fi + echo "INFO: Dataproc Cluster '${CLUSTER_NAME}' already exists. Skipping creation." >&2 fi fi @@ -170,14 +179,14 @@ if [[ " ${PLAN[*]} " =~ " CREATE_gcsTempBucket " ]]; then create_gcs_bucket "${T if [[ " ${PLAN[*]} " =~ " CREATE_gcsBucket " || " ${PLAN[*]} " =~ " CREATE_gcsTempBucket " ]]; then upload_init_actions; fi if [[ " ${PLAN[*]} " =~ " CREATE_vpcNetwork " ]]; then create_vpc_network; ensure_default_internet_route; fi +if [[ " ${PLAN[*]} " =~ " ENSURE_DEFAULT_ROUTE " ]]; then ensure_default_internet_route; fi if [[ " ${PLAN[*]} " =~ " CREATE_standardSubnet " ]]; then create_subnet "${SUBNET}" "standardSubnet" "${RANGE}"; fi if [[ " ${PLAN[*]} " =~ " CREATE_privateSubnet " ]]; then source "${GCLOUD_DIR}/lib/swp/subnet.sh"; create_private_subnet; fi -if [[ " ${PLAN[*]} " =~ " CREATE_cloudRouter " ]]; then +if [[ " ${PLAN[*]} " =~ " CREATE_cloudRouter " ]]; then create_router if [[ "${NAT_EGRESS}" == "true" ]]; then add_nat_to_router; fi -elif [[ "${NAT_EGRESS}" == "true" && $(get_state "cloudRouter") != "null" ]]; then - # TODO: Check if nat-config is already on the router, beyond just router existence. +elif [[ " ${PLAN[*]} " =~ " ADD_NAT_TO_ROUTER " ]]; then echo "INFO: NAT Egress enabled, ensuring NAT on existing router..." add_nat_to_router fi @@ -185,21 +194,15 @@ fi if [[ " ${PLAN[*]} " =~ " CREATE_firewallRule-ssh " ]]; then create_firewall_rules; fi if [[ " ${PLAN[*]} " =~ " CREATE_autoscalingPolicy " ]]; then create_autoscaling_policy; fi -# Cluster Deletion if RECREATE is in plan -if [[ " ${PLAN[*]} " =~ " RECREATE_dataprocCluster " ]]; then - echo "Recreating cluster due to configuration mismatch..." - delete_dpgce_cluster - update_state "dataprocCluster" "null" # Ensure state is updated after delete -fi - -# Cluster Creation if CREATE or RECREATE is in plan -if [[ " ${PLAN[*]} " =~ " CREATE_dataprocCluster " || " ${PLAN[*]} " =~ " RECREATE_dataprocCluster " ]]; then +# Cluster Creation if CREATE is in plan +if [[ " ${PLAN[*]} " =~ " CREATE_dataprocCluster " ]]; then if [[ "${IS_PRIVATE}" == "true" ]]; then create_dpgce_private_cluster else - # create_dpgce_cluster is already sourced create_dpgce_cluster fi +else + echo "INFO: Skipping cluster creation, already exists or not requested." fi # TODO: Implement SWP Egress setup if SWP_EGRESS is true @@ -211,5 +214,6 @@ echo "Plan execution finished." # --- Final Audit --- print_status "Running final audit to update cache..." -"${GCLOUD_DIR}/bin/audit-dpgce" > /dev/null +"${GCLOUD_DIR}/bin/audit-dpgce" --timestamp "${TIMESTAMP}" &> /dev/null report_result "Done" +port_result "Done" diff --git a/gcloud/bin/destroy-dpgce b/gcloud/bin/destroy-dpgce index 5a503e29..fc9c3bea 100755 --- a/gcloud/bin/destroy-dpgce +++ b/gcloud/bin/destroy-dpgce @@ -4,7 +4,7 @@ set -e # --- Get script's real directory --- -SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +SCRIPT_DIR=$(dirname "${BASH_SOURCE[0]}") GCLOUD_DIR="$(realpath "${SCRIPT_DIR}/..")" export GCLOUD_DIR diff --git a/gcloud/bin/recreate-cluster.sh b/gcloud/bin/recreate-cluster.sh index 94d4d8ec..37232ab8 100755 --- a/gcloud/bin/recreate-cluster.sh +++ b/gcloud/bin/recreate-cluster.sh @@ -4,7 +4,7 @@ set -e # --- Get script's real directory --- -SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +SCRIPT_DIR=$(dirname "${BASH_SOURCE[0]}") GCLOUD_DIR="$(realpath "${SCRIPT_DIR}/..")" # --- Source environment variables and utility functions --- diff --git a/gcloud/bin/ssh-m b/gcloud/bin/ssh-m index 0b847cd5..53bf4979 100755 --- a/gcloud/bin/ssh-m +++ b/gcloud/bin/ssh-m @@ -14,7 +14,12 @@ # See the License for the specific language governing permissions and # limitations under the License. # -source lib/env.sh +# --- Get script's real directory --- +SCRIPT_DIR=$(dirname "${BASH_SOURCE[0]}") +GCLOUD_DIR="$(realpath "${SCRIPT_DIR}/..")" +export GCLOUD_DIR + +source "${GCLOUD_DIR}/lib/env.sh" M_HOSTNAME="${CLUSTER_NAME}-m" # If the first argument is a number, treat it as the master index for HA diff --git a/gcloud/lib/dataproc/cluster-custom.sh b/gcloud/lib/dataproc/cluster-custom.sh index fb399a1b..2f4d510b 100644 --- a/gcloud/lib/dataproc/cluster-custom.sh +++ b/gcloud/lib/dataproc/cluster-custom.sh @@ -117,7 +117,7 @@ function create_dpgce_cluster() { # --num-masters=1 # --num-workers=2 --master-accelerator "type=${M_ACCELERATOR_TYPE}" - # --worker-accelerator "type=${PRIMARY_ACCELERATOR_TYPE}" +# --worker-accelerator "type=${PRIMARY_ACCELERATOR_TYPE}" # --secondary-worker-accelerator "type=${SECONDARY_ACCELERATOR_TYPE}" --master-machine-type "${M_MACHINE_TYPE}" # --worker-machine-type "${PRIMARY_MACHINE_TYPE}" diff --git a/gcloud/lib/env.sh b/gcloud/lib/env.sh index 49136f46..d8c8762b 100644 --- a/gcloud/lib/env.sh +++ b/gcloud/lib/env.sh @@ -14,13 +14,16 @@ # See the License for the specific language governing permissions and # limitations under the License. # -# Set RESOURCE_SUFFIX based on TIMESTAMP env var or generate new +# --- Get lib's real directory --- +LIB_DIR=$(realpath "$(dirname "${BASH_SOURCE[0]}")") +GCLOUD_DIR="$(realpath "${LIB_DIR}/..")" +export GCLOUD_DIR if [[ -n "${TIMESTAMP}" ]]; then + export TIMESTAMP export RESOURCE_SUFFIX="${TIMESTAMP}" - echo "Using provided TIMESTAMP for resources: ${RESOURCE_SUFFIX}" >&2 else - export RESOURCE_SUFFIX="$(date +%s)" - echo "Generated new TIMESTAMP for resources: ${RESOURCE_SUFFIX}" >&2 + export TIMESTAMP="$(date +%s)" + export RESOURCE_SUFFIX="${TIMESTAMP}" fi export REPRO_TMPDIR="${REPRO_TMPDIR:-/tmp/dataproc-repro/${RESOURCE_SUFFIX}}" mkdir -p "${REPRO_TMPDIR}" diff --git a/gcloud/lib/network/router.sh b/gcloud/lib/network/router.sh index 9c83d5b4..b3c77eb2 100644 --- a/gcloud/lib/network/router.sh +++ b/gcloud/lib/network/router.sh @@ -43,6 +43,7 @@ function add_nat_to_router () { --auto-allocate-nat-external-ips; then report_result "Created" refresh_resource_state "cloudRouter" "lib/network/router.sh" exists_router + refresh_resource_state "cloudRouterNAT" "lib/network/router.sh" exists_router_nat "nat-config" else report_result "Fail" return 1 @@ -71,3 +72,10 @@ function delete_router () { fi } export -f delete_router + +function exists_router_nat() { + local nat_name="$1" + # gcloud compute routers nats describe returns non-zero if not found + _check_exists gcloud compute routers nats describe "${nat_name}" --router="${ROUTER_NAME}" --region="${REGION}" --project="${PROJECT_ID}" +} +export -f exists_router_nat diff --git a/gcloud/lib/network/routes.sh b/gcloud/lib/network/routes.sh index 3b775cc1..a19bdb94 100644 --- a/gcloud/lib/network/routes.sh +++ b/gcloud/lib/network/routes.sh @@ -3,7 +3,7 @@ function ensure_default_internet_route() { print_status "Ensuring default internet route for ${NETWORK}..." local log_file="ensure_default_route_${NETWORK}.log" - if ! gcloud compute routes list --project="${PROJECT_ID}" --filter="network=${NETWORK} AND destRange=0.0.0.0/0 AND nextHopGateway=default-internet-gateway" --format="value(name)" | grep -q .; then + if ! check_default_route; then print_status " Default internet route not found, creating..." if run_gcloud "${log_file}" gcloud compute routes create "default-internet-${NETWORK}" \ --project="${PROJECT_ID}" \ @@ -22,6 +22,27 @@ function ensure_default_internet_route() { } export -f ensure_default_internet_route +function check_default_route() { + local project_id="${PROJECT_ID}" + local network_uri="https://www.googleapis.com/compute/v1/projects/${project_id}/global/networks/${NETWORK}" + local gateway_uri="https://www.googleapis.com/compute/v1/projects/${project_id}/global/gateways/default-internet-gateway" + local dest_range="0.0.0.0/0" + + # Fetch all routes in JSON format + if gcloud compute routes list --project="${project_id}" --format=json | \ + jq -e --arg network_uri "$network_uri" \ + --arg gateway_uri "$gateway_uri" \ + --arg dest_range "$dest_range" \ + '.[] | select(.network == $network_uri and .destRange == $dest_range and .nextHopGateway == $gateway_uri)' > /dev/null; then + # Match found, jq exits with 0 + return 0 + else + # No match, jq exits with non-zero + return 1 + fi +} +export -f check_default_route + function delete_route() { local route_name="$1" print_status "Deleting Route ${route_name}..." From 9a92c6d53d602007834099a56b1fbd2c73dc2c85 Mon Sep 17 00:00:00 2001 From: "C.J. Collier" Date: Thu, 19 Mar 2026 04:01:29 +0000 Subject: [PATCH 08/25] feat: Refactor destroy-dpgce to be plan-based and audit-driven - Reworked `bin/destroy-dpgce` to run an audit first, then generate a teardown plan based on discovered resources. - Added a context-aware status report to `destroy-dpgce` to display the state of resources found by the audit, respecting intent flags like --nat-egress, --custom, etc. - Updated `destroy-dpgce` to accept the same intent flags as `create-dpgce` to tailor the status report. - Made plan generation for network VMs and custom routes conditional on their detection in the audit. - Added `get_any_network_vms_state` to `lib/gcp/misc.sh` and an `anyNetworkVms` check to `bin/audit-dpgce` to support more accurate teardown planning for any VM in the network. - Ensured `destroy-dpgce` uses a unique TIMESTAMP for each run to isolate state. - Updated `README.md` to reflect these significant changes. --- gcloud/README.md | 104 +++++++++++------ gcloud/bin/audit-dpgce | 1 + gcloud/bin/destroy-dpgce | 240 +++++++++++++++++++++++++++++---------- gcloud/lib/gcp/misc.sh | 17 +++ 4 files changed, 267 insertions(+), 95 deletions(-) diff --git a/gcloud/README.md b/gcloud/README.md index ac496680..45051a57 100644 --- a/gcloud/README.md +++ b/gcloud/README.md @@ -1,6 +1,6 @@