diff --git a/data-tool/Makefile b/data-tool/Makefile index 60d8af0668..12de531aa0 100644 --- a/data-tool/Makefile +++ b/data-tool/Makefile @@ -113,14 +113,36 @@ run-colin-freeze: ## Run colin freeze flow . $(VENV_DIR)/bin/activate && \ python flows/colin_freeze_flow.py OUT ?= -RESET_EXTRACT_POSTGRES ?= +ARTIFACT_DIR ?= +CORP_FILE ?= ../data-tool/scripts/generated/delta_ctst.txt +SOURCE_CONNECTION ?= cprd +TARGET_CONNECTION ?= ctst_pg +TARGET_SCHEMA ?= public +MIG_BATCH_ID ?= 1 +LOOKBACK_HOURS ?= 5 +EXTRACT_OPTIONAL_ARGS = $(if $(OUT),--out "$(OUT)",) $(if $(ARTIFACT_DIR),--artifact-dir "$(ARTIFACT_DIR)",) +# Preserve historical Makefile behavior after CLI boolean cleanup: generate and run DbSchemaCLI, +# and keep cars* refresh enabled for local targets. Scheduled jobs should pass explicit flags. run-extract-load: . $(VENV_DIR)/bin/activate && \ - python flows/refresh_extract_subset_flow.py --mode load + python flows/refresh_extract_subset_flow.py --mode load \ + --corp-file "$(CORP_FILE)" \ + --source-connection "$(SOURCE_CONNECTION)" \ + --target-connection "$(TARGET_CONNECTION)" \ + --target-schema "$(TARGET_SCHEMA)" \ + --include-cars \ + --run-dbschemacli $(EXTRACT_OPTIONAL_ARGS) run-extract-refresh: . $(VENV_DIR)/bin/activate && \ - python flows/refresh_extract_subset_flow.py --mode refresh + python flows/refresh_extract_subset_flow.py --mode refresh \ + --source-connection "$(SOURCE_CONNECTION)" \ + --target-connection "$(TARGET_CONNECTION)" \ + --target-schema "$(TARGET_SCHEMA)" \ + --mig-batch-id "$(MIG_BATCH_ID)" \ + --lookback-hours "$(LOOKBACK_HOURS)" \ + --include-cars \ + --run-dbschemacli $(EXTRACT_OPTIONAL_ARGS) run-update-colin-ar-ind: ## Run update COLIN AR indicator flow . $(VENV_DIR)/bin/activate && \ diff --git a/data-tool/flows/common/colin_queries.py b/data-tool/flows/common/colin_queries.py index 466bcdcef5..828c1f3318 100644 --- a/data-tool/flows/common/colin_queries.py +++ b/data-tool/flows/common/colin_queries.py @@ -1,13 +1,24 @@ +from __future__ import annotations from math import ceil import re -from sqlalchemy import text + +def _positive_int(value: int | str, name: str) -> int: + try: + parsed = int(value) + except (TypeError, ValueError) as exc: + raise ValueError(f'{name} must be a positive integer') from exc + if parsed <= 0: + raise ValueError(f'{name} must be a positive integer') + return parsed def build_corp_list(corp_list: str, chunksize: int) -> str: if not str(corp_list).strip(): raise ValueError('empty corp_list') corp_nums = re.findall(r"'([^']*)'", corp_list) + if not corp_nums: + raise ValueError('corp_list must contain SQL-quoted corp identifiers') batch_size = min(chunksize, 999) num_batches = ceil(len(corp_nums) / batch_size) @@ -27,9 +38,11 @@ def build_corp_list(corp_list: str, chunksize: int) -> str: corp_list_cte = 'corp_list AS (\n'+ '\n'.join(union_lines) + '\n)' return ',\n'.join([*batch_ctes, corp_list_cte]) -def get_updated_identifiers(timestamp: str, corp_list: str, chunk_size: int) -> str: +def get_updated_identifiers(timestamp: str, corp_list: str, chunk_size: int, lookback_hours: int = 5) -> str: if not str(corp_list).strip(): raise ValueError('empty corp_list') + chunk_size = _positive_int(chunk_size, 'chunk_size') + lookback_hours = _positive_int(lookback_hours, 'lookback_hours') corp_list_ctes = build_corp_list(corp_list, chunk_size) query = f""" WITH {corp_list_ctes}, @@ -46,7 +59,7 @@ def get_updated_identifiers(timestamp: str, corp_list: str, chunk_size: int) -> FROM event e JOIN corp_list c ON c.corp_num = e.corp_num - WHERE e.event_timestmp > TIMESTAMP '{timestamp}' - INTERVAL '2' HOUR + WHERE e.event_timestmp > TIMESTAMP '{timestamp}' - INTERVAL '{lookback_hours}' HOUR AND NOT ( EXISTS ( SELECT 1 @@ -74,12 +87,13 @@ def get_updated_identifiers(timestamp: str, corp_list: str, chunk_size: int) -> return query def get_identifiers_per_batch(mig_batch_id: int) -> str: + mig_batch_id = _positive_int(mig_batch_id, 'mig_batch_id') return f""" SELECT string_agg(pg_catalog.quote_literal(trim(CAST(mcb.corp_num AS text))), ',') AS corp_list FROM mig_corp_batch mcb WHERE mcb.mig_batch_id = {mig_batch_id} """ -def get_updated_identifiers_for_batch(timestamp: str, corp_list: str, chunk_size: int) -> str: +def get_updated_identifiers_for_batch(timestamp: str, corp_list: str, chunk_size: int = 999, lookback_hours: int = 5) -> str: """per batch get identifiers""" - return get_updated_identifiers(timestamp, corp_list, 999) \ No newline at end of file + return get_updated_identifiers(timestamp, corp_list, chunk_size, lookback_hours=lookback_hours) diff --git a/data-tool/flows/common/query_utils.py b/data-tool/flows/common/query_utils.py index c57df10925..f1b98c36e7 100644 --- a/data-tool/flows/common/query_utils.py +++ b/data-tool/flows/common/query_utils.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import pandas as pd from typing import Dict, Iterable, List, Sequence import re @@ -111,4 +113,4 @@ def prune_candidates_from_account(pruning_corps_list: list) -> str: def get_cutoff_timestamp_query() -> str: return f""" SELECT extracted_at FROM colin_extract_version - """ \ No newline at end of file + """ diff --git a/data-tool/flows/config.py b/data-tool/flows/config.py index ea1239cd24..076bc459a8 100644 --- a/data-tool/flows/config.py +++ b/data-tool/flows/config.py @@ -1,3 +1,5 @@ +from __future__ import annotations + # Copyright © 2022 Province of British Columbia # # Licensed under the Apache License, Version 2.0 (the 'License'); @@ -23,6 +25,7 @@ import sys from dotenv import find_dotenv, load_dotenv +from sqlalchemy.engine import URL # this will load all the envars from a .env file located in the project root (api) @@ -54,6 +57,49 @@ def _get_bool(name: str, default: bool = False) -> bool: return val.strip().lower() == 'true' +def _build_postgres_uri( + *, + username: str, + password: str, + host: str, + port: str | int, + database: str, + exact_uri: str | None = None, + cloudsql_connection_name: str | None = None, +) -> str: + """Build a Postgres SQLAlchemy URL without breaking on URL-reserved password chars.""" + if exact_uri: + return exact_uri + if cloudsql_connection_name: + return URL.create( + 'postgresql+psycopg2', + username=username, + password=password, + database=database, + query={'host': f'/cloudsql/{cloudsql_connection_name}'}, + ).render_as_string(hide_password=False) + return URL.create( + 'postgresql+psycopg2', + username=username, + password=password, + host=host, + port=int(port), + database=database, + ).render_as_string(hide_password=False) + + +def _build_oracle_uri(*, username: str, password: str, host: str, port: str | int, database: str) -> str: + """Build an Oracle SQLAlchemy URL without breaking on URL-reserved password chars.""" + return URL.create( + 'oracle+oracledb', + username=username, + password=password, + host=host, + port=int(port), + database=database, + ).render_as_string(hide_password=False) + + def get_named_config(config_name: str = 'production'): """Return the configuration object based on the name. @@ -105,12 +151,14 @@ class _Config(): # pylint: disable=too-few-public-methods DB_NAME_COLIN_MIGR = os.getenv('DATABASE_NAME_COLIN_MIGR', '') DB_HOST_COLIN_MIGR = os.getenv('DATABASE_HOST_COLIN_MIGR', '') DB_PORT_COLIN_MIGR = os.getenv('DATABASE_PORT_COLIN_MIGR', '5432') - SQLALCHEMY_DATABASE_URI_COLIN_MIGR = 'postgresql://{user}:{password}@{host}:{port}/{name}'.format( - user=DB_USER_COLIN_MIGR, + SQLALCHEMY_DATABASE_URI_COLIN_MIGR = _build_postgres_uri( + username=DB_USER_COLIN_MIGR, password=DB_PASSWORD_COLIN_MIGR, host=DB_HOST_COLIN_MIGR, - port=int(DB_PORT_COLIN_MIGR), - name=DB_NAME_COLIN_MIGR, + port=DB_PORT_COLIN_MIGR, + database=DB_NAME_COLIN_MIGR, + exact_uri=os.getenv('DATABASE_URI_COLIN_MIGR'), + cloudsql_connection_name=os.getenv('CLOUDSQL_INSTANCE_CONNECTION_NAME_COLIN_MIGR'), ) SQLALCHEMY_TRACK_MODIFICATIONS = os.getenv('SQLALCHEMY_TRACK_MODIFICATIONS', False) @@ -120,12 +168,14 @@ class _Config(): # pylint: disable=too-few-public-methods DB_NAME = os.getenv('DATABASE_NAME', '') DB_HOST = os.getenv('DATABASE_HOST', '') DB_PORT = os.getenv('DATABASE_PORT', '5432') - SQLALCHEMY_DATABASE_URI = 'postgresql://{user}:{password}@{host}:{port}/{name}'.format( - user=DB_USER, + SQLALCHEMY_DATABASE_URI = _build_postgres_uri( + username=DB_USER, password=DB_PASSWORD, host=DB_HOST, - port=int(DB_PORT), - name=DB_NAME, + port=DB_PORT, + database=DB_NAME, + exact_uri=os.getenv('DATABASE_URI_LEAR'), + cloudsql_connection_name=os.getenv('CLOUDSQL_INSTANCE_CONNECTION_NAME'), ) DATABASE_POOL_PRE_PING = os.getenv('DATABASE_POOL_PRE_PING', 'True') == 'True' @@ -144,12 +194,14 @@ class _Config(): # pylint: disable=too-few-public-methods DB_NAME_AUTH = os.getenv('DATABASE_NAME_AUTH', '') DB_HOST_AUTH = os.getenv('DATABASE_HOST_AUTH', '') DB_PORT_AUTH = os.getenv('DATABASE_PORT_AUTH', '5432') - SQLALCHEMY_DATABASE_URI_AUTH = 'postgresql://{user}:{password}@{host}:{port}/{name}'.format( - user=DB_USER_AUTH, + SQLALCHEMY_DATABASE_URI_AUTH = _build_postgres_uri( + username=DB_USER_AUTH, password=DB_PASSWORD_AUTH, host=DB_HOST_AUTH, - port=int(DB_PORT_AUTH), - name=DB_NAME_AUTH, + port=DB_PORT_AUTH, + database=DB_NAME_AUTH, + exact_uri=os.getenv('DATABASE_URI_AUTH'), + cloudsql_connection_name=os.getenv('CLOUDSQL_INSTANCE_CONNECTION_NAME_AUTH'), ) # service accounts @@ -206,12 +258,12 @@ class _Config(): # pylint: disable=too-few-public-methods DB_NAME_COLIN_ORACLE = os.getenv('DATABASE_NAME_COLIN_ORACLE', '') DB_HOST_COLIN_ORACLE = os.getenv('DATABASE_HOST_COLIN_ORACLE', '') DB_PORT_COLIN_ORACLE = os.getenv('DATABASE_PORT_COLIN_ORACLE', '1521') - SQLALCHEMY_DATABASE_URI_COLIN_ORACLE = 'oracle+oracledb://{user}:{password}@{host}:{port}/{name}'.format( - user=DB_USER_COLIN_ORACLE, + SQLALCHEMY_DATABASE_URI_COLIN_ORACLE = _build_oracle_uri( + username=DB_USER_COLIN_ORACLE, password=DB_PASSWORD_COLIN_ORACLE, host=DB_HOST_COLIN_ORACLE, - port=int(DB_PORT_COLIN_ORACLE), - name=DB_NAME_COLIN_ORACLE, + port=DB_PORT_COLIN_ORACLE, + database=DB_NAME_COLIN_ORACLE, ) FREEZE_COLIN_CORPS = os.getenv('FREEZE_COLIN_CORPS', 'False') == 'True' FREEZE_ADD_EARLY_ADOPTER = os.getenv('FREEZE_ADD_EARLY_ADOPTER', 'False') == 'True' diff --git a/data-tool/flows/refresh_extract_subset_flow.py b/data-tool/flows/refresh_extract_subset_flow.py index fc2fba6821..c061274c6b 100644 --- a/data-tool/flows/refresh_extract_subset_flow.py +++ b/data-tool/flows/refresh_extract_subset_flow.py @@ -1,20 +1,28 @@ +from __future__ import annotations + import argparse import os from pathlib import Path -import re import subprocess import sys +from uuid import uuid4 + from prefect import flow, task from prefect.cache_policies import NO_CACHE -from prefect.states import Failed -from flask import current_app from sqlalchemy import create_engine, text from sqlalchemy.engine import Engine -from datetime import datetime, timezone + from config import get_named_config from common.colin_queries import get_identifiers_per_batch, get_updated_identifiers_for_batch from common.init_utils import colin_oracle_init, get_config -from common.query_utils import corpnum_to_oracle_ids, get_cutoff_timestamp_query, get_fallout_corp_nums, prune_candidates_from_account, prune_candidates_from_batch, prune_candidates_from_cp +from common.query_utils import ( + corpnum_to_oracle_ids, + get_cutoff_timestamp_query, + get_fallout_corp_nums, + prune_candidates_from_account, + prune_candidates_from_batch, + prune_candidates_from_cp, +) _REPO_ROOT = Path(__file__).resolve().parents[2] _SCRIPT_PATH = _REPO_ROOT / 'data-tool' / 'scripts' / 'generate_cprd_subset_extract.py' @@ -25,6 +33,16 @@ _BUILD_VIEWS_SCRIPT = _REPO_ROOT / 'data-tool' / 'scripts' / 'colin_corps_extract_postgres_views_ddl' +def _positive_int(value: str) -> int: + try: + parsed = int(value) + except (TypeError, ValueError) as exc: + raise argparse.ArgumentTypeError('must be a positive integer') from exc + if parsed <= 0: + raise argparse.ArgumentTypeError('must be a positive integer') + return parsed + + def _resolve_master_script_path(out: str | None) -> Path: if not out: return _SUBSET.resolve() @@ -33,11 +51,60 @@ def _resolve_master_script_path(out: str | None) -> Path: return p.resolve() return (_REPO_ROOT / p).resolve() + +def _resolve_artifact_dir(artifact_dir: str | None) -> Path | None: + if not artifact_dir: + return None + path = Path(artifact_dir).expanduser() + if not path.is_absolute(): + path = _REPO_ROOT / path + return path.resolve() + + +def _default_out_for_artifact(artifact_dir: str | None, mode: str, out: str | None) -> str | None: + if out: + return out + artifact_path = _resolve_artifact_dir(artifact_dir) + if artifact_path is None: + return None + return str(artifact_path / f'subset_{mode}.sql') + + +def _normalize_target_corp_num(value: object) -> str | None: + corp_num = str(value).strip().upper() + if not corp_num: + return None + if corp_num.isdigit(): + return f'BC{corp_num}' + return corp_num + + +def _extract_refresh_corp_nums(updated_rows: list[dict]) -> tuple[list[str], list[str]]: + feed_lines: list[str] = [] + updated_corp_nums: list[str] = [] + seen: set[str] = set() + for row in updated_rows: + for key, value in row.items(): + if key is None or value is None: + continue + if str(key).lower() != 'corp_num': + continue + feed_value = str(value).strip().upper() + target_value = _normalize_target_corp_num(feed_value) + if feed_value and target_value and target_value not in seen: + seen.add(target_value) + feed_lines.append(feed_value) + updated_corp_nums.append(target_value) + break + return feed_lines, updated_corp_nums + + def _run_cmd(argv: list[str], env: dict[str, str] | None = None) -> None: r = subprocess.run(argv, cwd=str(_REPO_ROOT), capture_output=False, text=True, env=env) if r.returncode != 0: raise RuntimeError(f'command failed ({r.returncode}): {" ".join(argv)}') - + + def require_file(path: str | Path, description: str) -> Path: """File Not Found Error""" resolved = Path(path).expanduser().resolve() @@ -53,7 +120,7 @@ def _reset_extract_postgres_db() -> None: port = str(cfg.DB_PORT_COLIN_MIGR) user = cfg.DB_USER_COLIN_MIGR password = cfg.DB_PASSWORD_COLIN_MIGR - + require_file(_DEFAULT_DDL, 'Extract DDL File') pg_flags = ['-h', host, '-p', str(port), '-U', user] @@ -66,14 +133,15 @@ def _reset_extract_postgres_db() -> None: "FROM pg_stat_activity " f"WHERE datname = '{safe_db}' AND pid <> pg_backend_pid();" ) - _run_cmd(['psql', *pg_flags, '-d', 'postgres', '-c', terminate_sql ], env=run_env) - _run_cmd(['dropdb', *pg_flags, '--maintenance-db=postgres', '--if-exists', dbname ], env=run_env) - _run_cmd(['createdb', *pg_flags, '--maintenance-db=postgres', '-T', 'template0', dbname ], env=run_env) - _run_cmd(['psql', *pg_flags, '-d', dbname, '-v', 'ON_ERROR_STOP=1', '-f', str(_DEFAULT_DDL) ], env=run_env) - _run_cmd(['psql', *pg_flags, '-d', dbname, '-v', 'ON_ERROR_STOP=1', '-f', str(_BUILD_VIEWS_SCRIPT) ], env=run_env) + _run_cmd(['psql', *pg_flags, '-d', 'postgres', '-c', terminate_sql], env=run_env) + _run_cmd(['dropdb', *pg_flags, '--maintenance-db=postgres', '--if-exists', dbname], env=run_env) + _run_cmd(['createdb', *pg_flags, '--maintenance-db=postgres', '-T', 'template0', dbname], env=run_env) + _run_cmd(['psql', *pg_flags, '-d', dbname, '-v', 'ON_ERROR_STOP=1', '-f', str(_DEFAULT_DDL)], env=run_env) + _run_cmd(['psql', *pg_flags, '-d', dbname, '-v', 'ON_ERROR_STOP=1', '-f', str(_BUILD_VIEWS_SCRIPT)], env=run_env) + @task(name='Get-Fallen-Out-Identifiers', cache_policy=NO_CACHE) -def get_fallen_identifiers(updated_corp_nums: list) -> list[dict]: +def get_fallen_identifiers(updated_corp_nums: list) -> list[str]: """ Get updated corp nums from colin with cutoff timestamp """ @@ -86,16 +154,16 @@ def get_fallen_identifiers(updated_corp_nums: list) -> list[dict]: rows = [str(row).strip() for row in result] return rows + @task(name='Prune-Fallen-Out-Identifiers', cache_policy=NO_CACHE) -def prune_fallen_identifiers(fallenout_corp_nums: list) -> list[dict]: +def prune_fallen_identifiers(fallen_out_identifiers_list: list) -> None: """ - Get updated corp nums from colin with cutoff timestamp + Prune fallen-out corp nums from migration candidate tables. """ - if not fallenout_corp_nums: - print(f"No fallout corps to prune") + if not fallen_out_identifiers_list: + print('No fallout corps to prune') return cfg = get_named_config() - fallen_out_identifiers_list = get_fallen_identifiers(fallenout_corp_nums) cp_query = prune_candidates_from_cp(fallen_out_identifiers_list) batch_query = prune_candidates_from_batch(fallen_out_identifiers_list) account_query = prune_candidates_from_account(fallen_out_identifiers_list) @@ -103,24 +171,31 @@ def prune_fallen_identifiers(fallenout_corp_nums: list) -> list[dict]: prune_cp = conn.execute(text(cp_query)) prune_batch = conn.execute(text(batch_query)) prune_account = conn.execute(text(account_query)) - print(f"Pruned corp_processing={prune_cp.rowcount}, mig_corp_batch={prune_batch.rowcount}, mig_corp_account={prune_account.rowcount}") + print(f'Pruned corp_processing={prune_cp.rowcount}, mig_corp_batch={prune_batch.rowcount}, mig_corp_account={prune_account.rowcount}') -def get_cuttoff_timestamp() -> datetime: +def get_cuttoff_timestamp(): cfg = get_named_config() cuttoff_timestamp = get_cutoff_timestamp_query() with create_engine(cfg.SQLALCHEMY_DATABASE_URI_COLIN_MIGR).begin() as conn: cuttoff_timestamp_result = conn.execute(text(cuttoff_timestamp)).scalar() - print(f"cuttoff timestamp is {cuttoff_timestamp_result}") + print(f'cuttoff timestamp is {cuttoff_timestamp_result}') return cuttoff_timestamp_result - + @task(name='Cleanup-Extract-Postgres', cache_policy=NO_CACHE) def cleanup_extract_postgres_db() -> None: _reset_extract_postgres_db() + @task(name='Get-Updated-Identifiers-Colin', cache_policy=NO_CACHE) -def get_updated_identifiers_colin(cutoff_timestamp: str, mig_batch_id: int, colin_oracle_engine: Engine, chunk_size: int) -> list[dict]: +def get_updated_identifiers_colin( + cutoff_timestamp: str, + mig_batch_id: int, + colin_oracle_engine: Engine, + chunk_size: int = 900, + lookback_hours: int = 5, +) -> list[dict]: """ Get updated corp nums from colin with cutoff timestamp """ @@ -128,16 +203,24 @@ def get_updated_identifiers_colin(cutoff_timestamp: str, mig_batch_id: int, coli mig_sql = get_identifiers_per_batch(mig_batch_id) with create_engine(cfg.SQLALCHEMY_DATABASE_URI_COLIN_MIGR).connect() as conn: row = conn.execute(text(mig_sql)).fetchone() - - corp_list = corpnum_to_oracle_ids(row[0]) if row else None - colin_sql = get_updated_identifiers_for_batch(cutoff_timestamp, str(corp_list), chunk_size) + + corp_list = corpnum_to_oracle_ids(row[0]) if row and row[0] else None + if not corp_list: + raise ValueError(f'no corp identifiers found for mig_batch_id={mig_batch_id}') + + colin_sql = get_updated_identifiers_for_batch( + cutoff_timestamp, + corp_list, + chunk_size=chunk_size, + lookback_hours=lookback_hours, + ) with colin_oracle_engine.connect() as conn: result = conn.execute(text(colin_sql)) rows = [dict(row) for row in result.mappings()] return rows - + @task(name='Run-CPRD-Subset-Generator', cache_policy=NO_CACHE) def run_cprd_subset_extract_generator( corp_file: str, @@ -148,15 +231,18 @@ def run_cprd_subset_extract_generator( pg_disable_method: str, out: str | None, include_cp: bool = False, + include_cars: bool = False, + source_connection: str = 'cprd', target_connection: str = 'ctst_pg', + target_schema: str = 'public', prefix_numeric_bc: bool = False, ) -> subprocess.CompletedProcess: """ Generate Commands """ require_file(_SCRIPT_PATH, 'Generated script') - corp_path =require_file(corp_file, 'Corp list file') - + corp_path = require_file(corp_file, 'Corp list file') + argv = [ sys.executable, str(_SCRIPT_PATH), @@ -170,8 +256,14 @@ def run_cprd_subset_extract_generator( str(threads), '--pg-disable-method', pg_disable_method, + '--source-connection', + source_connection, + '--target-connection', + target_connection, + '--target-schema', + target_schema, ] - argv.extend(['--target-connection', target_connection]) + argv.append('--include-cars' if include_cars else '--no-cars') if pg_fastload: argv.append('--pg-fastload') if include_cp: @@ -181,7 +273,7 @@ def run_cprd_subset_extract_generator( out_path = _resolve_master_script_path(out) out_path.parent.mkdir(parents=True, exist_ok=True) argv.extend(['--out', str(out_path)]) - + return subprocess.run( argv, cwd=str(_REPO_ROOT), @@ -189,6 +281,7 @@ def run_cprd_subset_extract_generator( text=True, ) + @task(name='DBSchemaCLI', cache_policy=NO_CACHE) def run_dbschemacli_task(master_script: str, dbschemacli_cmd: str = 'dbschemacli') -> subprocess.CompletedProcess: master_script_path = Path(master_script) @@ -202,8 +295,9 @@ def run_dbschemacli_task(master_script: str, dbschemacli_cmd: str = 'dbschemacli text=True, ) + @task(name='Refresh-Views', cache_policy=NO_CACHE) -def run_refresh_views(mode: str = 'refresh', targets: str = 'all') -> subprocess.CompletedProcess: +def run_refresh_views(mode: str = 'refresh', targets: str = 'all') -> subprocess.CompletedProcess: cfg = get_named_config() script = require_file(_REFRESH_VIEWS_SCRIPT, 'refresh_colin_extract_views.sh') argv = [ @@ -213,74 +307,96 @@ def run_refresh_views(mode: str = 'refresh', targets: str = 'all') -> subproces '--db', cfg.DB_NAME_COLIN_MIGR, '--host', cfg.DB_HOST_COLIN_MIGR, '--port', str(cfg.DB_PORT_COLIN_MIGR), - '--user', cfg.DB_USER_COLIN_MIGR + '--user', cfg.DB_USER_COLIN_MIGR, ] run_env = dict(os.environ) if cfg.DB_PASSWORD_COLIN_MIGR and 'PGPASSWORD' not in run_env: run_env['PGPASSWORD'] = cfg.DB_PASSWORD_COLIN_MIGR print(f'Running: {" ".join(argv)}') - return subprocess.run(argv, - cwd=str(_REPO_ROOT), - capture_output=False, - text=True, - env=run_env - ) + return subprocess.run( + argv, + cwd=str(_REPO_ROOT), + capture_output=False, + text=True, + env=run_env, + ) @flow(name='Extract-Subset-Flow', log_prints=True, persist_result=False) def extract_pull_flow( - corp_file: str, - mode: str = 'load', + corp_file: str | None = None, + mode: str = 'refresh', chunk_size: int = 900, threads: int = 4, pg_fastload: bool = False, pg_disable_method: str = 'table_triggers', - out: str | None=None, + out: str | None = None, run_dbschemacli: bool = False, dbschemacli_cmd: str = 'dbschemacli', - refresh_views: bool = True, + refresh_views: bool = False, reset_extract_postgres: bool = True, include_cp: bool = False, + include_cars: bool = False, + source_connection: str = 'cprd', target_connection: str = 'ctst_pg', + target_schema: str = 'public', + mig_batch_id: int = 1, + lookback_hours: int = 5, + artifact_dir: str | None = None, ) -> None: """ Generate files """ + if mode == 'load' and not corp_file: + raise ValueError('load mode requires --corp-file') if mode == 'refresh': reset_extract_postgres = False print('Running in refresh mode: skipping Postgres DB reset') if reset_extract_postgres: cleanup_extract_postgres_db() - - cutoff = get_cuttoff_timestamp() - config = get_config() - colin_oracle_engine = colin_oracle_init(config) - # Get Identifiers + artifact_path = _resolve_artifact_dir(artifact_dir) + if artifact_path is not None: + artifact_path.mkdir(parents=True, exist_ok=True) + out = _default_out_for_artifact(artifact_dir, mode, out) + feed_path: Path | None = None - if mode == 'refresh': - updated_rows = get_updated_identifiers_colin(cutoff_timestamp=cutoff, mig_batch_id=1, colin_oracle_engine=colin_oracle_engine, chunk_size=chunk_size) + delete_feed = False + updated_corp_nums: list[str] = [] + if mode == 'refresh' and not corp_file: + cutoff = get_cuttoff_timestamp() + config = get_config() + colin_oracle_engine = colin_oracle_init(config) + updated_rows = get_updated_identifiers_colin( + cutoff_timestamp=cutoff, + mig_batch_id=mig_batch_id, + colin_oracle_engine=colin_oracle_engine, + chunk_size=chunk_size, + lookback_hours=lookback_hours, + ) print(f'Colin updated identifiers : {len(updated_rows)} rows') - _GENERATED_DIR.mkdir(parents=True, exist_ok=True) - feed_path = _GENERATED_DIR / f'refresh_corp_feed_{os.getpid()}.tmp' - seen = set() - lines = [] - updated_corp_nums = [] - for row in updated_rows: - for k, v in row.items(): - if k is None or v is None: - continue - if str(k).lower() == 'corp_num': - c = str(v).strip() - if c and c not in seen: - seen.add(c) - lines.append(c) - updated_corp_nums.append('BC'+c) - break - if not lines: - raise ValueError('refresh: no corp_num in updated_rows') - feed_path.write_text('\n'.join(lines) + '\n', encoding='utf-8') + feed_lines, updated_corp_nums = _extract_refresh_corp_nums(updated_rows) + if not feed_lines: + print( + 'Refresh found no updated corp identifiers to extract; ' + 'skipping generator, DbSchemaCLI, pruning, and materialized-view refresh.' + ) + return + if artifact_path is not None: + feed_path = artifact_path / f'refresh_corp_feed_{os.getpid()}_{uuid4().hex}.txt' + else: + _GENERATED_DIR.mkdir(parents=True, exist_ok=True) + feed_path = _GENERATED_DIR / f'refresh_corp_feed_{os.getpid()}.tmp' + delete_feed = True + feed_path.write_text('\n'.join(feed_lines) + '\n', encoding='utf-8') corp_file = str(feed_path) + elif mode == 'refresh' and corp_file: + updated_corp_nums = [ + normalized + for line in Path(corp_file).read_text(encoding='utf-8').splitlines() + if (normalized := _normalize_target_corp_num(line)) + ] + result: subprocess.CompletedProcess | None = None print(f'Running CPRD subset extract generator {corp_file}') try: @@ -291,18 +407,21 @@ def extract_pull_flow( threads=threads, pg_fastload=pg_fastload, include_cp=include_cp, + include_cars=include_cars, pg_disable_method=pg_disable_method, out=out, + source_connection=source_connection, target_connection=target_connection, - prefix_numeric_bc=(mode=='refresh'), + target_schema=target_schema, + prefix_numeric_bc=(mode == 'refresh'), ) finally: - if feed_path is not None: + if feed_path is not None and delete_feed: feed_path.unlink(missing_ok=True) - if result.returncode != 0 and result is not None: + if result is not None and result.returncode != 0: raise RuntimeError(f'Generator exited with code {result.returncode}') - print(f'generator completed successfully') - + print('generator completed successfully') + if run_dbschemacli: master_script = _resolve_master_script_path(out=out) run_result = run_dbschemacli_task( @@ -311,28 +430,51 @@ def extract_pull_flow( ) if run_result.returncode != 0: raise RuntimeError(f'DbSchemaCLI exited with code {run_result.returncode}') - + if refresh_views: refresh_result = run_refresh_views('refresh', 'all') - if refresh_result.returncode !=0: + if refresh_result.returncode != 0: raise RuntimeError(f'Refresh-Views exited with code {refresh_result.returncode}') - if mode == 'refresh': + if mode == 'refresh' and run_dbschemacli and updated_corp_nums: prune_identifiers = get_fallen_identifiers(updated_corp_nums) prune_fallen_identifiers(prune_identifiers) - -if __name__ == '__main__': + + +def build_arg_parser() -> argparse.ArgumentParser: p = argparse.ArgumentParser(description='Run Extract-Pull flow....') - p.add_argument('--corp_file', default='../data-tool/scripts/generated/delta_ctst.txt', help='Path to newline-delimited corp identifiers') + p.add_argument('--corp-file', '--corp_file', dest='corp_file', default=None, help='Path to newline-delimited corp identifiers') p.add_argument('--mode', default='refresh', choices=('refresh', 'load')) - p.add_argument('--chunk-size', type=int, default=900, help='Max items per IN list.') - p.add_argument('--threads', type=int, default=4, help='DBSchemaCLI transfer threads') + p.add_argument('--chunk-size', type=_positive_int, default=900, help='Max items per IN list.') + p.add_argument('--threads', type=_positive_int, default=4, help='DBSchemaCLI transfer threads') p.add_argument('--pg-fastload', action='store_true', help='Enable Postgres fast-load') p.add_argument('--include-cp', action='store_true', help='Include corp type CP in subset extract queries') + p.add_argument('--include-cars', dest='include_cars', action='store_true', help='Include global cars* refresh') + p.add_argument('--no-cars', dest='include_cars', action='store_false', help='Skip global cars* refresh') + p.set_defaults(include_cars=False) p.add_argument('--pg-disable-method', default='table_triggers', choices=('table_triggers', 'replica_role')) - p.add_argument('--out', default='data-tool/scripts/subset/generated/subset_refresh.sql', help='Output path for generated master script.') - p.add_argument('--run-dbschemacli', action='store_false') - p.add_argument('--refresh-views', action='store_false') + p.add_argument('--out', default=None, help='Output path for generated master script.') + p.add_argument('--artifact-dir', default=None, help='Directory for retained run artifacts/replay feed and default generated master script') + p.add_argument('--run-dbschemacli', action='store_true', default=False, help='Run DbSchemaCLI after generating the master script') + p.add_argument('--refresh-views', dest='refresh_views', action='store_true', help='Refresh materialized views from this flow (normally handled by the OCP wrapper)') + p.add_argument('--no-refresh-views', dest='refresh_views', action='store_false', help='Do not refresh materialized views from this flow') + p.set_defaults(refresh_views=False) p.add_argument('--dbschemacli-cmd', default='dbschemacli') - p.add_argument('--reset-extract-postgres', action='store_false') + p.add_argument('--reset-extract-postgres', dest='reset_extract_postgres', action='store_true', help='Reset/rebuild extract Postgres before load mode') + p.add_argument('--no-reset-extract-postgres', dest='reset_extract_postgres', action='store_false', help='Do not reset/rebuild extract Postgres') + p.set_defaults(reset_extract_postgres=True) + p.add_argument('--source-connection', default='cprd') p.add_argument('--target-connection', default='ctst_pg') - extract_pull_flow(**vars(p.parse_args())) + p.add_argument('--target-schema', default='public') + p.add_argument('--mig-batch-id', type=_positive_int, default=1) + p.add_argument('--lookback-hours', type=_positive_int, default=5) + return p + + +def main(argv: list[str] | None = None) -> int: + args = build_arg_parser().parse_args(argv) + extract_pull_flow(**vars(args)) + return 0 + + +if __name__ == '__main__': + raise SystemExit(main()) diff --git a/data-tool/scripts/README_COLIN_Corps_Extract.md b/data-tool/scripts/README_COLIN_Corps_Extract.md index d6274f493b..ef459943af 100644 --- a/data-tool/scripts/README_COLIN_Corps_Extract.md +++ b/data-tool/scripts/README_COLIN_Corps_Extract.md @@ -37,6 +37,8 @@ register driver PostgreSql org.postgresql.Driver jdbc:postgresql://: connection cprd_pg -d PostgreSql -u postgres -p -h localhost -P -D colin-mig-corps-data-test ``` +DbSchema connection aliases are not embedded with credentials in generated transfer SQL. The SQL connects by alias, so `~/.DbSchema/cli/init.sql` must define the aliases referenced by the script being run. + 7. Transfer data (full refresh) `dbschemacli /data-tool/scripts/transfer_cprd_corps.sql` @@ -244,6 +246,14 @@ cars* global refresh templates (only when cars are included; skipped with `--no- Generator: - `data-tool/scripts/generate_cprd_subset_extract.py` +Connection aliases and target schema: +- Generated subset SQL is self-contained for rendered predicates, per-chunk SQL, source alias, and target schema after generation, but it still depends on DbSchema connection aliases from `~/.DbSchema/cli/init.sql`. +- The source Oracle alias defaults to `cprd` for direct/local compatibility. Pass `--source-connection ` if your `init.sql` uses another source alias. +- The direct generator default target alias is `cprd_pg_subset`; define that alias in `init.sql` or pass `--target-connection `. +- The target Postgres schema defaults to `public`. Pass `--target-schema ` when loading into another schema. +- Target schema names must be lowercase simple identifiers: letters, numbers, and underscores, not starting with a number. Uppercase schema names are rejected because generated SQL uses unquoted identifiers. +- The OCP wrapper/flow requires explicit source alias, target alias, and target schema values and passes them explicitly to the generator. + Generated scripts (gitignored by default): - **Main script**: runs templates + the per-chunk scripts, e.g. `data-tool/scripts/_generated/subset_refresh.sql` - **Chunk scripts**: generated per chunk in `data-tool/scripts/_generated/chunks/` @@ -263,8 +273,11 @@ Generated scripts (gitignored by default): --mode refresh \ --chunk-size 500 \ --threads 4 \ - --pg-fastload \ - --pg-disable-method table_triggers \ + --pg-fastload \ + --pg-disable-method table_triggers \ + --source-connection cprd \ + --target-connection cprd_pg_subset \ + --target-schema public \ --out /data-tool/scripts/_generated/subset_refresh.sql ``` @@ -276,8 +289,11 @@ Generated scripts (gitignored by default): --chunk-size 500 \ --threads 4 \ --include-cp \ - --pg-fastload \ - --pg-disable-method table_triggers \ + --pg-fastload \ + --pg-disable-method table_triggers \ + --source-connection cprd \ + --target-connection cprd_pg_subset \ + --target-schema public \ --out /data-tool/scripts/_generated/subset_refresh.sql ``` @@ -288,25 +304,43 @@ Generated scripts (gitignored by default): --mode load \ --chunk-size 500 \ --threads 4 \ - --pg-fastload \ - --pg-disable-method table_triggers \ + --pg-fastload \ + --pg-disable-method table_triggers \ + --source-connection cprd \ + --target-connection cprd_pg_subset \ + --target-schema public \ --out /data-tool/scripts/_generated/subset_load.sql ``` + Required/connection flags to set deliberately: + - Add `--source-connection ` if your source Oracle DbSchema alias is not the direct generator default `cprd`. + - Add `--target-connection ` if your target Postgres DbSchema alias is not the direct generator default `cprd_pg_subset`. + - Add `--target-schema ` if your target Postgres schema is not the direct generator default `public`. The schema must already exist and contain the expected extract/helper tables. + + Local Makefile shortcuts for the refresh flow now expose the same connection/schema controls: + ```bash + make -C /data-tool run-extract-refresh \ + SOURCE_CONNECTION=cprd \ + TARGET_CONNECTION=ctst_pg \ + TARGET_SCHEMA=public + ``` + Override these values when your DbSchema `init.sql` aliases or target Postgres schema differ. The direct generator default target alias is `cprd_pg_subset`; the local Makefile keeps its historical `TARGET_CONNECTION=ctst_pg` default for compatibility. + Optional flags: - Add `--include-cp` to opt in corp type `CP` for the subset transfer queries. - `--include-cp` affects the **subset workflow only**. Full refresh (`transfer_cprd_corps.sql`) and downstream reservation flows still use the historical corp-type cohort unless updated separately. + - Add `--no-cars` to skip the global cars* refresh, or `--include-cars` to make the default cars* refresh explicit. Optional performance flags: - Add `--pg-fastload` to enable Postgres session settings for faster bulk writes (templates `subset_pg_fastload_begin.sql` / `subset_pg_fastload_end.sql`). - `--pg-disable-method` currently accepts only `table_triggers` and `replica_role`. - The actual generator default is `table_triggers`, not `replica_role`. - - In refresh mode, preserved rows in `corp_processing`, `auth_processing`, `affiliation_processing`, and `colin_tracking` still reference `corporation` / `event`, so FK enforcement must stay suppressed across delete/reload. The generator now adds refresh-only trigger suppression for those preserved FK-owning tables when `--pg-disable-method table_triggers` is used. + - In refresh mode, preserved rows in `corp_processing`, `auth_processing`, `affiliation_processing`, and `colin_tracking` still reference `corporation` / `event`, so FK enforcement must stay suppressed across delete/reload. Current generator table-trigger mode suppresses `corp_processing` and `colin_tracking`. `auth_processing` exists in current DDL and was introduced in the generator as a commented-out trigger statement with no discoverable repo-history rationale; `affiliation_processing` also exists in current DDL/docs but is absent from generator trigger handling. Extending table-trigger suppression to those two tables is deferred until Cloud SQL privilege/runtime validation confirms the intended preserved-table set. - Subset load/refresh scripts now acquire a session-level Postgres advisory lock at the start and release it at the end so overlapping subset runs on the same target DB serialize instead of interleaving. The same lock key is also used by the full refresh script. - `table_triggers` changes table trigger state globally while the refresh runs, so use it against a quiesced/disposable extract DB and with a role that can disable the relevant triggers. - If you use `replica_role`, remember it is session-local. If FK errors still occur, verify `current_setting('session_replication_role')` inside the nested delete/purge scripts being executed by DbSchemaCLI. - - `address` is treated as a shared/global table during subset refresh/load. The generator now reuses the predeclared helper staging table `public.subset_address_stage`, transfers incoming Oracle addresses into it, and merges them into `public.address` by `addr_id` instead of deleting/reinserting address rows directly. The address extract also includes `notification_resend` references. - - BCOMPS purge keysets also use predeclared helper tables in the extract schema: `public.subset_excluded_corps`, `public.subset_excluded_events`, and `public.subset_excluded_corp_parties`. + - `address` is treated as a shared/global table during subset refresh/load. The generator now reuses the predeclared helper staging table `.subset_address_stage`, transfers incoming Oracle addresses into it, and merges them into `.address` by `addr_id` instead of deleting/reinserting address rows directly. The address extract also includes `notification_resend` references. + - BCOMPS purge keysets also use predeclared helper tables in the extract schema: `.subset_excluded_corps`, `.subset_excluded_events`, and `.subset_excluded_corp_parties`. - Do not overlap subset runs against the same target DB, and ensure the runtime role can truncate/read/write those helper tables. - Existing extract DBs created from older DDL must be refreshed or updated with the latest `colin_corps_extract_postgres_ddl` before running the subset workflow, otherwise the first helper-table `TRUNCATE` will fail. - Refresh mode now pre-cleans orphan event/corp-party child rows that can survive the parent-keyed delete phase from earlier failed/interleaved runs (for example a stale `filing` row whose parent `event` row is missing in target). @@ -330,7 +364,9 @@ Generated scripts (gitignored by default): - Subset scripts automatically install a Postgres helper cast to allow DbSchemaCLI inserts of `t/f` into boolean columns. - DbSchemaCLI build issue: If you get errors about `"bsh: for: No collection"` ensure you are using DbSchemaCLI 9.4.3+. - If `--no-cars` is used, cars* tables are skipped entirely. -- Chunk templates are rendered at generation time (inline mode), so the resulting SQL is self-contained. +- The direct refresh flow CLI uses a rolling lookback (`--lookback-hours`, default 5) scoped by `--mig-batch-id` (default 1). This is not durable high-watermark processing; retained artifact corp feeds should be used for recovery/reruns of failed runs. +- Chunk templates are rendered at generation time (inline mode), so the resulting SQL is self-contained for rendered predicates and chunk SQL. +- Generated SQL connects by DbSchema alias and renders the selected target schema at generation time. The source alias defaults to `cprd`, the target alias defaults to `cprd_pg_subset`, and the target schema defaults to `public` for direct generator use unless `--source-connection`, `--target-connection`, or `--target-schema` are passed. Ensure the selected aliases exist in `~/.DbSchema/cli/init.sql` and the selected schema exists in the target Postgres DB before running DbSchemaCLI. - If you need legacy runtime substitution behavior, generate with `--render-mode vset` (uses DbSchemaCLI `vset` + &placeholders). ### Oracle IN-list strategy (`--oracle-in-strategy`) diff --git a/data-tool/scripts/generate_cprd_subset_extract.py b/data-tool/scripts/generate_cprd_subset_extract.py index 479aba2f58..e834edfb43 100644 --- a/data-tool/scripts/generate_cprd_subset_extract.py +++ b/data-tool/scripts/generate_cprd_subset_extract.py @@ -31,7 +31,7 @@ import argparse import re -from dataclasses import dataclass +from dataclasses import dataclass, replace from enum import Enum from pathlib import Path from typing import Dict, Iterable, List, Sequence @@ -86,6 +86,7 @@ class cfg_GenerationConfig: out_master: Path out_chunks_dir: Path + source_connection: str target_connection: str target_schema: str @@ -98,6 +99,10 @@ class cfg_GenerationConfig: TMPL_TOKEN_TARGET_PRED = "&target_corp_num_predicate" # used by transfer template (Oracle-side) TMPL_TOKEN_ORACLE_PRED = "&oracle_corp_num_predicate" # used by transfer template (Oracle-side) TMPL_TOKEN_ORACLE_CORP_TYPE_PRED = "&oracle_corp_type_predicate" # used by transfer template (Oracle-side) +TMPL_TOKEN_SOURCE_CONNECTION = "__DBSCHEMA_SOURCE_CONNECTION__" # rendered generator-time source DbSchema alias +TMPL_TOKEN_TARGET_SCHEMA = "__DBSCHEMA_TARGET_SCHEMA__" # rendered generator-time target Postgres schema +TMPL_CONNECTION_TOKENS = (TMPL_TOKEN_SOURCE_CONNECTION, TMPL_TOKEN_TARGET_SCHEMA) +DBSCHEMA_IDENTIFIER_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$") @dataclass(frozen=True) @@ -306,26 +311,32 @@ def tmpl_default_bundle(repo_root: Path) -> tmpl_TemplateBundle: pg_prepare_address_stage = tmpl_TemplateSpec( name="subset_pg_prepare_address_stage", path=subset_dir / "subset_pg_prepare_address_stage.sql", + required_tokens=(TMPL_TOKEN_TARGET_SCHEMA,), ) pg_cleanup_address_stage = tmpl_TemplateSpec( name="subset_pg_cleanup_address_stage", path=subset_dir / "subset_pg_cleanup_address_stage.sql", + required_tokens=(TMPL_TOKEN_TARGET_SCHEMA,), ) pg_cleanup_orphan_children = tmpl_TemplateSpec( name="subset_pg_cleanup_orphan_children", path=subset_dir / "subset_pg_cleanup_orphan_children.sql", + required_tokens=(TMPL_TOKEN_TARGET_SCHEMA,), ) disable_triggers = tmpl_TemplateSpec( name="subset_disable_triggers", path=subset_dir / "subset_disable_triggers.sql", + required_tokens=(TMPL_TOKEN_TARGET_SCHEMA,), ) enable_triggers = tmpl_TemplateSpec( name="subset_enable_triggers", path=subset_dir / "subset_enable_triggers.sql", + required_tokens=(TMPL_TOKEN_TARGET_SCHEMA,), ) pg_boolean_casts = tmpl_TemplateSpec( name="subset_pg_boolean_casts", path=subset_dir / "subset_pg_boolean_casts.sql", + required_tokens=(TMPL_TOKEN_TARGET_SCHEMA,), ) pg_fastload_begin = tmpl_TemplateSpec( name="subset_pg_fastload_begin", @@ -338,24 +349,33 @@ def tmpl_default_bundle(repo_root: Path) -> tmpl_TemplateBundle: pg_purge_bcomps_excluded = tmpl_TemplateSpec( name="subset_pg_purge_bcomps_excluded", path=subset_dir / "subset_pg_purge_bcomps_excluded.sql", + required_tokens=(TMPL_TOKEN_TARGET_SCHEMA,), ) delete_chunk = tmpl_TemplateSpec( name="subset_delete_chunk", path=subset_dir / "subset_delete_chunk.sql", - required_tokens=(TMPL_TOKEN_CORP_IDS,), + required_tokens=(TMPL_TOKEN_CORP_IDS, TMPL_TOKEN_TARGET_SCHEMA), ) transfer_chunk = tmpl_TemplateSpec( name="subset_transfer_chunk", path=subset_dir / "subset_transfer_chunk.sql", - required_tokens=(TMPL_TOKEN_TARGET_PRED, TMPL_TOKEN_ORACLE_PRED, TMPL_TOKEN_ORACLE_CORP_TYPE_PRED), + required_tokens=( + TMPL_TOKEN_TARGET_PRED, + TMPL_TOKEN_ORACLE_PRED, + TMPL_TOKEN_ORACLE_CORP_TYPE_PRED, + TMPL_TOKEN_SOURCE_CONNECTION, + TMPL_TOKEN_TARGET_SCHEMA, + ), ) delete_cars = tmpl_TemplateSpec( name="subset_delete_cars", path=subset_dir / "subset_delete_cars.sql", + required_tokens=(TMPL_TOKEN_TARGET_SCHEMA,), ) transfer_cars = tmpl_TemplateSpec( name="subset_transfer_cars", path=subset_dir / "subset_transfer_cars.sql", + required_tokens=(TMPL_TOKEN_SOURCE_CONNECTION, TMPL_TOKEN_TARGET_SCHEMA), ) return tmpl_TemplateBundle( @@ -405,6 +425,45 @@ def tmpl_render(template_text: str, *, replacements: Dict[str, str]) -> str: return out +def cfg_validate_dbschema_identifier(name: str, value: str) -> str: + if not value: + raise SystemExit(f"{name} must not be empty") + if not DBSCHEMA_IDENTIFIER_RE.match(value): + raise SystemExit( + f"{name} must be a conservative DbSchema/Postgres identifier using letters, digits, " + "and underscore, and must not start with a digit" + ) + return value + + +def cfg_validate_pg_schema_identifier(name: str, value: str) -> str: + value = cfg_validate_dbschema_identifier(name, value) + if value != value.lower(): + raise SystemExit( + f"{name} must be lowercase. PostgreSQL folds unquoted uppercase identifiers to lowercase, " + "so uppercase schema values are rejected to avoid mismatches." + ) + return value + + +def tmpl_connection_replacements(cfg: cfg_GenerationConfig) -> Dict[str, str]: + return { + TMPL_TOKEN_SOURCE_CONNECTION: cfg.source_connection, + TMPL_TOKEN_TARGET_SCHEMA: cfg.target_schema, + } + + +def tmpl_assert_no_connection_tokens(spec: tmpl_TemplateSpec, rendered_text: str) -> None: + remaining = [token for token in TMPL_CONNECTION_TOKENS if token in rendered_text] + if remaining: + raise SystemExit( + "Template source/schema token rendering failed.\n" + f"Template: {spec.name}\n" + f"Path: {spec.path}\n" + f"Remaining token(s): {', '.join(remaining)}\n" + ) + + # ========================= # chunk_* (chunk planning) # ========================= @@ -446,9 +505,94 @@ def chunk_plan_chunks( # ========================= def gen_write_text(path: Path, text: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) path.write_text(text, encoding="utf-8") +def gen_write_rendered_connection_template( + *, + cfg: cfg_GenerationConfig, + spec: tmpl_TemplateSpec, + output_dir: Path, +) -> tmpl_TemplateSpec: + template_text = tmpl_load_text(spec) + rendered_text = tmpl_render(template_text, replacements=tmpl_connection_replacements(cfg)) + tmpl_assert_no_connection_tokens(spec, rendered_text) + rendered_path = output_dir / spec.path.name + gen_write_text(rendered_path, rendered_text) + return replace( + spec, + path=rendered_path, + required_tokens=tuple(token for token in spec.required_tokens if token not in TMPL_CONNECTION_TOKENS), + ) + + +def gen_write_rendered_connection_templates( + *, + cfg: cfg_GenerationConfig, + templates: tmpl_TemplateBundle, +) -> tmpl_TemplateBundle: + support_dir = cfg.out_chunks_dir / "support" + return replace( + templates, + pg_prepare_address_stage=gen_write_rendered_connection_template( + cfg=cfg, + spec=templates.pg_prepare_address_stage, + output_dir=support_dir, + ), + pg_cleanup_address_stage=gen_write_rendered_connection_template( + cfg=cfg, + spec=templates.pg_cleanup_address_stage, + output_dir=support_dir, + ), + pg_cleanup_orphan_children=gen_write_rendered_connection_template( + cfg=cfg, + spec=templates.pg_cleanup_orphan_children, + output_dir=support_dir, + ), + disable_triggers=gen_write_rendered_connection_template( + cfg=cfg, + spec=templates.disable_triggers, + output_dir=support_dir, + ), + enable_triggers=gen_write_rendered_connection_template( + cfg=cfg, + spec=templates.enable_triggers, + output_dir=support_dir, + ), + pg_boolean_casts=gen_write_rendered_connection_template( + cfg=cfg, + spec=templates.pg_boolean_casts, + output_dir=support_dir, + ), + pg_purge_bcomps_excluded=gen_write_rendered_connection_template( + cfg=cfg, + spec=templates.pg_purge_bcomps_excluded, + output_dir=support_dir, + ), + delete_chunk=gen_write_rendered_connection_template( + cfg=cfg, + spec=templates.delete_chunk, + output_dir=support_dir, + ), + transfer_chunk=gen_write_rendered_connection_template( + cfg=cfg, + spec=templates.transfer_chunk, + output_dir=support_dir, + ), + delete_cars=gen_write_rendered_connection_template( + cfg=cfg, + spec=templates.delete_cars, + output_dir=support_dir, + ), + transfer_cars=gen_write_rendered_connection_template( + cfg=cfg, + spec=templates.transfer_cars, + output_dir=support_dir, + ), + ) + + def gen_build_chunk_sql( *, chunk: chunk_ChunkSpec, @@ -497,7 +641,9 @@ def gen_build_chunk_sql( rendered_transfer = tmpl_render(transfer_template_text, replacements=replacements) if (TMPL_TOKEN_TARGET_PRED in rendered_transfer or TMPL_TOKEN_ORACLE_PRED in rendered_transfer or - TMPL_TOKEN_ORACLE_CORP_TYPE_PRED in rendered_transfer): + TMPL_TOKEN_ORACLE_CORP_TYPE_PRED in rendered_transfer or + TMPL_TOKEN_SOURCE_CONNECTION in rendered_transfer or + TMPL_TOKEN_TARGET_SCHEMA in rendered_transfer): raise SystemExit( f"Internal error: token(s) remained after rendering transfer template for chunk {chunk.index:03d}." ) @@ -561,9 +707,12 @@ def _gen_emit_pg_disable_begin(lines: List[str], *, cfg: cfg_GenerationConfig, t lines.append(f"execute {templates.disable_triggers.path.as_posix()}") if cfg.mode == cfg_GenerationMode.REFRESH: lines.append("-- Refresh-only: preserved processing/tracking tables still reference corporation/event rows.") - lines.append("ALTER TABLE corp_processing DISABLE TRIGGER ALL;") - # lines.append("ALTER TABLE auth_processing DISABLE TRIGGER ALL;") - lines.append("ALTER TABLE colin_tracking DISABLE TRIGGER ALL;") + lines.append(f"ALTER TABLE {cfg.target_schema}.corp_processing DISABLE TRIGGER ALL;") + lines.append("-- Deferred: DDL/docs identify auth_processing and affiliation_processing as preserved FK-owning") + lines.append("-- tables too, but auth_processing was introduced here as commented-out with no repo-history") + lines.append("-- rationale. Do not extend table-trigger suppression until Cloud SQL privilege/runtime") + lines.append("-- validation confirms the intended preserved-table set.") + lines.append(f"ALTER TABLE {cfg.target_schema}.colin_tracking DISABLE TRIGGER ALL;") lines.append("") return @@ -581,9 +730,9 @@ def _gen_emit_pg_disable_end(lines: List[str], *, cfg: cfg_GenerationConfig, tem lines.append(f"execute {templates.enable_triggers.path.as_posix()}") if cfg.mode == cfg_GenerationMode.REFRESH: lines.append("-- Refresh-only: restore preserved processing/tracking table triggers too.") - lines.append("ALTER TABLE corp_processing ENABLE TRIGGER ALL;") - # lines.append("ALTER TABLE auth_processing ENABLE TRIGGER ALL;") - lines.append("ALTER TABLE colin_tracking ENABLE TRIGGER ALL;") + lines.append(f"ALTER TABLE {cfg.target_schema}.corp_processing ENABLE TRIGGER ALL;") + lines.append("-- Deferred: see matching disable-side note for auth_processing/affiliation_processing.") + lines.append(f"ALTER TABLE {cfg.target_schema}.colin_tracking ENABLE TRIGGER ALL;") lines.append("") return @@ -646,8 +795,8 @@ def gen_build_master_script_inline( lines.append(f"learn schema {cfg.target_schema};") lines.append("") - lines.append("truncate table public.colin_extract_version; " - "insert into public.colin_extract_version (extracted_at) values (current_timestamp); " + lines.append(f"truncate table {cfg.target_schema}.colin_extract_version; " + f"insert into {cfg.target_schema}.colin_extract_version (extracted_at) values (current_timestamp); " ) lines.append("") @@ -750,8 +899,8 @@ def gen_build_master_script_vset( lines.append(f"learn schema {cfg.target_schema};") lines.append("") - lines.append("truncate table public.colin_extract_version; " - "insert into public.colin_extract_version (extracted_at) values (current_timestamp); " + lines.append(f"truncate table {cfg.target_schema}.colin_extract_version; " + f"insert into {cfg.target_schema}.colin_extract_version (extracted_at) values (current_timestamp); " ) lines.append("") @@ -773,10 +922,6 @@ def gen_build_master_script_vset( _gen_emit_pg_disable_begin(lines, cfg=cfg, templates=templates) _gen_emit_refresh_fk_note(lines, cfg=cfg) - if cfg.mode == cfg_GenerationMode.REFRESH: - lines.append("-- Cleanup stale orphan child rows before chunked refresh deletes.") - lines.append(f"execute {templates.pg_cleanup_orphan_children.path.as_posix()}") - lines.append("") if cfg.include_cars: lines.append("-- global cars* refresh (not corp-scoped; full dataset truncate + reload)") @@ -909,6 +1054,12 @@ def cli_parse_args(argv: List[str] | None = None) -> argparse.Namespace: action="store_true", help="If set, any all-numeric corp id lines will be normalized to BC for the TARGET/Postgres corp_num.", ) + parser.add_argument( + "--include-cars", + dest="include_cars", + action="store_true", + help="Include global cars* refresh step (carsfile/carsbox/carsrept/carindiv; generator default).", + ) parser.add_argument( "--no-cars", dest="include_cars", @@ -951,6 +1102,11 @@ def cli_parse_args(argv: List[str] | None = None) -> argparse.Namespace: "session_replication_role in the master script and nested execute files.", ) + parser.add_argument( + "--source-connection", + default="cprd", + help="DbSchemaCLI connection name for the SOURCE Oracle DB (default: cprd).", + ) parser.add_argument( "--target-connection", default="cprd_pg_subset", @@ -994,16 +1150,19 @@ def cfg_build_config(args: argparse.Namespace) -> cfg_GenerationConfig: ) out_master.parent.mkdir(parents=True, exist_ok=True) - # Chunk scripts dir is always derived from master output stem for determinism. + # Chunk/support scripts dir is always derived from master output stem for determinism. out_chunks_dir = out_master.parent / f"{out_master.stem}_chunks" - if render_mode == cfg_RenderMode.INLINE: - out_chunks_dir.mkdir(parents=True, exist_ok=True) + out_chunks_dir.mkdir(parents=True, exist_ok=True) if args.chunk_size <= 0: raise SystemExit("--chunk-size must be > 0") if args.threads <= 0: raise SystemExit("--threads must be > 0") + source_connection = cfg_validate_dbschema_identifier("--source-connection", str(args.source_connection)) + target_connection = cfg_validate_dbschema_identifier("--target-connection", str(args.target_connection)) + target_schema = cfg_validate_pg_schema_identifier("--target-schema", str(args.target_schema)) + return cfg_GenerationConfig( repo_root=repo_root, corp_file=corp_file, @@ -1021,8 +1180,9 @@ def cfg_build_config(args: argparse.Namespace) -> cfg_GenerationConfig: or_of_in_max_ids=or_of_in_max_ids, out_master=out_master, out_chunks_dir=out_chunks_dir, - target_connection=str(args.target_connection), - target_schema=str(args.target_schema), + source_connection=source_connection, + target_connection=target_connection, + target_schema=target_schema, ) @@ -1034,6 +1194,7 @@ def _effective_oracle_strategy(cfg: cfg_GenerationConfig, total_ids: int) -> cfg def run(cfg: cfg_GenerationConfig) -> int: templates = tmpl_default_bundle(cfg.repo_root) + templates = gen_write_rendered_connection_templates(cfg=cfg, templates=templates) if cfg.pg_debug_session_probes and cfg.render_mode != cfg_RenderMode.INLINE: raise SystemExit("--pg-debug-session-probes currently supports only --render-mode inline.") @@ -1157,6 +1318,9 @@ def run(cfg: cfg_GenerationConfig) -> int: print(" - Corp ids in the file should match the TARGET Postgres extract corp_num format (e.g. BC0460007).") print(" - If you have numeric-only corp ids, consider --prefix-numeric-bc.") print(f" - corp ids: {n_ids} => ceil({n_ids}/{cfg.chunk_size}) = {in_groups} chunk(s)") + print(f" - Source DbSchema connection: {cfg.source_connection}") + print(f" - Target DbSchema connection: {cfg.target_connection}") + print(f" - Target schema: {cfg.target_schema}") print(f" - Oracle IN-list handling: {effective_strategy.value} (configured: {cfg.oracle_in_strategy.value})") print(f" - chunk-size (max items per IN list): {cfg.chunk_size}") if effective_strategy == cfg_OracleInStrategy.CHUNK_FILES: @@ -1176,7 +1340,7 @@ def run(cfg: cfg_GenerationConfig) -> int: print(f" - Postgres fast-load session settings: {'ENABLED' if cfg.pg_fastload else 'disabled'} (--pg-fastload)") print(f" - Postgres trigger suppression: {cfg.pg_disable_method.value} (--pg-disable-method)") print(" - subset runs acquire a session-level advisory lock on the target DB to prevent overlap.") - print(" - Address loads use the predeclared helper table public.subset_address_stage and merge into public.address by addr_id.") + print(f" - Address loads use the predeclared helper table {cfg.target_schema}.subset_address_stage and merge into {cfg.target_schema}.address by addr_id.") print(" - BCOMPS purge keysets also use predeclared helper tables in the extract schema (subset_excluded_*).") print(" - subset runs should not overlap on the same target DB, and the runtime role must be able to truncate/read/write those helper tables.") if cfg.pg_debug_session_probes: @@ -1215,6 +1379,9 @@ def run(cfg: cfg_GenerationConfig) -> int: print("Notes:") print(" - This script relies on DbSchemaCLI vset variables and runtime substitution.") print(f" - corp ids: {n_ids} => ceil({n_ids}/{cfg.chunk_size}) = {in_groups} chunk(s)") + print(f" - Source DbSchema connection: {cfg.source_connection}") + print(f" - Target DbSchema connection: {cfg.target_connection}") + print(f" - Target schema: {cfg.target_schema}") print(f" - Oracle IN-list handling: {effective_strategy.value} (configured: {cfg.oracle_in_strategy.value})") print(f" - chunk-size (max items per IN list): {cfg.chunk_size}") if effective_strategy == cfg_OracleInStrategy.CHUNK_FILES: @@ -1230,7 +1397,7 @@ def run(cfg: cfg_GenerationConfig) -> int: print(f" - Postgres fast-load session settings: {'ENABLED' if cfg.pg_fastload else 'disabled'} (--pg-fastload)") print(f" - Postgres trigger suppression: {cfg.pg_disable_method.value} (--pg-disable-method)") print(" - subset runs acquire a session-level advisory lock on the target DB to prevent overlap.") - print(" - Address loads use the predeclared helper table public.subset_address_stage and merge into public.address by addr_id.") + print(f" - Address loads use the predeclared helper table {cfg.target_schema}.subset_address_stage and merge into {cfg.target_schema}.address by addr_id.") print(" - BCOMPS purge keysets also use predeclared helper tables in the extract schema (subset_excluded_*).") print(" - subset runs should not overlap on the same target DB, and the runtime role must be able to truncate/read/write those helper tables.") if cfg.pg_debug_session_probes: diff --git a/data-tool/scripts/subset/subset_delete_cars.sql b/data-tool/scripts/subset/subset_delete_cars.sql index ebcbec0ac5..c7b41a6c87 100644 --- a/data-tool/scripts/subset/subset_delete_cars.sql +++ b/data-tool/scripts/subset/subset_delete_cars.sql @@ -4,7 +4,7 @@ -- These tables are NOT corp-scoped, so we truncate the entire dataset and reload from Oracle. -- Volume is low enough that a full refresh is appropriate. -TRUNCATE TABLE carindiv; -TRUNCATE TABLE carsrept; -TRUNCATE TABLE carsbox; -TRUNCATE TABLE carsfile; +TRUNCATE TABLE __DBSCHEMA_TARGET_SCHEMA__.carindiv; +TRUNCATE TABLE __DBSCHEMA_TARGET_SCHEMA__.carsrept; +TRUNCATE TABLE __DBSCHEMA_TARGET_SCHEMA__.carsbox; +TRUNCATE TABLE __DBSCHEMA_TARGET_SCHEMA__.carsfile; diff --git a/data-tool/scripts/subset/subset_delete_chunk.sql b/data-tool/scripts/subset/subset_delete_chunk.sql index 45ed54a76d..89dbb71ddf 100644 --- a/data-tool/scripts/subset/subset_delete_chunk.sql +++ b/data-tool/scripts/subset/subset_delete_chunk.sql @@ -4,7 +4,7 @@ -- corp_ids_in : comma-separated SQL string literals for target corp_num values (NO parentheses), -- e.g. 'BC0460007','A1234567' -- --- Intended to be executed from a master DbSchemaCLI script connected to the target Postgres DB (cprd_pg). +-- Intended to be executed from a master DbSchemaCLI script connected to the target Postgres DB (__DBSCHEMA_TARGET_SCHEMA__ schema). -- -- Note: This script intentionally does NOT delete internal migration/processing tables (mig_*, corp_processing, -- colin_tracking, affiliation_processing, etc). It only deletes the corp-scoped COLIN extract tables that are @@ -16,104 +16,104 @@ -- Address rows are treated as shared/global during subset refresh. -- Do not delete them here: subset_transfer_chunk.sql stages incoming Oracle address rows and --- merges them into public.address by addr_id. +-- merges them into the configured target address table by addr_id. -- Delete child tables first (event-scoped). -DELETE FROM notification_resend -WHERE event_id IN (SELECT event_id FROM event WHERE corp_num IN (&corp_ids_in)); +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.notification_resend +WHERE event_id IN (SELECT event_id FROM __DBSCHEMA_TARGET_SCHEMA__.event WHERE corp_num IN (&corp_ids_in)); -DELETE FROM notification -WHERE event_id IN (SELECT event_id FROM event WHERE corp_num IN (&corp_ids_in)); +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.notification +WHERE event_id IN (SELECT event_id FROM __DBSCHEMA_TARGET_SCHEMA__.event WHERE corp_num IN (&corp_ids_in)); -DELETE FROM filing_user -WHERE event_id IN (SELECT event_id FROM event WHERE corp_num IN (&corp_ids_in)); +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.filing_user +WHERE event_id IN (SELECT event_id FROM __DBSCHEMA_TARGET_SCHEMA__.event WHERE corp_num IN (&corp_ids_in)); -DELETE FROM payment -WHERE event_id IN (SELECT event_id FROM event WHERE corp_num IN (&corp_ids_in)); +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.payment +WHERE event_id IN (SELECT event_id FROM __DBSCHEMA_TARGET_SCHEMA__.event WHERE corp_num IN (&corp_ids_in)); -DELETE FROM ledger_text -WHERE event_id IN (SELECT event_id FROM event WHERE corp_num IN (&corp_ids_in)); +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.ledger_text +WHERE event_id IN (SELECT event_id FROM __DBSCHEMA_TARGET_SCHEMA__.event WHERE corp_num IN (&corp_ids_in)); -DELETE FROM conv_ledger -WHERE event_id IN (SELECT event_id FROM event WHERE corp_num IN (&corp_ids_in)); +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.conv_ledger +WHERE event_id IN (SELECT event_id FROM __DBSCHEMA_TARGET_SCHEMA__.event WHERE corp_num IN (&corp_ids_in)); -DELETE FROM conv_event -WHERE event_id IN (SELECT event_id FROM event WHERE corp_num IN (&corp_ids_in)); +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.conv_event +WHERE event_id IN (SELECT event_id FROM __DBSCHEMA_TARGET_SCHEMA__.event WHERE corp_num IN (&corp_ids_in)); -DELETE FROM completing_party -WHERE event_id IN (SELECT event_id FROM event WHERE corp_num IN (&corp_ids_in)); +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.completing_party +WHERE event_id IN (SELECT event_id FROM __DBSCHEMA_TARGET_SCHEMA__.event WHERE corp_num IN (&corp_ids_in)); -DELETE FROM submitting_party -WHERE event_id IN (SELECT event_id FROM event WHERE corp_num IN (&corp_ids_in)); +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.submitting_party +WHERE event_id IN (SELECT event_id FROM __DBSCHEMA_TARGET_SCHEMA__.event WHERE corp_num IN (&corp_ids_in)); -DELETE FROM corp_involved_amalgamating -WHERE event_id IN (SELECT event_id FROM event WHERE corp_num IN (&corp_ids_in)); +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.corp_involved_amalgamating +WHERE event_id IN (SELECT event_id FROM __DBSCHEMA_TARGET_SCHEMA__.event WHERE corp_num IN (&corp_ids_in)); -DELETE FROM corp_involved_cont_in -WHERE event_id IN (SELECT event_id FROM event WHERE corp_num IN (&corp_ids_in)); +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.corp_involved_cont_in +WHERE event_id IN (SELECT event_id FROM __DBSCHEMA_TARGET_SCHEMA__.event WHERE corp_num IN (&corp_ids_in)); -DELETE FROM correction -WHERE event_id IN (SELECT event_id FROM event WHERE corp_num IN (&corp_ids_in)); +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.correction +WHERE event_id IN (SELECT event_id FROM __DBSCHEMA_TARGET_SCHEMA__.event WHERE corp_num IN (&corp_ids_in)); -DELETE FROM filing -WHERE event_id IN (SELECT event_id FROM event WHERE corp_num IN (&corp_ids_in)); +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.filing +WHERE event_id IN (SELECT event_id FROM __DBSCHEMA_TARGET_SCHEMA__.event WHERE corp_num IN (&corp_ids_in)); -- Delete corp-party related tables. -DELETE FROM party_notification -WHERE party_id IN (SELECT corp_party_id FROM corp_party WHERE corp_num IN (&corp_ids_in)); +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.party_notification +WHERE party_id IN (SELECT corp_party_id FROM __DBSCHEMA_TARGET_SCHEMA__.corp_party WHERE corp_num IN (&corp_ids_in)); -DELETE FROM offices_held -WHERE corp_party_id IN (SELECT corp_party_id FROM corp_party WHERE corp_num IN (&corp_ids_in)); +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.offices_held +WHERE corp_party_id IN (SELECT corp_party_id FROM __DBSCHEMA_TARGET_SCHEMA__.corp_party WHERE corp_num IN (&corp_ids_in)); -DELETE FROM corp_party_relationship -WHERE corp_party_id IN (SELECT corp_party_id FROM corp_party WHERE corp_num IN (&corp_ids_in)); +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.corp_party_relationship +WHERE corp_party_id IN (SELECT corp_party_id FROM __DBSCHEMA_TARGET_SCHEMA__.corp_party WHERE corp_num IN (&corp_ids_in)); -DELETE FROM corp_party +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.corp_party WHERE corp_num IN (&corp_ids_in); -- Delete corp-scoped tables. -DELETE FROM office +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.office WHERE corp_num IN (&corp_ids_in); -DELETE FROM corp_name +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.corp_name WHERE corp_num IN (&corp_ids_in); -DELETE FROM corp_state +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.corp_state WHERE corp_num IN (&corp_ids_in); -DELETE FROM corp_comments +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.corp_comments WHERE corp_num IN (&corp_ids_in); -DELETE FROM corp_flag +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.corp_flag WHERE corp_num IN (&corp_ids_in); -DELETE FROM cont_out +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.cont_out WHERE corp_num IN (&corp_ids_in); -DELETE FROM corp_restriction +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.corp_restriction WHERE corp_num IN (&corp_ids_in); -DELETE FROM jurisdiction +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.jurisdiction WHERE corp_num IN (&corp_ids_in); -DELETE FROM resolution +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.resolution WHERE corp_num IN (&corp_ids_in); -DELETE FROM share_series +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.share_series WHERE corp_num IN (&corp_ids_in); -DELETE FROM share_struct_cls +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.share_struct_cls WHERE corp_num IN (&corp_ids_in); -DELETE FROM share_struct +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.share_struct WHERE corp_num IN (&corp_ids_in); -- Delete events last (many tables reference event_id). -DELETE FROM event +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.event WHERE corp_num IN (&corp_ids_in); -- Delete the corp rows last. -DELETE FROM corporation +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.corporation WHERE corp_num IN (&corp_ids_in); -- Address rows are refreshed via stage+merge in subset_transfer_chunk.sql. diff --git a/data-tool/scripts/subset/subset_disable_triggers.sql b/data-tool/scripts/subset/subset_disable_triggers.sql index 6089f524c8..fd2c945fb3 100644 --- a/data-tool/scripts/subset/subset_disable_triggers.sql +++ b/data-tool/scripts/subset/subset_disable_triggers.sql @@ -1,38 +1,38 @@ -- Disable triggers for corp-scoped tables (subset refresh/load). -- Intended to be executed from a master DbSchemaCLI script while connected to the target Postgres DB. -ALTER TABLE corporation DISABLE TRIGGER ALL; -ALTER TABLE corp_name DISABLE TRIGGER ALL; -ALTER TABLE corp_state DISABLE TRIGGER ALL; -ALTER TABLE event DISABLE TRIGGER ALL; -ALTER TABLE filing DISABLE TRIGGER ALL; -ALTER TABLE filing_user DISABLE TRIGGER ALL; -ALTER TABLE office DISABLE TRIGGER ALL; -ALTER TABLE corp_comments DISABLE TRIGGER ALL; -ALTER TABLE ledger_text DISABLE TRIGGER ALL; -ALTER TABLE corp_party DISABLE TRIGGER ALL; -ALTER TABLE corp_party_relationship DISABLE TRIGGER ALL; -ALTER TABLE offices_held DISABLE TRIGGER ALL; -ALTER TABLE completing_party DISABLE TRIGGER ALL; -ALTER TABLE submitting_party DISABLE TRIGGER ALL; -ALTER TABLE corp_flag DISABLE TRIGGER ALL; -ALTER TABLE cont_out DISABLE TRIGGER ALL; -ALTER TABLE conv_event DISABLE TRIGGER ALL; -ALTER TABLE conv_ledger DISABLE TRIGGER ALL; -ALTER TABLE corp_involved_amalgamating DISABLE TRIGGER ALL; -ALTER TABLE corp_involved_cont_in DISABLE TRIGGER ALL; -ALTER TABLE corp_restriction DISABLE TRIGGER ALL; -ALTER TABLE correction DISABLE TRIGGER ALL; -ALTER TABLE jurisdiction DISABLE TRIGGER ALL; -ALTER TABLE resolution DISABLE TRIGGER ALL; -ALTER TABLE share_series DISABLE TRIGGER ALL; -ALTER TABLE share_struct DISABLE TRIGGER ALL; -ALTER TABLE share_struct_cls DISABLE TRIGGER ALL; -ALTER TABLE notification DISABLE TRIGGER ALL; -ALTER TABLE notification_resend DISABLE TRIGGER ALL; -ALTER TABLE party_notification DISABLE TRIGGER ALL; -ALTER TABLE payment DISABLE TRIGGER ALL; -ALTER TABLE carsfile DISABLE TRIGGER ALL; -ALTER TABLE carsbox DISABLE TRIGGER ALL; -ALTER TABLE carsrept DISABLE TRIGGER ALL; -ALTER TABLE carindiv DISABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.corporation DISABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.corp_name DISABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.corp_state DISABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.event DISABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.filing DISABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.filing_user DISABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.office DISABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.corp_comments DISABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.ledger_text DISABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.corp_party DISABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.corp_party_relationship DISABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.offices_held DISABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.completing_party DISABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.submitting_party DISABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.corp_flag DISABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.cont_out DISABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.conv_event DISABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.conv_ledger DISABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.corp_involved_amalgamating DISABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.corp_involved_cont_in DISABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.corp_restriction DISABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.correction DISABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.jurisdiction DISABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.resolution DISABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.share_series DISABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.share_struct DISABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.share_struct_cls DISABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.notification DISABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.notification_resend DISABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.party_notification DISABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.payment DISABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.carsfile DISABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.carsbox DISABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.carsrept DISABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.carindiv DISABLE TRIGGER ALL; diff --git a/data-tool/scripts/subset/subset_enable_triggers.sql b/data-tool/scripts/subset/subset_enable_triggers.sql index 071691976d..b7ef1327ac 100644 --- a/data-tool/scripts/subset/subset_enable_triggers.sql +++ b/data-tool/scripts/subset/subset_enable_triggers.sql @@ -1,38 +1,38 @@ -- Enable triggers for corp-scoped tables (subset refresh/load). -- Intended to be executed from a master DbSchemaCLI script while connected to the target Postgres DB. -ALTER TABLE corporation ENABLE TRIGGER ALL; -ALTER TABLE corp_name ENABLE TRIGGER ALL; -ALTER TABLE corp_state ENABLE TRIGGER ALL; -ALTER TABLE event ENABLE TRIGGER ALL; -ALTER TABLE filing ENABLE TRIGGER ALL; -ALTER TABLE filing_user ENABLE TRIGGER ALL; -ALTER TABLE office ENABLE TRIGGER ALL; -ALTER TABLE corp_comments ENABLE TRIGGER ALL; -ALTER TABLE ledger_text ENABLE TRIGGER ALL; -ALTER TABLE corp_party ENABLE TRIGGER ALL; -ALTER TABLE corp_party_relationship ENABLE TRIGGER ALL; -ALTER TABLE offices_held ENABLE TRIGGER ALL; -ALTER TABLE completing_party ENABLE TRIGGER ALL; -ALTER TABLE submitting_party ENABLE TRIGGER ALL; -ALTER TABLE corp_flag ENABLE TRIGGER ALL; -ALTER TABLE cont_out ENABLE TRIGGER ALL; -ALTER TABLE conv_event ENABLE TRIGGER ALL; -ALTER TABLE conv_ledger ENABLE TRIGGER ALL; -ALTER TABLE corp_involved_amalgamating ENABLE TRIGGER ALL; -ALTER TABLE corp_involved_cont_in ENABLE TRIGGER ALL; -ALTER TABLE corp_restriction ENABLE TRIGGER ALL; -ALTER TABLE correction ENABLE TRIGGER ALL; -ALTER TABLE jurisdiction ENABLE TRIGGER ALL; -ALTER TABLE resolution ENABLE TRIGGER ALL; -ALTER TABLE share_series ENABLE TRIGGER ALL; -ALTER TABLE share_struct ENABLE TRIGGER ALL; -ALTER TABLE share_struct_cls ENABLE TRIGGER ALL; -ALTER TABLE notification ENABLE TRIGGER ALL; -ALTER TABLE notification_resend ENABLE TRIGGER ALL; -ALTER TABLE party_notification ENABLE TRIGGER ALL; -ALTER TABLE payment ENABLE TRIGGER ALL; -ALTER TABLE carsfile ENABLE TRIGGER ALL; -ALTER TABLE carsbox ENABLE TRIGGER ALL; -ALTER TABLE carsrept ENABLE TRIGGER ALL; -ALTER TABLE carindiv ENABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.corporation ENABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.corp_name ENABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.corp_state ENABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.event ENABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.filing ENABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.filing_user ENABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.office ENABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.corp_comments ENABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.ledger_text ENABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.corp_party ENABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.corp_party_relationship ENABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.offices_held ENABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.completing_party ENABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.submitting_party ENABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.corp_flag ENABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.cont_out ENABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.conv_event ENABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.conv_ledger ENABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.corp_involved_amalgamating ENABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.corp_involved_cont_in ENABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.corp_restriction ENABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.correction ENABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.jurisdiction ENABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.resolution ENABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.share_series ENABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.share_struct ENABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.share_struct_cls ENABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.notification ENABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.notification_resend ENABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.party_notification ENABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.payment ENABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.carsfile ENABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.carsbox ENABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.carsrept ENABLE TRIGGER ALL; +ALTER TABLE __DBSCHEMA_TARGET_SCHEMA__.carindiv ENABLE TRIGGER ALL; diff --git a/data-tool/scripts/subset/subset_pg_boolean_casts.sql b/data-tool/scripts/subset/subset_pg_boolean_casts.sql index b97a72568d..b141cd5e00 100644 --- a/data-tool/scripts/subset/subset_pg_boolean_casts.sql +++ b/data-tool/scripts/subset/subset_pg_boolean_casts.sql @@ -23,7 +23,7 @@ -- DbSchemaCLI splits statements on semicolons and does not reliably handle semicolons inside dollar-quoted -- bodies. Keep dollar-quoted bodies free of internal semicolons and avoid DO $$ blocks. -CREATE OR REPLACE FUNCTION public.dbcli_varchar_to_boolean(val varchar) +CREATE OR REPLACE FUNCTION __DBSCHEMA_TARGET_SCHEMA__.dbcli_varchar_to_boolean(val varchar) RETURNS boolean LANGUAGE sql IMMUTABLE @@ -32,7 +32,7 @@ AS $$ SELECT (val::text)::boolean $$; -CREATE OR REPLACE FUNCTION public.dbcli_bpchar_to_boolean(val bpchar) +CREATE OR REPLACE FUNCTION __DBSCHEMA_TARGET_SCHEMA__.dbcli_bpchar_to_boolean(val bpchar) RETURNS boolean LANGUAGE sql IMMUTABLE @@ -44,12 +44,12 @@ $$; -- Recreate casts in an idempotent way (Postgres has no CREATE CAST IF NOT EXISTS). DROP CAST IF EXISTS (varchar AS boolean); CREATE CAST (varchar AS boolean) - WITH FUNCTION public.dbcli_varchar_to_boolean(varchar) + WITH FUNCTION __DBSCHEMA_TARGET_SCHEMA__.dbcli_varchar_to_boolean(varchar) AS IMPLICIT -- DbSchemaCLI workaround: avoid keyword being last token ; DROP CAST IF EXISTS (bpchar AS boolean); CREATE CAST (bpchar AS boolean) - WITH FUNCTION public.dbcli_bpchar_to_boolean(bpchar) + WITH FUNCTION __DBSCHEMA_TARGET_SCHEMA__.dbcli_bpchar_to_boolean(bpchar) AS IMPLICIT -- DbSchemaCLI workaround: avoid keyword being last token ; diff --git a/data-tool/scripts/subset/subset_pg_cleanup_address_stage.sql b/data-tool/scripts/subset/subset_pg_cleanup_address_stage.sql index a702de8e9c..0382b48de6 100644 --- a/data-tool/scripts/subset/subset_pg_cleanup_address_stage.sql +++ b/data-tool/scripts/subset/subset_pg_cleanup_address_stage.sql @@ -1,2 +1,4 @@ -- Cleanup the shared address staging table used by subset_transfer_chunk.sql. --- No-op: the helper table is predeclared in the COLIN extract DDL and is truncated during prepare/chunk execution. +-- This is schema-qualified because generated subset runs may target non-public schemas. + +TRUNCATE TABLE __DBSCHEMA_TARGET_SCHEMA__.subset_address_stage; diff --git a/data-tool/scripts/subset/subset_pg_cleanup_orphan_children.sql b/data-tool/scripts/subset/subset_pg_cleanup_orphan_children.sql index 345a6e55e1..33199b4f71 100644 --- a/data-tool/scripts/subset/subset_pg_cleanup_orphan_children.sql +++ b/data-tool/scripts/subset/subset_pg_cleanup_orphan_children.sql @@ -11,52 +11,52 @@ -- - corp-scoped rows deleted directly by corp_num are left to the regular chunk deletes -- Event-scoped children whose parent event row is missing. -DELETE FROM notification_resend t -WHERE NOT EXISTS (SELECT 1 FROM event e WHERE e.event_id = t.event_id); +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.notification_resend t +WHERE NOT EXISTS (SELECT 1 FROM __DBSCHEMA_TARGET_SCHEMA__.event e WHERE e.event_id = t.event_id); -DELETE FROM notification t -WHERE NOT EXISTS (SELECT 1 FROM event e WHERE e.event_id = t.event_id); +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.notification t +WHERE NOT EXISTS (SELECT 1 FROM __DBSCHEMA_TARGET_SCHEMA__.event e WHERE e.event_id = t.event_id); -DELETE FROM filing_user t -WHERE NOT EXISTS (SELECT 1 FROM event e WHERE e.event_id = t.event_id); +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.filing_user t +WHERE NOT EXISTS (SELECT 1 FROM __DBSCHEMA_TARGET_SCHEMA__.event e WHERE e.event_id = t.event_id); -DELETE FROM payment t -WHERE NOT EXISTS (SELECT 1 FROM event e WHERE e.event_id = t.event_id); +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.payment t +WHERE NOT EXISTS (SELECT 1 FROM __DBSCHEMA_TARGET_SCHEMA__.event e WHERE e.event_id = t.event_id); -DELETE FROM ledger_text t -WHERE NOT EXISTS (SELECT 1 FROM event e WHERE e.event_id = t.event_id); +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.ledger_text t +WHERE NOT EXISTS (SELECT 1 FROM __DBSCHEMA_TARGET_SCHEMA__.event e WHERE e.event_id = t.event_id); -DELETE FROM conv_ledger t -WHERE NOT EXISTS (SELECT 1 FROM event e WHERE e.event_id = t.event_id); +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.conv_ledger t +WHERE NOT EXISTS (SELECT 1 FROM __DBSCHEMA_TARGET_SCHEMA__.event e WHERE e.event_id = t.event_id); -DELETE FROM conv_event t -WHERE NOT EXISTS (SELECT 1 FROM event e WHERE e.event_id = t.event_id); +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.conv_event t +WHERE NOT EXISTS (SELECT 1 FROM __DBSCHEMA_TARGET_SCHEMA__.event e WHERE e.event_id = t.event_id); -DELETE FROM completing_party t -WHERE NOT EXISTS (SELECT 1 FROM event e WHERE e.event_id = t.event_id); +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.completing_party t +WHERE NOT EXISTS (SELECT 1 FROM __DBSCHEMA_TARGET_SCHEMA__.event e WHERE e.event_id = t.event_id); -DELETE FROM submitting_party t -WHERE NOT EXISTS (SELECT 1 FROM event e WHERE e.event_id = t.event_id); +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.submitting_party t +WHERE NOT EXISTS (SELECT 1 FROM __DBSCHEMA_TARGET_SCHEMA__.event e WHERE e.event_id = t.event_id); -DELETE FROM corp_involved_amalgamating t +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.corp_involved_amalgamating t WHERE t.event_id IS NOT NULL - AND NOT EXISTS (SELECT 1 FROM event e WHERE e.event_id = t.event_id); + AND NOT EXISTS (SELECT 1 FROM __DBSCHEMA_TARGET_SCHEMA__.event e WHERE e.event_id = t.event_id); -DELETE FROM corp_involved_cont_in t -WHERE NOT EXISTS (SELECT 1 FROM event e WHERE e.event_id = t.event_id); +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.corp_involved_cont_in t +WHERE NOT EXISTS (SELECT 1 FROM __DBSCHEMA_TARGET_SCHEMA__.event e WHERE e.event_id = t.event_id); -DELETE FROM correction t -WHERE NOT EXISTS (SELECT 1 FROM event e WHERE e.event_id = t.event_id); +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.correction t +WHERE NOT EXISTS (SELECT 1 FROM __DBSCHEMA_TARGET_SCHEMA__.event e WHERE e.event_id = t.event_id); -DELETE FROM filing t -WHERE NOT EXISTS (SELECT 1 FROM event e WHERE e.event_id = t.event_id); +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.filing t +WHERE NOT EXISTS (SELECT 1 FROM __DBSCHEMA_TARGET_SCHEMA__.event e WHERE e.event_id = t.event_id); -- Corp-party children whose parent corp_party row is missing. -DELETE FROM party_notification t -WHERE NOT EXISTS (SELECT 1 FROM corp_party cp WHERE cp.corp_party_id = t.party_id); +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.party_notification t +WHERE NOT EXISTS (SELECT 1 FROM __DBSCHEMA_TARGET_SCHEMA__.corp_party cp WHERE cp.corp_party_id = t.party_id); -DELETE FROM offices_held t -WHERE NOT EXISTS (SELECT 1 FROM corp_party cp WHERE cp.corp_party_id = t.corp_party_id); +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.offices_held t +WHERE NOT EXISTS (SELECT 1 FROM __DBSCHEMA_TARGET_SCHEMA__.corp_party cp WHERE cp.corp_party_id = t.corp_party_id); -DELETE FROM corp_party_relationship t -WHERE NOT EXISTS (SELECT 1 FROM corp_party cp WHERE cp.corp_party_id = t.corp_party_id); +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.corp_party_relationship t +WHERE NOT EXISTS (SELECT 1 FROM __DBSCHEMA_TARGET_SCHEMA__.corp_party cp WHERE cp.corp_party_id = t.corp_party_id); diff --git a/data-tool/scripts/subset/subset_pg_prepare_address_stage.sql b/data-tool/scripts/subset/subset_pg_prepare_address_stage.sql index 12c00c30d3..0305556ec2 100644 --- a/data-tool/scripts/subset/subset_pg_prepare_address_stage.sql +++ b/data-tool/scripts/subset/subset_pg_prepare_address_stage.sql @@ -1,4 +1,4 @@ -- Prepare the shared address staging table used by subset_transfer_chunk.sql. -- This is a predeclared regular table (not TEMP) because DbSchemaCLI transfer work may use separate sessions. -TRUNCATE TABLE public.subset_address_stage; +TRUNCATE TABLE __DBSCHEMA_TARGET_SCHEMA__.subset_address_stage; diff --git a/data-tool/scripts/subset/subset_pg_purge_bcomps_excluded.sql b/data-tool/scripts/subset/subset_pg_purge_bcomps_excluded.sql index 7c48f0e562..e7313ed0dd 100644 --- a/data-tool/scripts/subset/subset_pg_purge_bcomps_excluded.sql +++ b/data-tool/scripts/subset/subset_pg_purge_bcomps_excluded.sql @@ -15,167 +15,167 @@ -- 1) Build keysets -TRUNCATE TABLE public.subset_excluded_corp_parties; -TRUNCATE TABLE public.subset_excluded_events; -TRUNCATE TABLE public.subset_excluded_corps; +TRUNCATE TABLE __DBSCHEMA_TARGET_SCHEMA__.subset_excluded_corp_parties; +TRUNCATE TABLE __DBSCHEMA_TARGET_SCHEMA__.subset_excluded_events; +TRUNCATE TABLE __DBSCHEMA_TARGET_SCHEMA__.subset_excluded_corps; -INSERT INTO public.subset_excluded_corps (corp_num) +INSERT INTO __DBSCHEMA_TARGET_SCHEMA__.subset_excluded_corps (corp_num) SELECT DISTINCT e.corp_num -FROM event e -JOIN filing f ON f.event_id = e.event_id -JOIN filing_user u ON u.event_id = e.event_id +FROM __DBSCHEMA_TARGET_SCHEMA__.event e +JOIN __DBSCHEMA_TARGET_SCHEMA__.filing f ON f.event_id = e.event_id +JOIN __DBSCHEMA_TARGET_SCHEMA__.filing_user u ON u.event_id = e.event_id WHERE e.corp_num IS NOT NULL AND u.user_id = 'BCOMPS' AND f.filing_type_cd IN ('BEINC', 'ICORP', 'ICORU', 'ICORC', 'CONTB', 'CONTI', 'CONTU', 'CONTC'); -INSERT INTO public.subset_excluded_events (event_id) +INSERT INTO __DBSCHEMA_TARGET_SCHEMA__.subset_excluded_events (event_id) SELECT DISTINCT e.event_id -FROM event e -JOIN public.subset_excluded_corps x ON x.corp_num = e.corp_num +FROM __DBSCHEMA_TARGET_SCHEMA__.event e +JOIN __DBSCHEMA_TARGET_SCHEMA__.subset_excluded_corps x ON x.corp_num = e.corp_num WHERE e.event_id IS NOT NULL; -INSERT INTO public.subset_excluded_corp_parties (corp_party_id) +INSERT INTO __DBSCHEMA_TARGET_SCHEMA__.subset_excluded_corp_parties (corp_party_id) SELECT DISTINCT cp.corp_party_id -FROM corp_party cp -JOIN public.subset_excluded_corps x ON x.corp_num = cp.corp_num +FROM __DBSCHEMA_TARGET_SCHEMA__.corp_party cp +JOIN __DBSCHEMA_TARGET_SCHEMA__.subset_excluded_corps x ON x.corp_num = cp.corp_num WHERE cp.corp_party_id IS NOT NULL; -- 2) Purge (delete child tables first) -- Event-scoped children -DELETE FROM notification_resend t -USING public.subset_excluded_events x +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.notification_resend t +USING __DBSCHEMA_TARGET_SCHEMA__.subset_excluded_events x WHERE t.event_id = x.event_id; -DELETE FROM notification t -USING public.subset_excluded_events x +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.notification t +USING __DBSCHEMA_TARGET_SCHEMA__.subset_excluded_events x WHERE t.event_id = x.event_id; -DELETE FROM filing_user t -USING public.subset_excluded_events x +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.filing_user t +USING __DBSCHEMA_TARGET_SCHEMA__.subset_excluded_events x WHERE t.event_id = x.event_id; -DELETE FROM payment t -USING public.subset_excluded_events x +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.payment t +USING __DBSCHEMA_TARGET_SCHEMA__.subset_excluded_events x WHERE t.event_id = x.event_id; -DELETE FROM ledger_text t -USING public.subset_excluded_events x +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.ledger_text t +USING __DBSCHEMA_TARGET_SCHEMA__.subset_excluded_events x WHERE t.event_id = x.event_id; -DELETE FROM conv_ledger t -USING public.subset_excluded_events x +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.conv_ledger t +USING __DBSCHEMA_TARGET_SCHEMA__.subset_excluded_events x WHERE t.event_id = x.event_id; -DELETE FROM conv_event t -USING public.subset_excluded_events x +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.conv_event t +USING __DBSCHEMA_TARGET_SCHEMA__.subset_excluded_events x WHERE t.event_id = x.event_id; -DELETE FROM completing_party t -USING public.subset_excluded_events x +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.completing_party t +USING __DBSCHEMA_TARGET_SCHEMA__.subset_excluded_events x WHERE t.event_id = x.event_id; -DELETE FROM submitting_party t -USING public.subset_excluded_events x +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.submitting_party t +USING __DBSCHEMA_TARGET_SCHEMA__.subset_excluded_events x WHERE t.event_id = x.event_id; -DELETE FROM corp_involved_cont_in t -USING public.subset_excluded_events x +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.corp_involved_cont_in t +USING __DBSCHEMA_TARGET_SCHEMA__.subset_excluded_events x WHERE t.event_id = x.event_id; -DELETE FROM correction t -USING public.subset_excluded_events x +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.correction t +USING __DBSCHEMA_TARGET_SCHEMA__.subset_excluded_events x WHERE t.event_id = x.event_id; -DELETE FROM filing t -USING public.subset_excluded_events x +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.filing t +USING __DBSCHEMA_TARGET_SCHEMA__.subset_excluded_events x WHERE t.event_id = x.event_id; -- corp_involved_amalgamating can reference corp_num via ted_corp_num/ting_corp_num as well as event_id. -- Delete any rows where either side is excluded (covers non-event-owned references too). -DELETE FROM corp_involved_amalgamating t -USING public.subset_excluded_corps x +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.corp_involved_amalgamating t +USING __DBSCHEMA_TARGET_SCHEMA__.subset_excluded_corps x WHERE t.ted_corp_num = x.corp_num OR t.ting_corp_num = x.corp_num; -- Corp-party related -DELETE FROM party_notification t -USING public.subset_excluded_corp_parties x +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.party_notification t +USING __DBSCHEMA_TARGET_SCHEMA__.subset_excluded_corp_parties x WHERE t.party_id = x.corp_party_id; -DELETE FROM offices_held t -USING public.subset_excluded_corp_parties x +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.offices_held t +USING __DBSCHEMA_TARGET_SCHEMA__.subset_excluded_corp_parties x WHERE t.corp_party_id = x.corp_party_id; -DELETE FROM corp_party_relationship t -USING public.subset_excluded_corp_parties x +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.corp_party_relationship t +USING __DBSCHEMA_TARGET_SCHEMA__.subset_excluded_corp_parties x WHERE t.corp_party_id = x.corp_party_id; -DELETE FROM corp_party t -USING public.subset_excluded_corp_parties x +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.corp_party t +USING __DBSCHEMA_TARGET_SCHEMA__.subset_excluded_corp_parties x WHERE t.corp_party_id = x.corp_party_id; -- Corp-scoped tables -DELETE FROM office t -USING public.subset_excluded_corps x +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.office t +USING __DBSCHEMA_TARGET_SCHEMA__.subset_excluded_corps x WHERE t.corp_num = x.corp_num; -DELETE FROM corp_name t -USING public.subset_excluded_corps x +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.corp_name t +USING __DBSCHEMA_TARGET_SCHEMA__.subset_excluded_corps x WHERE t.corp_num = x.corp_num; -DELETE FROM corp_state t -USING public.subset_excluded_corps x +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.corp_state t +USING __DBSCHEMA_TARGET_SCHEMA__.subset_excluded_corps x WHERE t.corp_num = x.corp_num; -DELETE FROM corp_comments t -USING public.subset_excluded_corps x +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.corp_comments t +USING __DBSCHEMA_TARGET_SCHEMA__.subset_excluded_corps x WHERE t.corp_num = x.corp_num; -DELETE FROM corp_flag t -USING public.subset_excluded_corps x +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.corp_flag t +USING __DBSCHEMA_TARGET_SCHEMA__.subset_excluded_corps x WHERE t.corp_num = x.corp_num; -DELETE FROM cont_out t -USING public.subset_excluded_corps x +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.cont_out t +USING __DBSCHEMA_TARGET_SCHEMA__.subset_excluded_corps x WHERE t.corp_num = x.corp_num; -DELETE FROM corp_restriction t -USING public.subset_excluded_corps x +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.corp_restriction t +USING __DBSCHEMA_TARGET_SCHEMA__.subset_excluded_corps x WHERE t.corp_num = x.corp_num; -DELETE FROM jurisdiction t -USING public.subset_excluded_corps x +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.jurisdiction t +USING __DBSCHEMA_TARGET_SCHEMA__.subset_excluded_corps x WHERE t.corp_num = x.corp_num; -DELETE FROM resolution t -USING public.subset_excluded_corps x +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.resolution t +USING __DBSCHEMA_TARGET_SCHEMA__.subset_excluded_corps x WHERE t.corp_num = x.corp_num; -- Share tables (delete deepest first) -DELETE FROM share_series t -USING public.subset_excluded_corps x +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.share_series t +USING __DBSCHEMA_TARGET_SCHEMA__.subset_excluded_corps x WHERE t.corp_num = x.corp_num; -DELETE FROM share_struct_cls t -USING public.subset_excluded_corps x +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.share_struct_cls t +USING __DBSCHEMA_TARGET_SCHEMA__.subset_excluded_corps x WHERE t.corp_num = x.corp_num; -DELETE FROM share_struct t -USING public.subset_excluded_corps x +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.share_struct t +USING __DBSCHEMA_TARGET_SCHEMA__.subset_excluded_corps x WHERE t.corp_num = x.corp_num; -- Events last (many tables reference event_id) -DELETE FROM event t -USING public.subset_excluded_events x +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.event t +USING __DBSCHEMA_TARGET_SCHEMA__.subset_excluded_events x WHERE t.event_id = x.event_id; -- Corporation last -DELETE FROM corporation t -USING public.subset_excluded_corps x +DELETE FROM __DBSCHEMA_TARGET_SCHEMA__.corporation t +USING __DBSCHEMA_TARGET_SCHEMA__.subset_excluded_corps x WHERE t.corp_num = x.corp_num; -- 3) Cleanup helper tables -TRUNCATE TABLE public.subset_excluded_corp_parties; -TRUNCATE TABLE public.subset_excluded_events; -TRUNCATE TABLE public.subset_excluded_corps; +TRUNCATE TABLE __DBSCHEMA_TARGET_SCHEMA__.subset_excluded_corp_parties; +TRUNCATE TABLE __DBSCHEMA_TARGET_SCHEMA__.subset_excluded_events; +TRUNCATE TABLE __DBSCHEMA_TARGET_SCHEMA__.subset_excluded_corps; diff --git a/data-tool/scripts/subset/subset_transfer_cars.sql b/data-tool/scripts/subset/subset_transfer_cars.sql index f41876127a..665db97810 100644 --- a/data-tool/scripts/subset/subset_transfer_cars.sql +++ b/data-tool/scripts/subset/subset_transfer_cars.sql @@ -1,17 +1,17 @@ --- Global transfer of cars* tables from SOURCE Oracle DB (cprd) into TARGET Postgres extract DB (cprd_pg). --- Intended to be executed from a master DbSchemaCLI script connected to the target Postgres DB (cprd_pg). +-- Global transfer of cars* tables from SOURCE Oracle DB (__DBSCHEMA_SOURCE_CONNECTION__) into TARGET Postgres extract DB (__DBSCHEMA_TARGET_SCHEMA__ schema). +-- Intended to be executed from a master DbSchemaCLI script connected to the target Postgres DB (__DBSCHEMA_TARGET_SCHEMA__ schema). -- -- These tables are NOT corp-scoped. The full dataset is transferred without filtering. -- Volume is low enough that a full refresh is appropriate. -transfer public.carsfile from cprd using +transfer __DBSCHEMA_TARGET_SCHEMA__.carsfile from __DBSCHEMA_SOURCE_CONNECTION__ using select documtid, filedate, regiracf from carsfile; -transfer public.carsbox from cprd using +transfer __DBSCHEMA_TARGET_SCHEMA__.carsbox from __DBSCHEMA_SOURCE_CONNECTION__ using select documtid, accesnum, @@ -19,14 +19,14 @@ select boxrracf from carsbox; -transfer public.carsrept from cprd using +transfer __DBSCHEMA_TARGET_SCHEMA__.carsrept from __DBSCHEMA_SOURCE_CONNECTION__ using select documtid, docutype, compnumb from carsrept; -transfer public.carindiv from cprd using +transfer __DBSCHEMA_TARGET_SCHEMA__.carindiv from __DBSCHEMA_SOURCE_CONNECTION__ using select documtid, replace(surname, CHR(0), '') as surname, diff --git a/data-tool/scripts/subset/subset_transfer_chunk.sql b/data-tool/scripts/subset/subset_transfer_chunk.sql index 50b4e91757..f76703338c 100644 --- a/data-tool/scripts/subset/subset_transfer_chunk.sql +++ b/data-tool/scripts/subset/subset_transfer_chunk.sql @@ -1,4 +1,4 @@ --- Transfer a chunk (or a whole subset) of corps from the SOURCE Oracle DB (cprd) into the TARGET Postgres extract DB (cprd_pg). +-- Transfer a chunk (or a whole subset) of corps from the SOURCE Oracle DB (__DBSCHEMA_SOURCE_CONNECTION__) into the TARGET Postgres extract DB (__DBSCHEMA_TARGET_SCHEMA__ schema). -- -- REQUIRED DbSchemaCLI variables (replace_variables=true): -- target_corp_num_predicate : SQL predicate restricting the computed target_corp_num (NO trailing semicolon). @@ -14,7 +14,7 @@ -- c.CORP_TYP_CD in ('BC','C','ULC','CUL','CC','CCC','QA','QB','QC','QD','QE') -- c.CORP_TYP_CD in ('BC','C','ULC','CUL','CC','CCC','QA','QB','QC','QD','QE','CP') -- --- Intended to be executed from a master DbSchemaCLI script connected to the target Postgres DB (cprd_pg). +-- Intended to be executed from a master DbSchemaCLI script connected to the target Postgres DB (__DBSCHEMA_TARGET_SCHEMA__ schema). -- -- IMPORTANT: -- - This template intentionally avoids the boolean<->integer ALTER COLUMN hacks used in the full refresh script. @@ -33,7 +33,7 @@ -- vset oracle_corp_num_predicate=c.CORP_NUM in ('1111585','1226175'); -- corporation -transfer public.corporation from cprd using +transfer __DBSCHEMA_TARGET_SCHEMA__.corporation from __DBSCHEMA_SOURCE_CONNECTION__ using with corp_list as ( select /*+ materialize */ c.corp_num from corporation c @@ -88,7 +88,7 @@ left join last_ar la on la.corp_num = c.corp_num; -- event -transfer public.event from cprd using +transfer __DBSCHEMA_TARGET_SCHEMA__.event from __DBSCHEMA_SOURCE_CONNECTION__ using with corp_list as ( select /*+ materialize */ c.corp_num from corporation c @@ -121,7 +121,7 @@ where e.event_typ_cd not in ('BNUPD', 'ADDLEDGR'); -- corp_name -transfer public.corp_name from cprd using +transfer __DBSCHEMA_TARGET_SCHEMA__.corp_name from __DBSCHEMA_SOURCE_CONNECTION__ using with corp_list as ( select /*+ materialize */ c.corp_num from corporation c @@ -152,7 +152,7 @@ join CORP_NAME cn on cn.corp_num = c.corp_num; -- corp_state -transfer public.corp_state from cprd using +transfer __DBSCHEMA_TARGET_SCHEMA__.corp_state from __DBSCHEMA_SOURCE_CONNECTION__ using with corp_list as ( select /*+ materialize */ c.corp_num from corporation c @@ -184,7 +184,7 @@ join corp_op_state cos on cos.state_typ_cd = cs.state_typ_cd; -- filing -transfer public.filing from cprd using +transfer __DBSCHEMA_TARGET_SCHEMA__.filing from __DBSCHEMA_SOURCE_CONNECTION__ using with corp_list as ( select /*+ materialize */ c.corp_num from corporation c @@ -231,7 +231,7 @@ join filing f on f.event_id = e.event_id; -- filing_user -transfer public.filing_user from cprd using +transfer __DBSCHEMA_TARGET_SCHEMA__.filing_user from __DBSCHEMA_SOURCE_CONNECTION__ using with corp_list as ( select /*+ materialize */ c.corp_num from corporation c @@ -266,9 +266,9 @@ join filing_user u on u.event_id = e.event_id; -- address (shared/global table; stage then merge before loading dependents) -TRUNCATE TABLE public.subset_address_stage; +TRUNCATE TABLE __DBSCHEMA_TARGET_SCHEMA__.subset_address_stage; -transfer public.subset_address_stage from cprd using +transfer __DBSCHEMA_TARGET_SCHEMA__.subset_address_stage from __DBSCHEMA_SOURCE_CONNECTION__ using with corp_list as ( select /*+ materialize */ c.corp_num from corporation c @@ -346,7 +346,7 @@ from ( join address a on x.mailing_addr_id = a.addr_id ); -INSERT INTO public.address ( +INSERT INTO __DBSCHEMA_TARGET_SCHEMA__.address ( addr_id, province, country_typ_cd, @@ -374,7 +374,7 @@ FROM ( addr_line_2, addr_line_3, city - FROM public.subset_address_stage + FROM __DBSCHEMA_TARGET_SCHEMA__.subset_address_stage WHERE addr_id IS NOT NULL ORDER BY addr_id ) s @@ -387,11 +387,11 @@ SET province = EXCLUDED.province, addr_line_3 = EXCLUDED.addr_line_3, city = EXCLUDED.city; -TRUNCATE TABLE public.subset_address_stage; +TRUNCATE TABLE __DBSCHEMA_TARGET_SCHEMA__.subset_address_stage; -- office -transfer public.office from cprd using +transfer __DBSCHEMA_TARGET_SCHEMA__.office from __DBSCHEMA_SOURCE_CONNECTION__ using with corp_list as ( select /*+ materialize */ c.corp_num from corporation c @@ -423,7 +423,7 @@ join office o on o.corp_num = c.corp_num; -- corp_comments -transfer public.corp_comments from cprd using +transfer __DBSCHEMA_TARGET_SCHEMA__.corp_comments from __DBSCHEMA_SOURCE_CONNECTION__ using with corp_list as ( select /*+ materialize */ c.corp_num from corporation c @@ -457,7 +457,7 @@ join corp_comments cc on cc.corp_num = c.corp_num; -- ledger_text -transfer public.ledger_text from cprd using +transfer __DBSCHEMA_TARGET_SCHEMA__.ledger_text from __DBSCHEMA_SOURCE_CONNECTION__ using with corp_list as ( select /*+ materialize */ c.corp_num from corporation c @@ -488,7 +488,7 @@ join ledger_text l on l.event_id = e.event_id; -- corp_party -transfer public.corp_party from cprd using +transfer __DBSCHEMA_TARGET_SCHEMA__.corp_party from __DBSCHEMA_SOURCE_CONNECTION__ using with corp_list as ( select /*+ materialize */ c.corp_num from corporation c @@ -531,7 +531,7 @@ join corp_party p on p.corp_num = c.corp_num; -- corp_party_relationship -transfer public.corp_party_relationship from cprd using +transfer __DBSCHEMA_TARGET_SCHEMA__.corp_party_relationship from __DBSCHEMA_SOURCE_CONNECTION__ using with corp_list as ( select /*+ materialize */ c.corp_num from corporation c @@ -560,7 +560,7 @@ join CORP_PARTY_RELATIONSHIP cpr on cpr.corp_party_id = p.corp_party_id; -- offices_held -transfer public.offices_held from cprd using +transfer __DBSCHEMA_TARGET_SCHEMA__.offices_held from __DBSCHEMA_SOURCE_CONNECTION__ using with corp_list as ( select /*+ materialize */ c.corp_num from corporation c @@ -589,7 +589,7 @@ join OFFICES_HELD oh on oh.corp_party_id = p.corp_party_id; -- completing_party -transfer public.completing_party from cprd using +transfer __DBSCHEMA_TARGET_SCHEMA__.completing_party from __DBSCHEMA_SOURCE_CONNECTION__ using with corp_list as ( select /*+ materialize */ c.corp_num from corporation c @@ -622,7 +622,7 @@ join completing_party cp on cp.event_id = e.event_id; -- submitting_party -transfer public.submitting_party from cprd using +transfer __DBSCHEMA_TARGET_SCHEMA__.submitting_party from __DBSCHEMA_SOURCE_CONNECTION__ using with corp_list as ( select /*+ materialize */ c.corp_num from corporation c @@ -663,7 +663,7 @@ join SUBMITTING_PARTY sp on sp.event_id = e.event_id; -- corp_flag -transfer public.corp_flag from cprd using +transfer __DBSCHEMA_TARGET_SCHEMA__.corp_flag from __DBSCHEMA_SOURCE_CONNECTION__ using with corp_list as ( select /*+ materialize */ c.corp_num from corporation c @@ -693,7 +693,7 @@ join corp_flag cf on cf.corp_num = c.corp_num; -- cont_out -transfer public.cont_out from cprd using +transfer __DBSCHEMA_TARGET_SCHEMA__.cont_out from __DBSCHEMA_SOURCE_CONNECTION__ using with corp_list as ( select /*+ materialize */ c.corp_num from corporation c @@ -726,7 +726,7 @@ join CONT_OUT co on co.corp_num = c.corp_num; -- conv_event -transfer public.conv_event from cprd using +transfer __DBSCHEMA_TARGET_SCHEMA__.conv_event from __DBSCHEMA_SOURCE_CONNECTION__ using with corp_list as ( select /*+ materialize */ c.corp_num from corporation c @@ -765,7 +765,7 @@ join CONV_EVENT ce on ce.event_id = e.event_id; -- conv_ledger -transfer public.conv_ledger from cprd using +transfer __DBSCHEMA_TARGET_SCHEMA__.conv_ledger from __DBSCHEMA_SOURCE_CONNECTION__ using with corp_list as ( select /*+ materialize */ c.corp_num from corporation c @@ -796,7 +796,7 @@ join CONV_LEDGER cl on cl.event_id = e.event_id; -- corp_involved - amalgamaTING_businesses -transfer public.corp_involved_amalgamating from cprd using +transfer __DBSCHEMA_TARGET_SCHEMA__.corp_involved_amalgamating from __DBSCHEMA_SOURCE_CONNECTION__ using with corp_list as ( select /*+ materialize */ c.corp_num from corporation c @@ -864,7 +864,7 @@ where f.filing_typ_cd in ('AMALH', 'AMALV', 'AMALR', 'AMLHU', 'AMLVU', 'AMLRU', -- corp_involved - continue_in_historical_xpro -transfer public.corp_involved_cont_in from cprd using +transfer __DBSCHEMA_TARGET_SCHEMA__.corp_involved_cont_in from __DBSCHEMA_SOURCE_CONNECTION__ using with corp_list as ( select /*+ materialize */ c.corp_num from corporation c @@ -895,7 +895,7 @@ where f.filing_typ_cd in ('CONTI', 'CONTU', 'CONTC') -- corp_restriction -transfer public.corp_restriction from cprd using +transfer __DBSCHEMA_TARGET_SCHEMA__.corp_restriction from __DBSCHEMA_SOURCE_CONNECTION__ using with corp_list as ( select /*+ materialize */ c.corp_num from corporation c @@ -929,7 +929,7 @@ join CORP_RESTRICTION cr on cr.corp_num = c.corp_num; -- correction -transfer public.correction from cprd using +transfer __DBSCHEMA_TARGET_SCHEMA__.correction from __DBSCHEMA_SOURCE_CONNECTION__ using with corp_list as ( select /*+ materialize */ c.corp_num from corporation c @@ -959,7 +959,7 @@ join CORRECTION corr on corr.event_id = e.event_id; -- continued_in_from_jurisdiction -transfer public.jurisdiction from cprd using +transfer __DBSCHEMA_TARGET_SCHEMA__.jurisdiction from __DBSCHEMA_SOURCE_CONNECTION__ using with corp_list as ( select /*+ materialize */ c.corp_num from corporation c @@ -994,7 +994,7 @@ join JURISDICTION j on j.corp_num = c.corp_num; -- resolution -transfer public.resolution from cprd using +transfer __DBSCHEMA_TARGET_SCHEMA__.resolution from __DBSCHEMA_SOURCE_CONNECTION__ using with corp_list as ( select /*+ materialize */ c.corp_num from corporation c @@ -1025,7 +1025,7 @@ join RESOLUTION r on r.corp_num = c.corp_num; -- share_struct -transfer public.share_struct from cprd using +transfer __DBSCHEMA_TARGET_SCHEMA__.share_struct from __DBSCHEMA_SOURCE_CONNECTION__ using with corp_list as ( select /*+ materialize */ c.corp_num from corporation c @@ -1054,7 +1054,7 @@ join SHARE_STRUCT ss on ss.corp_num = c.corp_num; -- share_struct_cls -transfer public.share_struct_cls from cprd using +transfer __DBSCHEMA_TARGET_SCHEMA__.share_struct_cls from __DBSCHEMA_SOURCE_CONNECTION__ using with corp_list as ( select /*+ materialize */ c.corp_num from corporation c @@ -1103,7 +1103,7 @@ join SHARE_STRUCT_CLS ssc on ssc.corp_num = c.corp_num; -- share_series -transfer public.share_series from cprd using +transfer __DBSCHEMA_TARGET_SCHEMA__.share_series from __DBSCHEMA_SOURCE_CONNECTION__ using with corp_list as ( select /*+ materialize */ c.corp_num from corporation c @@ -1145,7 +1145,7 @@ join SHARE_SERIES ss on ss.corp_num = c.corp_num; -- notification -transfer public.notification from cprd using +transfer __DBSCHEMA_TARGET_SCHEMA__.notification from __DBSCHEMA_SOURCE_CONNECTION__ using with corp_list as ( select /*+ materialize */ c.corp_num from corporation c @@ -1181,7 +1181,7 @@ join NOTIFICATION n on n.event_id = e.event_id; -- notification_resend -transfer public.notification_resend from cprd using +transfer __DBSCHEMA_TARGET_SCHEMA__.notification_resend from __DBSCHEMA_SOURCE_CONNECTION__ using with corp_list as ( select /*+ materialize */ c.corp_num from corporation c @@ -1217,7 +1217,7 @@ join NOTIFICATION_RESEND nr on nr.event_id = e.event_id; -- party_notification -transfer public.party_notification from cprd using +transfer __DBSCHEMA_TARGET_SCHEMA__.party_notification from __DBSCHEMA_SOURCE_CONNECTION__ using with corp_list as ( select /*+ materialize */ c.corp_num from corporation c @@ -1254,7 +1254,7 @@ join PARTY_NOTIFICATION pn on pn.party_id = cp.corp_party_id; -- payment -transfer public.payment from cprd using +transfer __DBSCHEMA_TARGET_SCHEMA__.payment from __DBSCHEMA_SOURCE_CONNECTION__ using with corp_list as ( select /*+ materialize */ c.corp_num from corporation c diff --git a/jobs/colin-extract-delta/Dockerfile b/jobs/colin-extract-delta/Dockerfile new file mode 100644 index 0000000000..63eb548e58 --- /dev/null +++ b/jobs/colin-extract-delta/Dockerfile @@ -0,0 +1,128 @@ +# Build from the repository root so this image can copy data-tool/ and the job wrapper: +# docker build --platform linux/amd64 -f jobs/colin-extract-delta/Dockerfile -t colin-extract-delta . +FROM --platform=linux/amd64 python:3.11-slim-bullseye + +ARG VCS_REF="missing" +ARG BUILD_DATE="missing" +ARG DBSCHEMA_VERSION="9_7_1" +ARG DBSCHEMA_URL="https://dbschema.com/download/dbschema_unix_9_7_1.tar.gz" +ARG CLOUD_SQL_PROXY_VERSION="2.20.0" +ARG ORACLE_IC_VERSION="21_1" +ARG ORACLE_IC_ZIP="instantclient-basiclite-linux.x64-21.1.0.0.0.zip" +ARG ORACLE_IC_URL="https://download.oracle.com/otn_software/linux/instantclient/211000/instantclient-basiclite-linux.x64-21.1.0.0.0.zip" +ARG POSTGRES_JDBC_VERSION="42.7.3" +ARG ORACLE_JDBC_VERSION="23.4.0.24.05" +ARG DBSCHEMA_SHA256="" +ARG ORACLE_IC_SHA256="" +ARG POSTGRES_JDBC_SHA256="" +ARG ORACLE_JDBC_SHA256="" +ARG CLOUD_SQL_PROXY_SHA256="" + +ENV VCS_REF=${VCS_REF} \ + BUILD_DATE=${BUILD_DATE} \ + DBSCHEMA_VERSION=${DBSCHEMA_VERSION} \ + APP_HOME=/opt/app-root \ + APP_DATA=/opt/app-root/data \ + HOME=/opt/app-root \ + PYTHONUNBUFFERED=1 \ + PYTHONDONTWRITEBYTECODE=1 \ + PIP_NO_CACHE_DIR=1 \ + ORACLE_HOME=/opt/oracle/instantclient_21_1 \ + LD_LIBRARY_PATH=/opt/oracle/instantclient_21_1 \ + PATH=/opt/dbschema:/opt/oracle/instantclient_21_1:/opt/app-root/.local/bin:${PATH} \ + DBSCHEMA_DRIVER_DIR=/opt/jdbc \ + PYTHONPATH=/opt/app-root/data-tool/flows:/opt/app-root/data-tool + +LABEL org.label-schema.vcs-ref=${VCS_REF} \ + org.label-schema.build-date=${BUILD_DATE} \ + org.opencontainers.image.title="colin-extract-delta" \ + org.opencontainers.image.description="OCP runtime for COLIN extract delta refresh using data-tool and DbSchemaCLI" + +USER root + +SHELL ["/bin/bash", "-o", "pipefail", "-c"] + +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + bash \ + ca-certificates \ + curl \ + build-essential \ + git \ + libaio1 \ + libnsl2 \ + libpq-dev \ + openjdk-17-jre-headless \ + postgresql-client \ + unzip \ + && rm -rf /var/lib/apt/lists/* + +# Fail closed: external binary/JAR downloads must be checksum-pinned by CI/local builders. +RUN for name in DBSCHEMA_SHA256 ORACLE_IC_SHA256 POSTGRES_JDBC_SHA256 ORACLE_JDBC_SHA256 CLOUD_SQL_PROXY_SHA256; do \ + value="${!name:-}"; \ + if [[ -z "${value}" ]]; then echo "missing required build arg: ${name}" >&2; exit 1; fi; \ + done + +# Oracle Instant Client is needed because data-tool enables python-oracledb thick mode. +RUN mkdir -p /opt/oracle \ + && curl -fsSL "${ORACLE_IC_URL}" -o "/tmp/${ORACLE_IC_ZIP}" \ + && echo "${ORACLE_IC_SHA256} /tmp/${ORACLE_IC_ZIP}" | sha256sum -c - \ + && unzip -q "/tmp/${ORACLE_IC_ZIP}" -d /opt/oracle \ + && rm -f "/tmp/${ORACLE_IC_ZIP}" \ + && test -d "${ORACLE_HOME}" + +# DbSchemaCLI is included in the standard DbSchema kit and is used through the free CLI path. +# The OCP job does not require or consume a DbSchema license key. +RUN mkdir -p /opt/dbschema \ + && curl -fsSL "${DBSCHEMA_URL}" -o /tmp/dbschema.tar.gz \ + && echo "${DBSCHEMA_SHA256} /tmp/dbschema.tar.gz" | sha256sum -c - \ + && tar -xzf /tmp/dbschema.tar.gz -C /opt/dbschema --strip-components=1 \ + && rm -f /tmp/dbschema.tar.gz \ + && dbschema_cli="$(find /opt/dbschema -type f -name dbschemacli | head -n 1)" \ + && test -n "${dbschema_cli}" \ + && chmod +x "${dbschema_cli}" \ + && ln -sf "${dbschema_cli}" /usr/local/bin/dbschemacli + +# JDBC drivers are copied into ~/.DbSchema/drivers at runtime so DbSchemaCLI can resolve them +# even when it cannot download drivers in-cluster. +RUN mkdir -p /opt/jdbc/PostgreSql /opt/jdbc/Oracle \ + && curl -fsSL "https://jdbc.postgresql.org/download/postgresql-${POSTGRES_JDBC_VERSION}.jar" \ + -o "/opt/jdbc/PostgreSql/postgresql-${POSTGRES_JDBC_VERSION}.jar" \ + && echo "${POSTGRES_JDBC_SHA256} /opt/jdbc/PostgreSql/postgresql-${POSTGRES_JDBC_VERSION}.jar" | sha256sum -c - \ + && curl -fsSL "https://repo1.maven.org/maven2/com/oracle/database/jdbc/ojdbc11/${ORACLE_JDBC_VERSION}/ojdbc11-${ORACLE_JDBC_VERSION}.jar" \ + -o "/opt/jdbc/Oracle/ojdbc11-${ORACLE_JDBC_VERSION}.jar" \ + && echo "${ORACLE_JDBC_SHA256} /opt/jdbc/Oracle/ojdbc11-${ORACLE_JDBC_VERSION}.jar" | sha256sum -c - + +RUN curl -fsSL "https://storage.googleapis.com/cloud-sql-connectors/cloud-sql-proxy/v${CLOUD_SQL_PROXY_VERSION}/cloud-sql-proxy.linux.amd64" \ + -o /usr/local/bin/cloud-sql-proxy \ + && echo "${CLOUD_SQL_PROXY_SHA256} /usr/local/bin/cloud-sql-proxy" | sha256sum -c - \ + && chmod +x /usr/local/bin/cloud-sql-proxy + +WORKDIR /opt/app-root + +COPY data-tool/requirements.txt /tmp/data-tool-requirements.txt +RUN python -m pip install --upgrade pip \ + && python -m pip install --no-cache-dir -r /tmp/data-tool-requirements.txt \ + && rm -f /tmp/data-tool-requirements.txt + +# Match data-tool/Makefile install runtime imports: requirements.txt plus legal_api, +# registry_schemas, and sql-versioning. Install local LEAR packages without their +# transitive deps because requirements.txt owns the compatible runtime set. +COPY legal-api/ /opt/app-root/legal-api/ +COPY python/common/sql-versioning/ /opt/app-root/python/common/sql-versioning/ +RUN python -m pip install --no-cache-dir --no-deps --ignore-requires-python /opt/app-root/legal-api \ + && python -m pip install --no-cache-dir --no-deps /opt/app-root/python/common/sql-versioning \ + && python -m pip install --no-cache-dir --no-deps "git+https://github.com/bcgov/business-schemas.git@2.16.1#egg=registry_schemas" + +COPY data-tool/ /opt/app-root/data-tool/ +COPY jobs/colin-extract-delta/run.sh /opt/app-root/run.sh +COPY jobs/colin-extract-delta/README.md /opt/app-root/job-README.md + +RUN mkdir -p /opt/app-root/data /opt/app-root/.DbSchema/cli \ + && chmod +x /opt/app-root/run.sh \ + && chgrp -R 0 /opt/app-root /opt/dbschema /opt/jdbc /opt/oracle \ + && chmod -R g=u /opt/app-root /opt/dbschema /opt/jdbc /opt/oracle + +USER 1001 + +CMD ["/opt/app-root/run.sh"] diff --git a/jobs/colin-extract-delta/Makefile b/jobs/colin-extract-delta/Makefile new file mode 100644 index 0000000000..cdf0201ef7 --- /dev/null +++ b/jobs/colin-extract-delta/Makefile @@ -0,0 +1,64 @@ +.PHONY: help build build-nc smoke-local validate shellcheck yaml-parse template-required-params contract-validate + +MKFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST))) +CURRENT_ABS_DIR := $(patsubst %/,%,$(dir $(MKFILE_PATH))) +REPO_ROOT := $(abspath $(CURRENT_ABS_DIR)/../..) +PROJECT_NAME := colin-extract-delta +DOCKER_NAME := colin-extract-delta +TAG_NAME ?= dev +CHECKSUM_BUILD_ARGS = \ + --build-arg DBSCHEMA_SHA256="$(DBSCHEMA_SHA256)" \ + --build-arg ORACLE_IC_SHA256="$(ORACLE_IC_SHA256)" \ + --build-arg POSTGRES_JDBC_SHA256="$(POSTGRES_JDBC_SHA256)" \ + --build-arg ORACLE_JDBC_SHA256="$(ORACLE_JDBC_SHA256)" \ + --build-arg CLOUD_SQL_PROXY_SHA256="$(CLOUD_SQL_PROXY_SHA256)" + +build: ## Build the runtime image from the repo root context; requires checksum build args + docker build --platform linux/amd64 -f $(CURRENT_ABS_DIR)/Dockerfile -t $(DOCKER_NAME):$(TAG_NAME) $(REPO_ROOT) \ + --build-arg VCS_REF=$$(git -C $(REPO_ROOT) rev-parse --short HEAD 2>/dev/null || echo missing) \ + --build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \ + $(CHECKSUM_BUILD_ARGS) + +build-nc: ## Build the runtime image without cache; requires checksum build args + docker build --no-cache --platform linux/amd64 -f $(CURRENT_ABS_DIR)/Dockerfile -t $(DOCKER_NAME):$(TAG_NAME) $(REPO_ROOT) \ + $(CHECKSUM_BUILD_ARGS) + +smoke-local: ## Run wrapper smoke mode in an already-built image; requires DB/Cloud SQL env vars + docker run --rm --env-file .env \ + -e SMOKE_ONLY=true \ + -e APP_DATA=/opt/app-root/data \ + $(DOCKER_NAME):$(TAG_NAME) + +shellcheck: ## Shell lint if shellcheck is installed + @if command -v shellcheck >/dev/null 2>&1; then shellcheck $(CURRENT_ABS_DIR)/run.sh; else echo "shellcheck not installed; skipping"; fi + +yaml-parse: ## Parse OpenShift templates after replacing template placeholders with simple values + python -c 'from pathlib import Path; import importlib.util, re, sys; \ +spec=importlib.util.find_spec("yaml"); \ +(spec is not None) or (print("PyYAML is not installed; skipping yaml parse") or sys.exit(0)); \ +import yaml; base=Path("$(CURRENT_ABS_DIR)")/"openshift"/"templates"; \ +paths=[base/"bc.yaml", base/"cronjob.yaml"]; \ +[print(f"parsed {p}") for p in paths for _ in [yaml.safe_load(re.sub(r"\$$\{[A-Z0-9_]+\}", "placeholder", re.sub(r"\$$\{\{[A-Z0-9_]+\}\}", "1", p.read_text())))] ]' + +template-required-params: ## Ensure rollout-specific CronJob params are required and have no concrete defaults + python -c 'from pathlib import Path; import re, sys; \ +text=Path("$(CURRENT_ABS_DIR)/openshift/templates/cronjob.yaml").read_text(); \ +required={"TAG","IMAGE_NAMESPACE","IMAGE_TAG","DBSCHEMA_SOURCE_CONNECTION","DBSCHEMA_TARGET_CONNECTION","DBSCHEMA_TARGET_SCHEMA","GCP_SA_SECRET_NAME","GCP_SA_SECRET_KEY"}; \ +params={m.group("name"):m.group("body") for m in re.finditer(r"(?ms)^ - name: (?P[A-Z0-9_]+)\n(?P.*?)(?=^ - name: |\Z)", text)}; \ +errors=[]; \ +[errors.append(f"missing parameter {name}") for name in sorted(required) if name not in params]; \ +[errors.append(f"{name} must be required") for name in sorted(required) if name in params and not re.search(r"^ required: true$$", params[name], re.M)]; \ +[errors.append(f"{name} must not define a concrete value/default") for name in sorted(required) if name in params and re.search(r"^ value:\s*", params[name], re.M)]; \ +print("cronjob required rollout params are explicit") if not errors else (print("\n".join(errors), file=sys.stderr), sys.exit(1))' + +contract-validate: ## Validate OCP wrapper/data-tool flow and generator contracts without requiring pytest + python $(CURRENT_ABS_DIR)/validate_contract.py + +validate: shellcheck yaml-parse template-required-params contract-validate ## Run lightweight local validation + bash -n $(CURRENT_ABS_DIR)/run.sh + python -m py_compile $(REPO_ROOT)/data-tool/flows/refresh_extract_subset_flow.py + +help: ## Show this help + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' + +.DEFAULT_GOAL := help diff --git a/jobs/colin-extract-delta/README.md b/jobs/colin-extract-delta/README.md new file mode 100644 index 0000000000..9e31731712 --- /dev/null +++ b/jobs/colin-extract-delta/README.md @@ -0,0 +1,201 @@ +# COLIN Extract Delta Job + +OCP CronJob wrapper for the existing `data-tool/flows/refresh_extract_subset_flow.py` refresh/subset workflow. This package does **not** replace the data-tool generator; it packages the runtime, starts optional Cloud SQL Auth Proxy, generates DbSchemaCLI connections, runs the flow with explicit CLI arguments, and retains run artifacts for recovery. + +## What this job does + +1. Creates `/opt/app-root/data/runs/` for non-secret artifacts. +2. Optionally starts Cloud SQL Auth Proxy on `127.0.0.1:$CLOUDSQL_PROXY_PORT`. +3. Exports the Postgres env used by Python, `psql`, and DbSchema target connection. +4. Generates credential-bearing `~/.DbSchema/cli/init.sql` outside retained run artifacts with explicit rollout inputs for: + - source Oracle connection `${DBSCHEMA_SOURCE_CONNECTION}`, + - target Postgres connection `${DBSCHEMA_TARGET_CONNECTION}`, and + - target Postgres schema `${DBSCHEMA_TARGET_SCHEMA}`. +5. Runs smoke/preflight checks, including non-mutating DbSchemaCLI connect/select checks for both generated aliases and `learn schema ${DBSCHEMA_TARGET_SCHEMA}`. +6. Invokes `refresh_extract_subset_flow.py` as a standalone script with explicit source alias, target alias, and target schema args. +7. Optionally refreshes materialized views using `data-tool/refresh_colin_extract_views.sh --schema ${DBSCHEMA_TARGET_SCHEMA}`. +8. Runs postflight checks for trigger state and helper table cleanup in `${DBSCHEMA_TARGET_SCHEMA}`. + +## Build + +Builds must use the repository root as Docker context: + +```bash +docker build --platform linux/amd64 -f jobs/colin-extract-delta/Dockerfile -t colin-extract-delta:dev . \ + --build-arg DBSCHEMA_SHA256= \ + --build-arg ORACLE_IC_SHA256= \ + --build-arg POSTGRES_JDBC_SHA256= \ + --build-arg ORACLE_JDBC_SHA256= \ + --build-arg CLOUD_SQL_PROXY_SHA256= +# or export those checksum variables first, then: +make -C jobs/colin-extract-delta build +``` + +Runtime image assumptions: + +- Python dependencies come from `data-tool/requirements.txt`, plus the data-tool install-contract packages `legal_api`, `registry_schemas`, and `sql-versioning`. +- Oracle Instant Client Basic Lite is installed because `data-tool` enables `python-oracledb` thick mode. +- Java, PostgreSQL client tools, DbSchemaCLI 9.7.1, Oracle/Postgres JDBC jars, and Cloud SQL Auth Proxy v2 are installed in the image. +- DbSchemaCLI is installed from the standard DbSchema archive and used as a command-line client through the free CLI path. No DbSchema license key is required or wired into the OCP job. +- CI/local builds must pass SHA256 build args (`DBSCHEMA_SHA256`, `ORACLE_IC_SHA256`, `POSTGRES_JDBC_SHA256`, `ORACLE_JDBC_SHA256`, `CLOUD_SQL_PROXY_SHA256`). The Dockerfile fails closed if any external binary/JAR checksum is missing. + +## Required secrets/env + +Secret `${NAME}-${TAG}-secret` should provide: + +| Key | Purpose | +|---|---| +| `DATABASE_USERNAME_COLIN_MIGR` | Cloud SQL/Postgres target user | +| `DATABASE_PASSWORD_COLIN_MIGR` | Cloud SQL/Postgres target password | +| `DATABASE_NAME_COLIN_MIGR` | Cloud SQL/Postgres target DB | +| `DATABASE_USERNAME_COLIN_ORACLE` | Oracle source user | +| `DATABASE_PASSWORD_COLIN_ORACLE` | Oracle source password | +| `DATABASE_HOST_COLIN_ORACLE` | Oracle host | +| `DATABASE_PORT_COLIN_ORACLE` | Oracle port, usually `1521` | +| `DATABASE_NAME_COLIN_ORACLE` | Oracle service/database name | +| `CLOUDSQL_INSTANCE_CONNECTION_NAME` | Required for `CLOUDSQL_PROXY_MODE=wrapper` | +| `DATABASE_HOST_COLIN_MIGR` | Optional; required only for `CLOUDSQL_PROXY_MODE=disabled` direct networking | +| `DATABASE_PORT_COLIN_MIGR` | Optional; required only for `CLOUDSQL_PROXY_MODE=disabled` direct networking | + +Optional JSON-key proxy mode mounts secret `${GCP_SA_SECRET_NAME}` key `${GCP_SA_SECRET_KEY}` at `/var/secrets/google/cloudsql-service-account.json` and sets `GOOGLE_APPLICATION_CREDENTIALS`. The OpenShift template requires these values to be supplied explicitly; if the optional file is absent, `run.sh` unsets `GOOGLE_APPLICATION_CREDENTIALS` so Workload Identity/ambient ADC can be used if validated. + +Required OpenShift rollout parameters with no concrete defaults: + +| Parameter | Purpose | +|---|---| +| `TAG` | Environment/resource suffix, supplied per deployment. | +| `IMAGE_NAMESPACE` | Namespace containing the externally built image. | +| `IMAGE_TAG` | Image tag to deploy. | +| `DBSCHEMA_SOURCE_CONNECTION` | Source Oracle alias generated in `init.sql` and rendered into generated transfer SQL. | +| `DBSCHEMA_TARGET_CONNECTION` | Target Postgres alias generated in `init.sql` and passed through the flow/generator. | +| `DBSCHEMA_TARGET_SCHEMA` | Target Postgres schema rendered into generated helper/transfer SQL and used as `PGSCHEMA`. | +| `GCP_SA_SECRET_NAME` / `GCP_SA_SECRET_KEY` | JSON-key secret values when JSON-key Cloud SQL proxy mode is used; supply explicit deployment values even if ambient ADC is validated and the optional volume is unused. | + +DbSchema init generation writes the direct OCP secrets into runtime `~/.DbSchema/cli/init.sql` using DbSchemaCLI connection text syntax. This file is credential-bearing, is created under a restrictive umask, must remain `0600`, and must stay outside retained run artifacts. Because this job does not implement reliable DbSchema init escaping, connection values reject whitespace/line breaks, semicolons, comment-like `#`/`--`, leading `-`, and shell/DbSchema metacharacters such as quotes, backslashes, `$`, `` ` ``, `&`, `|`, redirection, brackets/braces/parentheses, globs, and `!`; database passwords with those characters must be rotated/updated before this job can run. Alias names are restricted to conservative DbSchema identifiers. The target schema is restricted to a lowercase conservative PostgreSQL identifier to avoid unquoted uppercase/lowercase folding mismatches. The configured source alias is rendered into transfer SQL, and the configured target schema is rendered into helper/transfer SQL. + +## Important runtime knobs + +| Env | Default | Notes | +|---|---:|---| +| `SMOKE_ONLY` | `false` | `true` runs preflight only and exits before data movement. | +| `CLOUDSQL_PROXY_MODE` | `wrapper` | `wrapper`, `external`, `sidecar`, or `disabled`. Wrapper mode starts/cleans up proxy in `run.sh` and forces target DB host/port to the local proxy address/port. | +| `FLOW_MODE` | `refresh` | Scheduled job should use `refresh`. `load` requires `FLOW_CORP_FILE`. | +| `FLOW_CORP_FILE` | unset | Optional in refresh mode to replay a retained/curated feed; required in load mode. | +| `FLOW_RESET_EXTRACT_POSTGRES` | `false` | Explicit destructive DB reset for load mode only when approved. | +| `FLOW_MIG_BATCH_ID` | `1` | Passed to data-tool CLI. | +| `FLOW_LOOKBACK_HOURS` | `5` | Rolling v1 lookback. Not a durable high-watermark. | +| `FLOW_INCLUDE_CARS` | `false` | Keep false unless the global cars* truncate/reload is explicitly approved. | +| `REFRESH_COLIN_EXTRACT_VIEWS` | `false` | Optional whole-MV refresh after DbSchema transfer. | +| `MV_REFRESH_TARGETS` | `legacy` | First supported MV profile. | +| `TERMINATION_GRACE_SECONDS` | `20` | Seconds to wait for the active DbSchema/flow/MV child after `TERM`/`INT` before sending `KILL`. | +| `CLOUDSQL_PROXY_STOP_SECONDS` | `10` | Seconds to wait for the wrapper-started Cloud SQL Auth Proxy during cleanup before sending `KILL`. | +| `PGCONNECT_TIMEOUT` | `10` | libpq connect timeout used by wrapper `psql` preflight/postflight/diagnostic queries. | +| `PG_STATEMENT_TIMEOUT_MS` | `60000` | Postgres statement timeout used by wrapper `psql` preflight/postflight/diagnostic queries. | +| `DBSCHEMA_SOURCE_CONNECTION` | required in OCP/run.sh | Must match the source alias generated in runtime `init.sql` and rendered into transfer SQL. Direct/local data-tool generator defaults are compatibility defaults only and must not be relied on by OCP. | +| `DBSCHEMA_TARGET_CONNECTION` | required in OCP/run.sh | Must match the target alias generated in runtime `init.sql` and emitted by the flow/generator. Direct/local data-tool flow/generator defaults are compatibility defaults only and must not be relied on by OCP. | +| `DBSCHEMA_TARGET_SCHEMA` | required in OCP/run.sh | Lowercase target Postgres schema rendered into helper/transfer SQL and used to derive `PGSCHEMA`. Direct/local generator defaults are compatibility defaults only and must not be relied on by OCP. | + +## Local smoke + +With an env file containing the required values: + +```bash +make -C jobs/colin-extract-delta build +docker run --rm --env-file jobs/colin-extract-delta/.env \ + -e SMOKE_ONLY=true \ + colin-extract-delta:dev +``` + +This validates tools, proxy, target Postgres connection/helper tables in `${DBSCHEMA_TARGET_SCHEMA}`, Oracle connection, DbSchemaCLI startup, and both generated DbSchema aliases by running `connect`, `learn schema ${DBSCHEMA_TARGET_SCHEMA}`, and trivial `select` statements against target `${DBSCHEMA_TARGET_CONNECTION}` and source `${DBSCHEMA_SOURCE_CONNECTION}`. It does not run transfers or move data. + +## OCP smoke + +1. Process/apply the ImageStream template in tools. +2. Push the externally built image to `${IMAGE_NAMESPACE}/${NAME}:${IMAGE_TAG}`. +3. Process/apply the CronJob template with `SUSPEND=true` and `SMOKE_ONLY=true`. +4. Create a one-off job from the CronJob and inspect logs/artifacts. Smoke must prove DbSchemaCLI can connect/select using both generated aliases without running transfers. +5. Only after smoke passes, run one-corp/small-batch data movement with the CronJob still suspended. + +## Retained artifacts + +Artifacts live under `/opt/app-root/data/runs/` for the pod lifetime. The CronJob uses `emptyDir`, so artifacts are retained only while the pod exists and while failed/successful job history keeps the pod around. + +Expected files include: + +- `refresh_corp_feed_.txt` — retained rolling-lookback corp feed in refresh mode. +- `subset_refresh.sql` — generated DbSchema master script. +- `subset_refresh_chunks/` — generated chunk scripts. +- `logs/refresh-flow.log`, `logs/cloud-sql-proxy.log`, `logs/dbschema-smoke.log`. +- `dbschema-smoke.sql` — non-secret smoke script containing only aliases and trivial select statements. +- `mv-refresh-plan.preflight.sql` when MV refresh is enabled. +- `postflight-helper-counts.txt` and, on trigger issues, `postflight-disabled-triggers.txt`. +- On failed runs, `failure-helper-counts.txt` and `failure-disabled-triggers.txt` when the target DB is reachable during diagnostics. + +Logs and docs must not print passwords or full credential-bearing URLs. `init.sql` contains direct OCP secrets, is written with `0600`, and lives under `~/.DbSchema/cli/` outside `${RUN_DIR}`; do not copy it into tickets, retained artifacts, or persistent storage. `dbschema-smoke.sql` is safe to retain because it contains no credentials, transfer statements, or generated chunk SQL. + +## Recovery after partial failures + +The subset generator owns advisory locking, trigger suppression, helper tables, BCOMPS purge, deletes, and reloads. A failure can leave durable target-side state. The wrapper emits diagnostics and retained-artifact locations on failure, but intentionally does **not** automatically re-enable triggers or truncate helper tables; recovery must be a deliberate operator action. Recommended recovery: + +1. Identify the failed pod and run id from logs: `run_id=... artifacts=/opt/app-root/data/runs/`. +2. Copy non-secret artifacts before the pod is deleted, especially the retained corp feed and generated SQL. +3. Verify whether DbSchema is still running. If a stuck session holds the advisory lock, terminate the backend only after confirming no legitimate run is active. +4. Re-enable table triggers if needed using the retained rendered support script from the failed run, for example `${RUN_DIR}/subset_refresh_chunks/support/subset_enable_triggers.sql`, through `psql` or DbSchemaCLI against the target DB. Do **not** run source templates from `data-tool/scripts/subset/` directly because they contain generator tokens such as `__DBSCHEMA_TARGET_SCHEMA__`. If retained generated support scripts are unavailable, render the source template first with the exact validated schema, for example: `sed "s/__DBSCHEMA_TARGET_SCHEMA__/${DBSCHEMA_TARGET_SCHEMA}/g" data-tool/scripts/subset/subset_enable_triggers.sql | psql ...`. +5. Truncate helper tables if needed, using the configured target schema: + ```sql + \set target_schema '' + TRUNCATE :"target_schema".subset_address_stage; + TRUNCATE :"target_schema".subset_excluded_corp_parties; + TRUNCATE :"target_schema".subset_excluded_events; + TRUNCATE :"target_schema".subset_excluded_corps; + ``` +6. If failure happened before or during DbSchema transfer, rerun with the retained `refresh_corp_feed_.txt` as `FLOW_CORP_FILE` and `FLOW_MODE=refresh` to replay the same corp set instead of relying on a new rolling lookback. Keep `FLOW_RESET_EXTRACT_POSTGRES=false` for replay/delta runs. +7. If failure happened during MV refresh only, do not rerun the transfer just for MVs. Run `data-tool/refresh_colin_extract_views.sh --mode refresh --schema --targets ` once target table state is confirmed healthy. +8. Re-run smoke/preflight before resuming scheduled execution. + +## Rolling lookback limitation + +Scheduled v1 uses the data-tool CLI rolling lookback (`--lookback-hours`, default 5) scoped by `--mig-batch-id`. It is **not** durable high-watermark processing. Downtime longer than the lookback can miss changes unless an operator reruns with a wider lookback or a retained/curated corp feed. Durable watermark/run-ledger support is follow-up work. + +## Target schema and reset limitation + +Source alias and target schema are configurable for the generated subset transfer path: `DBSCHEMA_SOURCE_CONNECTION` is rendered into transfer `from ` clauses, and `DBSCHEMA_TARGET_SCHEMA` is rendered into generated helper/transfer SQL and used for `PGSCHEMA`/MV refresh. + +The direct generator/flow defaults remain only for local compatibility. OCP deployments must pass explicit values for source alias, target alias, and target schema. + +`refresh_colin_extract_views.sh --mode refresh --schema ` is schema-aware for MV refresh. The separate reset/reapply path remains `public`-only unless the view DDL is separately schema-qualified; do not use reset/reapply for a non-`public` target schema without a separate validation/fix. + +## Side-effect policy + +- `cars*`: disabled by default via `FLOW_INCLUDE_CARS=false`. Enabling it runs a global cars* truncate/reload, not a corp-local delta. +- BCOMPS purge: enabled by the generator and target-wide for excluded corps computed from target data after load. +- Shared address upsert: current generator behavior merges Oracle addresses into the shared target `address` table by `addr_id`; this is not purely corp-local. +- Materialized views: disabled by default. When enabled, `legacy` refreshes whole selected MVs, not only changed corps. + +## Production gate checklist + +Do not set `SUSPEND=false` outside an approved rollout until all are complete: + +- Cloud SQL role can disable/enable required triggers, truncate/write helper tables, create helper boolean casts, purge BCOMPS, and refresh/analyze selected MVs. +- DbSchemaCLI 9.7.1 starts headlessly through the free CLI path and can connect/select using both generated aliases. +- Oracle connectivity is proven from OCP. +- Target DB has current `colin_corps_extract_postgres_ddl` helper tables. +- `SMOKE_ONLY=true` OCP job succeeds and proves DbSchema target `${DBSCHEMA_TARGET_CONNECTION}`, `learn schema ${DBSCHEMA_TARGET_SCHEMA}`, and source `${DBSCHEMA_SOURCE_CONNECTION}` connect/select checks before any transfer. +- Generated SQL inspection with non-default source alias/schema confirms transfer statements use the configured source alias and helper/transfer objects use the configured target schema, with no unexpected `from cprd` or `public.` references. +- One-corp refresh succeeds. +- Small-batch refresh succeeds with retained artifacts verified. +- Recovery drill re-enables triggers and cleans helper tables. +- MV runtime is acceptable if MV refresh is enabled. +- Side-effect policy (`cars*`, BCOMPS, shared address, MVs) is approved. + +## Validation commands + +```bash +bash -n jobs/colin-extract-delta/run.sh +make -C jobs/colin-extract-delta validate +make -C jobs/colin-extract-delta template-required-params +# If pytest and local data-tool test dependencies are already available: +python -m pytest data-tool/tests/unit/flows/test_refresh_extract_subset_flow.py +``` + +`make validate` runs shell syntax, optional shellcheck, OpenShift YAML parsing if PyYAML is installed, rollout parameter checks, standalone OCP/data-tool contract checks, and a syntax compile of the current data-tool CLI entrypoint. The contract check validates generator rendering with non-default aliases/schema without pytest, and validates flow parser compatibility with the run.sh flag set when the flow's Python dependencies are installed. Do not add shell mocks for DbSchemaCLI smoke behavior; the meaningful gate requires DbSchemaCLI 9.7.1, staged JDBC drivers, and real endpoints. diff --git a/jobs/colin-extract-delta/openshift/Readme.md b/jobs/colin-extract-delta/openshift/Readme.md new file mode 100644 index 0000000000..5d70a460e3 --- /dev/null +++ b/jobs/colin-extract-delta/openshift/Readme.md @@ -0,0 +1,69 @@ +# OpenShift deployment notes: COLIN extract delta + +This package follows the external-CI image consumption pattern used by the newer SFTP job templates: `bc.yaml` creates only an ImageStream, while CI builds the repo-root Dockerfile and pushes `${NAME}:${IMAGE_TAG}`. + +## Apply ImageStream + +```bash +oc process -f jobs/colin-extract-delta/openshift/templates/bc.yaml \ + -p NAME=colin-extract-delta | oc apply -f - +``` + +## Deploy suspended CronJob + +Keep schedules suspended by default. The CronJob template intentionally has no concrete defaults for rollout-specific values, so pass them explicitly: + +```bash +oc process -f jobs/colin-extract-delta/openshift/templates/cronjob.yaml \ + -p NAME=colin-extract-delta \ + -p TAG= \ + -p IMAGE_NAMESPACE= \ + -p IMAGE_TAG= \ + -p DBSCHEMA_SOURCE_CONNECTION= \ + -p DBSCHEMA_TARGET_CONNECTION= \ + -p DBSCHEMA_TARGET_SCHEMA= \ + -p GCP_SA_SECRET_NAME= \ + -p GCP_SA_SECRET_KEY= \ + -p SUSPEND=true \ + -p SMOKE_ONLY=true | oc apply -f - +``` + +Create the main job secret separately. Minimum keys are documented in `../README.md`. For JSON-key Cloud SQL proxy mode, create `${GCP_SA_SECRET_NAME}` with key `${GCP_SA_SECRET_KEY}`; Workload Identity/ambient ADC deployments may use an explicitly supplied unused optional secret only after validation. + +## Smoke-only run + +```bash +oc create job --from=cronjob/colin-extract-delta- colin-extract-delta--smoke-$(date +%s) +oc logs -l job-name= --tail=-1 +``` + +A smoke run starts or validates the Cloud SQL proxy endpoint, checks Postgres helper tables in `${DBSCHEMA_TARGET_SCHEMA}`, checks Oracle connectivity, verifies DbSchemaCLI startup, validates the generated target `${DBSCHEMA_TARGET_CONNECTION}`, `learn schema ${DBSCHEMA_TARGET_SCHEMA}`, and source `${DBSCHEMA_SOURCE_CONNECTION}` with non-mutating connect/select checks, and optionally validates MV targets. It exits before generator/DbSchema transfer and does not move data. + +## Cloud SQL proxy modes + +- `wrapper` (default): `run.sh` starts `cloud-sql-proxy`, waits for localhost TCP, and cleans it up with a trap. Use this unless native sidecar support is confirmed for the target OCP/Kubernetes version. +- `external` / `sidecar`: another container/process provides `127.0.0.1:${CLOUDSQL_PROXY_PORT}`. Be careful with regular sidecars in Jobs because they can keep pods running after the main container exits. +- `disabled`: use only for direct Postgres networking where the target host/port are injected separately. + +JSON-key mode mounts `${GCP_SA_SECRET_NAME}` key `${GCP_SA_SECRET_KEY}` at `/var/secrets/google/cloudsql-service-account.json`. Workload Identity or ambient ADC may rely on the optional volume being absent only if validated; still pass explicit template parameter values so rollout inputs are deliberate. + +## Rollout sequence + +1. Apply ImageStream and push image. +2. Apply CronJob with `SUSPEND=true`, `SMOKE_ONLY=true`. +3. Run one-off smoke job. +4. Before enabling data movement, inspect generated SQL from a non-default source alias/schema run and confirm transfer statements use `from ` and helper/transfer objects use `.`, with no unexpected `from cprd` or `public.` references. +5. Switch `SMOKE_ONLY=false`, keep `SUSPEND=true`, run one-corp refresh using a curated `FLOW_CORP_FILE` if needed (`FLOW_MODE=refresh`, `FLOW_RESET_EXTRACT_POSTGRES=false`). +6. Run small-batch refresh with default `FLOW_INCLUDE_CARS=false` and `REFRESH_COLIN_EXTRACT_VIEWS=false`. +7. If approved, test `REFRESH_COLIN_EXTRACT_VIEWS=true` with `MV_REFRESH_TARGETS=legacy` and the configured `DBSCHEMA_TARGET_SCHEMA`. +8. Record runtime/memory/CPU/lock duration and tune resources/deadline. +9. Complete production checklist in `../README.md` before setting `SUSPEND=false`. + +## Operational reminders + +- Artifacts are in the pod `emptyDir` under `/opt/app-root/data/runs/` and disappear when the pod is deleted. +- Failed/successful job history limits determine how long pods remain discoverable. +- The rolling lookback is not a durable watermark; use retained corp feeds for exact reruns after partial failures. +- Keep `FLOW_INCLUDE_CARS=false` unless the global cars* side effect is explicitly approved. +- OCP must supply explicit source alias, target alias, and target schema values; local data-tool defaults are compatibility defaults only. +- The MV refresh path accepts a schema, but reset/reapply remains `public`-only unless the views DDL is separately schema-qualified. Do not use reset/reapply for a non-`public` target schema without a separate fix/validation. diff --git a/jobs/colin-extract-delta/openshift/templates/bc.yaml b/jobs/colin-extract-delta/openshift/templates/bc.yaml new file mode 100644 index 0000000000..a22ccddfd7 --- /dev/null +++ b/jobs/colin-extract-delta/openshift/templates/bc.yaml @@ -0,0 +1,21 @@ +apiVersion: template.openshift.io/v1 +kind: Template +metadata: + labels: + app: ${NAME} + name: ${NAME}-build +objects: + # External CI builds the repo-root Dockerfile and pushes into this ImageStream/tag. + # This template intentionally does not create an OpenShift BuildConfig. + - apiVersion: v1 + kind: ImageStream + metadata: + name: ${NAME} + labels: + app: ${NAME} +parameters: + - name: NAME + displayName: Name + description: The name assigned to the ImageStream consumed by the CronJob. + required: true + value: colin-extract-delta diff --git a/jobs/colin-extract-delta/openshift/templates/cronjob.yaml b/jobs/colin-extract-delta/openshift/templates/cronjob.yaml new file mode 100644 index 0000000000..ceba6b232d --- /dev/null +++ b/jobs/colin-extract-delta/openshift/templates/cronjob.yaml @@ -0,0 +1,344 @@ +apiVersion: template.openshift.io/v1 +kind: Template +metadata: + labels: + name: ${NAME} + name: ${NAME}-cronjob +objects: + - apiVersion: v1 + kind: ConfigMap + metadata: + name: ${NAME}-${TAG}-config + labels: + name: ${NAME} + environment: ${TAG} + role: ${ROLE} + data: + APP_DATA: /opt/app-root/data + CLOUDSQL_PROXY_MODE: "${CLOUDSQL_PROXY_MODE}" + CLOUDSQL_PROXY_ADDRESS: "127.0.0.1" + CLOUDSQL_PROXY_PORT: "${CLOUDSQL_PROXY_PORT}" + CLOUDSQL_PRIVATE_IP: "${CLOUDSQL_PRIVATE_IP}" + FLOW_MODE: "${FLOW_MODE}" + FLOW_CORP_FILE: "${FLOW_CORP_FILE}" + FLOW_RESET_EXTRACT_POSTGRES: "${FLOW_RESET_EXTRACT_POSTGRES}" + FLOW_CHUNK_SIZE: "${FLOW_CHUNK_SIZE}" + FLOW_THREADS: "${FLOW_THREADS}" + FLOW_MIG_BATCH_ID: "${FLOW_MIG_BATCH_ID}" + FLOW_LOOKBACK_HOURS: "${FLOW_LOOKBACK_HOURS}" + FLOW_PG_DISABLE_METHOD: "table_triggers" + FLOW_PG_FASTLOAD: "${FLOW_PG_FASTLOAD}" + FLOW_INCLUDE_CP: "${FLOW_INCLUDE_CP}" + # Scheduled v1 default: do not run the global cars* truncate/reload unless explicitly approved. + FLOW_INCLUDE_CARS: "${FLOW_INCLUDE_CARS}" + RUN_PREFLIGHT: "true" + RUN_POSTFLIGHT: "true" + SMOKE_ONLY: "${SMOKE_ONLY}" + REFRESH_COLIN_EXTRACT_VIEWS: "${REFRESH_COLIN_EXTRACT_VIEWS}" + MV_REFRESH_TARGETS: "${MV_REFRESH_TARGETS}" + MV_REFRESH_SKIP_ANALYZE: "${MV_REFRESH_SKIP_ANALYZE}" + PGSCHEMA: "${DBSCHEMA_TARGET_SCHEMA}" + DBSCHEMA_SOURCE_CONNECTION: "${DBSCHEMA_SOURCE_CONNECTION}" + DBSCHEMA_TARGET_CONNECTION: "${DBSCHEMA_TARGET_CONNECTION}" + DBSCHEMA_TARGET_SCHEMA: "${DBSCHEMA_TARGET_SCHEMA}" + - kind: CronJob + apiVersion: batch/v1 + metadata: + name: ${NAME}-${TAG} + labels: + name: ${NAME} + environment: ${TAG} + role: ${ROLE} + spec: + # Keep suspended until Cloud SQL privileges, DbSchema free CLI runtime, OCP networking, + # one-corp refresh, small-batch refresh, and recovery drill have passed for the environment. + suspend: ${{SUSPEND}} + schedule: "${SCHEDULE}" + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: ${{SUCCESS_JOBS_HISTORY_LIMIT}} + failedJobsHistoryLimit: ${{FAILED_JOBS_HISTORY_LIMIT}} + jobTemplate: + metadata: + labels: + name: ${NAME} + environment: ${TAG} + role: ${ROLE} + spec: + backoffLimit: ${{JOB_BACKOFF_LIMIT}} + activeDeadlineSeconds: ${{ACTIVE_DEADLINE_SECONDS}} + template: + metadata: + labels: + name: ${NAME} + environment: ${TAG} + role: ${ROLE} + spec: + restartPolicy: Never + terminationGracePeriodSeconds: 60 + dnsPolicy: ClusterFirst + containers: + - name: ${NAME}-${TAG} + image: ${IMAGE_REGISTRY}/${IMAGE_NAMESPACE}/${NAME}:${IMAGE_TAG} + imagePullPolicy: Always + command: + - /opt/app-root/run.sh + envFrom: + - configMapRef: + name: ${NAME}-${TAG}-config + env: + # Cloud SQL Auth Proxy: + # - wrapper mode uses CLOUDSQL_INSTANCE_CONNECTION_NAME and either mounted JSON ADC + # credentials or workload identity/ambient ADC if the cluster supports it. + # - external/sidecar mode should still expose Postgres on 127.0.0.1:${CLOUDSQL_PROXY_PORT}. + - name: CLOUDSQL_INSTANCE_CONNECTION_NAME + valueFrom: + secretKeyRef: + name: ${NAME}-${TAG}-secret + key: CLOUDSQL_INSTANCE_CONNECTION_NAME + optional: true + - name: GOOGLE_APPLICATION_CREDENTIALS + value: /var/secrets/google/cloudsql-service-account.json + - name: DATABASE_USERNAME_COLIN_MIGR + valueFrom: + secretKeyRef: + name: ${NAME}-${TAG}-secret + key: DATABASE_USERNAME_COLIN_MIGR + - name: DATABASE_PASSWORD_COLIN_MIGR + valueFrom: + secretKeyRef: + name: ${NAME}-${TAG}-secret + key: DATABASE_PASSWORD_COLIN_MIGR + - name: DATABASE_NAME_COLIN_MIGR + valueFrom: + secretKeyRef: + name: ${NAME}-${TAG}-secret + key: DATABASE_NAME_COLIN_MIGR + # Optional direct Postgres networking. Required only when CLOUDSQL_PROXY_MODE=disabled. + - name: DATABASE_HOST_COLIN_MIGR + valueFrom: + secretKeyRef: + name: ${NAME}-${TAG}-secret + key: DATABASE_HOST_COLIN_MIGR + optional: true + - name: DATABASE_PORT_COLIN_MIGR + valueFrom: + secretKeyRef: + name: ${NAME}-${TAG}-secret + key: DATABASE_PORT_COLIN_MIGR + optional: true + - name: DATABASE_USERNAME_COLIN_ORACLE + valueFrom: + secretKeyRef: + name: ${NAME}-${TAG}-secret + key: DATABASE_USERNAME_COLIN_ORACLE + - name: DATABASE_PASSWORD_COLIN_ORACLE + valueFrom: + secretKeyRef: + name: ${NAME}-${TAG}-secret + key: DATABASE_PASSWORD_COLIN_ORACLE + - name: DATABASE_HOST_COLIN_ORACLE + valueFrom: + secretKeyRef: + name: ${NAME}-${TAG}-secret + key: DATABASE_HOST_COLIN_ORACLE + - name: DATABASE_PORT_COLIN_ORACLE + valueFrom: + secretKeyRef: + name: ${NAME}-${TAG}-secret + key: DATABASE_PORT_COLIN_ORACLE + - name: DATABASE_NAME_COLIN_ORACLE + valueFrom: + secretKeyRef: + name: ${NAME}-${TAG}-secret + key: DATABASE_NAME_COLIN_ORACLE + resources: + # Conservative starting point for Python + Java/DbSchemaCLI. Tune after dev timing data. + requests: + cpu: "${CPU_REQUEST}" + memory: "${MEMORY_REQUEST}" + limits: + cpu: "${CPU_LIMIT}" + memory: "${MEMORY_LIMIT}" + volumeMounts: + - name: data-volume + mountPath: /opt/app-root/data + - name: gcp-sa-volume + readOnly: true + mountPath: /var/secrets/google + volumes: + - name: data-volume + emptyDir: {} + - name: gcp-sa-volume + secret: + # Optional for Workload Identity/ambient ADC deployments; required for JSON-key mode. + secretName: ${GCP_SA_SECRET_NAME} + optional: true + items: + - key: ${GCP_SA_SECRET_KEY} + path: cloudsql-service-account.json +parameters: + - name: NAME + displayName: Name + description: The name assigned to OpenShift resources for this job. + required: true + value: colin-extract-delta + - name: TAG + displayName: Environment tag + description: Required environment tag, for example dev, test, or prod. Supply explicitly per deployment. + required: true + - name: ROLE + displayName: Role + required: true + value: job + - name: IMAGE_NAMESPACE + displayName: Image Namespace + required: true + description: Required namespace containing the externally built ImageStreamTag. Supply explicitly per deployment. + - name: IMAGE_REGISTRY + displayName: Image Registry + required: true + value: image-registry.openshift-image-registry.svc:5000 + - name: IMAGE_TAG + displayName: Image tag + description: Required image tag for the externally built runtime image. Supply explicitly per deployment. + required: true + - name: SCHEDULE + displayName: Cron Schedule + description: Cron schedule using the cluster timezone. Keep suspended by default until rollout gates pass. + required: true + value: "30 9 * * TUE-SAT" + - name: SUSPEND + displayName: Suspend schedule + description: true keeps the CronJob disabled; set false only after production gate approval. + required: true + value: "true" + - name: SUCCESS_JOBS_HISTORY_LIMIT + displayName: Successful Job History Limit + required: true + value: "3" + - name: FAILED_JOBS_HISTORY_LIMIT + displayName: Failed Job History Limit + required: true + value: "3" + - name: JOB_BACKOFF_LIMIT + displayName: Job Backoff Limit + required: true + value: "0" + - name: ACTIVE_DEADLINE_SECONDS + displayName: Active Deadline Seconds + description: Tune after dev measurements; must cover DbSchema transfer plus optional MV refresh. + required: true + value: "21600" + - name: CPU_REQUEST + displayName: CPU Request + required: true + value: 500m + - name: CPU_LIMIT + displayName: CPU Limit + required: true + value: "2" + - name: MEMORY_REQUEST + displayName: Memory Request + required: true + value: 2Gi + - name: MEMORY_LIMIT + displayName: Memory Limit + required: true + value: 6Gi + - name: CLOUDSQL_PROXY_MODE + displayName: Cloud SQL Proxy Mode + description: wrapper starts cloud-sql-proxy in run.sh; external/sidecar assumes localhost is already available. + required: true + value: wrapper + - name: CLOUDSQL_PROXY_PORT + displayName: Cloud SQL Proxy Port + required: true + value: "5432" + - name: CLOUDSQL_PRIVATE_IP + displayName: Cloud SQL Private IP + description: Set true to pass --private-ip to the wrapper-managed Cloud SQL Auth Proxy. + required: true + value: "false" + - name: GCP_SA_SECRET_NAME + displayName: GCP Service Account Secret + description: Secret containing the Cloud SQL proxy JSON key for JSON-key mode. The template requires an explicit value; Workload Identity/ambient ADC deployments may provide an explicitly agreed unused secret name because the volume is optional. + required: true + - name: GCP_SA_SECRET_KEY + displayName: GCP Service Account Secret Key + description: Secret key containing the Cloud SQL proxy JSON credentials for JSON-key mode. Supply explicitly per deployment. + required: true + - name: FLOW_MODE + displayName: Flow Mode + description: refresh deletes/reloads the rolling-lookback or FLOW_CORP_FILE corp set; load loads FLOW_CORP_FILE and only resets the DB if FLOW_RESET_EXTRACT_POSTGRES=true. + required: true + value: refresh + - name: FLOW_CORP_FILE + displayName: Flow Corp File + description: Optional in refresh mode for curated/retained feed replay; required when FLOW_MODE=load. + required: false + value: "" + - name: FLOW_RESET_EXTRACT_POSTGRES + displayName: Reset Extract Postgres + description: Explicit destructive load-mode DB reset. Keep false for OCP delta/replay runs. + required: true + value: "false" + - name: FLOW_CHUNK_SIZE + displayName: Flow Chunk Size + required: true + value: "900" + - name: FLOW_THREADS + displayName: DbSchema Transfer Threads + required: true + value: "4" + - name: FLOW_MIG_BATCH_ID + displayName: Migration Batch Id + required: true + value: "1" + - name: FLOW_LOOKBACK_HOURS + displayName: Rolling Lookback Hours + description: Scheduled v1 rolling lookback; not a durable high-watermark. + required: true + value: "5" + - name: FLOW_PG_FASTLOAD + displayName: Enable Postgres Fastload + required: true + value: "false" + - name: FLOW_INCLUDE_CP + displayName: Include CP corps + required: true + value: "false" + - name: FLOW_INCLUDE_CARS + displayName: Include global cars refresh + description: false by default; true truncates/reloads cars* globally. + required: true + value: "false" + - name: DBSCHEMA_SOURCE_CONNECTION + displayName: DbSchema Source Connection + description: Required source Oracle alias generated in runtime init.sql and rendered into generated transfer SQL. Aliases are validated by run.sh and must use letters, digits, and underscore, and must not start with a digit. + required: true + - name: DBSCHEMA_TARGET_CONNECTION + displayName: DbSchema Target Connection + description: Required target Postgres alias generated in runtime init.sql and passed to the flow/generator. Aliases are validated by run.sh and must use letters, digits, and underscore, and must not start with a digit. + required: true + - name: DBSCHEMA_TARGET_SCHEMA + displayName: DbSchema Target Schema + description: Required Postgres target schema rendered into generated helper/transfer SQL and used for PGSCHEMA/MV refresh. Schema names are validated by run.sh and must use letters, digits, and underscore, and must not start with a digit. + required: true + - name: SMOKE_ONLY + displayName: Smoke Only + description: true runs preflight only and exits before data movement. + required: true + value: "false" + - name: REFRESH_COLIN_EXTRACT_VIEWS + displayName: Refresh Materialized Views + description: Optional post-transfer MV refresh. Leave false until dev runtime is measured. + required: true + value: "false" + - name: MV_REFRESH_TARGETS + displayName: MV Refresh Targets + required: true + value: legacy + - name: MV_REFRESH_SKIP_ANALYZE + displayName: Skip Analyze After MV Refresh + required: true + value: "false" diff --git a/jobs/colin-extract-delta/run.sh b/jobs/colin-extract-delta/run.sh new file mode 100755 index 0000000000..a5ea94f3d4 --- /dev/null +++ b/jobs/colin-extract-delta/run.sh @@ -0,0 +1,673 @@ +#!/usr/bin/env bash +# OCP entrypoint for the COLIN extract delta refresh job. + +set -Eeuo pipefail +IFS=$'\n\t' + +APP_HOME="${APP_HOME:-/opt/app-root}" +DATA_TOOL_DIR="${DATA_TOOL_DIR:-${APP_HOME}/data-tool}" +APP_DATA="${APP_DATA:-${APP_HOME}/data}" +RUN_ID="${RUN_ID:-$(date -u +%Y%m%dT%H%M%SZ)-${HOSTNAME:-local}-$$}" +RUN_DIR="${RUN_DIR:-${APP_DATA}/runs/${RUN_ID}}" +LOG_DIR="${RUN_DIR}/logs" +DBSCHEMA_HOME="${DBSCHEMA_HOME:-${HOME:-${APP_HOME}}/.DbSchema}" +DBSCHEMA_CLI_DIR="${DBSCHEMA_HOME}/cli" +DBSCHEMA_INIT_SQL="${DBSCHEMA_CLI_DIR}/init.sql" +DBSCHEMACLI_CMD="${DBSCHEMACLI_CMD:-dbschemacli}" +DBSCHEMA_SOURCE_CONNECTION="${DBSCHEMA_SOURCE_CONNECTION:-}" +DBSCHEMA_TARGET_CONNECTION="${DBSCHEMA_TARGET_CONNECTION:-}" +DBSCHEMA_TARGET_SCHEMA="${DBSCHEMA_TARGET_SCHEMA:-}" +CLOUDSQL_PROXY_MODE="${CLOUDSQL_PROXY_MODE:-wrapper}" +CLOUDSQL_PROXY_ADDRESS="${CLOUDSQL_PROXY_ADDRESS:-127.0.0.1}" +CLOUDSQL_PROXY_PORT="${CLOUDSQL_PROXY_PORT:-5432}" +CLOUDSQL_PRIVATE_IP="${CLOUDSQL_PRIVATE_IP:-false}" +CLOUDSQL_PROXY_WAIT_SECONDS="${CLOUDSQL_PROXY_WAIT_SECONDS:-60}" +FLOW_MODE="${FLOW_MODE:-refresh}" +FLOW_CHUNK_SIZE="${FLOW_CHUNK_SIZE:-900}" +FLOW_THREADS="${FLOW_THREADS:-4}" +FLOW_MIG_BATCH_ID="${FLOW_MIG_BATCH_ID:-1}" +FLOW_LOOKBACK_HOURS="${FLOW_LOOKBACK_HOURS:-5}" +FLOW_PG_DISABLE_METHOD="${FLOW_PG_DISABLE_METHOD:-table_triggers}" +FLOW_PG_FASTLOAD="${FLOW_PG_FASTLOAD:-false}" +FLOW_INCLUDE_CP="${FLOW_INCLUDE_CP:-false}" +FLOW_INCLUDE_CARS="${FLOW_INCLUDE_CARS:-false}" +FLOW_CORP_FILE="${FLOW_CORP_FILE:-}" +FLOW_RESET_EXTRACT_POSTGRES="${FLOW_RESET_EXTRACT_POSTGRES:-false}" +SMOKE_ONLY="${SMOKE_ONLY:-false}" +RUN_PREFLIGHT="${RUN_PREFLIGHT:-true}" +RUN_POSTFLIGHT="${RUN_POSTFLIGHT:-true}" +SKIP_ORACLE_PREFLIGHT="${SKIP_ORACLE_PREFLIGHT:-false}" +REFRESH_COLIN_EXTRACT_VIEWS="${REFRESH_COLIN_EXTRACT_VIEWS:-false}" +MV_REFRESH_TARGETS="${MV_REFRESH_TARGETS:-legacy}" +MV_REFRESH_SKIP_ANALYZE="${MV_REFRESH_SKIP_ANALYZE:-false}" +PGSCHEMA="${DBSCHEMA_TARGET_SCHEMA}" + +CLOUDSQL_PROXY_PID="" +ACTIVE_CHILD_PID="" +ACTIVE_CHILD_GROUP_PID="" +ACTIVE_CHILD_LABEL="" +TERMINATING="false" +TERMINATION_GRACE_SECONDS="${TERMINATION_GRACE_SECONDS:-20}" +CLOUDSQL_PROXY_STOP_SECONDS="${CLOUDSQL_PROXY_STOP_SECONDS:-10}" + +log() { + printf '[%s] %s\n' "$(date -u +%Y-%m-%dT%H:%M:%SZ)" "$*" +} + +die() { + log "ERROR: $*" >&2 + exit 1 +} + +_send_signal_to_pid_or_group() { + local signal="$1" + local pid="$2" + local group_pid="${3:-}" + if [[ -n "${group_pid}" ]]; then + kill "-${signal}" -- "-${group_pid}" >/dev/null 2>&1 || kill "-${signal}" "${pid}" >/dev/null 2>&1 || true + else + kill "-${signal}" "${pid}" >/dev/null 2>&1 || true + fi +} + +_wait_for_pid_exit() { + local pid="$1" + local timeout_seconds="$2" + local deadline=$((SECONDS + timeout_seconds)) + while kill -0 "${pid}" >/dev/null 2>&1; do + if (( SECONDS >= deadline )); then + return 1 + fi + sleep 1 + done + return 0 +} + +terminate_active_child() { + local signal="${1:-TERM}" + local pid="${ACTIVE_CHILD_PID:-}" + local group_pid="${ACTIVE_CHILD_GROUP_PID:-}" + local label="${ACTIVE_CHILD_LABEL:-active child}" + if [[ -z "${pid}" ]] || ! kill -0 "${pid}" >/dev/null 2>&1; then + return 0 + fi + + if [[ -n "${group_pid}" ]]; then + log "Forwarding ${signal} to ${label} process group pgid=${group_pid} pid=${pid}" + else + log "Forwarding ${signal} to ${label} pid=${pid}" + fi + _send_signal_to_pid_or_group "${signal}" "${pid}" "${group_pid}" + + if ! _wait_for_pid_exit "${pid}" "${TERMINATION_GRACE_SECONDS}"; then + log "${label} did not exit within ${TERMINATION_GRACE_SECONDS}s; sending KILL" + _send_signal_to_pid_or_group KILL "${pid}" "${group_pid}" + fi + wait "${pid}" >/dev/null 2>&1 || true +} + +handle_termination() { + local signal="$1" + local exit_code=143 + if [[ "${signal}" == "INT" ]]; then + exit_code=130 + fi + if [[ "${TERMINATING}" == "true" ]]; then + return 0 + fi + TERMINATING="true" + log "Received ${signal}; terminating active work before cleanup" + terminate_active_child TERM + exit "${exit_code}" +} + +stop_cloudsql_proxy() { + if [[ -n "${CLOUDSQL_PROXY_PID}" ]] && kill -0 "${CLOUDSQL_PROXY_PID}" >/dev/null 2>&1; then + log "Stopping Cloud SQL Auth Proxy pid=${CLOUDSQL_PROXY_PID}" + kill "${CLOUDSQL_PROXY_PID}" >/dev/null 2>&1 || true + if ! _wait_for_pid_exit "${CLOUDSQL_PROXY_PID}" "${CLOUDSQL_PROXY_STOP_SECONDS}"; then + log "Cloud SQL Auth Proxy did not exit within ${CLOUDSQL_PROXY_STOP_SECONDS}s; sending KILL" + kill -KILL "${CLOUDSQL_PROXY_PID}" >/dev/null 2>&1 || true + fi + wait "${CLOUDSQL_PROXY_PID}" >/dev/null 2>&1 || true + CLOUDSQL_PROXY_PID="" + fi +} + +run_active_command() { + local label="$1" + local log_file="$2" + local workdir="$3" + shift 3 + local rc=0 + + ACTIVE_CHILD_LABEL="${label}" + if command -v setsid >/dev/null 2>&1; then + setsid bash -c 'cd "$1" || exit; shift; exec "$@"' bash "${workdir}" "$@" > >(tee "${log_file}") 2>&1 & + ACTIVE_CHILD_PID=$! + ACTIVE_CHILD_GROUP_PID="${ACTIVE_CHILD_PID}" + else + (cd "${workdir}" && exec "$@") > >(tee "${log_file}") 2>&1 & + ACTIVE_CHILD_PID=$! + ACTIVE_CHILD_GROUP_PID="" + fi + + set +e + wait "${ACTIVE_CHILD_PID}" + rc=$? + set -e + + ACTIVE_CHILD_PID="" + ACTIVE_CHILD_GROUP_PID="" + ACTIVE_CHILD_LABEL="" + return "${rc}" +} + +cleanup() { + local exit_code=$? + if [[ "${exit_code}" -ne 0 ]]; then + emit_failure_diagnostics "${exit_code}" || true + fi + stop_cloudsql_proxy + if [[ -f "${DBSCHEMA_INIT_SQL}" ]]; then + chmod 600 "${DBSCHEMA_INIT_SQL}" >/dev/null 2>&1 || true + fi + log "Finished run_id=${RUN_ID} exit_code=${exit_code} artifacts=${RUN_DIR}" + exit "${exit_code}" +} +trap cleanup EXIT +trap 'handle_termination TERM' TERM +trap 'handle_termination INT' INT + +require_command() { + command -v "$1" >/dev/null 2>&1 || die "required command not found: $1" +} + +require_env() { + local name="$1" + if [[ -z "${!name:-}" ]]; then + die "required environment variable is not set: ${name}" + fi +} + +bool_true() { + case "${1:-}" in + true|TRUE|True|1|yes|YES|y|Y) return 0 ;; + *) return 1 ;; + esac +} + +wait_for_tcp() { + local host="$1" + local port="$2" + local timeout_seconds="$3" + local deadline=$((SECONDS + timeout_seconds)) + while (( SECONDS < deadline )); do + if (echo >"/dev/tcp/${host}/${port}") >/dev/null 2>&1; then + return 0 + fi + sleep 1 + done + return 1 +} + +psql_cmd() { + PGCONNECT_TIMEOUT="${PGCONNECT_TIMEOUT:-10}" \ + PGOPTIONS="${PGOPTIONS:-} -c statement_timeout=${PG_STATEMENT_TIMEOUT_MS:-60000}" \ + PGPASSWORD="${DATABASE_PASSWORD_COLIN_MIGR}" psql \ + -X -v ON_ERROR_STOP=1 \ + -h "${DATABASE_HOST_COLIN_MIGR}" \ + -p "${DATABASE_PORT_COLIN_MIGR}" \ + -U "${DATABASE_USERNAME_COLIN_MIGR}" \ + -d "${DATABASE_NAME_COLIN_MIGR}" \ + "$@" +} + +emit_failure_diagnostics() { + local exit_code="$1" + mkdir -p "${RUN_DIR}" "${LOG_DIR}" 2>/dev/null || true + log "Failure diagnostics for run_id=${RUN_ID} exit_code=${exit_code}" + log "Automatic database recovery is intentionally disabled. Follow the manual recovery steps in jobs/colin-extract-delta/README.md." + log "Inspect retained artifacts before pod cleanup: ${RUN_DIR}" + + if [[ -f "${LOG_DIR}/refresh-flow.log" ]]; then + log "Last refresh-flow.log lines:" + tail -n 80 "${LOG_DIR}/refresh-flow.log" >&2 || true + fi + if [[ -f "${LOG_DIR}/cloud-sql-proxy.log" ]]; then + log "Last cloud-sql-proxy.log lines:" + tail -n 40 "${LOG_DIR}/cloud-sql-proxy.log" >&2 || true + fi + if [[ -f "${LOG_DIR}/dbschema-smoke.log" ]]; then + log "Last dbschema-smoke.log lines:" + tail -n 40 "${LOG_DIR}/dbschema-smoke.log" >&2 || true + fi + + if [[ -n "${DATABASE_HOST_COLIN_MIGR:-}" && -n "${DATABASE_PORT_COLIN_MIGR:-}" && -n "${DATABASE_USERNAME_COLIN_MIGR:-}" && -n "${DATABASE_NAME_COLIN_MIGR:-}" && -n "${DATABASE_PASSWORD_COLIN_MIGR:-}" ]] && command -v psql >/dev/null 2>&1; then + psql_cmd -v target_schema="${DBSCHEMA_TARGET_SCHEMA}" -qAt >"${RUN_DIR}/failure-disabled-triggers.txt" <<'SQL' || true +SELECT c.relname || ':' || t.tgname || ':' || t.tgenabled +FROM pg_trigger t +JOIN pg_class c ON c.oid = t.tgrelid +JOIN pg_namespace n ON n.oid = c.relnamespace +WHERE n.nspname = :'target_schema' + AND NOT t.tgisinternal + AND t.tgenabled <> 'O' +ORDER BY c.relname, t.tgname; +SQL + psql_cmd -v target_schema="${DBSCHEMA_TARGET_SCHEMA}" -qAt >"${RUN_DIR}/failure-helper-counts.txt" <<'SQL' || true +SELECT 'subset_address_stage=' || count(*) FROM :"target_schema".subset_address_stage +UNION ALL SELECT 'subset_excluded_corps=' || count(*) FROM :"target_schema".subset_excluded_corps +UNION ALL SELECT 'subset_excluded_events=' || count(*) FROM :"target_schema".subset_excluded_events +UNION ALL SELECT 'subset_excluded_corp_parties=' || count(*) FROM :"target_schema".subset_excluded_corp_parties; +SQL + log "Wrote failure diagnostics if DB was reachable: ${RUN_DIR}/failure-disabled-triggers.txt and ${RUN_DIR}/failure-helper-counts.txt" + fi +} + +prepare_runtime() { + mkdir -p "${RUN_DIR}" "${LOG_DIR}" "${APP_DATA}" "${DBSCHEMA_CLI_DIR}" + chmod 700 "${RUN_DIR}" "${DBSCHEMA_HOME}" "${DBSCHEMA_CLI_DIR}" 2>/dev/null || true + export HOME="${HOME:-${APP_HOME}}" + export PYTHONPATH="${DATA_TOOL_DIR}/flows:${DATA_TOOL_DIR}:${PYTHONPATH:-}" + export TMPDIR="${RUN_DIR}/tmp" + mkdir -p "${TMPDIR}" +} + +validate_env() { + require_command bash + require_command python + require_command java + require_command psql + require_command timeout + require_command "${DBSCHEMACLI_CMD}" + + require_env DATABASE_USERNAME_COLIN_ORACLE + require_env DATABASE_PASSWORD_COLIN_ORACLE + require_env DATABASE_HOST_COLIN_ORACLE + require_env DATABASE_PORT_COLIN_ORACLE + require_env DATABASE_NAME_COLIN_ORACLE + + require_env DATABASE_USERNAME_COLIN_MIGR + require_env DATABASE_PASSWORD_COLIN_MIGR + require_env DATABASE_NAME_COLIN_MIGR + + case "${CLOUDSQL_PROXY_MODE}" in + wrapper) + require_command cloud-sql-proxy + require_env CLOUDSQL_INSTANCE_CONNECTION_NAME + ;; + external|sidecar) + : + ;; + disabled) + require_env DATABASE_HOST_COLIN_MIGR + require_env DATABASE_PORT_COLIN_MIGR + ;; + *) + die "CLOUDSQL_PROXY_MODE must be wrapper, external, sidecar, or disabled" + ;; + esac + + case "${FLOW_MODE}" in + refresh|load) : ;; + *) die "FLOW_MODE must be refresh or load" ;; + esac + + require_env DBSCHEMA_SOURCE_CONNECTION + require_env DBSCHEMA_TARGET_CONNECTION + require_env DBSCHEMA_TARGET_SCHEMA + validate_dbschema_alias DBSCHEMA_SOURCE_CONNECTION "${DBSCHEMA_SOURCE_CONNECTION}" + validate_dbschema_alias DBSCHEMA_TARGET_CONNECTION "${DBSCHEMA_TARGET_CONNECTION}" + validate_dbschema_schema DBSCHEMA_TARGET_SCHEMA "${DBSCHEMA_TARGET_SCHEMA}" + PGSCHEMA="${DBSCHEMA_TARGET_SCHEMA}" + + if [[ "${FLOW_MODE}" == "load" && -z "${FLOW_CORP_FILE}" ]]; then + die "FLOW_CORP_FILE is required when FLOW_MODE=load" + fi +} + +start_cloudsql_proxy() { + if [[ "${CLOUDSQL_PROXY_MODE}" != "wrapper" ]]; then + log "Cloud SQL proxy mode=${CLOUDSQL_PROXY_MODE}; assuming ${CLOUDSQL_PROXY_ADDRESS}:${CLOUDSQL_PROXY_PORT} is already available" + return 0 + fi + + local proxy_log="${LOG_DIR}/cloud-sql-proxy.log" + local proxy_cmd=( + cloud-sql-proxy + "--address=${CLOUDSQL_PROXY_ADDRESS}" + "--port=${CLOUDSQL_PROXY_PORT}" + ) + + if bool_true "${CLOUDSQL_PRIVATE_IP}"; then + proxy_cmd+=(--private-ip) + fi + if [[ -n "${GOOGLE_APPLICATION_CREDENTIALS:-}" && -f "${GOOGLE_APPLICATION_CREDENTIALS}" ]]; then + proxy_cmd+=("--credentials-file=${GOOGLE_APPLICATION_CREDENTIALS}") + else + unset GOOGLE_APPLICATION_CREDENTIALS + fi + if [[ -n "${CLOUDSQL_PROXY_EXTRA_ARGS:-}" ]]; then + local extra_args=() + local old_ifs="${IFS}" + IFS=' ' read -r -a extra_args <<< "${CLOUDSQL_PROXY_EXTRA_ARGS}" + IFS="${old_ifs}" + proxy_cmd+=("${extra_args[@]}") + fi + proxy_cmd+=("${CLOUDSQL_INSTANCE_CONNECTION_NAME}") + + log "Starting Cloud SQL Auth Proxy on ${CLOUDSQL_PROXY_ADDRESS}:${CLOUDSQL_PROXY_PORT}; log=${proxy_log}" + "${proxy_cmd[@]}" >"${proxy_log}" 2>&1 & + CLOUDSQL_PROXY_PID=$! + + if ! wait_for_tcp "${CLOUDSQL_PROXY_ADDRESS}" "${CLOUDSQL_PROXY_PORT}" "${CLOUDSQL_PROXY_WAIT_SECONDS}"; then + tail -n 100 "${proxy_log}" >&2 || true + die "Cloud SQL Auth Proxy did not become ready on ${CLOUDSQL_PROXY_ADDRESS}:${CLOUDSQL_PROXY_PORT}" + fi + log "Cloud SQL Auth Proxy is ready pid=${CLOUDSQL_PROXY_PID}" +} + +export_database_env() { + if [[ "${CLOUDSQL_PROXY_MODE}" == "wrapper" ]]; then + export DATABASE_HOST_COLIN_MIGR="${CLOUDSQL_PROXY_ADDRESS}" + export DATABASE_PORT_COLIN_MIGR="${CLOUDSQL_PROXY_PORT}" + else + export DATABASE_HOST_COLIN_MIGR="${DATABASE_HOST_COLIN_MIGR:-${CLOUDSQL_PROXY_ADDRESS}}" + export DATABASE_PORT_COLIN_MIGR="${DATABASE_PORT_COLIN_MIGR:-${CLOUDSQL_PROXY_PORT}}" + fi + export PGHOST="${DATABASE_HOST_COLIN_MIGR}" + export PGPORT="${DATABASE_PORT_COLIN_MIGR}" + export PGDATABASE="${DATABASE_NAME_COLIN_MIGR}" + export PGUSER="${DATABASE_USERNAME_COLIN_MIGR}" + export PGPASSWORD="${DATABASE_PASSWORD_COLIN_MIGR}" + export PGSCHEMA +} + +validate_dbschema_value() { + local name="$1" + local value="$2" + local unsafe_chars=$' \t\r\n;#&|<>$`"'\''\\(){}[]*?!' + local char reason + + if [[ "${value}" == -* ]]; then + reason="start with '-' because DbSchemaCLI may parse it as another option" + elif [[ "${value}" == *--* ]]; then + reason="contain '--' because unquoted DbSchemaCLI init.sql values must not contain comment-like tokens" + else + for (( i=0; i<${#unsafe_chars}; i++ )); do + char="${unsafe_chars:i:1}" + if [[ "${value}" == *"${char}"* ]]; then + case "${char}" in + $' ' | $'\t' | $'\r' | $'\n') reason="contain whitespace or line breaks" ;; + *) reason="contain reserved character '${char}'" ;; + esac + break + fi + done + fi + + if [[ -n "${reason:-}" ]]; then + if [[ "${name}" == *PASSWORD* ]]; then + die "${name} cannot be used by this job because DbSchemaCLI init.sql connection lines are unquoted and the value must not ${reason}. Rotate/update the secret to avoid shell/DbSchema metacharacters before running this job." + fi + die "${name} cannot ${reason}; DbSchemaCLI init.sql connection values are written unquoted by this job." + fi +} + +validate_dbschema_alias() { + local name="$1" + local value="$2" + if [[ ! "${value}" =~ ^[A-Za-z_][A-Za-z0-9_]*$ ]]; then + die "${name} must be a conservative DbSchema identifier using letters, digits, and underscore, and must not start with a digit" + fi +} + +validate_dbschema_schema() { + local name="$1" + local value="$2" + if [[ ! "${value}" =~ ^[A-Za-z_][A-Za-z0-9_]*$ ]]; then + die "${name} must be a conservative Postgres schema identifier using letters, digits, and underscore, and must not start with a digit" + fi + if [[ "${value}" =~ [A-Z] ]]; then + die "${name} must be lowercase; PostgreSQL folds unquoted uppercase identifiers to lowercase, so uppercase schema values are rejected to avoid mismatches" + fi +} + +generate_dbschema_init() { + log "Generating DbSchemaCLI init at ${DBSCHEMA_INIT_SQL}" + validate_dbschema_value DATABASE_USERNAME_COLIN_ORACLE "${DATABASE_USERNAME_COLIN_ORACLE}" + validate_dbschema_value DATABASE_PASSWORD_COLIN_ORACLE "${DATABASE_PASSWORD_COLIN_ORACLE}" + validate_dbschema_value DATABASE_HOST_COLIN_ORACLE "${DATABASE_HOST_COLIN_ORACLE}" + validate_dbschema_value DATABASE_PORT_COLIN_ORACLE "${DATABASE_PORT_COLIN_ORACLE}" + validate_dbschema_value DATABASE_NAME_COLIN_ORACLE "${DATABASE_NAME_COLIN_ORACLE}" + validate_dbschema_value DATABASE_USERNAME_COLIN_MIGR "${DATABASE_USERNAME_COLIN_MIGR}" + validate_dbschema_value DATABASE_PASSWORD_COLIN_MIGR "${DATABASE_PASSWORD_COLIN_MIGR}" + validate_dbschema_value DATABASE_HOST_COLIN_MIGR "${DATABASE_HOST_COLIN_MIGR}" + validate_dbschema_value DATABASE_PORT_COLIN_MIGR "${DATABASE_PORT_COLIN_MIGR}" + validate_dbschema_value DATABASE_NAME_COLIN_MIGR "${DATABASE_NAME_COLIN_MIGR}" + validate_dbschema_value DBSCHEMA_SOURCE_CONNECTION "${DBSCHEMA_SOURCE_CONNECTION}" + validate_dbschema_value DBSCHEMA_TARGET_CONNECTION "${DBSCHEMA_TARGET_CONNECTION}" + validate_dbschema_value DBSCHEMA_TARGET_SCHEMA "${DBSCHEMA_TARGET_SCHEMA}" + validate_dbschema_alias DBSCHEMA_SOURCE_CONNECTION "${DBSCHEMA_SOURCE_CONNECTION}" + validate_dbschema_alias DBSCHEMA_TARGET_CONNECTION "${DBSCHEMA_TARGET_CONNECTION}" + validate_dbschema_schema DBSCHEMA_TARGET_SCHEMA "${DBSCHEMA_TARGET_SCHEMA}" + mkdir -p "${DBSCHEMA_HOME}/drivers/PostgreSql" "${DBSCHEMA_HOME}/drivers/Oracle" + if [[ -d "${DBSCHEMA_DRIVER_DIR:-}" ]]; then + cp -R "${DBSCHEMA_DRIVER_DIR}/." "${DBSCHEMA_HOME}/drivers/" 2>/dev/null || true + fi + + umask 077 + cat >"${DBSCHEMA_INIT_SQL}" <&1)" + log "java=$(java -version 2>&1 | head -n 1)" + log "psql=$(psql --version 2>&1)" + log "dbschemacli=$(command -v "${DBSCHEMACLI_CMD}")" + python - <<'PY' +import oracledb +print(f"oracledb={oracledb.__version__}") +PY + if command -v cloud-sql-proxy >/dev/null 2>&1; then + log "cloud-sql-proxy=$(cloud-sql-proxy --version 2>&1 | head -n 1)" + fi + log "options mode=${FLOW_MODE} chunk_size=${FLOW_CHUNK_SIZE} threads=${FLOW_THREADS} mig_batch_id=${FLOW_MIG_BATCH_ID} lookback_hours=${FLOW_LOOKBACK_HOURS} include_cars=${FLOW_INCLUDE_CARS} reset_extract_postgres=${FLOW_RESET_EXTRACT_POSTGRES} refresh_mvs=${REFRESH_COLIN_EXTRACT_VIEWS} mv_targets=${MV_REFRESH_TARGETS} source_connection=${DBSCHEMA_SOURCE_CONNECTION} target_connection=${DBSCHEMA_TARGET_CONNECTION} target_schema=${DBSCHEMA_TARGET_SCHEMA}" +} + +run_dbschema_smoke() { + local smoke_sql="${RUN_DIR}/dbschema-smoke.sql" + local smoke_log="${LOG_DIR}/dbschema-smoke.log" + + cat >"${smoke_sql}" </dev/null + local helper_table_count + helper_table_count="$(psql_cmd -v target_schema="${DBSCHEMA_TARGET_SCHEMA}" -qAt <<'SQL' +WITH required(relname) AS ( + VALUES + ('subset_address_stage'), + ('subset_excluded_corps'), + ('subset_excluded_events'), + ('subset_excluded_corp_parties'), + ('colin_extract_version') +) +SELECT count(*) +FROM required +WHERE to_regclass(format('%I.%I', :'target_schema', relname)) IS NOT NULL; +SQL +)" + [[ "${helper_table_count}" == "5" ]] || die "missing required COLIN extract helper table(s); apply latest colin_corps_extract_postgres_ddl" + + if bool_true "${REFRESH_COLIN_EXTRACT_VIEWS}"; then + "${DATA_TOOL_DIR}/refresh_colin_extract_views.sh" \ + --mode plan \ + --targets "${MV_REFRESH_TARGETS}" \ + --db "${DATABASE_NAME_COLIN_MIGR}" \ + --host "${DATABASE_HOST_COLIN_MIGR}" \ + --port "${DATABASE_PORT_COLIN_MIGR}" \ + --user "${DATABASE_USERNAME_COLIN_MIGR}" \ + --schema "${PGSCHEMA}" \ + >"${RUN_DIR}/mv-refresh-plan.preflight.sql" + fi + + if [[ "${SKIP_ORACLE_PREFLIGHT}" != "true" ]]; then + (cd "${DATA_TOOL_DIR}/flows" && python - <<'PY') +from common.init_utils import colin_oracle_init, get_config +engine = colin_oracle_init.fn(get_config.fn()) +engine.dispose() +PY + fi + run_dbschema_smoke + test -w "${RUN_DIR}" || die "artifact directory is not writable: ${RUN_DIR}" + log "Preflight checks passed" +} + +run_refresh_flow() { + local master_script="${RUN_DIR}/subset_${FLOW_MODE}.sql" + local argv=( + python + "${DATA_TOOL_DIR}/flows/refresh_extract_subset_flow.py" + --mode "${FLOW_MODE}" + --chunk-size "${FLOW_CHUNK_SIZE}" + --threads "${FLOW_THREADS}" + --pg-disable-method "${FLOW_PG_DISABLE_METHOD}" + --artifact-dir "${RUN_DIR}" + --out "${master_script}" + --run-dbschemacli + --dbschemacli-cmd "${DBSCHEMACLI_CMD}" + --source-connection "${DBSCHEMA_SOURCE_CONNECTION}" + --target-connection "${DBSCHEMA_TARGET_CONNECTION}" + --target-schema "${DBSCHEMA_TARGET_SCHEMA}" + --mig-batch-id "${FLOW_MIG_BATCH_ID}" + --lookback-hours "${FLOW_LOOKBACK_HOURS}" + ) + + if [[ -n "${FLOW_CORP_FILE}" ]]; then + argv+=(--corp-file "${FLOW_CORP_FILE}") + fi + if bool_true "${FLOW_PG_FASTLOAD}"; then + argv+=(--pg-fastload) + fi + if bool_true "${FLOW_INCLUDE_CP}"; then + argv+=(--include-cp) + fi + if bool_true "${FLOW_INCLUDE_CARS}"; then + argv+=(--include-cars) + else + argv+=(--no-cars) + fi + if ! bool_true "${FLOW_RESET_EXTRACT_POSTGRES}"; then + argv+=(--no-reset-extract-postgres) + fi + + log "Running refresh_extract_subset_flow.py; generated master=${master_script}" + run_active_command "refresh_extract_subset_flow.py" "${LOG_DIR}/refresh-flow.log" "${APP_HOME}" "${argv[@]}" +} + +refresh_materialized_views() { + bool_true "${REFRESH_COLIN_EXTRACT_VIEWS}" || return 0 + local argv=( + "${DATA_TOOL_DIR}/refresh_colin_extract_views.sh" + --mode refresh + --targets "${MV_REFRESH_TARGETS}" + --db "${DATABASE_NAME_COLIN_MIGR}" + --host "${DATABASE_HOST_COLIN_MIGR}" + --port "${DATABASE_PORT_COLIN_MIGR}" + --user "${DATABASE_USERNAME_COLIN_MIGR}" + --schema "${PGSCHEMA}" + ) + if bool_true "${MV_REFRESH_SKIP_ANALYZE}"; then + argv+=(--skip-analyze) + fi + log "Refreshing COLIN extract materialized views targets=${MV_REFRESH_TARGETS}" + run_active_command "refresh_colin_extract_views.sh" "${LOG_DIR}/mv-refresh.log" "${APP_HOME}" "${argv[@]}" +} + +postflight_checks() { + bool_true "${RUN_POSTFLIGHT}" || return 0 + log "Running postflight checks" + local disabled_triggers + disabled_triggers="$(psql_cmd -v target_schema="${DBSCHEMA_TARGET_SCHEMA}" -qAt <<'SQL' +SELECT c.relname || ':' || t.tgname || ':' || t.tgenabled +FROM pg_trigger t +JOIN pg_class c ON c.oid = t.tgrelid +JOIN pg_namespace n ON n.oid = c.relnamespace +WHERE n.nspname = :'target_schema' + AND NOT t.tgisinternal + AND c.relname = ANY (ARRAY[ + 'corporation','corp_name','corp_state','event','filing','filing_user','office','corp_comments', + 'ledger_text','corp_party','corp_party_relationship','offices_held','completing_party','submitting_party', + 'corp_flag','cont_out','conv_event','conv_ledger','corp_involved_amalgamating','corp_involved_cont_in', + 'corp_restriction','correction','jurisdiction','resolution','share_series','share_struct','share_struct_cls', + 'notification','notification_resend','party_notification','payment','carsfile','carsbox','carsrept','carindiv', + 'corp_processing','auth_processing','affiliation_processing','colin_tracking' + ]) + AND t.tgenabled <> 'O'; +SQL +)" + if [[ -n "${disabled_triggers}" ]]; then + printf '%s\n' "${disabled_triggers}" >"${RUN_DIR}/postflight-disabled-triggers.txt" + die "postflight found non-origin trigger state; see ${RUN_DIR}/postflight-disabled-triggers.txt" + fi + + local helper_counts + helper_counts="$(psql_cmd -v target_schema="${DBSCHEMA_TARGET_SCHEMA}" -qAt <<'SQL' +SELECT 'subset_address_stage=' || count(*) FROM :"target_schema".subset_address_stage +UNION ALL SELECT 'subset_excluded_corps=' || count(*) FROM :"target_schema".subset_excluded_corps +UNION ALL SELECT 'subset_excluded_events=' || count(*) FROM :"target_schema".subset_excluded_events +UNION ALL SELECT 'subset_excluded_corp_parties=' || count(*) FROM :"target_schema".subset_excluded_corp_parties; +SQL +)" + printf '%s\n' "${helper_counts}" >"${RUN_DIR}/postflight-helper-counts.txt" + if printf '%s\n' "${helper_counts}" | grep -v '=0$' >/dev/null 2>&1; then + die "postflight found non-empty helper tables; see ${RUN_DIR}/postflight-helper-counts.txt" + fi + log "Postflight checks passed" +} + +main() { + prepare_runtime + validate_env + export_database_env + start_cloudsql_proxy + generate_dbschema_init + print_versions + preflight_checks + + if bool_true "${SMOKE_ONLY}"; then + log "SMOKE_ONLY=true; exiting after successful preflight" + return 0 + fi + + run_refresh_flow + refresh_materialized_views + postflight_checks + log "COLIN extract delta run completed successfully" +} + +main "$@" diff --git a/jobs/colin-extract-delta/validate_contract.py b/jobs/colin-extract-delta/validate_contract.py new file mode 100644 index 0000000000..79d1c3dae0 --- /dev/null +++ b/jobs/colin-extract-delta/validate_contract.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 +"""Lightweight OCP job contract checks that do not require pytest.""" + +from __future__ import annotations + +import importlib.util +import sys +import tempfile +from pathlib import Path + + +JOB_DIR = Path(__file__).resolve().parent +REPO_ROOT = JOB_DIR.parents[1] +DATA_TOOL_DIR = REPO_ROOT / "data-tool" +FLOW_PATH = DATA_TOOL_DIR / "flows" / "refresh_extract_subset_flow.py" +GENERATOR_PATH = DATA_TOOL_DIR / "scripts" / "generate_cprd_subset_extract.py" + + +def _load_module(name: str, path: Path): + spec = importlib.util.spec_from_file_location(name, path) + if spec is None or spec.loader is None: + raise RuntimeError(f"could not load module spec for {path}") + module = importlib.util.module_from_spec(spec) + sys.modules[name] = module + spec.loader.exec_module(module) + return module + + +def validate_flow_parser_contract() -> None: + """Verify the OCP run.sh flags still parse when flow deps are available.""" + sys.path.insert(0, str(DATA_TOOL_DIR / "flows")) + try: + flow_module = _load_module("refresh_extract_subset_flow_contract", FLOW_PATH) + except ImportError as err: + print(f"flow parser contract: skipped (missing import dependency: {err})") + return + + with tempfile.TemporaryDirectory(prefix="colin-extract-contract-") as tmp: + artifact_dir = Path(tmp) + out = artifact_dir / "subset_refresh.sql" + argv = [ + "--mode", + "refresh", + "--chunk-size", + "900", + "--threads", + "4", + "--pg-disable-method", + "table_triggers", + "--artifact-dir", + str(artifact_dir), + "--out", + str(out), + "--run-dbschemacli", + "--dbschemacli-cmd", + "dbschemacli", + "--source-connection", + "cpqa", + "--target-connection", + "ocp_pg", + "--target-schema", + "extract_delta", + "--mig-batch-id", + "1", + "--lookback-hours", + "5", + "--no-cars", + "--no-reset-extract-postgres", + ] + args = flow_module.build_arg_parser().parse_args(argv) + assert args.mode == "refresh" + assert args.artifact_dir == str(artifact_dir) + assert args.out == str(out) + assert args.run_dbschemacli is True + assert args.include_cars is False + assert args.reset_extract_postgres is False + assert args.source_connection == "cpqa" + assert args.target_connection == "ocp_pg" + assert args.target_schema == "extract_delta" + print("flow parser contract: ok") + + +def validate_generator_rendering_contract() -> None: + generator = _load_module("generate_cprd_subset_extract_contract", GENERATOR_PATH) + with tempfile.TemporaryDirectory(prefix="colin-extract-generator-") as tmp: + tmp_path = Path(tmp) + corp_file = tmp_path / "corps.txt" + out = tmp_path / "subset_refresh.sql" + corp_file.write_text("BC1234567\n", encoding="utf-8") + + args = generator.cli_parse_args( + [ + "--corp-file", + str(corp_file), + "--mode", + "refresh", + "--oracle-in-strategy", + "or_of_in_lists", + "--source-connection", + "cpqa", + "--target-connection", + "ocp_pg", + "--target-schema", + "extract_delta", + "--out", + str(out), + ] + ) + cfg = generator.cfg_build_config(args) + rc = generator.run(cfg) + if rc != 0: + raise AssertionError(f"generator returned {rc}") + + sql_files = sorted(tmp_path.rglob("*.sql")) + if not sql_files: + raise AssertionError("generator did not produce SQL files") + combined_sql = "\n".join(path.read_text(encoding="utf-8") for path in sql_files) + lower_sql = combined_sql.lower() + master_sql = out.read_text(encoding="utf-8") + + checks = { + "connect ocp_pg;": "target alias was not rendered in master SQL", + "learn schema extract_delta;": "target schema learn command was not rendered", + "transfer extract_delta.corporation from cpqa using": "source alias/target schema transfer was not rendered", + } + for needle, message in checks.items(): + haystack = lower_sql if needle.startswith("transfer") else master_sql + if needle not in haystack: + raise AssertionError(message) + if "__dbschema_" in lower_sql: + raise AssertionError("generated SQL contains unresolved __DBSCHEMA_*__ token") + if " from cprd" in lower_sql: + raise AssertionError("generated SQL contains unexpected hardcoded source alias 'cprd'") + if "public." in lower_sql: + raise AssertionError("generated SQL contains unexpected hardcoded target schema 'public.'") + print(f"generator rendering contract: ok ({len(sql_files)} SQL files)") + + +def main() -> int: + validate_flow_parser_contract() + validate_generator_rendering_contract() + return 0 + + +if __name__ == "__main__": + raise SystemExit(main())