diff --git a/.github/actions/collect-diagnostics/action.yml b/.github/actions/collect-diagnostics/action.yml index 28dc3260..1d596cb6 100644 --- a/.github/actions/collect-diagnostics/action.yml +++ b/.github/actions/collect-diagnostics/action.yml @@ -28,9 +28,16 @@ runs: -i "$SSH_KEY" "root@${INCUS_HOST}" "$@" 2>/dev/null || true } - # Discover running Incus containers managed by molecule - containers=$(ssh_host \ + # Discover running Incus containers managed by molecule. + # Filter by MOLECULE_RUN_SUFFIX so concurrent matrix jobs on the same + # Incus host don't cross-contaminate this job's diagnostics. + all_containers=$(ssh_host \ "incus list --format csv -c n,s | grep ',RUNNING' | cut -d, -f1" || true) + if [ -n "${MOLECULE_RUN_SUFFIX:-}" ]; then + containers=$(echo "$all_containers" | grep -- "${MOLECULE_RUN_SUFFIX}$" || true) + else + containers="$all_containers" + fi for name in $containers; do dir="$diag/service-logs/$name" @@ -105,13 +112,16 @@ runs: } > "$dir/container-resources.txt" 2>&1 || true done - # Container memory report with cgroup OOM events (from Incus host) - ssh_host bash -s << 'MEMSCRIPT' \ + # Container memory report with cgroup OOM events (from Incus host). + # Filter by run suffix to skip concurrent matrix jobs' containers. + ssh_host bash -s "${MOLECULE_RUN_SUFFIX:-}" << 'MEMSCRIPT' \ > "$diag/container-memory-report.txt" 2>&1 || true + suffix="$1" printf "%-50s %10s %10s %10s %10s %10s %s\n" \ "CONTAINER" "CURRENT" "PEAK" "ANON" "FILE" "LIMIT" "OOM_EVENTS" echo "---" for c in $(incus list -f csv -c ns 2>/dev/null | grep ",RUNNING" | cut -d, -f1 | sort); do + [ -n "$suffix" ] && [[ "$c" != *"$suffix" ]] && continue cgdir="/sys/fs/cgroup/lxc.payload.${c}" [ -f "$cgdir/memory.current" ] || continue current=$(($(cat "$cgdir/memory.current") / 1048576)) @@ -147,6 +157,12 @@ runs: report="$diag/OOM-REPORT.txt" found_oom=false + # LXC containers share the host kernel ring buffer and some log + # paths, so dmesg/ES log OOM entries can predate this job by days. + # Ignore anything older than this cutoff to avoid false positives. + since_epoch=$(date -u -d '3 hours ago' +%s) + since_iso=$(date -u -d '3 hours ago' +%Y-%m-%dT%H:%M:%S) + { echo "========================================" echo " OOM / Memory Pressure Detection Report" @@ -178,26 +194,47 @@ runs: done < "$diag/container-memory-report.txt" fi - # Check ES logs for OutOfMemoryError + # Check ES logs for OutOfMemoryError (recent entries only). + # ES log prefix is [YYYY-MM-DDTHH:MM:SS,mmm]; ISO timestamps compare + # lexicographically so string >= works. POSIX awk (mawk-compatible). for eslog in "$diag"/service-logs/*/elasticsearch.log; do [ -f "$eslog" ] || continue container=$(basename "$(dirname "$eslog")") - if grep -q "OutOfMemoryError" "$eslog" 2>/dev/null; then + recent=$(awk -v since="$since_iso" ' + /OutOfMemoryError/ { + if (match($0, /\[[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]T[0-9][0-9]:[0-9][0-9]:[0-9][0-9]/)) { + ts = substr($0, RSTART+1, 19) + if (ts >= since) print + } + }' "$eslog") + if [ -n "$recent" ]; then echo echo "[ES OOM] $container — OutOfMemoryError in elasticsearch.log:" - grep -A2 "OutOfMemoryError" "$eslog" | head -10 + echo "$recent" | head -10 found_oom=true fi done - # Check dmesg for kernel OOM killer + # Check dmesg for kernel OOM killer (recent entries only). + # dmesg -T prefixes lines with a ctime-like bracketed timestamp; + # parse it and compare to the cutoff to drop host-level history. for dlog in "$diag"/service-logs/*/dmesg.log; do [ -f "$dlog" ] || continue container=$(basename "$(dirname "$dlog")") - if grep -qi "oom-killer\|out of memory\|killed process" "$dlog" 2>/dev/null; then + recent=$(awk -v cutoff="$since_epoch" ' + { line = tolower($0) } + line ~ /oom-killer|out of memory|killed process/ { + if (match($0, /^\[[^]]+\]/)) { + ts = substr($0, RSTART+1, RLENGTH-2) + cmd = "date -d \"" ts "\" +%s 2>/dev/null" + if ((cmd | getline epoch) > 0 && (epoch+0) >= (cutoff+0)) print + close(cmd) + } + }' "$dlog") + if [ -n "$recent" ]; then echo echo "[KERNEL OOM] $container — OOM killer in dmesg:" - grep -i "oom-killer\|out of memory\|killed process" "$dlog" | head -10 + echo "$recent" | head -10 found_oom=true fi done @@ -228,7 +265,7 @@ runs: echo "::endgroup::" - name: Upload diagnostics - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v5 with: name: ${{ inputs.artifact-name }} path: /tmp/molecule-diagnostics/ diff --git a/.github/workflows/molecule.yml b/.github/workflows/molecule.yml index 99851936..11fbb743 100644 --- a/.github/workflows/molecule.yml +++ b/.github/workflows/molecule.yml @@ -81,10 +81,22 @@ jobs: mkdir -p "$RUNNER_TEMP/ssh-cp" - name: Install collection + env: + CACHE_HOST: ${{ secrets.INCUS_HOST }} run: | mkdir -p $ANSIBLE_COLLECTIONS_PATH/ansible_collections/$COLLECTION_NAMESPACE cp -a "$GITHUB_WORKSPACE" $ANSIBLE_COLLECTIONS_PATH/ansible_collections/$COLLECTION_NAMESPACE/$COLLECTION_NAME - ansible-galaxy collection install http://${{ secrets.INCUS_HOST }}:8082/collections/community-general-12.3.0.tar.gz http://${{ secrets.INCUS_HOST }}:8082/collections/community-crypto-3.1.1.tar.gz http://${{ secrets.INCUS_HOST }}:8082/collections/ansible-posix-2.1.0.tar.gz + if [ -n "$CACHE_HOST" ]; then + ansible-galaxy collection install \ + "http://${CACHE_HOST}:8082/collections/community-general-12.3.0.tar.gz" \ + "http://${CACHE_HOST}:8082/collections/community-crypto-3.1.1.tar.gz" \ + "http://${CACHE_HOST}:8082/collections/ansible-posix-2.1.0.tar.gz" + else + ansible-galaxy collection install \ + community.general:12.3.0 \ + community.crypto:3.1.1 \ + ansible.posix:2.1.0 + fi - name: Set up SSH key for molecule run: | diff --git a/.github/workflows/test_elasticsearch_upgrade.yml b/.github/workflows/test_elasticsearch_upgrade.yml index 8d4679ba..dc78035c 100644 --- a/.github/workflows/test_elasticsearch_upgrade.yml +++ b/.github/workflows/test_elasticsearch_upgrade.yml @@ -86,10 +86,22 @@ jobs: mkdir -p "$RUNNER_TEMP/ssh-cp" - name: Install collection + env: + CACHE_HOST: ${{ secrets.INCUS_HOST }} run: | mkdir -p $ANSIBLE_COLLECTIONS_PATH/ansible_collections/$COLLECTION_NAMESPACE cp -a "$GITHUB_WORKSPACE" $ANSIBLE_COLLECTIONS_PATH/ansible_collections/$COLLECTION_NAMESPACE/$COLLECTION_NAME - ansible-galaxy collection install http://${{ secrets.INCUS_HOST }}:8082/collections/community-general-12.3.0.tar.gz http://${{ secrets.INCUS_HOST }}:8082/collections/community-crypto-3.1.1.tar.gz http://${{ secrets.INCUS_HOST }}:8082/collections/ansible-posix-2.1.0.tar.gz + if [ -n "$CACHE_HOST" ]; then + ansible-galaxy collection install \ + "http://${CACHE_HOST}:8082/collections/community-general-12.3.0.tar.gz" \ + "http://${CACHE_HOST}:8082/collections/community-crypto-3.1.1.tar.gz" \ + "http://${CACHE_HOST}:8082/collections/ansible-posix-2.1.0.tar.gz" + else + ansible-galaxy collection install \ + community.general:12.3.0 \ + community.crypto:3.1.1 \ + ansible.posix:2.1.0 + fi - name: Set up SSH key for molecule run: | @@ -189,10 +201,22 @@ jobs: mkdir -p "$RUNNER_TEMP/ssh-cp" - name: Install collection + env: + CACHE_HOST: ${{ secrets.INCUS_HOST }} run: | mkdir -p $ANSIBLE_COLLECTIONS_PATH/ansible_collections/$COLLECTION_NAMESPACE cp -a "$GITHUB_WORKSPACE" $ANSIBLE_COLLECTIONS_PATH/ansible_collections/$COLLECTION_NAMESPACE/$COLLECTION_NAME - ansible-galaxy collection install http://${{ secrets.INCUS_HOST }}:8082/collections/community-general-12.3.0.tar.gz http://${{ secrets.INCUS_HOST }}:8082/collections/community-crypto-3.1.1.tar.gz http://${{ secrets.INCUS_HOST }}:8082/collections/ansible-posix-2.1.0.tar.gz + if [ -n "$CACHE_HOST" ]; then + ansible-galaxy collection install \ + "http://${CACHE_HOST}:8082/collections/community-general-12.3.0.tar.gz" \ + "http://${CACHE_HOST}:8082/collections/community-crypto-3.1.1.tar.gz" \ + "http://${CACHE_HOST}:8082/collections/ansible-posix-2.1.0.tar.gz" + else + ansible-galaxy collection install \ + community.general:12.3.0 \ + community.crypto:3.1.1 \ + ansible.posix:2.1.0 + fi - name: Set up SSH key for molecule run: | diff --git a/.github/workflows/test_full_stack.yml b/.github/workflows/test_full_stack.yml index f64d1d5b..7665fef5 100644 --- a/.github/workflows/test_full_stack.yml +++ b/.github/workflows/test_full_stack.yml @@ -114,10 +114,22 @@ jobs: mkdir -p "$RUNNER_TEMP/ssh-cp" - name: Install collection + env: + CACHE_HOST: ${{ secrets.INCUS_HOST }} run: | mkdir -p $ANSIBLE_COLLECTIONS_PATH/ansible_collections/$COLLECTION_NAMESPACE cp -a "$GITHUB_WORKSPACE" $ANSIBLE_COLLECTIONS_PATH/ansible_collections/$COLLECTION_NAMESPACE/$COLLECTION_NAME - ansible-galaxy collection install http://${{ secrets.INCUS_HOST }}:8082/collections/community-general-12.3.0.tar.gz http://${{ secrets.INCUS_HOST }}:8082/collections/community-crypto-3.1.1.tar.gz http://${{ secrets.INCUS_HOST }}:8082/collections/ansible-posix-2.1.0.tar.gz + if [ -n "$CACHE_HOST" ]; then + ansible-galaxy collection install \ + "http://${CACHE_HOST}:8082/collections/community-general-12.3.0.tar.gz" \ + "http://${CACHE_HOST}:8082/collections/community-crypto-3.1.1.tar.gz" \ + "http://${CACHE_HOST}:8082/collections/ansible-posix-2.1.0.tar.gz" + else + ansible-galaxy collection install \ + community.general:12.3.0 \ + community.crypto:3.1.1 \ + ansible.posix:2.1.0 + fi - name: Set up SSH key for molecule run: | diff --git a/.github/workflows/test_linting.yml b/.github/workflows/test_linting.yml index 766e16eb..a5e3b279 100644 --- a/.github/workflows/test_linting.yml +++ b/.github/workflows/test_linting.yml @@ -42,7 +42,22 @@ jobs: SSL_CERT_FILE: /etc/ssl/certs/ca-certificates.crt - name: Install Ansible collection dependencies. - run: ansible-galaxy collection install http://${{ secrets.INCUS_HOST }}:8082/collections/community-general-12.3.0.tar.gz http://${{ secrets.INCUS_HOST }}:8082/collections/community-crypto-3.1.1.tar.gz http://${{ secrets.INCUS_HOST }}:8082/collections/ansible-posix-2.1.0.tar.gz + env: + CACHE_HOST: ${{ secrets.INCUS_HOST }} + run: | + # Fall back to upstream Galaxy when the cache host secret is not + # available (e.g. Dependabot PRs, which do not inherit repo secrets). + if [ -n "$CACHE_HOST" ]; then + ansible-galaxy collection install \ + "http://${CACHE_HOST}:8082/collections/community-general-12.3.0.tar.gz" \ + "http://${CACHE_HOST}:8082/collections/community-crypto-3.1.1.tar.gz" \ + "http://${CACHE_HOST}:8082/collections/ansible-posix-2.1.0.tar.gz" + else + ansible-galaxy collection install \ + community.general:12.3.0 \ + community.crypto:3.1.1 \ + ansible.posix:2.1.0 + fi - name: Lint code (yamllint). run: | diff --git a/molecule/default/molecule.yml b/molecule/default/molecule.yml new file mode 100644 index 00000000..fe17cf38 --- /dev/null +++ b/molecule/default/molecule.yml @@ -0,0 +1,12 @@ +--- +# Placeholder scenario. Molecule 26 globs molecule/default/molecule.yml on +# every run to discover shared state, and emits a CRITICAL line when it is +# absent. Every CI invocation passes -s , so this scenario is +# never executed; the file only exists to silence that log noise. +prerun: false +driver: + name: default +platforms: + - name: placeholder +provisioner: + name: ansible diff --git a/molecule/elasticsearch_diagnostics/converge.yml b/molecule/elasticsearch_diagnostics/converge.yml index ae0013e7..4d0461eb 100644 --- a/molecule/elasticsearch_diagnostics/converge.yml +++ b/molecule/elasticsearch_diagnostics/converge.yml @@ -36,8 +36,9 @@ line: "bogus.nonexistent.setting: true" - name: Attempt restart with bad config (should fail with diagnostics) - ansible.builtin.include_tasks: - file: "{{ lookup('env', 'ANSIBLE_COLLECTIONS_PATH') | default(lookup('env', 'HOME') + '/.ansible/collections', true) }}/ansible_collections/oddly/elasticstack/roles/elasticsearch/tasks/restart_and_verify_elasticsearch.yml" + ansible.builtin.include_role: + name: oddly.elasticstack.elasticsearch + tasks_from: restart_and_verify_elasticsearch.yml - name: This should not be reached ansible.builtin.fail: @@ -72,5 +73,6 @@ seconds: 5 - name: Restart Elasticsearch with restored config - ansible.builtin.include_tasks: - file: "{{ lookup('env', 'ANSIBLE_COLLECTIONS_PATH') | default(lookup('env', 'HOME') + '/.ansible/collections', true) }}/ansible_collections/oddly/elasticstack/roles/elasticsearch/tasks/restart_and_verify_elasticsearch.yml" + ansible.builtin.include_role: + name: oddly.elasticstack.elasticsearch + tasks_from: restart_and_verify_elasticsearch.yml diff --git a/molecule/logstash_custom_pipeline/verify.yml b/molecule/logstash_custom_pipeline/verify.yml index 2fc03b94..01af0ddd 100644 --- a/molecule/logstash_custom_pipeline/verify.yml +++ b/molecule/logstash_custom_pipeline/verify.yml @@ -60,8 +60,13 @@ - name: Wait for Logstash to process events ansible.builtin.wait_for: path: /var/log/logstash/custom-output.log - timeout: 60 - msg: "Logstash custom output file not created" + # Wait for actual content, not just file creation: Logstash opens the + # output file when the pipeline starts but the generator events still + # need to flow through batch and flush, which is noticeably slower on + # resource-constrained runners. + search_regex: processed_by + timeout: 120 + msg: "Logstash custom output did not contain processed events within 120s" - name: Check output file has data ansible.builtin.slurp: diff --git a/requirements-test.txt b/requirements-test.txt index 7d7ad35c..175d7bc8 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,4 +1,9 @@ -ansible-core>=2.18,<2.21 +# Pin below 2.19 until this collection catches up with the variable-scoping +# changes in that release: role_path no longer leaks across play boundaries, +# which breaks include_tasks callers that rely on {{ role_path }}/.. paths +# (the elasticsearch_diagnostics molecule scenario hit this). Revisit once +# the full matrix is known to pass on 2.19. +ansible-core>=2.18,<2.19 ansible-lint molecule pytest diff --git a/scripts/check-ci-coverage.sh b/scripts/check-ci-coverage.sh index 33cc41bd..7754177e 100755 --- a/scripts/check-ci-coverage.sh +++ b/scripts/check-ci-coverage.sh @@ -10,7 +10,7 @@ MOLECULE_DIR="$REPO_ROOT/molecule" EXIT_CODE=0 # Scenarios that are not standalone tests (utility dirs, shared includes) -EXCLUDED_SCENARIOS="shared plugins" +EXCLUDED_SCENARIOS="default shared plugins" echo "=== Molecule scenario CI coverage check ===" echo