diff --git a/templates/master/00-master/two-node-with-fencing/files/hacluster-taint-sudoers.yaml b/templates/master/00-master/two-node-with-fencing/files/hacluster-taint-sudoers.yaml new file mode 100644 index 0000000000..cdca638b9d --- /dev/null +++ b/templates/master/00-master/two-node-with-fencing/files/hacluster-taint-sudoers.yaml @@ -0,0 +1,6 @@ +mode: 0440 +path: "/etc/sudoers.d/hacluster-taint" +contents: + inline: | + hacluster ALL=(root) NOPASSWD: /usr/bin/systemctl start --no-block taint-node@* + hacluster ALL=(root) NOPASSWD: /usr/bin/systemctl start --no-block untaint-node@* diff --git a/templates/master/00-master/two-node-with-fencing/files/taint-fenced-node.yaml b/templates/master/00-master/two-node-with-fencing/files/taint-fenced-node.yaml new file mode 100644 index 0000000000..94189783f1 --- /dev/null +++ b/templates/master/00-master/two-node-with-fencing/files/taint-fenced-node.yaml @@ -0,0 +1,41 @@ +mode: 0755 +path: "/usr/local/bin/taint-fenced-node.sh" +contents: + inline: | + #!/bin/bash + set -euo pipefail + FENCED_NODE="$1" + KUBECONFIG="/etc/kubernetes/static-pod-resources/kube-apiserver-certs/secrets/node-kubeconfigs/localhost.kubeconfig" + TAINT_KEY="node.kubernetes.io/out-of-service" + TAINT_VALUE="nodeshutdown" + ANNOTATION_KEY="node.kubernetes.io/out-of-service-applied-by" + ANNOTATION_VALUE="pacemaker" + OC="oc --kubeconfig=$KUBECONFIG" + + if [[ -z "${FENCED_NODE:-}" ]]; then + logger -t taint-fenced-node "ERROR: Node name is empty or not set" + exit 1 + fi + + logger -t taint-fenced-node "Applying out-of-service taint to fenced node ${FENCED_NODE}" + + PATCH=$(cat <&1) && { + logger -t taint-fenced-node "Successfully tainted and annotated ${FENCED_NODE}" + exit 0 + } + + logger -t taint-fenced-node "Attempt ${attempt}/${MAX_RETRIES} failed: ${OC_ERR}, retrying in ${RETRY_INTERVAL}s" + sleep $RETRY_INTERVAL + done + + logger -t taint-fenced-node "ERROR: Failed to taint ${FENCED_NODE} after ${MAX_RETRIES} attempts" + exit 1 diff --git a/templates/master/00-master/two-node-with-fencing/files/tnf-taint-alert.yaml b/templates/master/00-master/two-node-with-fencing/files/tnf-taint-alert.yaml new file mode 100644 index 0000000000..1285e24c8d --- /dev/null +++ b/templates/master/00-master/two-node-with-fencing/files/tnf-taint-alert.yaml @@ -0,0 +1,17 @@ +mode: 0755 +path: "/var/lib/pacemaker/alerts/tnf-taint-alert.sh" +contents: + inline: | + #!/bin/bash + # Pacemaker alert agent for fencing events. + # CRM_alert_kind=fencing, CRM_alert_rc=0 identifies a successful fence + # completion. + # CRM_alert_kind=fencing, so they are already excluded. + if [ "$CRM_alert_kind" = "fencing" ] && [ "$CRM_alert_rc" = "0" ] && [ -n "$CRM_alert_node" ]; then + logger -t tnf-taint-alert "Fencing succeeded for ${CRM_alert_node}, triggering taint service" + if ! sudo systemctl start --no-block "taint-node@${CRM_alert_node}.service"; then + logger -t tnf-taint-alert "ERROR: Failed to start taint-node@${CRM_alert_node}.service" + exit 1 + fi + fi + exit 0 diff --git a/templates/master/00-master/two-node-with-fencing/files/tnf-untaint-alert.yaml b/templates/master/00-master/two-node-with-fencing/files/tnf-untaint-alert.yaml new file mode 100644 index 0000000000..4f10b10be0 --- /dev/null +++ b/templates/master/00-master/two-node-with-fencing/files/tnf-untaint-alert.yaml @@ -0,0 +1,17 @@ +mode: 0755 +path: "/var/lib/pacemaker/alerts/tnf-untaint-alert.sh" +contents: + inline: | + #!/bin/bash + # Pacemaker alert agent for node-rejoin events. + # Registered with: select_nodes + # + # Triggers on: CRM_alert_kind=node + CRM_alert_desc=member → corosync membership (rejoin after fence) + if [ "$CRM_alert_kind" = "node" ] && [ "$CRM_alert_desc" = "member" ] && [ -n "$CRM_alert_node" ]; then + logger -t tnf-untaint-alert "Node ${CRM_alert_node} rejoined cluster (membership), triggering untaint service" + if ! sudo systemctl start --no-block "untaint-node@${CRM_alert_node}.service"; then + logger -t tnf-untaint-alert "ERROR: Failed to start untaint-node@${CRM_alert_node}.service" + exit 1 + fi + fi + exit 0 diff --git a/templates/master/00-master/two-node-with-fencing/files/untaint-fenced-node.yaml b/templates/master/00-master/two-node-with-fencing/files/untaint-fenced-node.yaml new file mode 100644 index 0000000000..528f5d7e7e --- /dev/null +++ b/templates/master/00-master/two-node-with-fencing/files/untaint-fenced-node.yaml @@ -0,0 +1,147 @@ +mode: 0755 +path: "/usr/local/bin/untaint-fenced-node.sh" +contents: + inline: | + #!/bin/bash + set -euo pipefail + REJOINED_NODE="$1" + KUBECONFIG="/etc/kubernetes/static-pod-resources/kube-apiserver-certs/secrets/node-kubeconfigs/localhost.kubeconfig" + TAINT_KEY="node.kubernetes.io/out-of-service" + ANNOTATION_KEY="node.kubernetes.io/out-of-service-applied-by" + ANNOTATION_VALUE="pacemaker" + OC="oc --kubeconfig=$KUBECONFIG" + + if [[ -z "${REJOINED_NODE:-}" ]]; then + logger -t untaint-fenced-node "ERROR: Node name is empty or not set" + exit 1 + fi + + logger -t untaint-fenced-node "Node ${REJOINED_NODE} rejoined cluster, checking if untaint is needed" + + # Wait for pacemaker to report the node as Online. + # The alert fires on corosync membership ("member"), but pacemaker Online + # confirms the full cluster join succeeded. This prevents untainting a node + # that joined corosync but may still fail pacemaker startup and get re-fenced. + MAX_PCMK_WAIT=180 + PCMK_INTERVAL=5 + PCMK_ONLINE=false + for i in $(seq 1 $((MAX_PCMK_WAIT / PCMK_INTERVAL))); do + if crm_mon -1 2>/dev/null | grep "Online:" | grep -cw "$REJOINED_NODE" >/dev/null; then + PCMK_ONLINE=true + break + fi + logger -t untaint-fenced-node "Waiting for pacemaker to report ${REJOINED_NODE} as Online (attempt ${i})" + sleep $PCMK_INTERVAL + done + + if [ "$PCMK_ONLINE" != "true" ]; then + logger -t untaint-fenced-node "ERROR: Pacemaker does not report ${REJOINED_NODE} as Online after ${MAX_PCMK_WAIT}s - aborting untaint" + exit 1 + fi + + logger -t untaint-fenced-node "Pacemaker reports ${REJOINED_NODE} as Online" + + # Wait for the rejoined node's kubelet to become Ready. + # This must happen before the annotation check: the taint script may still + # be retrying against the API while etcd recovers quorum, and kubelet Ready + # is a natural signal that the cluster is healthy enough + # for the taint write to have completed. + MAX_READY_WAIT=120 + READY_INTERVAL=5 + for i in $(seq 1 $((MAX_READY_WAIT / READY_INTERVAL))); do + NODE_READY=$($OC get node "$REJOINED_NODE" -o json 2>/dev/null | jq -r '.status.conditions[] | select(.type=="Ready") | .status // empty') || true + if [ "$NODE_READY" = "True" ]; then + break + fi + logger -t untaint-fenced-node "Waiting for node ${REJOINED_NODE} to become Ready (attempt ${i})" + sleep $READY_INTERVAL + done + + if [ "$NODE_READY" != "True" ]; then + logger -t untaint-fenced-node "WARNING: Node ${REJOINED_NODE} not Ready after ${MAX_READY_WAIT}s - proceeding with untaint anyway" + fi + + # Wait for VolumeAttachments to drain from the fenced node. + # The out-of-service taint triggers immediate pod eviction and volume + # detachment. We must wait for this to complete before removing the + # taint, otherwise pods remain on the rebooted node and wait for the + # local storage stack to recover instead of failing over to the + # surviving node where volumes can attach immediately. + MAX_VA_WAIT=300 + VA_INTERVAL=5 + VA_COUNT=$($OC get volumeattachments -o json 2>/dev/null \ + | jq --arg node "$REJOINED_NODE" '[.items[] | select(.spec.nodeName == $node)] | length') || VA_COUNT="0" + + if [ "$VA_COUNT" -gt 0 ]; then + logger -t untaint-fenced-node "Waiting for ${VA_COUNT} VolumeAttachment(s) to detach from ${REJOINED_NODE}" + for i in $(seq 1 $((MAX_VA_WAIT / VA_INTERVAL))); do + VA_COUNT=$($OC get volumeattachments -o json 2>/dev/null \ + | jq --arg node "$REJOINED_NODE" '[.items[] | select(.spec.nodeName == $node)] | length') || VA_COUNT="-1" + if [ "$VA_COUNT" = "0" ]; then + logger -t untaint-fenced-node "All VolumeAttachments detached from ${REJOINED_NODE}" + break + fi + if [ "$VA_COUNT" = "-1" ]; then + logger -t untaint-fenced-node "API unavailable checking VolumeAttachments (attempt ${i})" + else + logger -t untaint-fenced-node "Waiting for ${VA_COUNT} VolumeAttachment(s) to detach from ${REJOINED_NODE} (attempt ${i})" + fi + sleep $VA_INTERVAL + done + if [ "$VA_COUNT" != "0" ]; then + logger -t untaint-fenced-node "WARNING: ${VA_COUNT} VolumeAttachment(s) still on ${REJOINED_NODE} after ${MAX_VA_WAIT}s - proceeding with untaint" + fi + fi + + # Check if the taint was applied by pacemaker + NODE_JSON=$($OC get node "$REJOINED_NODE" -o json 2>/dev/null) || { + logger -t untaint-fenced-node "ERROR: Cannot reach API to check annotation - aborting" + exit 1 + } + APPLIED_BY=$(echo "$NODE_JSON" | jq -r --arg key "$ANNOTATION_KEY" '.metadata.annotations[$key] // empty') + if [ "$APPLIED_BY" != "$ANNOTATION_VALUE" ]; then + logger -t untaint-fenced-node "Node ${REJOINED_NODE} has no pacemaker out-of-service annotation - nothing to do" + exit 0 + fi + + logger -t untaint-fenced-node "Removing out-of-service taint and annotation from ${REJOINED_NODE}" + + # Remove taint and annotation with retries. + # If taint was already removed, we still proceed to remove the annotation. + MAX_RETRIES=12 + RETRY_INTERVAL=5 + for attempt in $(seq 1 $MAX_RETRIES); do + # Re-check annotation in case another process already cleaned up + NODE_JSON=$($OC get node "$REJOINED_NODE" -o json 2>/dev/null) || { + logger -t untaint-fenced-node "Attempt ${attempt}/${MAX_RETRIES}: oc get failed during annotation re-check - retrying in ${RETRY_INTERVAL}s" + sleep $RETRY_INTERVAL + continue + } + APPLIED_BY=$(echo "$NODE_JSON" | jq -r --arg key "$ANNOTATION_KEY" '.metadata.annotations[$key] // empty') + if [ "$APPLIED_BY" != "$ANNOTATION_VALUE" ]; then + logger -t untaint-fenced-node "Annotation already removed from ${REJOINED_NODE} - done" + exit 0 + fi + + $OC adm taint node "$REJOINED_NODE" "${TAINT_KEY}-" 2>/dev/null || true + $OC annotate node "$REJOINED_NODE" "${ANNOTATION_KEY}-" 2>/dev/null || true + + # Verify both are actually gone - treat oc failures as retryable + NODE_JSON=$($OC get node "$REJOINED_NODE" -o json 2>/dev/null) || { + logger -t untaint-fenced-node "Attempt ${attempt}/${MAX_RETRIES}: oc get failed during verification - retrying in ${RETRY_INTERVAL}s" + sleep $RETRY_INTERVAL + continue + } + REMAINING_TAINT=$(echo "$NODE_JSON" | jq -r --arg key "$TAINT_KEY" '.spec.taints // [] | map(select(.key == $key)) | .[0].key // empty') + REMAINING_ANNOT=$(echo "$NODE_JSON" | jq -r --arg key "$ANNOTATION_KEY" '.metadata.annotations[$key] // empty') + if [ -z "$REMAINING_TAINT" ] && [ -z "$REMAINING_ANNOT" ]; then + logger -t untaint-fenced-node "Successfully untainted and removed annotation from ${REJOINED_NODE}" + exit 0 + fi + + logger -t untaint-fenced-node "Attempt ${attempt}/${MAX_RETRIES}: taint='${REMAINING_TAINT}' annotation='${REMAINING_ANNOT}' - retrying in ${RETRY_INTERVAL}s" + sleep $RETRY_INTERVAL + done + + logger -t untaint-fenced-node "ERROR: Failed to untaint ${REJOINED_NODE} after ${MAX_RETRIES} attempts" + exit 1 diff --git a/templates/master/00-master/two-node-with-fencing/units/taint-node@.service.yaml b/templates/master/00-master/two-node-with-fencing/units/taint-node@.service.yaml new file mode 100644 index 0000000000..f762da2a29 --- /dev/null +++ b/templates/master/00-master/two-node-with-fencing/units/taint-node@.service.yaml @@ -0,0 +1,10 @@ +name: "taint-node@.service" +contents: | + [Unit] + Description=Taint fenced node %i in Kubernetes + After=network.target + + [Service] + Type=oneshot + ExecStart=/usr/local/bin/taint-fenced-node.sh %i + TimeoutStartSec=600 diff --git a/templates/master/00-master/two-node-with-fencing/units/untaint-node@.service.yaml b/templates/master/00-master/two-node-with-fencing/units/untaint-node@.service.yaml new file mode 100644 index 0000000000..02d5470636 --- /dev/null +++ b/templates/master/00-master/two-node-with-fencing/units/untaint-node@.service.yaml @@ -0,0 +1,10 @@ +name: "untaint-node@.service" +contents: | + [Unit] + Description=Untaint rejoined node %i in Kubernetes + After=network.target taint-node@%i.service + + [Service] + Type=oneshot + ExecStart=/usr/local/bin/untaint-fenced-node.sh %i + TimeoutStartSec=600