Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
mode: 0440
path: "/etc/sudoers.d/hacluster-taint"
contents:
inline: |
hacluster ALL=(root) NOPASSWD: /usr/bin/systemctl start --no-block taint-node@*
hacluster ALL=(root) NOPASSWD: /usr/bin/systemctl start --no-block untaint-node@*
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
mode: 0755
path: "/usr/local/bin/taint-fenced-node.sh"
contents:
inline: |
#!/bin/bash
set -euo pipefail
FENCED_NODE="$1"
KUBECONFIG="/etc/kubernetes/static-pod-resources/kube-apiserver-certs/secrets/node-kubeconfigs/localhost.kubeconfig"
TAINT_KEY="node.kubernetes.io/out-of-service"
TAINT_VALUE="nodeshutdown"
ANNOTATION_KEY="node.kubernetes.io/out-of-service-applied-by"
ANNOTATION_VALUE="pacemaker"
OC="oc --kubeconfig=$KUBECONFIG"

if [[ -z "${FENCED_NODE:-}" ]]; then
logger -t taint-fenced-node "ERROR: Node name is empty or not set"
exit 1
fi

logger -t taint-fenced-node "Applying out-of-service taint to fenced node ${FENCED_NODE}"

PATCH=$(cat <<PATCHEOF
{"metadata":{"annotations":{"${ANNOTATION_KEY}":"${ANNOTATION_VALUE}"}},
"spec":{"taints":[{"key":"${TAINT_KEY}","value":"${TAINT_VALUE}","effect":"NoExecute"}]}}
PATCHEOF
)

MAX_RETRIES=120
RETRY_INTERVAL=5
for attempt in $(seq 1 $MAX_RETRIES); do
OC_ERR=$($OC patch node "$FENCED_NODE" --type=strategic -p "$PATCH" 2>&1) && {
logger -t taint-fenced-node "Successfully tainted and annotated ${FENCED_NODE}"
exit 0
}

logger -t taint-fenced-node "Attempt ${attempt}/${MAX_RETRIES} failed: ${OC_ERR}, retrying in ${RETRY_INTERVAL}s"
sleep $RETRY_INTERVAL
done

logger -t taint-fenced-node "ERROR: Failed to taint ${FENCED_NODE} after ${MAX_RETRIES} attempts"
exit 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
mode: 0755
path: "/var/lib/pacemaker/alerts/tnf-taint-alert.sh"
contents:
inline: |
#!/bin/bash
# Pacemaker alert agent for fencing events.
# CRM_alert_kind=fencing, CRM_alert_rc=0 identifies a successful fence
# completion.
# CRM_alert_kind=fencing, so they are already excluded.
if [ "$CRM_alert_kind" = "fencing" ] && [ "$CRM_alert_rc" = "0" ] && [ -n "$CRM_alert_node" ]; then
logger -t tnf-taint-alert "Fencing succeeded for ${CRM_alert_node}, triggering taint service"
if ! sudo systemctl start --no-block "taint-node@${CRM_alert_node}.service"; then
logger -t tnf-taint-alert "ERROR: Failed to start taint-node@${CRM_alert_node}.service"
exit 1
fi
fi
exit 0
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
mode: 0755
path: "/var/lib/pacemaker/alerts/tnf-untaint-alert.sh"
contents:
inline: |
#!/bin/bash
# Pacemaker alert agent for node-rejoin events.
# Registered with: select_nodes
#
# Triggers on: CRM_alert_kind=node + CRM_alert_desc=member → corosync membership (rejoin after fence)
if [ "$CRM_alert_kind" = "node" ] && [ "$CRM_alert_desc" = "member" ] && [ -n "$CRM_alert_node" ]; then
logger -t tnf-untaint-alert "Node ${CRM_alert_node} rejoined cluster (membership), triggering untaint service"
if ! sudo systemctl start --no-block "untaint-node@${CRM_alert_node}.service"; then
logger -t tnf-untaint-alert "ERROR: Failed to start untaint-node@${CRM_alert_node}.service"
exit 1
fi
fi
exit 0
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
mode: 0755
path: "/usr/local/bin/untaint-fenced-node.sh"
contents:
inline: |
#!/bin/bash
set -euo pipefail
REJOINED_NODE="$1"
KUBECONFIG="/etc/kubernetes/static-pod-resources/kube-apiserver-certs/secrets/node-kubeconfigs/localhost.kubeconfig"
TAINT_KEY="node.kubernetes.io/out-of-service"
ANNOTATION_KEY="node.kubernetes.io/out-of-service-applied-by"
ANNOTATION_VALUE="pacemaker"
OC="oc --kubeconfig=$KUBECONFIG"

if [[ -z "${REJOINED_NODE:-}" ]]; then
logger -t untaint-fenced-node "ERROR: Node name is empty or not set"
exit 1
fi

logger -t untaint-fenced-node "Node ${REJOINED_NODE} rejoined cluster, checking if untaint is needed"

# Wait for pacemaker to report the node as Online.
# The alert fires on corosync membership ("member"), but pacemaker Online
# confirms the full cluster join succeeded. This prevents untainting a node
# that joined corosync but may still fail pacemaker startup and get re-fenced.
MAX_PCMK_WAIT=180
PCMK_INTERVAL=5
PCMK_ONLINE=false
for i in $(seq 1 $((MAX_PCMK_WAIT / PCMK_INTERVAL))); do
if crm_mon -1 2>/dev/null | grep "Online:" | grep -cw "$REJOINED_NODE" >/dev/null; then
PCMK_ONLINE=true
break
fi
logger -t untaint-fenced-node "Waiting for pacemaker to report ${REJOINED_NODE} as Online (attempt ${i})"
sleep $PCMK_INTERVAL
done

if [ "$PCMK_ONLINE" != "true" ]; then
logger -t untaint-fenced-node "ERROR: Pacemaker does not report ${REJOINED_NODE} as Online after ${MAX_PCMK_WAIT}s - aborting untaint"
exit 1
fi

logger -t untaint-fenced-node "Pacemaker reports ${REJOINED_NODE} as Online"

# Wait for the rejoined node's kubelet to become Ready.
# This must happen before the annotation check: the taint script may still
# be retrying against the API while etcd recovers quorum, and kubelet Ready
# is a natural signal that the cluster is healthy enough
# for the taint write to have completed.
MAX_READY_WAIT=120
READY_INTERVAL=5
for i in $(seq 1 $((MAX_READY_WAIT / READY_INTERVAL))); do
NODE_READY=$($OC get node "$REJOINED_NODE" -o json 2>/dev/null | jq -r '.status.conditions[] | select(.type=="Ready") | .status // empty') || true
if [ "$NODE_READY" = "True" ]; then
break
fi
logger -t untaint-fenced-node "Waiting for node ${REJOINED_NODE} to become Ready (attempt ${i})"
sleep $READY_INTERVAL
done

if [ "$NODE_READY" != "True" ]; then
logger -t untaint-fenced-node "WARNING: Node ${REJOINED_NODE} not Ready after ${MAX_READY_WAIT}s - proceeding with untaint anyway"
fi

# Wait for VolumeAttachments to drain from the fenced node.
# The out-of-service taint triggers immediate pod eviction and volume
# detachment. We must wait for this to complete before removing the
# taint, otherwise pods remain on the rebooted node and wait for the
# local storage stack to recover instead of failing over to the
# surviving node where volumes can attach immediately.
MAX_VA_WAIT=300
VA_INTERVAL=5
VA_COUNT=$($OC get volumeattachments -o json 2>/dev/null \
| jq --arg node "$REJOINED_NODE" '[.items[] | select(.spec.nodeName == $node)] | length') || VA_COUNT="0"

if [ "$VA_COUNT" -gt 0 ]; then
logger -t untaint-fenced-node "Waiting for ${VA_COUNT} VolumeAttachment(s) to detach from ${REJOINED_NODE}"
for i in $(seq 1 $((MAX_VA_WAIT / VA_INTERVAL))); do
VA_COUNT=$($OC get volumeattachments -o json 2>/dev/null \
| jq --arg node "$REJOINED_NODE" '[.items[] | select(.spec.nodeName == $node)] | length') || VA_COUNT="-1"
if [ "$VA_COUNT" = "0" ]; then
logger -t untaint-fenced-node "All VolumeAttachments detached from ${REJOINED_NODE}"
break
fi
if [ "$VA_COUNT" = "-1" ]; then
logger -t untaint-fenced-node "API unavailable checking VolumeAttachments (attempt ${i})"
else
logger -t untaint-fenced-node "Waiting for ${VA_COUNT} VolumeAttachment(s) to detach from ${REJOINED_NODE} (attempt ${i})"
fi
sleep $VA_INTERVAL
done
if [ "$VA_COUNT" != "0" ]; then
logger -t untaint-fenced-node "WARNING: ${VA_COUNT} VolumeAttachment(s) still on ${REJOINED_NODE} after ${MAX_VA_WAIT}s - proceeding with untaint"
fi
fi

# Check if the taint was applied by pacemaker
NODE_JSON=$($OC get node "$REJOINED_NODE" -o json 2>/dev/null) || {
logger -t untaint-fenced-node "ERROR: Cannot reach API to check annotation - aborting"
exit 1
}
APPLIED_BY=$(echo "$NODE_JSON" | jq -r --arg key "$ANNOTATION_KEY" '.metadata.annotations[$key] // empty')
if [ "$APPLIED_BY" != "$ANNOTATION_VALUE" ]; then
logger -t untaint-fenced-node "Node ${REJOINED_NODE} has no pacemaker out-of-service annotation - nothing to do"
exit 0
fi

logger -t untaint-fenced-node "Removing out-of-service taint and annotation from ${REJOINED_NODE}"

# Remove taint and annotation with retries.
# If taint was already removed, we still proceed to remove the annotation.
MAX_RETRIES=12
RETRY_INTERVAL=5
for attempt in $(seq 1 $MAX_RETRIES); do
# Re-check annotation in case another process already cleaned up
NODE_JSON=$($OC get node "$REJOINED_NODE" -o json 2>/dev/null) || {
logger -t untaint-fenced-node "Attempt ${attempt}/${MAX_RETRIES}: oc get failed during annotation re-check - retrying in ${RETRY_INTERVAL}s"
sleep $RETRY_INTERVAL
continue
}
APPLIED_BY=$(echo "$NODE_JSON" | jq -r --arg key "$ANNOTATION_KEY" '.metadata.annotations[$key] // empty')
if [ "$APPLIED_BY" != "$ANNOTATION_VALUE" ]; then
logger -t untaint-fenced-node "Annotation already removed from ${REJOINED_NODE} - done"
exit 0
fi

$OC adm taint node "$REJOINED_NODE" "${TAINT_KEY}-" 2>/dev/null || true
$OC annotate node "$REJOINED_NODE" "${ANNOTATION_KEY}-" 2>/dev/null || true

# Verify both are actually gone - treat oc failures as retryable
NODE_JSON=$($OC get node "$REJOINED_NODE" -o json 2>/dev/null) || {
logger -t untaint-fenced-node "Attempt ${attempt}/${MAX_RETRIES}: oc get failed during verification - retrying in ${RETRY_INTERVAL}s"
sleep $RETRY_INTERVAL
continue
}
REMAINING_TAINT=$(echo "$NODE_JSON" | jq -r --arg key "$TAINT_KEY" '.spec.taints // [] | map(select(.key == $key)) | .[0].key // empty')
REMAINING_ANNOT=$(echo "$NODE_JSON" | jq -r --arg key "$ANNOTATION_KEY" '.metadata.annotations[$key] // empty')
if [ -z "$REMAINING_TAINT" ] && [ -z "$REMAINING_ANNOT" ]; then
Comment thread
coderabbitai[bot] marked this conversation as resolved.
logger -t untaint-fenced-node "Successfully untainted and removed annotation from ${REJOINED_NODE}"
exit 0
fi

logger -t untaint-fenced-node "Attempt ${attempt}/${MAX_RETRIES}: taint='${REMAINING_TAINT}' annotation='${REMAINING_ANNOT}' - retrying in ${RETRY_INTERVAL}s"
sleep $RETRY_INTERVAL
done

logger -t untaint-fenced-node "ERROR: Failed to untaint ${REJOINED_NODE} after ${MAX_RETRIES} attempts"
exit 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
name: "taint-node@.service"
contents: |
[Unit]
Description=Taint fenced node %i in Kubernetes
After=network.target

[Service]
Type=oneshot
ExecStart=/usr/local/bin/taint-fenced-node.sh %i
TimeoutStartSec=600
Comment thread
vimauro marked this conversation as resolved.
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
name: "untaint-node@.service"
contents: |
[Unit]
Description=Untaint rejoined node %i in Kubernetes
After=network.target taint-node@%i.service

[Service]
Type=oneshot
ExecStart=/usr/local/bin/untaint-fenced-node.sh %i
TimeoutStartSec=600