Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions agent/conf/agent.properties
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,31 @@ iscsi.session.cleanup.enabled=false
# This parameter specifies if the host must be rebooted when something goes wrong with the heartbeat.
#reboot.host.and.alert.management.on.heartbeat.timeout=true

# Action taken by kvmheartbeat.sh / kvmspheartbeat.sh when a storage heartbeat
# write fails persistently. Supersedes the legacy binary
# 'reboot.host.and.alert.management.on.heartbeat.timeout' when set to a non-default value.
#
Comment on lines +314 to +316
# Allowed values:
# hard-reboot - immediate sysrq-trigger reboot (default; 'reboot' kept as alias).
# Required default for setups where a stale NFSv3 mount can prevent
# a graceful shutdown from completing.
# graceful-reboot - 'systemctl reboot' instead of sysrq; allows VMs to stop cleanly.
# Use only if a stale storage mount cannot block shutdown.
# restart-agent - restart cloudstack-agent only; running VMs are preserved.
# log-only - log + alert; take no automatic action (admin must investigate).
# custom - invoke the script at 'kvm.heartbeat.fence.custom.script' (see below).
# Script is called with one positional arg: the heartbeat script name
# (e.g. 'kvmheartbeat.sh'). Falls back to hard-reboot if missing or
# not executable.
#
# The non-default values are recommended for setups using LINSTOR/DRBD or any local
# storage with replication, where transient I/O contention can cause a heartbeat
# write to time out without the host actually being unhealthy.
#kvm.heartbeat.fence.action=hard-reboot

# Path to the operator-supplied script invoked when kvm.heartbeat.fence.action=custom.
#kvm.heartbeat.fence.custom.script=/etc/cloudstack/agent/heartbeat-fence-custom.sh

# Enables manually setting CPU's topology on KVM's VM.
#enable.manually.setting.cpu.topology.on.kvm.vm=true

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -598,6 +598,44 @@ public class AgentProperties{
public static final Property<Boolean> REBOOT_HOST_AND_ALERT_MANAGEMENT_ON_HEARTBEAT_TIMEOUT
= new Property<>("reboot.host.and.alert.management.on.heartbeat.timeout", true);

/**
* Action taken by the KVM agent's storage heartbeat scripts (kvmheartbeat.sh / kvmspheartbeat.sh)
* when a heartbeat write fails persistently. Allowed values:
* <ul>
* <li>{@code hard-reboot} (default; {@code reboot} accepted as alias) — immediate
* sysrq-trigger reboot. Required default for setups where a stale NFSv3 mount can
* prevent a graceful shutdown from completing.</li>
* <li>{@code graceful-reboot} — {@code systemctl reboot} instead of sysrq; allows VMs
* to stop cleanly. Use only if a stale storage mount cannot block shutdown.</li>
* <li>{@code restart-agent} — restart cloudstack-agent only; running VMs preserved.</li>
* <li>{@code log-only} — log + alert; take no automatic action (admin must investigate).</li>
* <li>{@code custom} — invoke the script at {@link #KVM_HEARTBEAT_FENCE_CUSTOM_SCRIPT}
* (default {@code /etc/cloudstack/agent/heartbeat-fence-custom.sh}). The script is
* called with one argument: the heartbeat script name (e.g. {@code kvmheartbeat.sh}).
* If the script is missing or not executable, falls back to {@code hard-reboot}.</li>
* </ul>
* The non-default values are recommended for setups using LINSTOR/DRBD or other replicated
* local storage, where transient I/O contention can cause a heartbeat write to time out
* without the host actually being unhealthy.<br>
* Read by the heartbeat shell scripts directly from agent.properties.<br>
* Data type: String.<br>
* Default value: {@code hard-reboot}
*/
public static final Property<String> KVM_HEARTBEAT_FENCE_ACTION
= new Property<>("kvm.heartbeat.fence.action", "hard-reboot");

/**
* Path to the operator-supplied script invoked when
* {@link #KVM_HEARTBEAT_FENCE_ACTION} is set to {@code custom}. The script must be
* executable and is called with a single positional argument: the heartbeat script name
* that triggered the fence (e.g. {@code kvmheartbeat.sh}). Read by the heartbeat shell
* scripts directly from agent.properties.<br>
* Data type: String.<br>
* Default value: {@code /etc/cloudstack/agent/heartbeat-fence-custom.sh}
*/
public static final Property<String> KVM_HEARTBEAT_FENCE_CUSTOM_SCRIPT
= new Property<>("kvm.heartbeat.fence.custom.script", "/etc/cloudstack/agent/heartbeat-fence-custom.sh");

/**
* Enables manually setting CPU's topology on KVM's VM. <br>
* Data type: Boolean.<br>
Expand Down
85 changes: 85 additions & 0 deletions scripts/vm/hypervisor/kvm/kvmha-fence.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#!/bin/bash
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
# Shared fence-action helper for kvmheartbeat.sh and kvmspheartbeat.sh.
# Sourced by both scripts; do not invoke directly.
#
# Usage from caller:
# source "$(dirname "$0")/kvmha-fence.sh"
# fence_action "kvmheartbeat.sh" # script name passed for log tagging

AGENT_PROPS="${AGENT_PROPS:-/etc/cloudstack/agent/agent.properties}"

fence_action() {
local source_script="${1:-kvmha}"
local FENCE_ACTION="hard-reboot"
local CUSTOM_SCRIPT="/etc/cloudstack/agent/heartbeat-fence-custom.sh"

if [ -r "$AGENT_PROPS" ]; then
local val
val=$(grep -E '^[[:space:]]*kvm\.heartbeat\.fence\.action[[:space:]]*=' "$AGENT_PROPS" | tail -n 1 | cut -d= -f2- | tr -d '[:space:]')
[ -n "$val" ] && FENCE_ACTION="$val"
local cval
cval=$(grep -E '^[[:space:]]*kvm\.heartbeat\.fence\.custom\.script[[:space:]]*=' "$AGENT_PROPS" | tail -n 1 | cut -d= -f2- | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
[ -n "$cval" ] && CUSTOM_SCRIPT="$cval"
fi

case "$FENCE_ACTION" in
log-only)
/usr/bin/logger -t heartbeat "${source_script}: heartbeat write to storage failed; fence action 'log-only' selected — taking no automatic action. Operator must investigate."
exit 0
;;
restart-agent)
/usr/bin/logger -t heartbeat "${source_script}: heartbeat write to storage failed; fence action 'restart-agent' — restarting cloudstack-agent (running VMs preserved)."
sync &
sleep 2
systemctl restart cloudstack-agent
exit $?
;;
graceful-reboot)
/usr/bin/logger -t heartbeat "${source_script}: heartbeat write to storage failed; fence action 'graceful-reboot' — rebooting via systemctl (allows running VMs to stop cleanly)."
sync &
sleep 5
systemctl reboot
exit $?
;;
custom)
if [ -x "$CUSTOM_SCRIPT" ]; then
/usr/bin/logger -t heartbeat "${source_script}: heartbeat write to storage failed; fence action 'custom' — running ${CUSTOM_SCRIPT}."
sync &
sleep 2
"$CUSTOM_SCRIPT" "$source_script"
exit $?
else
/usr/bin/logger -t heartbeat "${source_script}: heartbeat write to storage failed; fence action 'custom' selected but ${CUSTOM_SCRIPT} is missing or not executable — falling back to hard-reboot."
sync &
sleep 5
echo b > /proc/sysrq-trigger
exit $?
fi
;;
hard-reboot|reboot|*)
# 'reboot' kept as alias for back-compat with pre-existing deployments.
/usr/bin/logger -t heartbeat "${source_script} will reboot system because it was unable to write the heartbeat to the storage."
sync &
sleep 5
echo b > /proc/sysrq-trigger
exit $?
;;
esac
}
8 changes: 3 additions & 5 deletions scripts/vm/hypervisor/kvm/kvmheartbeat.sh
Original file line number Diff line number Diff line change
Expand Up @@ -156,11 +156,9 @@ then
exit 0
elif [ "$cflag" == "1" ]
then
/usr/bin/logger -t heartbeat "kvmheartbeat.sh will reboot system because it was unable to write the heartbeat to the storage."
sync &
sleep 5
echo b > /proc/sysrq-trigger
exit $?
# shellcheck disable=SC1091
. "$(dirname "$0")/kvmha-fence.sh"
fence_action "kvmheartbeat.sh"
else
write_hbLog
exit $?
Expand Down
8 changes: 3 additions & 5 deletions scripts/vm/hypervisor/kvm/kvmspheartbeat.sh
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,7 @@ deleteVMs() {

if [ "$cflag" == "1" ]
then
/usr/bin/logger -t heartbeat "kvmspheartbeat.sh will reboot system because it was unable to write the heartbeat to the storage."
sync &
sleep 5
echo b > /proc/sysrq-trigger
exit $?
# shellcheck disable=SC1091
. "$(dirname "$0")/kvmha-fence.sh"
fence_action "kvmspheartbeat.sh"
fi
Loading