Skip to content

Commit 0deb4c4

Browse files
committed
fix(nas-backup): timeout, free-space check, trap-based cleanup with VM resume
Three independent reliability fixes for the KVM NAS backup script, layered on top of the existing quiesce + EXIT_CLEANUP_FAILED groundwork: 1. BACKUP_TIMEOUT env var (default 6h) bounds the libvirt domjobinfo wait loop in backup_running_vm. Today a stuck QEMU backup holds the agent's command slot until the orchestrator-level timeout fires. The new guard issues domjobabort and exits non-zero so the agent reclaims the slot promptly. 2. MIN_FREE_SPACE env var (default 1 GiB) + check_free_space() runs after mount and before any qemu-img convert in both backup_running_vm and backup_stopped_vm. Fail-fast on a near-full NAS instead of failing mid-write halfway through a multi-GiB convert. 3. trap cleanup EXIT replaces the six explicit cleanup() call sites as the primary cleanup mechanism so orphan NFS mounts no longer accumulate when the script dies to SIGTERM, SIGINT, or any uncaught set -e failure between the explicit call sites. cleanup() is now guarded by CLEANUP_DONE so the trap doesn't re-run an already-completed cleanup from an explicit call. cleanup() additionally resumes the VM if it's still paused — backup-begin holds the guest paused briefly and a failed backup mid-pause currently leaves the guest stuck in 'paused' state until an operator intervenes. Targets main; supersedes the 4.20-targeted version of this PR.
1 parent a0aafe2 commit 0deb4c4

1 file changed

Lines changed: 71 additions & 3 deletions

File tree

scripts/vm/hypervisor/kvm/nasbackup.sh

Lines changed: 71 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,20 @@ logFile="/var/log/cloudstack/agent/agent.log"
3737

3838
EXIT_CLEANUP_FAILED=20
3939

40+
# Backup job timeout in seconds (default: 6 hours). Guards the libvirt
41+
# domjobinfo wait loop so a stuck QEMU backup eventually fails the script
42+
# instead of holding the agent's command slot indefinitely.
43+
BACKUP_TIMEOUT=${BACKUP_TIMEOUT:-21600}
44+
45+
# Minimum free space required on the mounted backup target, in bytes
46+
# (default: 1 GiB). Checked after mount, before any qemu-img convert, so
47+
# we fail fast rather than mid-write when the NAS is near-full.
48+
MIN_FREE_SPACE=${MIN_FREE_SPACE:-1073741824}
49+
50+
# Guards cleanup() against double-execution when both an explicit call
51+
# and the EXIT trap fire (e.g. error path calls cleanup; exit 1 → trap).
52+
CLEANUP_DONE=0
53+
4054
log() {
4155
[[ "$verb" -eq 1 ]] && builtin echo "$@"
4256
if [[ "$1" == "-ne" || "$1" == "-e" || "$1" == "-n" ]]; then
@@ -111,6 +125,7 @@ get_linstor_uuid_from_path() {
111125

112126
backup_running_vm() {
113127
mount_operation
128+
check_free_space
114129
mkdir -p "$dest" || { echo "Failed to create backup directory $dest"; exit 1; }
115130

116131
name="root"
@@ -160,6 +175,9 @@ backup_running_vm() {
160175
virsh -c qemu:///system domiflist $VM > $dest/domiflist.xml 2>/dev/null
161176
virsh -c qemu:///system domblklist $VM > $dest/domblklist.xml 2>/dev/null
162177

178+
# Bound the wait so a stuck QEMU backup eventually aborts instead of holding
179+
# the agent's command slot until the orchestrator-level timeout fires.
180+
local elapsed=0
163181
while true; do
164182
status=$(virsh -c qemu:///system domjobinfo $VM --completed --keep-completed | awk '/Job type:/ {print $3}')
165183
case "$status" in
@@ -169,7 +187,13 @@ backup_running_vm() {
169187
echo "Virsh backup job failed"
170188
cleanup ;;
171189
esac
190+
if [[ $elapsed -ge $BACKUP_TIMEOUT ]]; then
191+
echo "Backup timed out after ${BACKUP_TIMEOUT}s for VM $VM"
192+
virsh -c qemu:///system domjobabort $VM > /dev/null 2>&1 || true
193+
exit 1
194+
fi
172195
sleep 5
196+
elapsed=$((elapsed + 5))
173197
done
174198

175199
# Use qemu-img convert to sparsify linstor backups which get bloated due to virsh backup-begin.
@@ -204,6 +228,7 @@ backup_running_vm() {
204228

205229
backup_stopped_vm() {
206230
mount_operation
231+
check_free_space
207232
mkdir -p "$dest" || { echo "Failed to create backup directory $dest"; exit 1; }
208233

209234
IFS=","
@@ -262,19 +287,62 @@ mount_operation() {
262287
fi
263288
}
264289

290+
check_free_space() {
291+
local free_bytes
292+
free_bytes=$(df -P "$mount_point" 2>/dev/null | awk 'NR==2 {print $4}')
293+
if [[ -n "$free_bytes" ]]; then
294+
# df -P reports 1K blocks; convert to bytes.
295+
free_bytes=$((free_bytes * 1024))
296+
if [[ $free_bytes -lt $MIN_FREE_SPACE ]]; then
297+
echo "Insufficient free space on backup target: $((free_bytes / 1048576)) MB available, $((MIN_FREE_SPACE / 1048576)) MB required"
298+
exit 1
299+
fi
300+
log -ne "Backup target has $((free_bytes / 1073741824)) GB free space"
301+
fi
302+
}
303+
265304
cleanup() {
305+
# Idempotent: skip if a prior explicit call already ran. Without this guard,
306+
# the EXIT trap would re-run cleanup and fail on the already-unmounted point.
307+
[[ $CLEANUP_DONE -eq 1 ]] && return 0
308+
CLEANUP_DONE=1
309+
266310
local status=0
267311

268-
rm -rf "$dest" || { echo "Failed to delete $dest"; status=1; }
269-
umount "$mount_point" || { echo "Failed to unmount $mount_point"; status=1; }
270-
rmdir "$mount_point" || { echo "Failed to remove mount point $mount_point"; status=1; }
312+
# If the VM was paused mid-backup (e.g. backup-begin succeeded but the script
313+
# is exiting on error or signal), resume it. Without this a failed backup
314+
# leaves the guest stuck in 'paused' state until an operator intervenes.
315+
if [[ -n "$VM" ]]; then
316+
local vm_state
317+
vm_state=$(virsh -c qemu:///system domstate "$VM" 2>/dev/null || true)
318+
if [[ "$vm_state" == "paused" ]]; then
319+
log -ne "Resuming paused VM $VM during backup cleanup"
320+
if ! virsh -c qemu:///system resume "$VM" > /dev/null 2>&1; then
321+
echo "Failed to resume VM $VM"
322+
status=1
323+
fi
324+
fi
325+
fi
326+
327+
if [[ -n "$dest" && -d "$dest" ]]; then
328+
rm -rf "$dest" || { echo "Failed to delete $dest"; status=1; }
329+
fi
330+
if [[ -n "$mount_point" && -d "$mount_point" ]]; then
331+
umount "$mount_point" 2>/dev/null || { echo "Failed to unmount $mount_point"; status=1; }
332+
rmdir "$mount_point" 2>/dev/null || true
333+
fi
271334

272335
if [[ $status -ne 0 ]]; then
273336
echo "Backup cleanup failed"
274337
exit $EXIT_CLEANUP_FAILED
275338
fi
276339
}
277340

341+
# Trap ensures cleanup runs on any exit path — including SIGTERM/SIGINT and
342+
# unexpected errors caught by set -e — not just the explicit failure branches.
343+
# Prevents orphan NFS mounts from accumulating after non-graceful exits.
344+
trap cleanup EXIT
345+
278346
function usage {
279347
echo ""
280348
echo "Usage: $0 -o <operation> -v|--vm <domain name> -t <storage type> -s <storage address> -m <mount options> -p <backup path> -d <disks path> -q|--quiesce <true|false>"

0 commit comments

Comments
 (0)