diff --git a/conf/db/upgrade/V4.8.0.8__schema.sql b/conf/db/upgrade/V4.8.0.8__schema.sql
new file mode 100644
index 00000000000..c91012bda12
--- /dev/null
+++ b/conf/db/upgrade/V4.8.0.8__schema.sql
@@ -0,0 +1,2 @@
+-- ZSV-10538 snapshot commit-delete idempotency: latch column
+CALL ADD_COLUMN('VolumeSnapshotVO', 'deletingSince', 'TIMESTAMP', 1, NULL);
diff --git a/conf/i18n/messages_en_US.properties b/conf/i18n/messages_en_US.properties
index 381ddbb74aa..0580d4a1683 100755
--- a/conf/i18n/messages_en_US.properties
+++ b/conf/i18n/messages_en_US.properties
@@ -4831,3 +4831,4 @@ GreaterThan = GreaterThan
LessThan = LessThan
LessThanOrEqualTo = LessThanOrEqualTo
resource[%s]\ doesn't\ support\ zwatch\ return\ with\ clause = resource[{0}] doesn''t support zwatch return with clause
+snapshot[uuid\:%s,\ name\:%s]\ is\ in\ Deleting\ status,\ operation[%s]\ is\ rejected;\ allowed\ messages%s = snapshot[uuid:{0}, name:{1}] is in Deleting status, operation[{2}] is rejected; allowed messages{3}
diff --git a/conf/i18n/messages_zh_CN.properties b/conf/i18n/messages_zh_CN.properties
index fd25401733e..2bec8787075 100755
--- a/conf/i18n/messages_zh_CN.properties
+++ b/conf/i18n/messages_zh_CN.properties
@@ -4831,3 +4831,4 @@ GreaterThan = 大于
LessThan = 小于
LessThanOrEqualTo = 小于等于
resource[%s]\ doesn't\ support\ zwatch\ return\ with\ clause = 资源[{0}]不支持ZWatch Return WITH子句
+snapshot[uuid\:%s,\ name\:%s]\ is\ in\ Deleting\ status,\ operation[%s]\ is\ rejected;\ allowed\ messages%s = 快照[uuid:{0}, name:{1}]处于Deleting状态,拒绝操作[{2}];当前状态允许的消息列表{3}
diff --git a/conf/springConfigXml/volumeSnapshot.xml b/conf/springConfigXml/volumeSnapshot.xml
index f2ad0dc93a4..01eaa9c440c 100755
--- a/conf/springConfigXml/volumeSnapshot.xml
+++ b/conf/springConfigXml/volumeSnapshot.xml
@@ -58,4 +58,10 @@
+
+
+
+
+
+
diff --git a/header/src/main/java/org/zstack/header/storage/snapshot/VolumeSnapshotAO.java b/header/src/main/java/org/zstack/header/storage/snapshot/VolumeSnapshotAO.java
index 434cf362b81..a78ab76b83e 100755
--- a/header/src/main/java/org/zstack/header/storage/snapshot/VolumeSnapshotAO.java
+++ b/header/src/main/java/org/zstack/header/storage/snapshot/VolumeSnapshotAO.java
@@ -76,6 +76,9 @@ public class VolumeSnapshotAO extends ResourceVO implements ShadowEntity {
@Column
private Timestamp lastOpDate;
+ @Column
+ private Timestamp deletingSince;
+
@Transient
private VolumeSnapshotAO shadow;
@@ -152,6 +155,14 @@ public void setLastOpDate(Timestamp lastOpDate) {
this.lastOpDate = lastOpDate;
}
+ public Timestamp getDeletingSince() {
+ return deletingSince;
+ }
+
+ public void setDeletingSince(Timestamp deletingSince) {
+ this.deletingSince = deletingSince;
+ }
+
public String getVolumeUuid() {
return volumeUuid;
}
diff --git a/header/src/main/java/org/zstack/header/storage/snapshot/VolumeSnapshotAO_.java b/header/src/main/java/org/zstack/header/storage/snapshot/VolumeSnapshotAO_.java
index 20c2d132348..6dd1a0fb7b1 100755
--- a/header/src/main/java/org/zstack/header/storage/snapshot/VolumeSnapshotAO_.java
+++ b/header/src/main/java/org/zstack/header/storage/snapshot/VolumeSnapshotAO_.java
@@ -29,4 +29,5 @@ public class VolumeSnapshotAO_ extends ResourceVO_ {
public static volatile SingularAttribute status;
public static volatile SingularAttribute createDate;
public static volatile SingularAttribute lastOpDate;
+ public static volatile SingularAttribute deletingSince;
}
diff --git a/storage/src/main/java/org/zstack/storage/snapshot/SnapshotReconcileOnStart.java b/storage/src/main/java/org/zstack/storage/snapshot/SnapshotReconcileOnStart.java
new file mode 100644
index 00000000000..e827a07e4eb
--- /dev/null
+++ b/storage/src/main/java/org/zstack/storage/snapshot/SnapshotReconcileOnStart.java
@@ -0,0 +1,51 @@
+package org.zstack.storage.snapshot;
+
+import org.springframework.beans.factory.annotation.Autowire;
+import org.springframework.beans.factory.annotation.Configurable;
+import org.zstack.core.db.Q;
+import org.zstack.header.managementnode.ManagementNodeReadyExtensionPoint;
+import org.zstack.header.storage.snapshot.VolumeSnapshotStatus;
+import org.zstack.header.storage.snapshot.VolumeSnapshotVO;
+import org.zstack.header.storage.snapshot.VolumeSnapshotVO_;
+import org.zstack.utils.Utils;
+import org.zstack.utils.logging.CLogger;
+
+import java.util.List;
+
+/**
+ * ZSV-10538: passive reconcile for snapshots stuck in Deleting status.
+ *
+ * When MN crashes mid-deletion, the VolumeSnapshotVO row stays in Deleting (this is the
+ * persistent latch designed in docs/snapshot-single-delete/18-idempotency-design-status-deleting.md).
+ * On the next MN startup we scan for such rows and emit a WARN log per stuck snapshot so
+ * operators can decide whether to resume by re-issuing DeleteVolumeSnapshot{,Group}.
+ *
+ * NOTE: we deliberately do NOT auto-retry. SBLK host failover / lock conflicts make
+ * unattended retry risky; the safer choice is to let the operator re-issue the same API
+ * which routes through the idempotent resume path in VolumeSnapshotTreeBase.
+ */
+@Configurable(preConstruction = true, autowire = Autowire.BY_TYPE)
+public class SnapshotReconcileOnStart implements ManagementNodeReadyExtensionPoint {
+ private static final CLogger logger = Utils.getLogger(SnapshotReconcileOnStart.class);
+
+ @Override
+ public void managementNodeReady() {
+ List stuck = Q.New(VolumeSnapshotVO.class)
+ .eq(VolumeSnapshotVO_.status, VolumeSnapshotStatus.Deleting)
+ .list();
+ if (stuck.isEmpty()) {
+ return;
+ }
+ logger.warn(String.format(
+ "[ZSV-10538] Found %d snapshot(s) in Deleting status at MN startup. " +
+ "These were left mid-deletion by a previous MN session. No automatic action taken. " +
+ "Re-issue APIDeleteVolumeSnapshot{,Group} to resume; the API is idempotent.",
+ stuck.size()));
+ for (VolumeSnapshotVO v : stuck) {
+ logger.warn(String.format(
+ " - snapshot[uuid:%s, name:%s, volumeUuid:%s, treeUuid:%s, deletingSince:%s, installPath:%s]",
+ v.getUuid(), v.getName(), v.getVolumeUuid(), v.getTreeUuid(),
+ v.getDeletingSince(), v.getPrimaryStorageInstallPath()));
+ }
+ }
+}
diff --git a/storage/src/main/java/org/zstack/storage/snapshot/VolumeSnapshotErrors.java b/storage/src/main/java/org/zstack/storage/snapshot/VolumeSnapshotErrors.java
index 51e2673cd7a..6b18f820d05 100755
--- a/storage/src/main/java/org/zstack/storage/snapshot/VolumeSnapshotErrors.java
+++ b/storage/src/main/java/org/zstack/storage/snapshot/VolumeSnapshotErrors.java
@@ -5,7 +5,13 @@
public enum VolumeSnapshotErrors {
NOT_IN_CORRECT_STATE(1000),
FULL_SNAPSHOT_ERROR(1001),
- BATCH_DELETE_ERROR(1002);
+ BATCH_DELETE_ERROR(1002),
+ /* ZSV-10538: snapshot is currently being deleted by an in-flight task */
+ DELETING_IN_PROGRESS(1101),
+ /* ZSV-10538: previous deletion did not finish; resume by retrying the same API */
+ DELETING_RETRY_HINT(1102),
+ /* ZSV-10538: operation rejected because snapshot is in Deleting status */
+ OPERATION_REJECTED_DURING_DELETING(1103);
private String code;
diff --git a/storage/src/main/java/org/zstack/storage/snapshot/VolumeSnapshotTreeBase.java b/storage/src/main/java/org/zstack/storage/snapshot/VolumeSnapshotTreeBase.java
index beb8b044d1a..501cca9c56e 100755
--- a/storage/src/main/java/org/zstack/storage/snapshot/VolumeSnapshotTreeBase.java
+++ b/storage/src/main/java/org/zstack/storage/snapshot/VolumeSnapshotTreeBase.java
@@ -111,8 +111,15 @@ public class VolumeSnapshotTreeBase {
APIShrinkVolumeSnapshotMsg.class.getName()
);
+ // ZSV-10538: Deleting allows the idempotent delete API (resume),
+ // the internal deletion msg, and read-only query APIs.
+ // All other writes (Revert / CreateChild / CreateImage / Backup ...) are rejected
+ // by the whitelist with VolumeSnapshotErrors.OPERATION_REJECTED_DURING_DELETING.
allowedStatus.addState(VolumeSnapshotStatus.Deleting,
- VolumeSnapshotDeletionMsg.class.getName());
+ VolumeSnapshotDeletionMsg.class.getName(),
+ APIDeleteVolumeSnapshotMsg.class.getName(),
+ APIQueryVolumeSnapshotMsg.class.getName(),
+ APIGetVolumeSnapshotSizeMsg.class.getName());
allowedStatus.addState(VolumeSnapshotStatus.Creating,
VolumeSnapshotDeletionMsg.class.getName());
@@ -149,11 +156,17 @@ public VolumeSnapshotTreeBase(VolumeSnapshotVO vo, boolean syncOnVolume) {
private ErrorCode isOperationAllowed(Message msg) {
if (allowedStatus.isOperationAllowed(msg.getClass().getName(), currentRoot.getStatus().toString())) {
return null;
- } else {
- return err(VolumeSnapshotErrors.NOT_IN_CORRECT_STATE,
- "snapshot[uuid:%s, name:%s]'s status[%s] is not allowed for message[%s], allowed status%s",
- currentRoot.getUuid(), currentRoot.getName(), currentRoot.getStatus(), msg.getClass().getName(), allowedStatus.getStatesForOperation(msg.getClass().getName()));
}
+ // ZSV-10538: surface a dedicated code when the rejection is due to Deleting status
+ if (currentRoot.getStatus() == VolumeSnapshotStatus.Deleting) {
+ return err(VolumeSnapshotErrors.OPERATION_REJECTED_DURING_DELETING,
+ "snapshot[uuid:%s, name:%s] is in Deleting status, operation[%s] is rejected; allowed messages%s",
+ currentRoot.getUuid(), currentRoot.getName(), msg.getClass().getName(),
+ allowedStatus.getStatesForOperation(msg.getClass().getName()));
+ }
+ return err(VolumeSnapshotErrors.NOT_IN_CORRECT_STATE,
+ "snapshot[uuid:%s, name:%s]'s status[%s] is not allowed for message[%s], allowed status%s",
+ currentRoot.getUuid(), currentRoot.getName(), currentRoot.getStatus(), msg.getClass().getName(), allowedStatus.getStatesForOperation(msg.getClass().getName()));
}
private void refreshVO() {
@@ -826,6 +839,38 @@ public void handle(ErrorCode errCode, Map data) {
}
private void deleteSingleFlows() {
+ // ZSV-10538: latch status=Deleting BEFORE any physical operation as the persistent idempotency signal.
+ // NoRollbackFlow on purpose: if anything below fails, the row stays in Deleting so the user
+ // can re-issue the same API and the API entry routes through the resume branch (see handle(APIDeleteVolumeSnapshotMsg)).
+ flow(new NoRollbackFlow() {
+ String __name__ = "change-snapshot-status-to-Deleting";
+
+ @Override
+ public void run(FlowTrigger trigger, Map data) {
+ if (currentRoot.getStatus() == VolumeSnapshotStatus.Deleting) {
+ // idempotent: a previous interrupted attempt already latched the flag
+ trigger.next();
+ return;
+ }
+ new SQLBatch() {
+ @Override
+ protected void scripts() {
+ int affected = sql("update VolumeSnapshotVO s set s.status = :st, s.deletingSince = :ts " +
+ "where s.uuid = :uuid and s.status <> :st")
+ .param("st", VolumeSnapshotStatus.Deleting)
+ .param("ts", new java.sql.Timestamp(System.currentTimeMillis()))
+ .param("uuid", currentRoot.getUuid())
+ .execute();
+ if (affected > 0) {
+ currentRoot.setStatus(VolumeSnapshotStatus.Deleting);
+ currentRoot.setDeletingSince(new java.sql.Timestamp(System.currentTimeMillis()));
+ }
+ }
+ }.execute();
+ trigger.next();
+ }
+ });
+
flow(new NoRollbackFlow() {
VmInstanceState vmState;
String srcSnapshotParentPath;
@@ -873,11 +918,30 @@ public void fail(ErrorCode errorCode) {
}
private void stepDelete(Completion completion) {
+ // ZSV-10538: re-read vmState every round so a VM that starts mid-flight is observed (R7)
+ if (volume.getVmInstanceUuid() != null) {
+ VmInstanceState fresh = Q.New(VmInstanceVO.class)
+ .eq(VmInstanceVO_.uuid, volume.getVmInstanceUuid())
+ .select(VmInstanceVO_.state).findValue();
+ if (fresh != null) {
+ vmState = fresh;
+ }
+ }
+
List vos = Q.New(VolumeSnapshotVO.class).eq(VolumeSnapshotVO_.treeUuid, currentRoot.getTreeUuid()).list();
boolean current = Q.New(VolumeSnapshotTreeVO.class).eq(VolumeSnapshotTreeVO_.uuid, currentRoot.getTreeUuid())
.select(VolumeSnapshotTreeVO_.current).findValue();
VolumeTree volumeTree = VolumeTree.fromVOs(vos, current, VolumeInventory.valueOf(volume));
- List children = volumeTree.getSnapshotLeaf(currentRoot.getUuid()).getChildren();
+ // ZSV-10538: currentRoot may have been hardDeleted by a previous (interrupted) run.
+ // When the row is gone, the snapshot is already removed → idempotent no-op.
+ VolumeTree.VolumeSnapshotLeaf rootLeaf = volumeTree.getSnapshotLeaf(currentRoot.getUuid());
+ if (rootLeaf == null) {
+ logger.debug(String.format("stepDelete: snapshot[uuid:%s] already removed from tree[uuid:%s], idempotent no-op",
+ currentRoot.getUuid(), currentRoot.getTreeUuid()));
+ completion.success();
+ return;
+ }
+ List children = rootLeaf.getChildren();
if (children.isEmpty()) {
deleteVolumeSnapshotAndSyncVolumeSize(completion);
@@ -2990,6 +3054,35 @@ public void handle(ErrorCode errCode, Map data) {
private void handle(final APIDeleteVolumeSnapshotMsg msg) {
final APIDeleteVolumeSnapshotEvent evt = new APIDeleteVolumeSnapshotEvent(msg.getId());
+ // ZSV-10538: idempotent entry routing based on persistent status.
+ // 1) row gone -> success (already fully deleted by a prior attempt)
+ // 2) status=Deleted -> success (terminal state; just publish)
+ // 3) status=Deleting -> log a resume hint and fall through to the regular
+ // deletion path; the change-status flow detects Deleting
+ // and short-circuits, then stepDelete re-reads DB each
+ // round to converge from whatever the interrupted step left.
+ // 4) status=Ready -> normal first-time deletion path
+ VolumeSnapshotVO latest = dbf.findByUuid(msg.getUuid(), VolumeSnapshotVO.class);
+ if (latest == null) {
+ logger.debug(String.format("APIDeleteVolumeSnapshotMsg: snapshot[uuid:%s] row not found, returning success idempotently",
+ msg.getUuid()));
+ bus.publish(evt);
+ return;
+ }
+ if (latest.getStatus() == VolumeSnapshotStatus.Deleted) {
+ logger.debug(String.format("APIDeleteVolumeSnapshotMsg: snapshot[uuid:%s] already in Deleted state, returning success idempotently",
+ msg.getUuid()));
+ bus.publish(evt);
+ return;
+ }
+ if (latest.getStatus() == VolumeSnapshotStatus.Deleting) {
+ logger.warn(String.format("APIDeleteVolumeSnapshotMsg: snapshot[uuid:%s, name:%s] is in Deleting status (since %s); " +
+ "resuming previously interrupted deletion (errorCode hint: %s)",
+ latest.getUuid(), latest.getName(), latest.getDeletingSince(),
+ VolumeSnapshotErrors.DELETING_RETRY_HINT));
+ currentRoot = latest;
+ }
+
deleteVolumeSnapshot(msg, new Completion(msg) {
@Override
public void success() {
diff --git a/storage/src/main/java/org/zstack/storage/snapshot/group/VolumeSnapshotGroupBase.java b/storage/src/main/java/org/zstack/storage/snapshot/group/VolumeSnapshotGroupBase.java
index fcb6907ca9e..3894d7b26e3 100644
--- a/storage/src/main/java/org/zstack/storage/snapshot/group/VolumeSnapshotGroupBase.java
+++ b/storage/src/main/java/org/zstack/storage/snapshot/group/VolumeSnapshotGroupBase.java
@@ -27,6 +27,7 @@
import org.zstack.header.message.NeedReplyMessage;
import org.zstack.header.storage.snapshot.DeleteVolumeSnapshotMsg;
import org.zstack.header.storage.snapshot.VolumeSnapshotConstant;
+import org.zstack.header.storage.snapshot.VolumeSnapshotStatus;
import org.zstack.header.storage.snapshot.VolumeSnapshotVO;
import org.zstack.header.storage.snapshot.group.*;
import org.zstack.header.tpm.entity.TpmVO;
@@ -225,6 +226,17 @@ private void handle(DeleteVolumeSnapshotGroupInnerMsg msg) {
logger.debug(String.format("skip snapshots not belong to origin vm[uuid:%s]", self.getVmInstanceUuid()));
}
+ // ZSV-10538: surface resume hint when one or more members are already in Deleting
+ // (left over from a previous interrupted group-delete). Per-snapshot routing in
+ // VolumeSnapshotTreeBase.handle(APIDeleteVolumeSnapshotMsg) / DeleteVolumeSnapshotMsg
+ // handles the actual resume; here we only log so operators understand what is happening.
+ long deleting = snapshots.stream().filter(s -> s.getStatus() == VolumeSnapshotStatus.Deleting).count();
+ if (deleting > 0) {
+ logger.warn(String.format("snapshot group[uuid:%s, name:%s]: %d/%d members are in Deleting status; " +
+ "this delete will resume the previously interrupted group deletion",
+ self.getUuid(), self.getName(), deleting, snapshots.size()));
+ }
+
SimpleFlowChain.of("delete-volume-snapshot-group")
.then("delete-volume-snapshots", (trigger) ->
new While<>(snapshots).step((snapshot, compl) -> {