Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions conf/db/upgrade/V4.8.0.8__schema.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
-- ZSV-10538 snapshot commit-delete idempotency: latch column
CALL ADD_COLUMN('VolumeSnapshotVO', 'deletingSince', 'TIMESTAMP', 1, NULL);
1 change: 1 addition & 0 deletions conf/i18n/messages_en_US.properties
Original file line number Diff line number Diff line change
Expand Up @@ -4831,3 +4831,4 @@ GreaterThan = GreaterThan
LessThan = LessThan
LessThanOrEqualTo = LessThanOrEqualTo
resource[%s]\ doesn't\ support\ zwatch\ return\ with\ clause = resource[{0}] doesn''t support zwatch return with clause
snapshot[uuid\:%s,\ name\:%s]\ is\ in\ Deleting\ status,\ operation[%s]\ is\ rejected;\ allowed\ messages%s = snapshot[uuid:{0}, name:{1}] is in Deleting status, operation[{2}] is rejected; allowed messages{3}
1 change: 1 addition & 0 deletions conf/i18n/messages_zh_CN.properties
Original file line number Diff line number Diff line change
Expand Up @@ -4831,3 +4831,4 @@ GreaterThan = 大于
LessThan = 小于
LessThanOrEqualTo = 小于等于
resource[%s]\ doesn't\ support\ zwatch\ return\ with\ clause = 资源[{0}]不支持ZWatch Return WITH子句
snapshot[uuid\:%s,\ name\:%s]\ is\ in\ Deleting\ status,\ operation[%s]\ is\ rejected;\ allowed\ messages%s = 快照[uuid:{0}, name:{1}]处于Deleting状态,拒绝操作[{2}];当前状态允许的消息列表{3}
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor | ⚡ Quick win

建议在“消息列表”前补充分隔符,避免占位符直连文本。

当前中文文案为“当前状态允许的消息列表{3}”,建议改为“当前状态允许的消息列表:{3}”,提升用户阅读体验。

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@conf/i18n/messages_zh_CN.properties` at line 4834, Update the translation
value for the key starting with "snapshot[uuid\:%s,\ name\:%s]\ is\ in\
Deleting\ status,\ operation[%s]\ is\ rejected;\ allowed\ messages%s" so the
Chinese text inserts a separator before the placeholder {3}; change
"当前状态允许的消息列表{3}" to "当前状态允许的消息列表:{3}" to avoid the placeholder being
concatenated with the preceding text.

6 changes: 6 additions & 0 deletions conf/springConfigXml/volumeSnapshot.xml
Original file line number Diff line number Diff line change
Expand Up @@ -58,4 +58,10 @@
<zstack:extension interface="org.zstack.storage.snapshot.group.MemorySnapshotGroupReferenceFactory" />
</zstack:plugin>
</bean>

<bean id="SnapshotReconcileOnStart" class="org.zstack.storage.snapshot.SnapshotReconcileOnStart">
<zstack:plugin>
<zstack:extension interface="org.zstack.header.managementnode.ManagementNodeReadyExtensionPoint"/>
</zstack:plugin>
</bean>
</beans>
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@ public class VolumeSnapshotAO extends ResourceVO implements ShadowEntity {
@Column
private Timestamp lastOpDate;

@Column
private Timestamp deletingSince;

@Transient
private VolumeSnapshotAO shadow;

Expand Down Expand Up @@ -152,6 +155,14 @@ public void setLastOpDate(Timestamp lastOpDate) {
this.lastOpDate = lastOpDate;
}

public Timestamp getDeletingSince() {
return deletingSince;
}

public void setDeletingSince(Timestamp deletingSince) {
this.deletingSince = deletingSince;
}

public String getVolumeUuid() {
return volumeUuid;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,5 @@ public class VolumeSnapshotAO_ extends ResourceVO_ {
public static volatile SingularAttribute<VolumeSnapshotAO, VolumeSnapshotStatus> status;
public static volatile SingularAttribute<VolumeSnapshotAO, Timestamp> createDate;
public static volatile SingularAttribute<VolumeSnapshotAO, Timestamp> lastOpDate;
public static volatile SingularAttribute<VolumeSnapshotAO, Timestamp> deletingSince;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
package org.zstack.storage.snapshot;

import org.springframework.beans.factory.annotation.Autowire;
import org.springframework.beans.factory.annotation.Configurable;
import org.zstack.core.db.Q;
import org.zstack.header.managementnode.ManagementNodeReadyExtensionPoint;
import org.zstack.header.storage.snapshot.VolumeSnapshotStatus;
import org.zstack.header.storage.snapshot.VolumeSnapshotVO;
import org.zstack.header.storage.snapshot.VolumeSnapshotVO_;
import org.zstack.utils.Utils;
import org.zstack.utils.logging.CLogger;

import java.util.List;

/**
* ZSV-10538: passive reconcile for snapshots stuck in Deleting status.
*
* When MN crashes mid-deletion, the VolumeSnapshotVO row stays in Deleting (this is the
* persistent latch designed in docs/snapshot-single-delete/18-idempotency-design-status-deleting.md).
* On the next MN startup we scan for such rows and emit a WARN log per stuck snapshot so
* operators can decide whether to resume by re-issuing DeleteVolumeSnapshot{,Group}.
*
* NOTE: we deliberately do NOT auto-retry. SBLK host failover / lock conflicts make
* unattended retry risky; the safer choice is to let the operator re-issue the same API
* which routes through the idempotent resume path in VolumeSnapshotTreeBase.
*/
@Configurable(preConstruction = true, autowire = Autowire.BY_TYPE)
public class SnapshotReconcileOnStart implements ManagementNodeReadyExtensionPoint {
private static final CLogger logger = Utils.getLogger(SnapshotReconcileOnStart.class);

@Override
public void managementNodeReady() {
List<VolumeSnapshotVO> stuck = Q.New(VolumeSnapshotVO.class)
.eq(VolumeSnapshotVO_.status, VolumeSnapshotStatus.Deleting)
.list();
if (stuck.isEmpty()) {
return;
}
logger.warn(String.format(
"[ZSV-10538] Found %d snapshot(s) in Deleting status at MN startup. " +
"These were left mid-deletion by a previous MN session. No automatic action taken. " +
"Re-issue APIDeleteVolumeSnapshot{,Group} to resume; the API is idempotent.",
stuck.size()));
for (VolumeSnapshotVO v : stuck) {
logger.warn(String.format(
" - snapshot[uuid:%s, name:%s, volumeUuid:%s, treeUuid:%s, deletingSince:%s, installPath:%s]",
v.getUuid(), v.getName(), v.getVolumeUuid(), v.getTreeUuid(),
v.getDeletingSince(), v.getPrimaryStorageInstallPath()));
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,13 @@
public enum VolumeSnapshotErrors {
NOT_IN_CORRECT_STATE(1000),
FULL_SNAPSHOT_ERROR(1001),
BATCH_DELETE_ERROR(1002);
BATCH_DELETE_ERROR(1002),
/* ZSV-10538: snapshot is currently being deleted by an in-flight task */
DELETING_IN_PROGRESS(1101),
/* ZSV-10538: previous deletion did not finish; resume by retrying the same API */
DELETING_RETRY_HINT(1102),
/* ZSV-10538: operation rejected because snapshot is in Deleting status */
OPERATION_REJECTED_DURING_DELETING(1103);

private String code;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,15 @@ public class VolumeSnapshotTreeBase {
APIShrinkVolumeSnapshotMsg.class.getName()
);

// ZSV-10538: Deleting allows the idempotent delete API (resume),
// the internal deletion msg, and read-only query APIs.
// All other writes (Revert / CreateChild / CreateImage / Backup ...) are rejected
// by the whitelist with VolumeSnapshotErrors.OPERATION_REJECTED_DURING_DELETING.
allowedStatus.addState(VolumeSnapshotStatus.Deleting,
VolumeSnapshotDeletionMsg.class.getName());
VolumeSnapshotDeletionMsg.class.getName(),
APIDeleteVolumeSnapshotMsg.class.getName(),
APIQueryVolumeSnapshotMsg.class.getName(),
APIGetVolumeSnapshotSizeMsg.class.getName());

allowedStatus.addState(VolumeSnapshotStatus.Creating,
VolumeSnapshotDeletionMsg.class.getName());
Expand Down Expand Up @@ -149,11 +156,17 @@ public VolumeSnapshotTreeBase(VolumeSnapshotVO vo, boolean syncOnVolume) {
private ErrorCode isOperationAllowed(Message msg) {
if (allowedStatus.isOperationAllowed(msg.getClass().getName(), currentRoot.getStatus().toString())) {
return null;
} else {
return err(VolumeSnapshotErrors.NOT_IN_CORRECT_STATE,
"snapshot[uuid:%s, name:%s]'s status[%s] is not allowed for message[%s], allowed status%s",
currentRoot.getUuid(), currentRoot.getName(), currentRoot.getStatus(), msg.getClass().getName(), allowedStatus.getStatesForOperation(msg.getClass().getName()));
}
// ZSV-10538: surface a dedicated code when the rejection is due to Deleting status
if (currentRoot.getStatus() == VolumeSnapshotStatus.Deleting) {
return err(VolumeSnapshotErrors.OPERATION_REJECTED_DURING_DELETING,
"snapshot[uuid:%s, name:%s] is in Deleting status, operation[%s] is rejected; allowed messages%s",
currentRoot.getUuid(), currentRoot.getName(), msg.getClass().getName(),
allowedStatus.getStatesForOperation(msg.getClass().getName()));
}
return err(VolumeSnapshotErrors.NOT_IN_CORRECT_STATE,
"snapshot[uuid:%s, name:%s]'s status[%s] is not allowed for message[%s], allowed status%s",
currentRoot.getUuid(), currentRoot.getName(), currentRoot.getStatus(), msg.getClass().getName(), allowedStatus.getStatesForOperation(msg.getClass().getName()));
}

private void refreshVO() {
Expand Down Expand Up @@ -826,6 +839,38 @@ public void handle(ErrorCode errCode, Map data) {
}

private void deleteSingleFlows() {
// ZSV-10538: latch status=Deleting BEFORE any physical operation as the persistent idempotency signal.
// NoRollbackFlow on purpose: if anything below fails, the row stays in Deleting so the user
// can re-issue the same API and the API entry routes through the resume branch (see handle(APIDeleteVolumeSnapshotMsg)).
flow(new NoRollbackFlow() {
String __name__ = "change-snapshot-status-to-Deleting";

@Override
public void run(FlowTrigger trigger, Map data) {
if (currentRoot.getStatus() == VolumeSnapshotStatus.Deleting) {
// idempotent: a previous interrupted attempt already latched the flag
trigger.next();
return;
}
new SQLBatch() {
@Override
protected void scripts() {
int affected = sql("update VolumeSnapshotVO s set s.status = :st, s.deletingSince = :ts " +
"where s.uuid = :uuid and s.status <> :st")
.param("st", VolumeSnapshotStatus.Deleting)
.param("ts", new java.sql.Timestamp(System.currentTimeMillis()))
.param("uuid", currentRoot.getUuid())
.execute();
if (affected > 0) {
currentRoot.setStatus(VolumeSnapshotStatus.Deleting);
currentRoot.setDeletingSince(new java.sql.Timestamp(System.currentTimeMillis()));
}
}
}.execute();
trigger.next();
}
});

flow(new NoRollbackFlow() {
VmInstanceState vmState;
String srcSnapshotParentPath;
Expand Down Expand Up @@ -873,11 +918,30 @@ public void fail(ErrorCode errorCode) {
}

private void stepDelete(Completion completion) {
// ZSV-10538: re-read vmState every round so a VM that starts mid-flight is observed (R7)
if (volume.getVmInstanceUuid() != null) {
VmInstanceState fresh = Q.New(VmInstanceVO.class)
.eq(VmInstanceVO_.uuid, volume.getVmInstanceUuid())
.select(VmInstanceVO_.state).findValue();
if (fresh != null) {
vmState = fresh;
}
}

List<VolumeSnapshotVO> vos = Q.New(VolumeSnapshotVO.class).eq(VolumeSnapshotVO_.treeUuid, currentRoot.getTreeUuid()).list();
boolean current = Q.New(VolumeSnapshotTreeVO.class).eq(VolumeSnapshotTreeVO_.uuid, currentRoot.getTreeUuid())
.select(VolumeSnapshotTreeVO_.current).findValue();
VolumeTree volumeTree = VolumeTree.fromVOs(vos, current, VolumeInventory.valueOf(volume));
List<VolumeTree.VolumeSnapshotLeaf> children = volumeTree.getSnapshotLeaf(currentRoot.getUuid()).getChildren();
// ZSV-10538: currentRoot may have been hardDeleted by a previous (interrupted) run.
// When the row is gone, the snapshot is already removed → idempotent no-op.
VolumeTree.VolumeSnapshotLeaf rootLeaf = volumeTree.getSnapshotLeaf(currentRoot.getUuid());
if (rootLeaf == null) {
logger.debug(String.format("stepDelete: snapshot[uuid:%s] already removed from tree[uuid:%s], idempotent no-op",
currentRoot.getUuid(), currentRoot.getTreeUuid()));
completion.success();
return;
}
List<VolumeTree.VolumeSnapshotLeaf> children = rootLeaf.getChildren();

if (children.isEmpty()) {
deleteVolumeSnapshotAndSyncVolumeSize(completion);
Expand Down Expand Up @@ -2990,6 +3054,35 @@ public void handle(ErrorCode errCode, Map data) {
private void handle(final APIDeleteVolumeSnapshotMsg msg) {
final APIDeleteVolumeSnapshotEvent evt = new APIDeleteVolumeSnapshotEvent(msg.getId());

// ZSV-10538: idempotent entry routing based on persistent status.
// 1) row gone -> success (already fully deleted by a prior attempt)
// 2) status=Deleted -> success (terminal state; just publish)
// 3) status=Deleting -> log a resume hint and fall through to the regular
// deletion path; the change-status flow detects Deleting
// and short-circuits, then stepDelete re-reads DB each
// round to converge from whatever the interrupted step left.
// 4) status=Ready -> normal first-time deletion path
VolumeSnapshotVO latest = dbf.findByUuid(msg.getUuid(), VolumeSnapshotVO.class);
if (latest == null) {
logger.debug(String.format("APIDeleteVolumeSnapshotMsg: snapshot[uuid:%s] row not found, returning success idempotently",
msg.getUuid()));
bus.publish(evt);
return;
}
if (latest.getStatus() == VolumeSnapshotStatus.Deleted) {
logger.debug(String.format("APIDeleteVolumeSnapshotMsg: snapshot[uuid:%s] already in Deleted state, returning success idempotently",
msg.getUuid()));
bus.publish(evt);
return;
}
if (latest.getStatus() == VolumeSnapshotStatus.Deleting) {
logger.warn(String.format("APIDeleteVolumeSnapshotMsg: snapshot[uuid:%s, name:%s] is in Deleting status (since %s); " +
"resuming previously interrupted deletion (errorCode hint: %s)",
latest.getUuid(), latest.getName(), latest.getDeletingSince(),
VolumeSnapshotErrors.DELETING_RETRY_HINT));
currentRoot = latest;
}

deleteVolumeSnapshot(msg, new Completion(msg) {
@Override
public void success() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import org.zstack.header.message.NeedReplyMessage;
import org.zstack.header.storage.snapshot.DeleteVolumeSnapshotMsg;
import org.zstack.header.storage.snapshot.VolumeSnapshotConstant;
import org.zstack.header.storage.snapshot.VolumeSnapshotStatus;
import org.zstack.header.storage.snapshot.VolumeSnapshotVO;
import org.zstack.header.storage.snapshot.group.*;
import org.zstack.header.tpm.entity.TpmVO;
Expand Down Expand Up @@ -225,6 +226,17 @@ private void handle(DeleteVolumeSnapshotGroupInnerMsg msg) {
logger.debug(String.format("skip snapshots not belong to origin vm[uuid:%s]", self.getVmInstanceUuid()));
}

// ZSV-10538: surface resume hint when one or more members are already in Deleting
// (left over from a previous interrupted group-delete). Per-snapshot routing in
// VolumeSnapshotTreeBase.handle(APIDeleteVolumeSnapshotMsg) / DeleteVolumeSnapshotMsg
// handles the actual resume; here we only log so operators understand what is happening.
long deleting = snapshots.stream().filter(s -> s.getStatus() == VolumeSnapshotStatus.Deleting).count();
if (deleting > 0) {
logger.warn(String.format("snapshot group[uuid:%s, name:%s]: %d/%d members are in Deleting status; " +
"this delete will resume the previously interrupted group deletion",
self.getUuid(), self.getName(), deleting, snapshots.size()));
}

SimpleFlowChain.of("delete-volume-snapshot-group")
.then("delete-volume-snapshots", (trigger) ->
new While<>(snapshots).step((snapshot, compl) -> {
Expand Down