diff --git a/src/diffusers/models/transformers/auraflow_transformer_2d.py b/src/diffusers/models/transformers/auraflow_transformer_2d.py
index 3fa4df738784..b973e98b995b 100644
--- a/src/diffusers/models/transformers/auraflow_transformer_2d.py
+++ b/src/diffusers/models/transformers/auraflow_transformer_2d.py
@@ -76,6 +76,19 @@ def pe_selection_index_based_on_dim(self, h, w):
         h_p, w_p = h // self.patch_size, w // self.patch_size
         h_max, w_max = int(self.pos_embed_max_size**0.5), int(self.pos_embed_max_size**0.5)
 
+        # Guard against inputs larger than the pretrained positional embedding grid:
+        # without this check the centered crop produces negative / out-of-range
+        # indices, which silently corrupt the output on CPU and trigger a
+        # `vectorized_gather_kernel` device-side assert on CUDA that tears down
+        # the entire process (see #12656).
+        if h_p > h_max or w_p > w_max:
+            raise ValueError(
+                f"Input latent size ({h_p}, {w_p}) exceeds the pretrained positional "
+                f"embedding grid ({h_max}, {w_max}). The positional embedding supports "
+                f"latents up to ({h_max * self.patch_size}, {w_max * self.patch_size}) "
+                f"pixels at patch_size={self.patch_size}."
+            )
+
         # Calculate the top-left corner indices for the centered patch grid
         starth = h_max // 2 - h_p // 2
         startw = w_max // 2 - w_p // 2