microsoft · alsepkow · Apr 1, 2026 · Mar 14, 2026 · Mar 14, 2026 · Mar 16, 2026
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
@@ -853,6 +853,17 @@ static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal,
       StoredVal->getType()->isArrayTy())
     return false;
 
+  // HLSL Change Begin - Reject types where padded and primitive sizes differ.
+  // Coercion would create bitcasts between mismatched sizes.
+  Type *StoredValTy = StoredVal->getType();
+  uint64_t StoredPrimBits = StoredValTy->getPrimitiveSizeInBits();
+  uint64_t LoadPrimBits = LoadTy->getPrimitiveSizeInBits();
+  if (StoredPrimBits && DL.getTypeSizeInBits(StoredValTy) != StoredPrimBits)
+    return false;
+  if (LoadPrimBits && DL.getTypeSizeInBits(LoadTy) != LoadPrimBits)
+    return false;
+  // HLSL Change End
+
   // The store has to be at least as big as the load.
   if (DL.getTypeSizeInBits(StoredVal->getType()) <
         DL.getTypeSizeInBits(LoadTy))
@@ -1942,6 +1953,16 @@ bool GVN::processLoad(LoadInst *L) {
   if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInst)) {
     Value *StoredVal = DepSI->getValueOperand();
 
+    // HLSL Change Begin - Defense-in-depth: skip cross-type forwarding for
+    // padded types (e.g., min precision vectors).
+    if (StoredVal->getType() != L->getType()) {
+      Type *StoredTy = StoredVal->getType();
+      uint64_t StoredPrimBits = StoredTy->getPrimitiveSizeInBits();
+      if (StoredPrimBits && DL.getTypeSizeInBits(StoredTy) != StoredPrimBits)
+        return false;
+    }
+    // HLSL Change End
+
     // The store and load are to a must-aliased pointer, but they may not
     // actually have the same type.  See if we know how to reuse the stored
     // value (depending on its type).

diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
@@ -1671,7 +1671,10 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
   // extremely poorly defined currently. The long-term goal is to remove GEPing
   // over a vector from the IR completely.
   if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) {
-    unsigned ElementSizeInBits = DL.getTypeSizeInBits(VecTy->getScalarType());
+    // HLSL Change: Use alloc size for element stride to account for padded
+    // types.
+    unsigned ElementSizeInBits =
+        DL.getTypeAllocSizeInBits(VecTy->getScalarType());
     if (ElementSizeInBits % 8 != 0) {
       // GEPs over non-multiple of 8 size vector elements are invalid.
       return nullptr;
@@ -2134,7 +2137,8 @@ static VectorType *isVectorPromotionViable(AllocaSlices::Partition &P,
 
   // Try each vector type, and return the one which works.
   auto CheckVectorTypeForPromotion = [&](VectorType *VTy) {
-    uint64_t ElementSize = DL.getTypeSizeInBits(VTy->getElementType());
+    // HLSL Change: Use alloc size to match GEP offset stride for padded types.
+    uint64_t ElementSize = DL.getTypeAllocSizeInBits(VTy->getElementType());
 
     // While the definition of LLVM vectors is bitpacked, we don't support sizes
     // that aren't byte sized.
@@ -2492,12 +2496,14 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
                   : nullptr),
         VecTy(PromotableVecTy),
         ElementTy(VecTy ? VecTy->getElementType() : nullptr),
-        ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy) / 8 : 0),
+        // HLSL Change: Use alloc size to match GEP offset stride for padded
+        // types.
+        ElementSize(VecTy ? DL.getTypeAllocSizeInBits(ElementTy) / 8 : 0),
         BeginOffset(), EndOffset(), IsSplittable(), IsSplit(), OldUse(),
         OldPtr(), PHIUsers(PHIUsers), SelectUsers(SelectUsers),
         IRB(NewAI.getContext(), ConstantFolder()) {
     if (VecTy) {
-      assert((DL.getTypeSizeInBits(ElementTy) % 8) == 0 &&
+      assert((DL.getTypeAllocSizeInBits(ElementTy) % 8) == 0 && // HLSL Change
              "Only multiple-of-8 sized vector elements are viable");
       ++NumVectorized;
     }

diff --git a/test/Transforms/GVN/min-precision-padding.ll b/test/Transforms/GVN/min-precision-padding.ll
@@ -0,0 +1,154 @@
+; RUN: opt < %s -basicaa -gvn -S | FileCheck %s
+
+; Regression test for min precision vector GVN miscompilation.
+; DXC's data layout pads i16 to 32 bits (i16:32). GVN must not:
+;   1. Coerce padded vector types via bitcast (CanCoerceMustAliasedValueToLoad)
+;   2. Forward a zeroinitializer store past partial element stores (processLoad)
+;
+; Without the fix, GVN would forward the zeroinitializer vector load, producing
+; incorrect all-zero results for elements that were individually written.
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+; Test 1: GVN must not forward zeroinitializer past element store for <3 x i16>.
+; The store of zeroinitializer to %dst is followed by an element store to
+; %dst[0], then a vector load of %dst. GVN must not replace the vector load
+; with the zeroinitializer.
+
+; CHECK-LABEL: @test_no_forward_i16_vec3
+; CHECK: store <3 x i16> zeroinitializer
+; CHECK: store i16 %val
+; The vector load must survive — GVN must not replace it with zeroinitializer.
+; CHECK: %result = load <3 x i16>
+; CHECK: ret <3 x i16> %result
+define <3 x i16> @test_no_forward_i16_vec3(i16 %val) {
+entry:
+  %dst = alloca <3 x i16>, align 4
+  store <3 x i16> zeroinitializer, <3 x i16>* %dst, align 4
+  %elem0 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 0
+  store i16 %val, i16* %elem0, align 4
+  %result = load <3 x i16>, <3 x i16>* %dst, align 4
+  ret <3 x i16> %result
+}
+
+; Test 2: Same pattern with <3 x half> (f16:32 padding).
+
+; CHECK-LABEL: @test_no_forward_f16_vec3
+; CHECK: store <3 x half> zeroinitializer
+; CHECK: store half %val
+; CHECK: %result = load <3 x half>
+; CHECK: ret <3 x half> %result
+define <3 x half> @test_no_forward_f16_vec3(half %val) {
+entry:
+  %dst = alloca <3 x half>, align 4
+  store <3 x half> zeroinitializer, <3 x half>* %dst, align 4
+  %elem0 = getelementptr inbounds <3 x half>, <3 x half>* %dst, i32 0, i32 0
+  store half %val, half* %elem0, align 4
+  %result = load <3 x half>, <3 x half>* %dst, align 4
+  ret <3 x half> %result
+}
+
+; Test 3: Multiple element stores — all must survive.
+; Stores to elements 0, 1, 2 of a <3 x i16> vector after zeroinitializer.
+
+; CHECK-LABEL: @test_no_forward_i16_vec3_all_elems
+; CHECK: store <3 x i16> zeroinitializer
+; CHECK: store i16 %v0
+; CHECK: store i16 %v1
+; CHECK: store i16 %v2
+; CHECK: %result = load <3 x i16>
+; CHECK: ret <3 x i16> %result
+define <3 x i16> @test_no_forward_i16_vec3_all_elems(i16 %v0, i16 %v1, i16 %v2) {
+entry:
+  %dst = alloca <3 x i16>, align 4
+  store <3 x i16> zeroinitializer, <3 x i16>* %dst, align 4
+  %e0 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 0
+  store i16 %v0, i16* %e0, align 4
+  %e1 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 1
+  store i16 %v1, i16* %e1, align 4
+  %e2 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 2
+  store i16 %v2, i16* %e2, align 4
+  %result = load <3 x i16>, <3 x i16>* %dst, align 4
+  ret <3 x i16> %result
+}
+
+; Test 4: Coercion rejection — store a <3 x i16> vector, load as different type.
+; GVN must not attempt bitcast coercion on padded types.
+; If coercion happened, the load would be eliminated and replaced with a bitcast.
+
+; CHECK-LABEL: @test_no_coerce_i16_vec3
+; CHECK: store <3 x i16>
+; CHECK: load i96
+; CHECK-NOT: bitcast
+; CHECK: ret
+define i96 @test_no_coerce_i16_vec3(<3 x i16> %v) {
+entry:
+  %ptr = alloca <3 x i16>, align 4
+  store <3 x i16> %v, <3 x i16>* %ptr, align 4
+  %iptr = bitcast <3 x i16>* %ptr to i96*
+  %result = load i96, i96* %iptr, align 4
+  ret i96 %result
+}
+
+; Test 5: Long vector variant — <5 x i16> (exceeds 4-element native size).
+
+; CHECK-LABEL: @test_no_forward_i16_vec5
+; CHECK: store <5 x i16> zeroinitializer
+; CHECK: store i16 %val
+; CHECK: %result = load <5 x i16>
+; CHECK: ret <5 x i16> %result
+define <5 x i16> @test_no_forward_i16_vec5(i16 %val) {
+entry:
+  %dst = alloca <5 x i16>, align 4
+  store <5 x i16> zeroinitializer, <5 x i16>* %dst, align 4
+  %elem0 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 0
+  store i16 %val, i16* %elem0, align 4
+  %result = load <5 x i16>, <5 x i16>* %dst, align 4
+  ret <5 x i16> %result
+}
+
+; Test 6: Long vector variant — <8 x half>.
+
+; CHECK-LABEL: @test_no_forward_f16_vec8
+; CHECK: store <8 x half> zeroinitializer
+; CHECK: store half %val
+; CHECK: %result = load <8 x half>
+; CHECK: ret <8 x half> %result
+define <8 x half> @test_no_forward_f16_vec8(half %val) {
+entry:
+  %dst = alloca <8 x half>, align 4
+  store <8 x half> zeroinitializer, <8 x half>* %dst, align 4
+  %elem0 = getelementptr inbounds <8 x half>, <8 x half>* %dst, i32 0, i32 0
+  store half %val, half* %elem0, align 4
+  %result = load <8 x half>, <8 x half>* %dst, align 4
+  ret <8 x half> %result
+}
+
+; Test 7: Same-type store-to-load forwarding must still work for padded types.
+; GVN should forward %v directly — no intervening writes, same type.
+
+; CHECK-LABEL: @test_same_type_forward_i16_vec3
+; The load should be eliminated and %v returned directly.
+; CHECK-NOT: load
+; CHECK: ret <3 x i16> %v
+define <3 x i16> @test_same_type_forward_i16_vec3(<3 x i16> %v) {
+entry:
+  %ptr = alloca <3 x i16>, align 4
+  store <3 x i16> %v, <3 x i16>* %ptr, align 4
+  %result = load <3 x i16>, <3 x i16>* %ptr, align 4
+  ret <3 x i16> %result
+}
+
+; Test 8: Same-type forwarding for <3 x half>.
+
+; CHECK-LABEL: @test_same_type_forward_f16_vec3
+; CHECK-NOT: load
+; CHECK: ret <3 x half> %v
+define <3 x half> @test_same_type_forward_f16_vec3(<3 x half> %v) {
+entry:
+  %ptr = alloca <3 x half>, align 4
+  store <3 x half> %v, <3 x half>* %ptr, align 4
+  %result = load <3 x half>, <3 x half>* %ptr, align 4
+  ret <3 x half> %result
+}
diff --git a/test/Transforms/SROA/min-precision-padding.ll b/test/Transforms/SROA/min-precision-padding.ll
@@ -0,0 +1,149 @@
+; RUN: opt < %s -sroa -S | FileCheck %s
+
+; Regression test for SROA miscompilation of min precision vector element access.
+; DXC's data layout pads i16/f16 to 32 bits (i16:32, f16:32), so GEP offsets
+; between vector elements are 4 bytes apart. SROA must use alloc size (not
+; primitive size) for element stride, otherwise element stores get misplaced.
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+; Test 1: Element-wise write to <3 x i16> vector.
+; SROA must map GEP byte offsets to correct element indices using alloc size
+; (4 bytes per i16), not primitive size (2 bytes). All stores must survive
+; with correct indices, and the final vector load must be preserved.
+
+; CHECK-LABEL: @test_sroa_i16_vec3
+; CHECK: getelementptr inbounds <3 x i16>, <3 x i16>* %{{.*}}, i32 0, i32 0
+; CHECK: store i16 %v0
+; CHECK: getelementptr inbounds <3 x i16>, <3 x i16>* %{{.*}}, i32 0, i32 1
+; CHECK: store i16 %v1
+; CHECK: getelementptr inbounds <3 x i16>, <3 x i16>* %{{.*}}, i32 0, i32 2
+; CHECK: store i16 %v2
+; CHECK: load <3 x i16>
+; CHECK: ret <3 x i16>
+define <3 x i16> @test_sroa_i16_vec3(i16 %v0, i16 %v1, i16 %v2) {
+entry:
+  %dst = alloca <3 x i16>, align 4
+  store <3 x i16> zeroinitializer, <3 x i16>* %dst, align 4
+  %e0 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 0
+  store i16 %v0, i16* %e0, align 4
+  %e1 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 1
+  store i16 %v1, i16* %e1, align 4
+  %e2 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 2
+  store i16 %v2, i16* %e2, align 4
+  %result = load <3 x i16>, <3 x i16>* %dst, align 4
+  ret <3 x i16> %result
+}
+
+; Test 2: Same pattern with <3 x half> (f16:32 padding).
+
+; CHECK-LABEL: @test_sroa_f16_vec3
+; CHECK: getelementptr inbounds <3 x half>, <3 x half>* %{{.*}}, i32 0, i32 0
+; CHECK: store half %v0
+; CHECK: getelementptr inbounds <3 x half>, <3 x half>* %{{.*}}, i32 0, i32 1
+; CHECK: store half %v1
+; CHECK: getelementptr inbounds <3 x half>, <3 x half>* %{{.*}}, i32 0, i32 2
+; CHECK: store half %v2
+; CHECK: load <3 x half>
+; CHECK: ret <3 x half>
+define <3 x half> @test_sroa_f16_vec3(half %v0, half %v1, half %v2) {
+entry:
+  %dst = alloca <3 x half>, align 4
+  store <3 x half> zeroinitializer, <3 x half>* %dst, align 4
+  %e0 = getelementptr inbounds <3 x half>, <3 x half>* %dst, i32 0, i32 0
+  store half %v0, half* %e0, align 4
+  %e1 = getelementptr inbounds <3 x half>, <3 x half>* %dst, i32 0, i32 1
+  store half %v1, half* %e1, align 4
+  %e2 = getelementptr inbounds <3 x half>, <3 x half>* %dst, i32 0, i32 2
+  store half %v2, half* %e2, align 4
+  %result = load <3 x half>, <3 x half>* %dst, align 4
+  ret <3 x half> %result
+}
+
+; Test 3: Partial write — only element 1 is stored. SROA must index it correctly.
+
+; CHECK-LABEL: @test_sroa_i16_vec3_elem1
+; Element 1 store must be correctly placed at GEP index 1, not index 2.
+; Without the fix, byte offset 4 / prim_size 2 = index 2 (wrong).
+; With the fix, byte offset 4 / alloc_size 4 = index 1 (correct).
+; CHECK: getelementptr inbounds <3 x i16>, <3 x i16>* %{{.*}}, i32 0, i32 1
+; CHECK: store i16 %val
+; CHECK: load <3 x i16>
+; CHECK: ret <3 x i16>
+define <3 x i16> @test_sroa_i16_vec3_elem1(i16 %val) {
+entry:
+  %dst = alloca <3 x i16>, align 4
+  store <3 x i16> zeroinitializer, <3 x i16>* %dst, align 4
+  %e1 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 1
+  store i16 %val, i16* %e1, align 4
+  %result = load <3 x i16>, <3 x i16>* %dst, align 4
+  ret <3 x i16> %result
+}
+
+; Test 4: Element 2 store — verifies highest index is correct.
+
+; CHECK-LABEL: @test_sroa_i16_vec3_elem2
+; CHECK: getelementptr inbounds <3 x i16>, <3 x i16>* %{{.*}}, i32 0, i32 2
+; CHECK: store i16 %val
+; CHECK: load <3 x i16>
+; CHECK: ret <3 x i16>
+define <3 x i16> @test_sroa_i16_vec3_elem2(i16 %val) {
+entry:
+  %dst = alloca <3 x i16>, align 4
+  store <3 x i16> zeroinitializer, <3 x i16>* %dst, align 4
+  %e2 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 2
+  store i16 %val, i16* %e2, align 4
+  %result = load <3 x i16>, <3 x i16>* %dst, align 4
+  ret <3 x i16> %result
+}
+
+; Test 5: Long vector — <5 x i16> (exceeds 4-element native size).
+
+; CHECK-LABEL: @test_sroa_i16_vec5
+; CHECK: getelementptr inbounds <5 x i16>, <5 x i16>* %{{.*}}, i32 0, i32 0
+; CHECK: store i16 %v0
+; CHECK: getelementptr inbounds <5 x i16>, <5 x i16>* %{{.*}}, i32 0, i32 1
+; CHECK: store i16 %v1
+; CHECK: getelementptr inbounds <5 x i16>, <5 x i16>* %{{.*}}, i32 0, i32 4
+; CHECK: store i16 %v4
+; CHECK: load <5 x i16>
+; CHECK: ret <5 x i16>
+define <5 x i16> @test_sroa_i16_vec5(i16 %v0, i16 %v1, i16 %v2, i16 %v3, i16 %v4) {
+entry:
+  %dst = alloca <5 x i16>, align 4
+  store <5 x i16> zeroinitializer, <5 x i16>* %dst, align 4
+  %e0 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 0
+  store i16 %v0, i16* %e0, align 4
+  %e1 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 1
+  store i16 %v1, i16* %e1, align 4
+  %e2 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 2
+  store i16 %v2, i16* %e2, align 4
+  %e3 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 3
+  store i16 %v3, i16* %e3, align 4
+  %e4 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 4
+  store i16 %v4, i16* %e4, align 4
+  %result = load <5 x i16>, <5 x i16>* %dst, align 4
+  ret <5 x i16> %result
+}
+
+; Test 6: Long vector — <8 x half>.
+
+; CHECK-LABEL: @test_sroa_f16_vec8_partial
+; CHECK: getelementptr inbounds <8 x half>, <8 x half>* %{{.*}}, i32 0, i32 0
+; CHECK: store half %v0
+; CHECK: getelementptr inbounds <8 x half>, <8 x half>* %{{.*}}, i32 0, i32 7
+; CHECK: store half %v7
+; CHECK: load <8 x half>
+; CHECK: ret <8 x half>
+define <8 x half> @test_sroa_f16_vec8_partial(half %v0, half %v7) {
+entry:
+  %dst = alloca <8 x half>, align 4
+  store <8 x half> zeroinitializer, <8 x half>* %dst, align 4
+  %e0 = getelementptr inbounds <8 x half>, <8 x half>* %dst, i32 0, i32 0
+  store half %v0, half* %e0, align 4
+  %e7 = getelementptr inbounds <8 x half>, <8 x half>* %dst, i32 0, i32 7
+  store half %v7, half* %e7, align 4
+  %result = load <8 x half>, <8 x half>* %dst, align 4
+  ret <8 x half> %result
+}