Skip to content
21 changes: 21 additions & 0 deletions lib/Transforms/Scalar/GVN.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -853,6 +853,17 @@ static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal,
StoredVal->getType()->isArrayTy())
return false;

// HLSL Change Begin - Reject types where padded and primitive sizes differ.
// Coercion would create bitcasts between mismatched sizes.
Type *StoredValTy = StoredVal->getType();
uint64_t StoredPrimBits = StoredValTy->getPrimitiveSizeInBits();
uint64_t LoadPrimBits = LoadTy->getPrimitiveSizeInBits();
if (StoredPrimBits && DL.getTypeSizeInBits(StoredValTy) != StoredPrimBits)
return false;
if (LoadPrimBits && DL.getTypeSizeInBits(LoadTy) != LoadPrimBits)
return false;
// HLSL Change End

// The store has to be at least as big as the load.
if (DL.getTypeSizeInBits(StoredVal->getType()) <
DL.getTypeSizeInBits(LoadTy))
Expand Down Expand Up @@ -1942,6 +1953,16 @@ bool GVN::processLoad(LoadInst *L) {
if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInst)) {
Value *StoredVal = DepSI->getValueOperand();

// HLSL Change Begin - Defense-in-depth: skip cross-type forwarding for
// padded types (e.g., min precision vectors).
if (StoredVal->getType() != L->getType()) {
Type *StoredTy = StoredVal->getType();
uint64_t StoredPrimBits = StoredTy->getPrimitiveSizeInBits();
if (StoredPrimBits && DL.getTypeSizeInBits(StoredTy) != StoredPrimBits)
Comment thread
alsepkow marked this conversation as resolved.
return false;
}
// HLSL Change End

// The store and load are to a must-aliased pointer, but they may not
// actually have the same type. See if we know how to reuse the stored
// value (depending on its type).
Expand Down
14 changes: 10 additions & 4 deletions lib/Transforms/Scalar/SROA.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1671,7 +1671,10 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
// extremely poorly defined currently. The long-term goal is to remove GEPing
// over a vector from the IR completely.
if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) {
unsigned ElementSizeInBits = DL.getTypeSizeInBits(VecTy->getScalarType());
// HLSL Change: Use alloc size for element stride to account for padded
// types.
unsigned ElementSizeInBits =
DL.getTypeAllocSizeInBits(VecTy->getScalarType());
if (ElementSizeInBits % 8 != 0) {
// GEPs over non-multiple of 8 size vector elements are invalid.
return nullptr;
Expand Down Expand Up @@ -2134,7 +2137,8 @@ static VectorType *isVectorPromotionViable(AllocaSlices::Partition &P,

// Try each vector type, and return the one which works.
auto CheckVectorTypeForPromotion = [&](VectorType *VTy) {
uint64_t ElementSize = DL.getTypeSizeInBits(VTy->getElementType());
// HLSL Change: Use alloc size to match GEP offset stride for padded types.
uint64_t ElementSize = DL.getTypeAllocSizeInBits(VTy->getElementType());

// While the definition of LLVM vectors is bitpacked, we don't support sizes
// that aren't byte sized.
Expand Down Expand Up @@ -2492,12 +2496,14 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
: nullptr),
VecTy(PromotableVecTy),
ElementTy(VecTy ? VecTy->getElementType() : nullptr),
ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy) / 8 : 0),
// HLSL Change: Use alloc size to match GEP offset stride for padded
// types.
ElementSize(VecTy ? DL.getTypeAllocSizeInBits(ElementTy) / 8 : 0),
Comment thread
alsepkow marked this conversation as resolved.
BeginOffset(), EndOffset(), IsSplittable(), IsSplit(), OldUse(),
OldPtr(), PHIUsers(PHIUsers), SelectUsers(SelectUsers),
IRB(NewAI.getContext(), ConstantFolder()) {
if (VecTy) {
assert((DL.getTypeSizeInBits(ElementTy) % 8) == 0 &&
assert((DL.getTypeAllocSizeInBits(ElementTy) % 8) == 0 && // HLSL Change
"Only multiple-of-8 sized vector elements are viable");
++NumVectorized;
}
Expand Down
154 changes: 154 additions & 0 deletions test/Transforms/GVN/min-precision-padding.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
; RUN: opt < %s -basicaa -gvn -S | FileCheck %s

; Regression test for min precision vector GVN miscompilation.
; DXC's data layout pads i16 to 32 bits (i16:32). GVN must not:
; 1. Coerce padded vector types via bitcast (CanCoerceMustAliasedValueToLoad)
; 2. Forward a zeroinitializer store past partial element stores (processLoad)
;
; Without the fix, GVN would forward the zeroinitializer vector load, producing
; incorrect all-zero results for elements that were individually written.

target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
target triple = "dxil-ms-dx"

; Test 1: GVN must not forward zeroinitializer past element store for <3 x i16>.
; The store of zeroinitializer to %dst is followed by an element store to
; %dst[0], then a vector load of %dst. GVN must not replace the vector load
; with the zeroinitializer.

; CHECK-LABEL: @test_no_forward_i16_vec3
; CHECK: store <3 x i16> zeroinitializer
; CHECK: store i16 %val
; The vector load must survive — GVN must not replace it with zeroinitializer.
; CHECK: %result = load <3 x i16>
; CHECK: ret <3 x i16> %result
define <3 x i16> @test_no_forward_i16_vec3(i16 %val) {
entry:
%dst = alloca <3 x i16>, align 4
store <3 x i16> zeroinitializer, <3 x i16>* %dst, align 4
%elem0 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 0
store i16 %val, i16* %elem0, align 4
%result = load <3 x i16>, <3 x i16>* %dst, align 4
ret <3 x i16> %result
}

; Test 2: Same pattern with <3 x half> (f16:32 padding).

; CHECK-LABEL: @test_no_forward_f16_vec3
; CHECK: store <3 x half> zeroinitializer
; CHECK: store half %val
; CHECK: %result = load <3 x half>
; CHECK: ret <3 x half> %result
define <3 x half> @test_no_forward_f16_vec3(half %val) {
entry:
%dst = alloca <3 x half>, align 4
store <3 x half> zeroinitializer, <3 x half>* %dst, align 4
%elem0 = getelementptr inbounds <3 x half>, <3 x half>* %dst, i32 0, i32 0
store half %val, half* %elem0, align 4
%result = load <3 x half>, <3 x half>* %dst, align 4
ret <3 x half> %result
}

; Test 3: Multiple element stores — all must survive.
; Stores to elements 0, 1, 2 of a <3 x i16> vector after zeroinitializer.

; CHECK-LABEL: @test_no_forward_i16_vec3_all_elems
; CHECK: store <3 x i16> zeroinitializer
; CHECK: store i16 %v0
; CHECK: store i16 %v1
; CHECK: store i16 %v2
; CHECK: %result = load <3 x i16>
; CHECK: ret <3 x i16> %result
define <3 x i16> @test_no_forward_i16_vec3_all_elems(i16 %v0, i16 %v1, i16 %v2) {
entry:
%dst = alloca <3 x i16>, align 4
store <3 x i16> zeroinitializer, <3 x i16>* %dst, align 4
%e0 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 0
store i16 %v0, i16* %e0, align 4
%e1 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 1
store i16 %v1, i16* %e1, align 4
%e2 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 2
store i16 %v2, i16* %e2, align 4
%result = load <3 x i16>, <3 x i16>* %dst, align 4
ret <3 x i16> %result
}

; Test 4: Coercion rejection — store a <3 x i16> vector, load as different type.
; GVN must not attempt bitcast coercion on padded types.
; If coercion happened, the load would be eliminated and replaced with a bitcast.

; CHECK-LABEL: @test_no_coerce_i16_vec3
; CHECK: store <3 x i16>
; CHECK: load i96
; CHECK-NOT: bitcast
; CHECK: ret
define i96 @test_no_coerce_i16_vec3(<3 x i16> %v) {
entry:
%ptr = alloca <3 x i16>, align 4
store <3 x i16> %v, <3 x i16>* %ptr, align 4
%iptr = bitcast <3 x i16>* %ptr to i96*
%result = load i96, i96* %iptr, align 4
ret i96 %result
}

; Test 5: Long vector variant — <5 x i16> (exceeds 4-element native size).

; CHECK-LABEL: @test_no_forward_i16_vec5
; CHECK: store <5 x i16> zeroinitializer
; CHECK: store i16 %val
; CHECK: %result = load <5 x i16>
; CHECK: ret <5 x i16> %result
define <5 x i16> @test_no_forward_i16_vec5(i16 %val) {
entry:
%dst = alloca <5 x i16>, align 4
store <5 x i16> zeroinitializer, <5 x i16>* %dst, align 4
%elem0 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 0
store i16 %val, i16* %elem0, align 4
%result = load <5 x i16>, <5 x i16>* %dst, align 4
ret <5 x i16> %result
}

; Test 6: Long vector variant — <8 x half>.

; CHECK-LABEL: @test_no_forward_f16_vec8
; CHECK: store <8 x half> zeroinitializer
; CHECK: store half %val
; CHECK: %result = load <8 x half>
; CHECK: ret <8 x half> %result
define <8 x half> @test_no_forward_f16_vec8(half %val) {
entry:
%dst = alloca <8 x half>, align 4
store <8 x half> zeroinitializer, <8 x half>* %dst, align 4
%elem0 = getelementptr inbounds <8 x half>, <8 x half>* %dst, i32 0, i32 0
store half %val, half* %elem0, align 4
%result = load <8 x half>, <8 x half>* %dst, align 4
ret <8 x half> %result
}

; Test 7: Same-type store-to-load forwarding must still work for padded types.
; GVN should forward %v directly — no intervening writes, same type.

; CHECK-LABEL: @test_same_type_forward_i16_vec3
; The load should be eliminated and %v returned directly.
; CHECK-NOT: load
; CHECK: ret <3 x i16> %v
define <3 x i16> @test_same_type_forward_i16_vec3(<3 x i16> %v) {
entry:
%ptr = alloca <3 x i16>, align 4
store <3 x i16> %v, <3 x i16>* %ptr, align 4
%result = load <3 x i16>, <3 x i16>* %ptr, align 4
ret <3 x i16> %result
}

; Test 8: Same-type forwarding for <3 x half>.

; CHECK-LABEL: @test_same_type_forward_f16_vec3
; CHECK-NOT: load
; CHECK: ret <3 x half> %v
define <3 x half> @test_same_type_forward_f16_vec3(<3 x half> %v) {
entry:
%ptr = alloca <3 x half>, align 4
store <3 x half> %v, <3 x half>* %ptr, align 4
%result = load <3 x half>, <3 x half>* %ptr, align 4
ret <3 x half> %result
}
149 changes: 149 additions & 0 deletions test/Transforms/SROA/min-precision-padding.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
; RUN: opt < %s -sroa -S | FileCheck %s

; Regression test for SROA miscompilation of min precision vector element access.
; DXC's data layout pads i16/f16 to 32 bits (i16:32, f16:32), so GEP offsets
; between vector elements are 4 bytes apart. SROA must use alloc size (not
; primitive size) for element stride, otherwise element stores get misplaced.

target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
target triple = "dxil-ms-dx"

; Test 1: Element-wise write to <3 x i16> vector.
; SROA must map GEP byte offsets to correct element indices using alloc size
; (4 bytes per i16), not primitive size (2 bytes). All stores must survive
; with correct indices, and the final vector load must be preserved.

; CHECK-LABEL: @test_sroa_i16_vec3
; CHECK: getelementptr inbounds <3 x i16>, <3 x i16>* %{{.*}}, i32 0, i32 0
; CHECK: store i16 %v0
; CHECK: getelementptr inbounds <3 x i16>, <3 x i16>* %{{.*}}, i32 0, i32 1
; CHECK: store i16 %v1
; CHECK: getelementptr inbounds <3 x i16>, <3 x i16>* %{{.*}}, i32 0, i32 2
; CHECK: store i16 %v2
; CHECK: load <3 x i16>
; CHECK: ret <3 x i16>
define <3 x i16> @test_sroa_i16_vec3(i16 %v0, i16 %v1, i16 %v2) {
entry:
%dst = alloca <3 x i16>, align 4
store <3 x i16> zeroinitializer, <3 x i16>* %dst, align 4
%e0 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 0
store i16 %v0, i16* %e0, align 4
%e1 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 1
store i16 %v1, i16* %e1, align 4
%e2 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 2
store i16 %v2, i16* %e2, align 4
%result = load <3 x i16>, <3 x i16>* %dst, align 4
ret <3 x i16> %result
}

; Test 2: Same pattern with <3 x half> (f16:32 padding).

; CHECK-LABEL: @test_sroa_f16_vec3
; CHECK: getelementptr inbounds <3 x half>, <3 x half>* %{{.*}}, i32 0, i32 0
; CHECK: store half %v0
; CHECK: getelementptr inbounds <3 x half>, <3 x half>* %{{.*}}, i32 0, i32 1
; CHECK: store half %v1
; CHECK: getelementptr inbounds <3 x half>, <3 x half>* %{{.*}}, i32 0, i32 2
; CHECK: store half %v2
; CHECK: load <3 x half>
; CHECK: ret <3 x half>
define <3 x half> @test_sroa_f16_vec3(half %v0, half %v1, half %v2) {
entry:
%dst = alloca <3 x half>, align 4
store <3 x half> zeroinitializer, <3 x half>* %dst, align 4
%e0 = getelementptr inbounds <3 x half>, <3 x half>* %dst, i32 0, i32 0
store half %v0, half* %e0, align 4
%e1 = getelementptr inbounds <3 x half>, <3 x half>* %dst, i32 0, i32 1
store half %v1, half* %e1, align 4
%e2 = getelementptr inbounds <3 x half>, <3 x half>* %dst, i32 0, i32 2
store half %v2, half* %e2, align 4
%result = load <3 x half>, <3 x half>* %dst, align 4
ret <3 x half> %result
}

; Test 3: Partial write — only element 1 is stored. SROA must index it correctly.

; CHECK-LABEL: @test_sroa_i16_vec3_elem1
; Element 1 store must be correctly placed at GEP index 1, not index 2.
; Without the fix, byte offset 4 / prim_size 2 = index 2 (wrong).
; With the fix, byte offset 4 / alloc_size 4 = index 1 (correct).
; CHECK: getelementptr inbounds <3 x i16>, <3 x i16>* %{{.*}}, i32 0, i32 1
; CHECK: store i16 %val
; CHECK: load <3 x i16>
; CHECK: ret <3 x i16>
define <3 x i16> @test_sroa_i16_vec3_elem1(i16 %val) {
entry:
%dst = alloca <3 x i16>, align 4
store <3 x i16> zeroinitializer, <3 x i16>* %dst, align 4
%e1 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 1
store i16 %val, i16* %e1, align 4
%result = load <3 x i16>, <3 x i16>* %dst, align 4
ret <3 x i16> %result
}

; Test 4: Element 2 store — verifies highest index is correct.

; CHECK-LABEL: @test_sroa_i16_vec3_elem2
; CHECK: getelementptr inbounds <3 x i16>, <3 x i16>* %{{.*}}, i32 0, i32 2
; CHECK: store i16 %val
; CHECK: load <3 x i16>
; CHECK: ret <3 x i16>
define <3 x i16> @test_sroa_i16_vec3_elem2(i16 %val) {
entry:
%dst = alloca <3 x i16>, align 4
store <3 x i16> zeroinitializer, <3 x i16>* %dst, align 4
%e2 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 2
store i16 %val, i16* %e2, align 4
%result = load <3 x i16>, <3 x i16>* %dst, align 4
ret <3 x i16> %result
}

; Test 5: Long vector — <5 x i16> (exceeds 4-element native size).

; CHECK-LABEL: @test_sroa_i16_vec5
; CHECK: getelementptr inbounds <5 x i16>, <5 x i16>* %{{.*}}, i32 0, i32 0
; CHECK: store i16 %v0
; CHECK: getelementptr inbounds <5 x i16>, <5 x i16>* %{{.*}}, i32 0, i32 1
; CHECK: store i16 %v1
; CHECK: getelementptr inbounds <5 x i16>, <5 x i16>* %{{.*}}, i32 0, i32 4
; CHECK: store i16 %v4
; CHECK: load <5 x i16>
; CHECK: ret <5 x i16>
define <5 x i16> @test_sroa_i16_vec5(i16 %v0, i16 %v1, i16 %v2, i16 %v3, i16 %v4) {
entry:
%dst = alloca <5 x i16>, align 4
store <5 x i16> zeroinitializer, <5 x i16>* %dst, align 4
%e0 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 0
store i16 %v0, i16* %e0, align 4
%e1 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 1
store i16 %v1, i16* %e1, align 4
%e2 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 2
store i16 %v2, i16* %e2, align 4
%e3 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 3
store i16 %v3, i16* %e3, align 4
%e4 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 4
store i16 %v4, i16* %e4, align 4
%result = load <5 x i16>, <5 x i16>* %dst, align 4
ret <5 x i16> %result
}

; Test 6: Long vector — <8 x half>.

; CHECK-LABEL: @test_sroa_f16_vec8_partial
; CHECK: getelementptr inbounds <8 x half>, <8 x half>* %{{.*}}, i32 0, i32 0
; CHECK: store half %v0
; CHECK: getelementptr inbounds <8 x half>, <8 x half>* %{{.*}}, i32 0, i32 7
; CHECK: store half %v7
; CHECK: load <8 x half>
; CHECK: ret <8 x half>
define <8 x half> @test_sroa_f16_vec8_partial(half %v0, half %v7) {
entry:
%dst = alloca <8 x half>, align 4
store <8 x half> zeroinitializer, <8 x half>* %dst, align 4
%e0 = getelementptr inbounds <8 x half>, <8 x half>* %dst, i32 0, i32 0
store half %v0, half* %e0, align 4
%e7 = getelementptr inbounds <8 x half>, <8 x half>* %dst, i32 0, i32 7
store half %v7, half* %e7, align 4
%result = load <8 x half>, <8 x half>* %dst, align 4
ret <8 x half> %result
}
Loading