-
Notifications
You must be signed in to change notification settings - Fork 851
Fix GVN and SROA miscompilation of min precision vector element access #8269
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
alsepkow
merged 8 commits into
microsoft:main
from
alsepkow:user/alsepkow/fix-min-precision-opt-bugs
Apr 1, 2026
Merged
Changes from all commits
Commits
Show all changes
8 commits
Select commit
Hold shift + click to select a range
b34136b
Fix GVN and SROA miscompilation of min precision vector element access
alsepkow 70294fa
chore: autopublish 2026-03-14T06:53:16Z
github-actions[bot] e48fba7
Apply clang-format to SROA.cpp HLSL Change comments
alsepkow c0aadc6
Merge branch 'user/alsepkow/fix-min-precision-opt-bugs' of https://gi…
alsepkow b74226c
Add regression tests for min precision GVN and SROA fixes
alsepkow 75c8dd7
Narrow GVN processLoad guard to allow same-type forwarding
alsepkow d3479c7
Address review: clarify comments, add HLSL Change marker, tighten tests
alsepkow 5b5d05c
Trim verbose comments to focus on why, not what
alsepkow File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,154 @@ | ||
| ; RUN: opt < %s -basicaa -gvn -S | FileCheck %s | ||
|
|
||
| ; Regression test for min precision vector GVN miscompilation. | ||
| ; DXC's data layout pads i16 to 32 bits (i16:32). GVN must not: | ||
| ; 1. Coerce padded vector types via bitcast (CanCoerceMustAliasedValueToLoad) | ||
| ; 2. Forward a zeroinitializer store past partial element stores (processLoad) | ||
| ; | ||
| ; Without the fix, GVN would forward the zeroinitializer vector load, producing | ||
| ; incorrect all-zero results for elements that were individually written. | ||
|
|
||
| target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64" | ||
| target triple = "dxil-ms-dx" | ||
|
|
||
| ; Test 1: GVN must not forward zeroinitializer past element store for <3 x i16>. | ||
| ; The store of zeroinitializer to %dst is followed by an element store to | ||
| ; %dst[0], then a vector load of %dst. GVN must not replace the vector load | ||
| ; with the zeroinitializer. | ||
|
|
||
| ; CHECK-LABEL: @test_no_forward_i16_vec3 | ||
| ; CHECK: store <3 x i16> zeroinitializer | ||
| ; CHECK: store i16 %val | ||
| ; The vector load must survive — GVN must not replace it with zeroinitializer. | ||
| ; CHECK: %result = load <3 x i16> | ||
| ; CHECK: ret <3 x i16> %result | ||
| define <3 x i16> @test_no_forward_i16_vec3(i16 %val) { | ||
| entry: | ||
| %dst = alloca <3 x i16>, align 4 | ||
| store <3 x i16> zeroinitializer, <3 x i16>* %dst, align 4 | ||
| %elem0 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 0 | ||
| store i16 %val, i16* %elem0, align 4 | ||
| %result = load <3 x i16>, <3 x i16>* %dst, align 4 | ||
| ret <3 x i16> %result | ||
| } | ||
|
|
||
| ; Test 2: Same pattern with <3 x half> (f16:32 padding). | ||
|
|
||
| ; CHECK-LABEL: @test_no_forward_f16_vec3 | ||
| ; CHECK: store <3 x half> zeroinitializer | ||
| ; CHECK: store half %val | ||
| ; CHECK: %result = load <3 x half> | ||
| ; CHECK: ret <3 x half> %result | ||
| define <3 x half> @test_no_forward_f16_vec3(half %val) { | ||
| entry: | ||
| %dst = alloca <3 x half>, align 4 | ||
| store <3 x half> zeroinitializer, <3 x half>* %dst, align 4 | ||
| %elem0 = getelementptr inbounds <3 x half>, <3 x half>* %dst, i32 0, i32 0 | ||
| store half %val, half* %elem0, align 4 | ||
| %result = load <3 x half>, <3 x half>* %dst, align 4 | ||
| ret <3 x half> %result | ||
| } | ||
|
|
||
| ; Test 3: Multiple element stores — all must survive. | ||
| ; Stores to elements 0, 1, 2 of a <3 x i16> vector after zeroinitializer. | ||
|
|
||
| ; CHECK-LABEL: @test_no_forward_i16_vec3_all_elems | ||
| ; CHECK: store <3 x i16> zeroinitializer | ||
| ; CHECK: store i16 %v0 | ||
| ; CHECK: store i16 %v1 | ||
| ; CHECK: store i16 %v2 | ||
| ; CHECK: %result = load <3 x i16> | ||
| ; CHECK: ret <3 x i16> %result | ||
| define <3 x i16> @test_no_forward_i16_vec3_all_elems(i16 %v0, i16 %v1, i16 %v2) { | ||
| entry: | ||
| %dst = alloca <3 x i16>, align 4 | ||
| store <3 x i16> zeroinitializer, <3 x i16>* %dst, align 4 | ||
| %e0 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 0 | ||
| store i16 %v0, i16* %e0, align 4 | ||
| %e1 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 1 | ||
| store i16 %v1, i16* %e1, align 4 | ||
| %e2 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 2 | ||
| store i16 %v2, i16* %e2, align 4 | ||
| %result = load <3 x i16>, <3 x i16>* %dst, align 4 | ||
| ret <3 x i16> %result | ||
| } | ||
|
|
||
| ; Test 4: Coercion rejection — store a <3 x i16> vector, load as different type. | ||
| ; GVN must not attempt bitcast coercion on padded types. | ||
| ; If coercion happened, the load would be eliminated and replaced with a bitcast. | ||
|
|
||
| ; CHECK-LABEL: @test_no_coerce_i16_vec3 | ||
| ; CHECK: store <3 x i16> | ||
| ; CHECK: load i96 | ||
| ; CHECK-NOT: bitcast | ||
| ; CHECK: ret | ||
| define i96 @test_no_coerce_i16_vec3(<3 x i16> %v) { | ||
| entry: | ||
| %ptr = alloca <3 x i16>, align 4 | ||
| store <3 x i16> %v, <3 x i16>* %ptr, align 4 | ||
| %iptr = bitcast <3 x i16>* %ptr to i96* | ||
| %result = load i96, i96* %iptr, align 4 | ||
| ret i96 %result | ||
| } | ||
|
|
||
| ; Test 5: Long vector variant — <5 x i16> (exceeds 4-element native size). | ||
|
|
||
| ; CHECK-LABEL: @test_no_forward_i16_vec5 | ||
| ; CHECK: store <5 x i16> zeroinitializer | ||
| ; CHECK: store i16 %val | ||
| ; CHECK: %result = load <5 x i16> | ||
| ; CHECK: ret <5 x i16> %result | ||
| define <5 x i16> @test_no_forward_i16_vec5(i16 %val) { | ||
| entry: | ||
| %dst = alloca <5 x i16>, align 4 | ||
| store <5 x i16> zeroinitializer, <5 x i16>* %dst, align 4 | ||
| %elem0 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 0 | ||
| store i16 %val, i16* %elem0, align 4 | ||
| %result = load <5 x i16>, <5 x i16>* %dst, align 4 | ||
| ret <5 x i16> %result | ||
| } | ||
|
|
||
| ; Test 6: Long vector variant — <8 x half>. | ||
|
|
||
| ; CHECK-LABEL: @test_no_forward_f16_vec8 | ||
| ; CHECK: store <8 x half> zeroinitializer | ||
| ; CHECK: store half %val | ||
| ; CHECK: %result = load <8 x half> | ||
| ; CHECK: ret <8 x half> %result | ||
| define <8 x half> @test_no_forward_f16_vec8(half %val) { | ||
| entry: | ||
| %dst = alloca <8 x half>, align 4 | ||
| store <8 x half> zeroinitializer, <8 x half>* %dst, align 4 | ||
| %elem0 = getelementptr inbounds <8 x half>, <8 x half>* %dst, i32 0, i32 0 | ||
| store half %val, half* %elem0, align 4 | ||
| %result = load <8 x half>, <8 x half>* %dst, align 4 | ||
| ret <8 x half> %result | ||
| } | ||
|
|
||
| ; Test 7: Same-type store-to-load forwarding must still work for padded types. | ||
| ; GVN should forward %v directly — no intervening writes, same type. | ||
|
|
||
| ; CHECK-LABEL: @test_same_type_forward_i16_vec3 | ||
| ; The load should be eliminated and %v returned directly. | ||
| ; CHECK-NOT: load | ||
| ; CHECK: ret <3 x i16> %v | ||
| define <3 x i16> @test_same_type_forward_i16_vec3(<3 x i16> %v) { | ||
| entry: | ||
| %ptr = alloca <3 x i16>, align 4 | ||
| store <3 x i16> %v, <3 x i16>* %ptr, align 4 | ||
| %result = load <3 x i16>, <3 x i16>* %ptr, align 4 | ||
| ret <3 x i16> %result | ||
| } | ||
|
|
||
| ; Test 8: Same-type forwarding for <3 x half>. | ||
|
|
||
| ; CHECK-LABEL: @test_same_type_forward_f16_vec3 | ||
| ; CHECK-NOT: load | ||
| ; CHECK: ret <3 x half> %v | ||
| define <3 x half> @test_same_type_forward_f16_vec3(<3 x half> %v) { | ||
| entry: | ||
| %ptr = alloca <3 x half>, align 4 | ||
| store <3 x half> %v, <3 x half>* %ptr, align 4 | ||
| %result = load <3 x half>, <3 x half>* %ptr, align 4 | ||
| ret <3 x half> %result | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,149 @@ | ||
| ; RUN: opt < %s -sroa -S | FileCheck %s | ||
|
|
||
| ; Regression test for SROA miscompilation of min precision vector element access. | ||
| ; DXC's data layout pads i16/f16 to 32 bits (i16:32, f16:32), so GEP offsets | ||
| ; between vector elements are 4 bytes apart. SROA must use alloc size (not | ||
| ; primitive size) for element stride, otherwise element stores get misplaced. | ||
|
|
||
| target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64" | ||
| target triple = "dxil-ms-dx" | ||
|
|
||
| ; Test 1: Element-wise write to <3 x i16> vector. | ||
| ; SROA must map GEP byte offsets to correct element indices using alloc size | ||
| ; (4 bytes per i16), not primitive size (2 bytes). All stores must survive | ||
| ; with correct indices, and the final vector load must be preserved. | ||
|
|
||
| ; CHECK-LABEL: @test_sroa_i16_vec3 | ||
| ; CHECK: getelementptr inbounds <3 x i16>, <3 x i16>* %{{.*}}, i32 0, i32 0 | ||
| ; CHECK: store i16 %v0 | ||
| ; CHECK: getelementptr inbounds <3 x i16>, <3 x i16>* %{{.*}}, i32 0, i32 1 | ||
| ; CHECK: store i16 %v1 | ||
| ; CHECK: getelementptr inbounds <3 x i16>, <3 x i16>* %{{.*}}, i32 0, i32 2 | ||
| ; CHECK: store i16 %v2 | ||
| ; CHECK: load <3 x i16> | ||
| ; CHECK: ret <3 x i16> | ||
| define <3 x i16> @test_sroa_i16_vec3(i16 %v0, i16 %v1, i16 %v2) { | ||
| entry: | ||
| %dst = alloca <3 x i16>, align 4 | ||
| store <3 x i16> zeroinitializer, <3 x i16>* %dst, align 4 | ||
| %e0 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 0 | ||
| store i16 %v0, i16* %e0, align 4 | ||
| %e1 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 1 | ||
| store i16 %v1, i16* %e1, align 4 | ||
| %e2 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 2 | ||
| store i16 %v2, i16* %e2, align 4 | ||
| %result = load <3 x i16>, <3 x i16>* %dst, align 4 | ||
| ret <3 x i16> %result | ||
| } | ||
|
|
||
| ; Test 2: Same pattern with <3 x half> (f16:32 padding). | ||
|
|
||
| ; CHECK-LABEL: @test_sroa_f16_vec3 | ||
| ; CHECK: getelementptr inbounds <3 x half>, <3 x half>* %{{.*}}, i32 0, i32 0 | ||
| ; CHECK: store half %v0 | ||
| ; CHECK: getelementptr inbounds <3 x half>, <3 x half>* %{{.*}}, i32 0, i32 1 | ||
| ; CHECK: store half %v1 | ||
| ; CHECK: getelementptr inbounds <3 x half>, <3 x half>* %{{.*}}, i32 0, i32 2 | ||
| ; CHECK: store half %v2 | ||
| ; CHECK: load <3 x half> | ||
| ; CHECK: ret <3 x half> | ||
| define <3 x half> @test_sroa_f16_vec3(half %v0, half %v1, half %v2) { | ||
| entry: | ||
| %dst = alloca <3 x half>, align 4 | ||
| store <3 x half> zeroinitializer, <3 x half>* %dst, align 4 | ||
| %e0 = getelementptr inbounds <3 x half>, <3 x half>* %dst, i32 0, i32 0 | ||
| store half %v0, half* %e0, align 4 | ||
| %e1 = getelementptr inbounds <3 x half>, <3 x half>* %dst, i32 0, i32 1 | ||
| store half %v1, half* %e1, align 4 | ||
| %e2 = getelementptr inbounds <3 x half>, <3 x half>* %dst, i32 0, i32 2 | ||
| store half %v2, half* %e2, align 4 | ||
| %result = load <3 x half>, <3 x half>* %dst, align 4 | ||
| ret <3 x half> %result | ||
| } | ||
|
|
||
| ; Test 3: Partial write — only element 1 is stored. SROA must index it correctly. | ||
|
|
||
| ; CHECK-LABEL: @test_sroa_i16_vec3_elem1 | ||
| ; Element 1 store must be correctly placed at GEP index 1, not index 2. | ||
| ; Without the fix, byte offset 4 / prim_size 2 = index 2 (wrong). | ||
| ; With the fix, byte offset 4 / alloc_size 4 = index 1 (correct). | ||
| ; CHECK: getelementptr inbounds <3 x i16>, <3 x i16>* %{{.*}}, i32 0, i32 1 | ||
| ; CHECK: store i16 %val | ||
| ; CHECK: load <3 x i16> | ||
| ; CHECK: ret <3 x i16> | ||
| define <3 x i16> @test_sroa_i16_vec3_elem1(i16 %val) { | ||
| entry: | ||
| %dst = alloca <3 x i16>, align 4 | ||
| store <3 x i16> zeroinitializer, <3 x i16>* %dst, align 4 | ||
| %e1 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 1 | ||
| store i16 %val, i16* %e1, align 4 | ||
| %result = load <3 x i16>, <3 x i16>* %dst, align 4 | ||
| ret <3 x i16> %result | ||
| } | ||
|
|
||
| ; Test 4: Element 2 store — verifies highest index is correct. | ||
|
|
||
| ; CHECK-LABEL: @test_sroa_i16_vec3_elem2 | ||
| ; CHECK: getelementptr inbounds <3 x i16>, <3 x i16>* %{{.*}}, i32 0, i32 2 | ||
| ; CHECK: store i16 %val | ||
| ; CHECK: load <3 x i16> | ||
| ; CHECK: ret <3 x i16> | ||
| define <3 x i16> @test_sroa_i16_vec3_elem2(i16 %val) { | ||
| entry: | ||
| %dst = alloca <3 x i16>, align 4 | ||
| store <3 x i16> zeroinitializer, <3 x i16>* %dst, align 4 | ||
| %e2 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 2 | ||
| store i16 %val, i16* %e2, align 4 | ||
| %result = load <3 x i16>, <3 x i16>* %dst, align 4 | ||
| ret <3 x i16> %result | ||
| } | ||
|
|
||
| ; Test 5: Long vector — <5 x i16> (exceeds 4-element native size). | ||
|
|
||
| ; CHECK-LABEL: @test_sroa_i16_vec5 | ||
| ; CHECK: getelementptr inbounds <5 x i16>, <5 x i16>* %{{.*}}, i32 0, i32 0 | ||
| ; CHECK: store i16 %v0 | ||
| ; CHECK: getelementptr inbounds <5 x i16>, <5 x i16>* %{{.*}}, i32 0, i32 1 | ||
| ; CHECK: store i16 %v1 | ||
| ; CHECK: getelementptr inbounds <5 x i16>, <5 x i16>* %{{.*}}, i32 0, i32 4 | ||
| ; CHECK: store i16 %v4 | ||
| ; CHECK: load <5 x i16> | ||
| ; CHECK: ret <5 x i16> | ||
| define <5 x i16> @test_sroa_i16_vec5(i16 %v0, i16 %v1, i16 %v2, i16 %v3, i16 %v4) { | ||
| entry: | ||
| %dst = alloca <5 x i16>, align 4 | ||
| store <5 x i16> zeroinitializer, <5 x i16>* %dst, align 4 | ||
| %e0 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 0 | ||
| store i16 %v0, i16* %e0, align 4 | ||
| %e1 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 1 | ||
| store i16 %v1, i16* %e1, align 4 | ||
| %e2 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 2 | ||
| store i16 %v2, i16* %e2, align 4 | ||
| %e3 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 3 | ||
| store i16 %v3, i16* %e3, align 4 | ||
| %e4 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 4 | ||
| store i16 %v4, i16* %e4, align 4 | ||
| %result = load <5 x i16>, <5 x i16>* %dst, align 4 | ||
| ret <5 x i16> %result | ||
| } | ||
|
|
||
| ; Test 6: Long vector — <8 x half>. | ||
|
|
||
| ; CHECK-LABEL: @test_sroa_f16_vec8_partial | ||
| ; CHECK: getelementptr inbounds <8 x half>, <8 x half>* %{{.*}}, i32 0, i32 0 | ||
| ; CHECK: store half %v0 | ||
| ; CHECK: getelementptr inbounds <8 x half>, <8 x half>* %{{.*}}, i32 0, i32 7 | ||
| ; CHECK: store half %v7 | ||
| ; CHECK: load <8 x half> | ||
| ; CHECK: ret <8 x half> | ||
| define <8 x half> @test_sroa_f16_vec8_partial(half %v0, half %v7) { | ||
| entry: | ||
| %dst = alloca <8 x half>, align 4 | ||
| store <8 x half> zeroinitializer, <8 x half>* %dst, align 4 | ||
| %e0 = getelementptr inbounds <8 x half>, <8 x half>* %dst, i32 0, i32 0 | ||
| store half %v0, half* %e0, align 4 | ||
| %e7 = getelementptr inbounds <8 x half>, <8 x half>* %dst, i32 0, i32 7 | ||
| store half %v7, half* %e7, align 4 | ||
| %result = load <8 x half>, <8 x half>* %dst, align 4 | ||
| ret <8 x half> %result | ||
| } |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.