-
Notifications
You must be signed in to change notification settings - Fork 12.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[X86] combineEXTRACT_SUBVECTOR - fold extract_subvector(subv_broadcast_load(ptr),0) -> load(ptr) #126523
Conversation
…t_load(ptr),0) -> load(ptr) This can typically be handled by SimplifyDemandedVectorElts, but this will fail when there are multiple uses of the subv_broadcast_load node, but if there's just one use of the load result (and the rest are uses of the memory chain), we can still replace with a load and update the chain accordingly. Noticed on llvm#126517
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesThis is typically handled by SimplifyDemandedVectorElts, but this will fail when there are multiple uses of the subv_broadcast_load node, but if there's just one use of the load result (and the rest are uses of the memory chain), we can still replace with a load and update the chain accordingly. Noticed on #126517 Full diff: https://github.com/llvm/llvm-project/pull/126523.diff 3 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 995b4de12ce12c..601bf7bb6b817d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -58477,10 +58477,26 @@ static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
DAG.isSplatValue(InVec, /*AllowUndefs*/ false)))
return extractSubVector(InVec, 0, DAG, DL, SizeInBits);
- // If we're extracting a broadcasted subvector, just use the lowest subvector.
- if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
- cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT)
- return extractSubVector(InVec, 0, DAG, DL, SizeInBits);
+ // Check if we're extracting a whole broadcasted subvector.
+ if (InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
+ auto *MemIntr = cast<MemIntrinsicSDNode>(InVec);
+ EVT MemVT = MemIntr->getMemoryVT();
+ if (MemVT == VT) {
+ // Just use the lowest subvector.
+ if (IdxVal != 0)
+ return extractSubVector(InVec, 0, DAG, DL, SizeInBits);
+ // If this is the only use, we can replace with a regular load (this may
+ // have been missed by SimplifyDemandedVectorElts due to extra uses of the
+ // memory chain).
+ if (InVec.hasOneUse()) {
+ SDValue Ld =
+ DAG.getLoad(MemVT, DL, MemIntr->getChain(), MemIntr->getBasePtr(),
+ MemIntr->getMemOperand());
+ DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1), Ld.getValue(1));
+ return Ld;
+ }
+ }
+ }
// Attempt to extract from the source of a shuffle vector.
if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) {
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
index 3d72319f59ca9e..e47a9ac3a0c0bf 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -3634,19 +3634,18 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
;
; AVX-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],mem[1,2,3]
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],mem[2,3,4,5,6,7]
; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX-NEXT: vmovdqa (%rdi), %xmm2
; AVX-NEXT: vmovdqa 16(%rdi), %xmm3
; AVX-NEXT: vpaddb 48(%rsi), %xmm3, %xmm3
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
; AVX-NEXT: vmovdqa %xmm3, 48(%rdx)
; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX2-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
@@ -3820,19 +3819,18 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
;
; AVX-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3]
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],mem[4,5,6,7]
; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX-NEXT: vmovdqa (%rdi), %xmm2
; AVX-NEXT: vmovdqa 16(%rdi), %xmm3
; AVX-NEXT: vpaddb 48(%rsi), %xmm3, %xmm3
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
-; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
+; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
+; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
; AVX-NEXT: vmovdqa %xmm3, 48(%rdx)
; AVX-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX2-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
index acedcf42639066..a3f134922ba3cc 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -4044,18 +4044,17 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
;
; AVX-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],mem[1,2,3]
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],mem[2,3,4,5,6,7]
; AVX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3]
-; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3,4,5,6,7]
+; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovdqa %xmm1, (%rdx)
; AVX-NEXT: retq
;
; AVX2-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
@@ -4263,17 +4262,16 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
;
; AVX-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3]
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],mem[4,5,6,7]
; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2
; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
-; AVX-NEXT: vmovdqa %xmm1, (%rdx)
; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovdqa %xmm1, (%rdx)
; AVX-NEXT: retq
;
; AVX2-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
…t_load(ptr),0) -> load(ptr) (llvm#126523) This is typically handled by SimplifyDemandedVectorElts, but this will fail when there are multiple uses of the subv_broadcast_load node, but if there's just one use of the load result (and the rest are uses of the memory chain), we can still replace with a load and update the chain accordingly. Noticed on llvm#126517
…t_load(ptr),0) -> load(ptr) (llvm#126523) This is typically handled by SimplifyDemandedVectorElts, but this will fail when there are multiple uses of the subv_broadcast_load node, but if there's just one use of the load result (and the rest are uses of the memory chain), we can still replace with a load and update the chain accordingly. Noticed on llvm#126517
This is typically handled by SimplifyDemandedVectorElts, but this will fail when there are multiple uses of the subv_broadcast_load node, but if there's just one use of the load result (and the rest are uses of the memory chain), we can still replace with a load and update the chain accordingly.
Noticed on #126517