From 33ef3fa0318456ac5ed9e227c53cf106e1f0e117 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 27 Feb 2024 10:53:50 +0000 Subject: [PATCH] [X86] ReplaceNodeResults - truncate sub-128-bit vectors as shuffles directly We were scalarizing these truncations, but in most cases we can widen the source vector to 128-bits and perform the truncation as a shuffle directly (which will usually lower as a PACK or PSHUFB). For the cases where the widening and shuffle isn't legal we can leave it to generic legalization to scalarize for us. Fixes #81883 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 26 ++++++++++++------------- llvm/test/CodeGen/X86/extract-concat.ll | 19 +++++++----------- llvm/test/CodeGen/X86/vec_anyext.ll | 23 +++------------------- llvm/test/CodeGen/X86/vec_cast.ll | 2 +- 4 files changed, 24 insertions(+), 46 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index a86f13135173b0d..3b8008c14932339 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -32341,20 +32341,20 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, } } - if (128 % InBits == 0) { + if ((128 % InBits) == 0 && WidenVT.is128BitVector()) { // 128 bit and smaller inputs should avoid truncate all together and - // just use a build_vector that will become a shuffle. - // TODO: Widen and use a shuffle directly? - SmallVector Ops(WidenNumElts, DAG.getUNDEF(EltVT)); - // Use the original element count so we don't do more scalar opts than - // necessary. - for (unsigned i=0; i < MinElts; ++i) { - SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In, - DAG.getIntPtrConstant(i, dl)); - Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val); - } - Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops)); - return; + // use a shuffle. + if ((InEltVT.getSizeInBits() % EltVT.getSizeInBits()) == 0) { + int Scale = InEltVT.getSizeInBits() / EltVT.getSizeInBits(); + SmallVector TruncMask(WidenNumElts, -1); + for (unsigned I = 0; I < MinElts; ++I) + TruncMask[I] = Scale * I; + SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl, 128); + WidenIn = DAG.getBitcast(WidenVT, WidenIn); + Results.push_back( + DAG.getVectorShuffle(WidenVT, dl, WidenIn, WidenIn, TruncMask)); + return; + } } // With AVX512 there are some cases that can use a target specific diff --git a/llvm/test/CodeGen/X86/extract-concat.ll b/llvm/test/CodeGen/X86/extract-concat.ll index 93dbe99882fe0b5..e7415dcf229f40c 100644 --- a/llvm/test/CodeGen/X86/extract-concat.ll +++ b/llvm/test/CodeGen/X86/extract-concat.ll @@ -9,22 +9,17 @@ define void @foo(<4 x float> %in, ptr %out) { ; SSE2-LABEL: foo: ; SSE2: # %bb.0: ; SSE2-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx -; SSE2-NEXT: shll $8, %ecx -; SSE2-NEXT: orl %eax, %ecx -; SSE2-NEXT: movl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: shll $16, %eax -; SSE2-NEXT: orl %ecx, %eax -; SSE2-NEXT: orl $-16777216, %eax # imm = 0xFF000000 -; SSE2-NEXT: movl %eax, (%rdi) +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movd %xmm0, (%rdi) ; SSE2-NEXT: retq ; ; SSE42-LABEL: foo: ; SSE42: # %bb.0: ; SSE42-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,u,u,u,u,u,u,u,u,u,u,u,u,u] ; SSE42-NEXT: movl $255, %eax ; SSE42-NEXT: pinsrb $3, %eax, %xmm0 ; SSE42-NEXT: movd %xmm0, (%rdi) @@ -33,7 +28,7 @@ define void @foo(<4 x float> %in, ptr %out) { ; AVX-LABEL: foo: ; AVX: # %bb.0: ; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: movl $255, %eax ; AVX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, (%rdi) diff --git a/llvm/test/CodeGen/X86/vec_anyext.ll b/llvm/test/CodeGen/X86/vec_anyext.ll index 09e4a4b3a773d16..e229165be967a56 100644 --- a/llvm/test/CodeGen/X86/vec_anyext.ll +++ b/llvm/test/CodeGen/X86/vec_anyext.ll @@ -112,27 +112,10 @@ define <4 x i8> @func_8_16(ptr %a, ptr %b) nounwind { ; ; X64-LABEL: func_8_16: ; X64: # %bb.0: -; X64-NEXT: movq (%rdi), %rax -; X64-NEXT: vmovd %eax, %xmm0 -; X64-NEXT: movl %eax, %ecx -; X64-NEXT: shrl $16, %ecx -; X64-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: shrq $32, %rcx -; X64-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 -; X64-NEXT: shrq $48, %rax -; X64-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 -; X64-NEXT: movq (%rsi), %rax -; X64-NEXT: vmovd %eax, %xmm1 -; X64-NEXT: movl %eax, %ecx -; X64-NEXT: shrl $16, %ecx -; X64-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: shrq $32, %rcx -; X64-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 -; X64-NEXT: shrq $48, %rax -; X64-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; X64-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; X64-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; X64-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u] ; X64-NEXT: retq %F = load <4 x i16>, ptr %a %G = trunc <4 x i16> %F to <4 x i8> diff --git a/llvm/test/CodeGen/X86/vec_cast.ll b/llvm/test/CodeGen/X86/vec_cast.ll index e0089354cc95308..0a6bc2f59b685b7 100644 --- a/llvm/test/CodeGen/X86/vec_cast.ll +++ b/llvm/test/CodeGen/X86/vec_cast.ll @@ -156,7 +156,7 @@ define <3 x i16> @h(<3 x i32> %a) nounwind { ; CHECK-WIN-LABEL: h: ; CHECK-WIN: # %bb.0: ; CHECK-WIN-NEXT: movdqa (%rcx), %xmm0 -; CHECK-WIN-NEXT: movl (%rcx), %eax +; CHECK-WIN-NEXT: movd %xmm0, %eax ; CHECK-WIN-NEXT: pextrw $2, %xmm0, %edx ; CHECK-WIN-NEXT: pextrw $4, %xmm0, %ecx ; CHECK-WIN-NEXT: # kill: def $ax killed $ax killed $eax