Skip to content

Commit

Permalink
[X86] ReplaceNodeResults - truncate sub-128-bit vectors as shuffles d…
Browse files Browse the repository at this point in the history
…irectly

We were scalarizing these truncations, but in most cases we can widen the source vector to 128-bits and perform the truncation as a shuffle directly (which will usually lower as a PACK or PSHUFB).

For the cases where the widening and shuffle isn't legal we can leave it to generic legalization to scalarize for us.

Fixes llvm#81883
  • Loading branch information
RKSimon committed Feb 27, 2024
1 parent 2c9b6c1 commit 33ef3fa
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 46 deletions.
26 changes: 13 additions & 13 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32341,20 +32341,20 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
}
}

if (128 % InBits == 0) {
if ((128 % InBits) == 0 && WidenVT.is128BitVector()) {
// 128 bit and smaller inputs should avoid truncate all together and
// just use a build_vector that will become a shuffle.
// TODO: Widen and use a shuffle directly?
SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
// Use the original element count so we don't do more scalar opts than
// necessary.
for (unsigned i=0; i < MinElts; ++i) {
SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,
DAG.getIntPtrConstant(i, dl));
Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);
}
Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));
return;
// use a shuffle.
if ((InEltVT.getSizeInBits() % EltVT.getSizeInBits()) == 0) {
int Scale = InEltVT.getSizeInBits() / EltVT.getSizeInBits();
SmallVector<int, 16> TruncMask(WidenNumElts, -1);
for (unsigned I = 0; I < MinElts; ++I)
TruncMask[I] = Scale * I;
SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl, 128);
WidenIn = DAG.getBitcast(WidenVT, WidenIn);
Results.push_back(
DAG.getVectorShuffle(WidenVT, dl, WidenIn, WidenIn, TruncMask));
return;
}
}

// With AVX512 there are some cases that can use a target specific
Expand Down
19 changes: 7 additions & 12 deletions llvm/test/CodeGen/X86/extract-concat.ll
Original file line number Diff line number Diff line change
Expand Up @@ -9,22 +9,17 @@ define void @foo(<4 x float> %in, ptr %out) {
; SSE2-LABEL: foo:
; SSE2: # %bb.0:
; SSE2-NEXT: cvttps2dq %xmm0, %xmm0
; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
; SSE2-NEXT: shll $8, %ecx
; SSE2-NEXT: orl %eax, %ecx
; SSE2-NEXT: movl -{{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: shll $16, %eax
; SSE2-NEXT: orl %ecx, %eax
; SSE2-NEXT: orl $-16777216, %eax # imm = 0xFF000000
; SSE2-NEXT: movl %eax, (%rdi)
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: movd %xmm0, (%rdi)
; SSE2-NEXT: retq
;
; SSE42-LABEL: foo:
; SSE42: # %bb.0:
; SSE42-NEXT: cvttps2dq %xmm0, %xmm0
; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,u,u,u,u,u,u,u,u,u,u,u,u,u]
; SSE42-NEXT: movl $255, %eax
; SSE42-NEXT: pinsrb $3, %eax, %xmm0
; SSE42-NEXT: movd %xmm0, (%rdi)
Expand All @@ -33,7 +28,7 @@ define void @foo(<4 x float> %in, ptr %out) {
; AVX-LABEL: foo:
; AVX: # %bb.0:
; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX-NEXT: movl $255, %eax
; AVX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, (%rdi)
Expand Down
23 changes: 3 additions & 20 deletions llvm/test/CodeGen/X86/vec_anyext.ll
Original file line number Diff line number Diff line change
Expand Up @@ -112,27 +112,10 @@ define <4 x i8> @func_8_16(ptr %a, ptr %b) nounwind {
;
; X64-LABEL: func_8_16:
; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %rax
; X64-NEXT: vmovd %eax, %xmm0
; X64-NEXT: movl %eax, %ecx
; X64-NEXT: shrl $16, %ecx
; X64-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
; X64-NEXT: movq %rax, %rcx
; X64-NEXT: shrq $32, %rcx
; X64-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
; X64-NEXT: shrq $48, %rax
; X64-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; X64-NEXT: movq (%rsi), %rax
; X64-NEXT: vmovd %eax, %xmm1
; X64-NEXT: movl %eax, %ecx
; X64-NEXT: shrl $16, %ecx
; X64-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1
; X64-NEXT: movq %rax, %rcx
; X64-NEXT: shrq $32, %rcx
; X64-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
; X64-NEXT: shrq $48, %rax
; X64-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
; X64-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; X64-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; X64-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
; X64-NEXT: retq
%F = load <4 x i16>, ptr %a
%G = trunc <4 x i16> %F to <4 x i8>
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/vec_cast.ll
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ define <3 x i16> @h(<3 x i32> %a) nounwind {
; CHECK-WIN-LABEL: h:
; CHECK-WIN: # %bb.0:
; CHECK-WIN-NEXT: movdqa (%rcx), %xmm0
; CHECK-WIN-NEXT: movl (%rcx), %eax
; CHECK-WIN-NEXT: movd %xmm0, %eax
; CHECK-WIN-NEXT: pextrw $2, %xmm0, %edx
; CHECK-WIN-NEXT: pextrw $4, %xmm0, %ecx
; CHECK-WIN-NEXT: # kill: def $ax killed $ax killed $eax
Expand Down

0 comments on commit 33ef3fa

Please sign in to comment.