Skip to content

Commit 7d076e1

Browse files
committed
[PowerPC] Fix computation of offset for load-and-splat for permuted loads
Unfortunately this is another regression from my canonicalization patch (1fed131). The patch contained two implicit assumptions: 1. That we would have a permuted load only if we are loading a partial vector 2. That a partial vector load would necessarily be as wide as the splat However, assumption 2 is not correct since it is possible to do a wider load and only splat a half of it. This patch corrects this assumption by simply checking if the load is permuted and adjusting the offset if it is.
1 parent 032b78a commit 7d076e1

File tree

2 files changed

+106
-8
lines changed

2 files changed

+106
-8
lines changed

llvm/lib/Target/PowerPC/PPCISelLowering.cpp

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9126,13 +9126,15 @@ SDValue PPCTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
91269126
Op0.getOperand(1));
91279127
}
91289128

9129-
static const SDValue *getNormalLoadInput(const SDValue &Op) {
9129+
static const SDValue *getNormalLoadInput(const SDValue &Op, bool &IsPermuted) {
91309130
const SDValue *InputLoad = &Op;
91319131
if (InputLoad->getOpcode() == ISD::BITCAST)
91329132
InputLoad = &InputLoad->getOperand(0);
91339133
if (InputLoad->getOpcode() == ISD::SCALAR_TO_VECTOR ||
9134-
InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED)
9134+
InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED) {
9135+
IsPermuted = InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED;
91359136
InputLoad = &InputLoad->getOperand(0);
9137+
}
91369138
if (InputLoad->getOpcode() != ISD::LOAD)
91379139
return nullptr;
91389140
LoadSDNode *LD = cast<LoadSDNode>(*InputLoad);
@@ -9304,7 +9306,8 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
93049306

93059307
if (!BVNIsConstantSplat || SplatBitSize > 32) {
93069308

9307-
const SDValue *InputLoad = getNormalLoadInput(Op.getOperand(0));
9309+
bool IsPermutedLoad = false;
9310+
const SDValue *InputLoad = getNormalLoadInput(Op.getOperand(0), IsPermutedLoad);
93089311
// Handle load-and-splat patterns as we have instructions that will do this
93099312
// in one go.
93109313
if (InputLoad && DAG.isSplatValue(Op, true)) {
@@ -9927,14 +9930,25 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
99279930
// If this is a load-and-splat, we can do that with a single instruction
99289931
// in some cases. However if the load has multiple uses, we don't want to
99299932
// combine it because that will just produce multiple loads.
9930-
const SDValue *InputLoad = getNormalLoadInput(V1);
9933+
bool IsPermutedLoad = false;
9934+
const SDValue *InputLoad = getNormalLoadInput(V1, IsPermutedLoad);
99319935
if (InputLoad && Subtarget.hasVSX() && V2.isUndef() &&
99329936
(PPC::isSplatShuffleMask(SVOp, 4) || PPC::isSplatShuffleMask(SVOp, 8)) &&
99339937
InputLoad->hasOneUse()) {
99349938
bool IsFourByte = PPC::isSplatShuffleMask(SVOp, 4);
99359939
int SplatIdx =
99369940
PPC::getSplatIdxForPPCMnemonics(SVOp, IsFourByte ? 4 : 8, DAG);
99379941

9942+
// The splat index for permuted loads will be in the left half of the vector
9943+
// which is strictly wider than the loaded value by 8 bytes. So we need to
9944+
// adjust the splat index to point to the correct address in memory.
9945+
if (IsPermutedLoad) {
9946+
assert(isLittleEndian && "Unexpected permuted load on big endian target");
9947+
SplatIdx += IsFourByte ? 2 : 1;
9948+
assert(SplatIdx < IsFourByte ? 4 : 2 &&
9949+
"Splat of a value outside of the loaded memory");
9950+
}
9951+
99389952
LoadSDNode *LD = cast<LoadSDNode>(*InputLoad);
99399953
// For 4-byte load-and-splat, we need Power9.
99409954
if ((IsFourByte && Subtarget.hasP9Vector()) || !IsFourByte) {
@@ -9944,10 +9958,6 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
99449958
else
99459959
Offset = isLittleEndian ? (1 - SplatIdx) * 8 : SplatIdx * 8;
99469960

9947-
// If we are loading a partial vector, it does not make sense to adjust
9948-
// the base pointer. This happens with (splat (s_to_v_permuted (ld))).
9949-
if (LD->getMemoryVT().getSizeInBits() == (IsFourByte ? 32 : 64))
9950-
Offset = 0;
99519961
SDValue BasePtr = LD->getBasePtr();
99529962
if (Offset != 0)
99539963
BasePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),

llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -446,5 +446,93 @@ entry:
446446
ret <16 x i8> %shuffle
447447
}
448448

449+
define dso_local <4 x i32> @testSplat4Low(<8 x i8>* nocapture readonly %ptr) local_unnamed_addr #0 {
450+
; CHECK-P8-LABEL: testSplat4Low:
451+
; CHECK-P8: # %bb.0: # %entry
452+
; CHECK-P8-NEXT: ld r3, 0(r3)
453+
; CHECK-P8-NEXT: mtfprd f0, r3
454+
; CHECK-P8-NEXT: xxspltw v2, vs0, 0
455+
; CHECK-P8-NEXT: blr
456+
;
457+
; CHECK-P9-LABEL: testSplat4Low:
458+
; CHECK-P9: # %bb.0: # %entry
459+
; CHECK-P9-NEXT: addi r3, r3, 4
460+
; CHECK-P9-NEXT: lxvwsx v2, 0, r3
461+
; CHECK-P9-NEXT: blr
462+
;
463+
; CHECK-NOVSX-LABEL: testSplat4Low:
464+
; CHECK-NOVSX: # %bb.0: # %entry
465+
; CHECK-NOVSX-NEXT: ld r3, 0(r3)
466+
; CHECK-NOVSX-NEXT: addi r4, r1, -16
467+
; CHECK-NOVSX-NEXT: std r3, -16(r1)
468+
; CHECK-NOVSX-NEXT: lvx v2, 0, r4
469+
; CHECK-NOVSX-NEXT: vspltw v2, v2, 2
470+
; CHECK-NOVSX-NEXT: blr
471+
entry:
472+
%0 = load <8 x i8>, <8 x i8>* %ptr, align 8
473+
%vecinit18 = shufflevector <8 x i8> %0, <8 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
474+
%1 = bitcast <16 x i8> %vecinit18 to <4 x i32>
475+
ret <4 x i32> %1
476+
}
477+
478+
; Function Attrs: norecurse nounwind readonly
479+
define dso_local <4 x i32> @testSplat4hi(<8 x i8>* nocapture readonly %ptr) local_unnamed_addr #0 {
480+
; CHECK-P8-LABEL: testSplat4hi:
481+
; CHECK-P8: # %bb.0: # %entry
482+
; CHECK-P8-NEXT: ld r3, 0(r3)
483+
; CHECK-P8-NEXT: mtfprd f0, r3
484+
; CHECK-P8-NEXT: xxspltw v2, vs0, 1
485+
; CHECK-P8-NEXT: blr
486+
;
487+
; CHECK-P9-LABEL: testSplat4hi:
488+
; CHECK-P9: # %bb.0: # %entry
489+
; CHECK-P9-NEXT: lxvwsx v2, 0, r3
490+
; CHECK-P9-NEXT: blr
491+
;
492+
; CHECK-NOVSX-LABEL: testSplat4hi:
493+
; CHECK-NOVSX: # %bb.0: # %entry
494+
; CHECK-NOVSX-NEXT: ld r3, 0(r3)
495+
; CHECK-NOVSX-NEXT: addi r4, r1, -16
496+
; CHECK-NOVSX-NEXT: std r3, -16(r1)
497+
; CHECK-NOVSX-NEXT: lvx v2, 0, r4
498+
; CHECK-NOVSX-NEXT: vspltw v2, v2, 3
499+
; CHECK-NOVSX-NEXT: blr
500+
entry:
501+
%0 = load <8 x i8>, <8 x i8>* %ptr, align 8
502+
%vecinit22 = shufflevector <8 x i8> %0, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
503+
%1 = bitcast <16 x i8> %vecinit22 to <4 x i32>
504+
ret <4 x i32> %1
505+
}
506+
507+
; Function Attrs: norecurse nounwind readonly
508+
define dso_local <2 x i64> @testSplat8(<8 x i8>* nocapture readonly %ptr) local_unnamed_addr #0 {
509+
; CHECK-P8-LABEL: testSplat8:
510+
; CHECK-P8: # %bb.0: # %entry
511+
; CHECK-P8-NEXT: lxvdsx v2, 0, r3
512+
; CHECK-P8-NEXT: blr
513+
;
514+
; CHECK-P9-LABEL: testSplat8:
515+
; CHECK-P9: # %bb.0: # %entry
516+
; CHECK-P9-NEXT: lxvdsx v2, 0, r3
517+
; CHECK-P9-NEXT: blr
518+
;
519+
; CHECK-NOVSX-LABEL: testSplat8:
520+
; CHECK-NOVSX: # %bb.0: # %entry
521+
; CHECK-NOVSX-NEXT: ld r3, 0(r3)
522+
; CHECK-NOVSX-NEXT: addis r4, r2, .LCPI19_0@toc@ha
523+
; CHECK-NOVSX-NEXT: addi r4, r4, .LCPI19_0@toc@l
524+
; CHECK-NOVSX-NEXT: lvx v2, 0, r4
525+
; CHECK-NOVSX-NEXT: std r3, -16(r1)
526+
; CHECK-NOVSX-NEXT: addi r3, r1, -16
527+
; CHECK-NOVSX-NEXT: lvx v3, 0, r3
528+
; CHECK-NOVSX-NEXT: vperm v2, v3, v3, v2
529+
; CHECK-NOVSX-NEXT: blr
530+
entry:
531+
%0 = load <8 x i8>, <8 x i8>* %ptr, align 8
532+
%vecinit30 = shufflevector <8 x i8> %0, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
533+
%1 = bitcast <16 x i8> %vecinit30 to <2 x i64>
534+
ret <2 x i64> %1
535+
}
536+
449537
declare double @dummy() local_unnamed_addr
450538
attributes #0 = { nounwind }

0 commit comments

Comments
 (0)