Skip to content

Commit

Permalink
Reland "[LV]: Teach LV to recursively (de)interleave." (#125094)
Browse files Browse the repository at this point in the history
This patch relands the changes from "[LV]: Teach LV to recursively
(de)interleave.#122989"
    Reason for revert:
- The patch exposed an assert in the vectorizer related to VF difference
between
legacy cost model and VPlan-based cost model because of uncalculated
cost for
VPInstruction which is created by VPlanTransforms as a replacement to
'or disjoint'
       instruction.
VPlanTransforms do that instructions change when there are memory
interleaving and
predicated blocks, but that change didn't cause problems because at most
cases the cost
      difference between legacy/new models is not noticeable.
    - Issue is fixed by #125434

Original patch: #89018
Reviewed-by: paulwalker-arm, Mel-Chen
  • Loading branch information
hassnaaHamdi authored Feb 9, 2025
1 parent 4722200 commit e9a20f7
Show file tree
Hide file tree
Showing 6 changed files with 1,387 additions and 671 deletions.
14 changes: 7 additions & 7 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3439,10 +3439,10 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
if (hasIrregularType(ScalarTy, DL))
return false;

// We currently only know how to emit interleave/deinterleave with
// Factor=2 for scalable vectors. This is purely an implementation
// limit.
if (VF.isScalable() && InterleaveFactor != 2)
// For scalable vectors, the only interleave factor currently supported
// must be power of 2 since we require the (de)interleave2 intrinsics
// instead of shufflevectors.
if (VF.isScalable() && !isPowerOf2_32(InterleaveFactor))
return false;

// If the group involves a non-integral pointer, we may not be able to
Expand Down Expand Up @@ -9311,9 +9311,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
CM.getWideningDecision(IG->getInsertPos(), VF) ==
LoopVectorizationCostModel::CM_Interleave);
// For scalable vectors, the only interleave factor currently supported
// is 2 since we require the (de)interleave2 intrinsics instead of
// shufflevectors.
assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
// must be power of 2 since we require the (de)interleave2 intrinsics
// instead of shufflevectors.
assert((!Result || !VF.isScalable() || isPowerOf2_32(IG->getFactor())) &&
"Unsupported interleave factor for scalable vectors");
return Result;
};
Expand Down
79 changes: 56 additions & 23 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2915,10 +2915,21 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
// Scalable vectors cannot use arbitrary shufflevectors (only splats), so
// must use intrinsics to interleave.
if (VecTy->isScalableTy()) {
VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2,
Vals,
/*FMFSource=*/nullptr, Name);
assert(isPowerOf2_32(Factor) && "Unsupported interleave factor for "
"scalable vectors, must be power of 2");
SmallVector<Value *> InterleavingValues(Vals);
// When interleaving, the number of values will be shrunk until we have the
// single final interleaved value.
auto *InterleaveTy = cast<VectorType>(InterleavingValues[0]->getType());
for (unsigned Midpoint = Factor / 2; Midpoint > 0; Midpoint /= 2) {
InterleaveTy = VectorType::getDoubleElementsVectorType(InterleaveTy);
for (unsigned I = 0; I < Midpoint; ++I)
InterleavingValues[I] = Builder.CreateIntrinsic(
InterleaveTy, Intrinsic::vector_interleave2,
{InterleavingValues[I], InterleavingValues[Midpoint + I]},
/*FMFSource=*/nullptr, Name);
}
return InterleavingValues[0];
}

// Fixed length. Start by concatenating all vectors into a wide vector.
Expand Down Expand Up @@ -3004,15 +3015,11 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
&InterleaveFactor](Value *MaskForGaps) -> Value * {
if (State.VF.isScalable()) {
assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
assert(InterleaveFactor == 2 &&
assert(isPowerOf2_32(InterleaveFactor) &&
"Unsupported deinterleave factor for scalable vectors");
auto *ResBlockInMask = State.get(BlockInMask);
SmallVector<Value *, 2> Ops = {ResBlockInMask, ResBlockInMask};
auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(),
State.VF.getKnownMinValue() * 2, true);
return State.Builder.CreateIntrinsic(
MaskTy, Intrinsic::vector_interleave2, Ops,
/*FMFSource=*/nullptr, "interleaved.mask");
SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
return interleaveVectors(State.Builder, Ops, "interleaved.mask");
}

if (!BlockInMask)
Expand Down Expand Up @@ -3052,22 +3059,48 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
ArrayRef<VPValue *> VPDefs = definedValues();
const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
if (VecTy->isScalableTy()) {
assert(InterleaveFactor == 2 &&
assert(isPowerOf2_32(InterleaveFactor) &&
"Unsupported deinterleave factor for scalable vectors");

// Scalable vectors cannot use arbitrary shufflevectors (only splats),
// so must use intrinsics to deinterleave.
Value *DI = State.Builder.CreateIntrinsic(
Intrinsic::vector_deinterleave2, VecTy, NewLoad,
/*FMFSource=*/nullptr, "strided.vec");
unsigned J = 0;
for (unsigned I = 0; I < InterleaveFactor; ++I) {
Instruction *Member = Group->getMember(I);
// Scalable vectors cannot use arbitrary shufflevectors (only splats),
// so must use intrinsics to deinterleave.
SmallVector<Value *> DeinterleavedValues(InterleaveFactor);
DeinterleavedValues[0] = NewLoad;
// For the case of InterleaveFactor > 2, we will have to do recursive
// deinterleaving, because the current available deinterleave intrinsic
// supports only Factor of 2, otherwise it will bailout after first
// iteration.
// When deinterleaving, the number of values will double until we
// have "InterleaveFactor".
for (unsigned NumVectors = 1; NumVectors < InterleaveFactor;
NumVectors *= 2) {
// Deinterleave the elements within the vector
SmallVector<Value *> TempDeinterleavedValues(NumVectors);
for (unsigned I = 0; I < NumVectors; ++I) {
auto *DiTy = DeinterleavedValues[I]->getType();
TempDeinterleavedValues[I] = State.Builder.CreateIntrinsic(
Intrinsic::vector_deinterleave2, DiTy, DeinterleavedValues[I],
/*FMFSource=*/nullptr, "strided.vec");
}
// Extract the deinterleaved values:
for (unsigned I = 0; I < 2; ++I)
for (unsigned J = 0; J < NumVectors; ++J)
DeinterleavedValues[NumVectors * I + J] =
State.Builder.CreateExtractValue(TempDeinterleavedValues[J], I);
}

if (!Member)
#ifndef NDEBUG
for (Value *Val : DeinterleavedValues)
assert(Val && "NULL Deinterleaved Value");
#endif
for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
Instruction *Member = Group->getMember(I);
Value *StridedVec = DeinterleavedValues[I];
if (!Member) {
// This value is not needed as it's not used
cast<Instruction>(StridedVec)->eraseFromParent();
continue;

Value *StridedVec = State.Builder.CreateExtractValue(DI, I);
}
// If this member has different type, cast the result type.
if (Member->getType() != ScalarTy) {
VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
Expand Down
Loading

0 comments on commit e9a20f7

Please sign in to comment.