-
Notifications
You must be signed in to change notification settings - Fork 12.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Reland: [LV]: Teach LV to recursively (de)interleave. #122989
Conversation
This commit relands the changes from "[LV]: Teach LV to recursively (de)interleave. llvm#89018" Reason for revert: - The patch exposed a bug in the IA pass, the bug is now fixed and landed by commit: llvm#122643
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-llvm-transforms Author: Hassnaa Hamdi (hassnaaHamdi) ChangesThis commit relands the changes from "[LV]: Teach LV to recursively (de)interleave. #89018" Reason for revert:
Patch is 194.17 KiB, truncated to 20.00 KiB below, full version: /~https://github.com/llvm/llvm-project/pull/122989.diff 6 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 744faef1924380..ada2ad96bb9bfd 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3503,10 +3503,10 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
if (hasIrregularType(ScalarTy, DL))
return false;
- // We currently only know how to emit interleave/deinterleave with
- // Factor=2 for scalable vectors. This is purely an implementation
- // limit.
- if (VF.isScalable() && InterleaveFactor != 2)
+ // For scalable vectors, the only interleave factor currently supported
+ // must be power of 2 since we require the (de)interleave2 intrinsics
+ // instead of shufflevectors.
+ if (VF.isScalable() && !isPowerOf2_32(InterleaveFactor))
return false;
// If the group involves a non-integral pointer, we may not be able to
@@ -9410,9 +9410,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
CM.getWideningDecision(IG->getInsertPos(), VF) ==
LoopVectorizationCostModel::CM_Interleave);
// For scalable vectors, the only interleave factor currently supported
- // is 2 since we require the (de)interleave2 intrinsics instead of
- // shufflevectors.
- assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
+ // must be power of 2 since we require the (de)interleave2 intrinsics
+ // instead of shufflevectors.
+ assert((!Result || !VF.isScalable() || isPowerOf2_32(IG->getFactor())) &&
"Unsupported interleave factor for scalable vectors");
return Result;
};
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 4057a51155ece0..414ca5529d9ea0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2860,10 +2860,21 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
// Scalable vectors cannot use arbitrary shufflevectors (only splats), so
// must use intrinsics to interleave.
if (VecTy->isScalableTy()) {
- VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy);
- return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2,
- Vals,
- /*FMFSource=*/nullptr, Name);
+ assert(isPowerOf2_32(Factor) && "Unsupported interleave factor for "
+ "scalable vectors, must be power of 2");
+ SmallVector<Value *> InterleavingValues(Vals);
+ // When interleaving, the number of values will be shrunk until we have the
+ // single final interleaved value.
+ auto *InterleaveTy = cast<VectorType>(InterleavingValues[0]->getType());
+ for (unsigned Midpoint = Factor / 2; Midpoint > 0; Midpoint /= 2) {
+ InterleaveTy = VectorType::getDoubleElementsVectorType(InterleaveTy);
+ for (unsigned I = 0; I < Midpoint; ++I)
+ InterleavingValues[I] = Builder.CreateIntrinsic(
+ InterleaveTy, Intrinsic::vector_interleave2,
+ {InterleavingValues[I], InterleavingValues[Midpoint + I]},
+ /*FMFSource=*/nullptr, Name);
+ }
+ return InterleavingValues[0];
}
// Fixed length. Start by concatenating all vectors into a wide vector.
@@ -2949,15 +2960,11 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
&InterleaveFactor](Value *MaskForGaps) -> Value * {
if (State.VF.isScalable()) {
assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
- assert(InterleaveFactor == 2 &&
+ assert(isPowerOf2_32(InterleaveFactor) &&
"Unsupported deinterleave factor for scalable vectors");
auto *ResBlockInMask = State.get(BlockInMask);
- SmallVector<Value *, 2> Ops = {ResBlockInMask, ResBlockInMask};
- auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(),
- State.VF.getKnownMinValue() * 2, true);
- return State.Builder.CreateIntrinsic(
- MaskTy, Intrinsic::vector_interleave2, Ops,
- /*FMFSource=*/nullptr, "interleaved.mask");
+ SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
+ return interleaveVectors(State.Builder, Ops, "interleaved.mask");
}
if (!BlockInMask)
@@ -2997,22 +3004,48 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
ArrayRef<VPValue *> VPDefs = definedValues();
const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
if (VecTy->isScalableTy()) {
- assert(InterleaveFactor == 2 &&
+ assert(isPowerOf2_32(InterleaveFactor) &&
"Unsupported deinterleave factor for scalable vectors");
- // Scalable vectors cannot use arbitrary shufflevectors (only splats),
- // so must use intrinsics to deinterleave.
- Value *DI = State.Builder.CreateIntrinsic(
- Intrinsic::vector_deinterleave2, VecTy, NewLoad,
- /*FMFSource=*/nullptr, "strided.vec");
- unsigned J = 0;
- for (unsigned I = 0; I < InterleaveFactor; ++I) {
- Instruction *Member = Group->getMember(I);
+ // Scalable vectors cannot use arbitrary shufflevectors (only splats),
+ // so must use intrinsics to deinterleave.
+ SmallVector<Value *> DeinterleavedValues(InterleaveFactor);
+ DeinterleavedValues[0] = NewLoad;
+ // For the case of InterleaveFactor > 2, we will have to do recursive
+ // deinterleaving, because the current available deinterleave intrinsic
+ // supports only Factor of 2, otherwise it will bailout after first
+ // iteration.
+ // When deinterleaving, the number of values will double until we
+ // have "InterleaveFactor".
+ for (unsigned NumVectors = 1; NumVectors < InterleaveFactor;
+ NumVectors *= 2) {
+ // Deinterleave the elements within the vector
+ SmallVector<Value *> TempDeinterleavedValues(NumVectors);
+ for (unsigned I = 0; I < NumVectors; ++I) {
+ auto *DiTy = DeinterleavedValues[I]->getType();
+ TempDeinterleavedValues[I] = State.Builder.CreateIntrinsic(
+ Intrinsic::vector_deinterleave2, DiTy, DeinterleavedValues[I],
+ /*FMFSource=*/nullptr, "strided.vec");
+ }
+ // Extract the deinterleaved values:
+ for (unsigned I = 0; I < 2; ++I)
+ for (unsigned J = 0; J < NumVectors; ++J)
+ DeinterleavedValues[NumVectors * I + J] =
+ State.Builder.CreateExtractValue(TempDeinterleavedValues[J], I);
+ }
- if (!Member)
+#ifndef NDEBUG
+ for (Value *Val : DeinterleavedValues)
+ assert(Val && "NULL Deinterleaved Value");
+#endif
+ for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
+ Instruction *Member = Group->getMember(I);
+ Value *StridedVec = DeinterleavedValues[I];
+ if (!Member) {
+ // This value is not needed as it's not used
+ static_cast<Instruction *>(StridedVec)->eraseFromParent();
continue;
-
- Value *StridedVec = State.Builder.CreateExtractValue(DI, I);
+ }
// If this member has different type, cast the result type.
if (Member->getType() != ScalarTy) {
VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index bf956227334616..05c0bc0761ea4f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -396,8 +396,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP9]], align 4
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP10]])
; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; CHECK-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP10]])
; CHECK-NEXT: [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP11]])
; CHECK-NEXT: [[TMP12:%.*]] = add nsw <vscale x 4 x i32> [[REVERSE]], [[VEC_IND]]
; CHECK-NEXT: [[TMP13:%.*]] = sub nsw <vscale x 4 x i32> [[REVERSE1]], [[VEC_IND]]
@@ -1548,5 +1548,263 @@ end:
ret void
}
+; Check vectorization on an interleaved load/store groups of factor 4
+
+; for (int i = 0; i < 1024; ++i) {
+; dst[i].x = a[i].x + b[i].x;
+; dst[i].y = a[i].y - b[i].y;
+; dst[i].z = a[i].z << b[i].z;
+; dst[i].t = a[i].t >> b[i].t;
+; }
+%struct.xyzt = type { i32, i32, i32, i32 }
+
+define void @interleave_deinterleave(ptr writeonly noalias %dst, ptr readonly %a, ptr readonly %b) {
+; CHECK-LABEL: @interleave_deinterleave(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], 1024
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 2
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 16 x i32>, ptr [[TMP6]], align 4
+; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC]])
+; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC]], 1
+; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP7]])
+; CHECK-NEXT: [[STRIDED_VEC7:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP8]])
+; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC6]], 0
+; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC7]], 0
+; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC6]], 1
+; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC7]], 1
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_VEC8:%.*]] = load <vscale x 16 x i32>, ptr [[TMP13]], align 4
+; CHECK-NEXT: [[STRIDED_VEC9:%.*]] = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC8]])
+; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC9]], 0
+; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[STRIDED_VEC9]], 1
+; CHECK-NEXT: [[STRIDED_VEC10:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP14]])
+; CHECK-NEXT: [[STRIDED_VEC11:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP15]])
+; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC10]], 0
+; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC11]], 0
+; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC10]], 1
+; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC11]], 1
+; CHECK-NEXT: [[TMP20:%.*]] = add nsw <vscale x 4 x i32> [[TMP16]], [[TMP9]]
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST:%.*]], i64 [[INDEX]]
+; CHECK-NEXT: [[TMP22:%.*]] = sub nsw <vscale x 4 x i32> [[TMP10]], [[TMP17]]
+; CHECK-NEXT: [[TMP23:%.*]] = shl <vscale x 4 x i32> [[TMP11]], [[TMP18]]
+; CHECK-NEXT: [[TMP24:%.*]] = ashr <vscale x 4 x i32> [[TMP12]], [[TMP19]]
+; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP20]], <vscale x 4 x i32> [[TMP23]])
+; CHECK-NEXT: [[INTERLEAVED_VEC12:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP22]], <vscale x 4 x i32> [[TMP24]])
+; CHECK-NEXT: [[INTERLEAVED_VEC13:%.*]] = call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> [[INTERLEAVED_VEC]], <vscale x 8 x i32> [[INTERLEAVED_VEC12]])
+; CHECK-NEXT: store <vscale x 16 x i32> [[INTERLEAVED_VEC13]], ptr [[TMP21]], align 4
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP27]], [[TMP26]]
+; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[DST]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 4
+; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[Y]], align 4
+; CHECK-NEXT: [[Y11:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 4
+; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[Y11]], align 4
+; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP28]], [[TMP29]]
+; CHECK-NEXT: [[Y14:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 4
+; CHECK-NEXT: store i32 [[SUB]], ptr [[Y14]], align 4
+; CHECK-NEXT: [[Z:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 8
+; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[Z]], align 4
+; CHECK-NEXT: [[Z19:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 8
+; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr [[Z19]], align 4
+; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[TMP30]], [[TMP31]]
+; CHECK-NEXT: [[Z22:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 8
+; CHECK-NEXT: store i32 [[SHL]], ptr [[Z22]], align 4
+; CHECK-NEXT: [[T:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 12
+; CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[T]], align 4
+; CHECK-NEXT: [[T27:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 12
+; CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[T27]], align 4
+; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[TMP32]], [[TMP33]]
+; CHECK-NEXT: [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 12
+; CHECK-NEXT: store i32 [[SHR]], ptr [[T30]], align 4
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
+; CHECK: for.end:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds %struct.xyzt, ptr %a, i64 %indvars.iv
+ %0 = load i32, ptr %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds %struct.xyzt, ptr %b, i64 %indvars.iv
+ %1 = load i32, ptr %arrayidx2, align 4
+ %add = add nsw i32 %1, %0
+ %arrayidx5 = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %indvars.iv
+ store i32 %add, ptr %arrayidx5, align 4
+ %y = getelementptr inbounds nuw i8, ptr %arrayidx, i64 4
+ %2 = load i32, ptr %y, align 4
+ %y11 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 4
+ %3 = load i32, ptr %y11, align 4
+ %sub = sub nsw i32 %2, %3
+ %y14 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 4
+ store i32 %sub, ptr %y14, align 4
+ %z = getelementptr inbounds nuw i8, ptr %arrayidx, i64 8
+ %4 = load i32, ptr %z, align 4
+ %z19 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 8
+ %5 = load i32, ptr %z19, align 4
+ %shl = shl i32 %4, %5
+ %z22 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 8
+ store i32 %shl, ptr %z22, align 4
+ %t = getelementptr inbounds nuw i8, ptr %arrayidx, i64 12
+ %6 = load i32, ptr %t, align 4
+ %t27 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 12
+ %7 = load i32, ptr %t27, align 4
+ %shr = ashr i32 %6, %7
+ %t30 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 12
+ store i32 %shr, ptr %t30, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, 1024
+ br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+; Check vectorization on a reverse interleaved load/store groups of factor 4
+
+; for (int i = 1023; i >= 0; i--) {
+; int a = A[i].x + i;
+; int b = A[i].y - i;
+; int c = A[i].z * i;
+; int d = A[i].t << i;
+; B[i].x = a;
+; B[i].y = b;
+; B[i].z = c;
+; B[i].t = d;
+; }
+
+define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A, ptr noalias nocapture %B) #1{
+; CHECK-LABEL: @interleave_deinterleave_reverse(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; CHECK-NEXT: [[INDUCTION:%.*]] = sub <vscale x 4 x i32> splat (i32 1023), [[TMP2]]
+; CHECK-NEXT: [[TMP3:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
+; CHECK-NEXT: [[TMP4:%.*]] = sub nsw i32 0, [[TMP3]]
+; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP4]], i64 0
+; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDE...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM! This patch is identical to #89018, since the reason for reverting last time was due to a bug in a different IR pass that this PR was exposing. I'm accepting this on the basis that it's identical to before and it has already been thoroughly reviewed and accepted by several reviewers.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. I also included some suggestions to help with readability of the tests.
llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll
Outdated
Show resolved
Hide resolved
llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll
Outdated
Show resolved
Hide resolved
llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll
Outdated
Show resolved
Hide resolved
Hello, It seems that this patch broke sve vls bots: |
This looks like the loop vectoriser cost model assert firing. FYI @fhahn.
It's only a bug when building with asserts and just indicates the legacy and new vplan cost models do not agree. I guess this is another example that needs adding to |
It sounds like this somehow introduced a divergence between the cost for interleave groups in the legacy and VPlan cost model. Without seeing the test case it's difficult to tell as the cost computations didn't really change in the patch AFAICT, but I suspect the divergence should be fixed |
Hi, |
This reverts commit 9491f75. This triggers an assert when building with SVE enabled. https://lab.llvm.org/buildbot/#/builders/143/builds/4795
Given that we are close to cutting the release branch (I think) and the bot has been red for a number of days I reverted the change for now and added a reduced test case for the crash in 3418cd0 It looks like there might be some gaps with test coverage for costing scalable interleave groups, |
Hi, |
It sounds like |
This patch relands the changes from "[LV]: Teach LV to recursively (de)interleave.#122989" Reason for revert: - The patch exposed an assert in the vectorizer related to VF difference between legacy cost model and VPlan-based cost model because of uncalculated cost for VPInstruction which is created by VPlanTransforms as a replacement to 'or disjoint' instruction. VPlanTransforms do that instructions change when there are memory interleaving and predicated blocks, but that change didn't cause problems because at most cases the cost difference between legacy/new models is not noticeable. - Issue is fixed by #125434 Original patch: #89018 Reviewed-by: paulwalker-arm, Mel-Chen
This patch relands the changes from "[LV]: Teach LV to recursively (de)interleave.llvm#122989" Reason for revert: - The patch exposed an assert in the vectorizer related to VF difference between legacy cost model and VPlan-based cost model because of uncalculated cost for VPInstruction which is created by VPlanTransforms as a replacement to 'or disjoint' instruction. VPlanTransforms do that instructions change when there are memory interleaving and predicated blocks, but that change didn't cause problems because at most cases the cost difference between legacy/new models is not noticeable. - Issue is fixed by llvm#125434 Original patch: llvm#89018 Reviewed-by: paulwalker-arm, Mel-Chen
This commit relands the changes from "[LV]: Teach LV to recursively (de)interleave. #89018"
Reason for revert: