Skip to content

Commit

Permalink
Add QuadBroadcast optimization for WaveShuffleIndex pattern
Browse files Browse the repository at this point in the history
Adds a new QuadBroadcast intrinsic for  WaveShuffleIndex operation.
  • Loading branch information
SharmaRithik authored and igcbot committed Feb 24, 2025
1 parent 26a57ba commit 34c8a74
Show file tree
Hide file tree
Showing 6 changed files with 281 additions and 52 deletions.
21 changes: 21 additions & 0 deletions IGC/Compiler/CISACodeGen/EmitVISAPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6051,6 +6051,24 @@ void EmitPass::emitSimdClusteredBroadcast(llvm::Instruction* inst)

}

void EmitPass::emitQuadBroadcast(llvm::Instruction* inst) {
CVariable* data = GetSymbol(inst->getOperand(0));
ConstantInt* laneOp = dyn_cast<ConstantInt>(inst->getOperand(1));
IGC_ASSERT(laneOp && laneOp->getZExtValue() < 4);

if (data->IsUniform()) {
m_encoder->Copy(m_destination, data);
m_encoder->Push();
return;
}

m_encoder->SetNoMask();
m_encoder->SetSrcRegion(0, 4, 4, 0);
m_encoder->SetSrcSubReg(0, laneOp->getZExtValue());
m_encoder->Copy(m_destination, data);
m_encoder->Push();
}

void EmitPass::emitSimdShuffleDown(llvm::Instruction* inst)
{
CVariable* pCurrentData = GetSymbol(inst->getOperand(0));
Expand Down Expand Up @@ -9476,6 +9494,9 @@ void EmitPass::EmitGenIntrinsicMessage(llvm::GenIntrinsicInst* inst)
case GenISAIntrinsic::GenISA_WaveBroadcast:
emitSimdShuffle(inst);
break;
case GenISAIntrinsic::GenISA_QuadBroadcast:
emitQuadBroadcast(inst);
break;
case GenISAIntrinsic::GenISA_WaveClusteredBroadcast:
emitSimdClusteredBroadcast(inst);
break;
Expand Down
1 change: 1 addition & 0 deletions IGC/Compiler/CISACodeGen/EmitVISAPass.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,7 @@ class EmitPass : public llvm::FunctionPass
void emitSimdSize(llvm::Instruction* inst);
void emitSimdShuffle(llvm::Instruction* inst);
void emitSimdClusteredBroadcast(llvm::Instruction* inst);
void emitQuadBroadcast(llvm::Instruction* inst);
void emitCrossInstanceMov(const SSource& source, const DstModifier& modifier);
void emitSimdShuffleDown(llvm::Instruction* inst);
void emitSimdShuffleXor(llvm::Instruction* inst);
Expand Down
1 change: 1 addition & 0 deletions IGC/Compiler/CISACodeGen/opCode.h
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,7 @@ DECLARE_OPCODE(GenISA_WavePrefix, GenISAIntrinsic, llvm_wavePrefix, false, false
DECLARE_OPCODE(GenISA_QuadPrefix, GenISAIntrinsic, llvm_quadPrefix, false, false, false, false, false, false, false)
DECLARE_OPCODE(GenISA_WaveClusteredPrefix, GenISAIntrinsic, llvm_waveClusteredPrefix, false, false, false, false, false, false, false)
DECLARE_OPCODE(GenISA_WaveShuffleIndex, GenISAIntrinsic, llvm_waveShuffleIndex, false, false, false, false, false, false, false)
DECLARE_OPCODE(GenISA_QuadBroadcast, GenISAIntrinsic, llvm_QuadBroadcast, false, false, false, false, false, false, false)
DECLARE_OPCODE(GenISA_WaveBroadcast, GenISAIntrinsic, llvm_waveBroadcast, false, false, false, false, false, false, false)
DECLARE_OPCODE(GenISA_WaveClusteredBroadcast, GenISAIntrinsic, llvm_waveClusteredBroadcast, false, false, false, false, false, false, false)

Expand Down
164 changes: 112 additions & 52 deletions IGC/Compiler/CustomSafeOptPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -289,8 +289,7 @@ void CustomSafeOptPass::visitAnd(BinaryOperator& I) {
// also be written manually as
// uint32_t other_id = sg.get_local_id() ^ XOR_VALUE;
// r = select_from_group(sg, x, other_id);
void CustomSafeOptPass::visitShuffleIndex(llvm::CallInst* I)
{
void CustomSafeOptPass::visitShuffleIndex(llvm::CallInst* I) {
using namespace llvm::PatternMatch;
/*
Pattern match
Expand All @@ -299,87 +298,148 @@ void CustomSafeOptPass::visitShuffleIndex(llvm::CallInst* I)
%xor = xor i16 %[optional1], 1
...[optional2] = %xor
%simdShuffle = call i32 @llvm.genx.GenISA.WaveShuffleIndex.i32(i32 %x, i32 %[optional2], i32 0)
Optional can be any combinations of :
Optional can be any combinations of:
* %and = and i16 %856, 63
* %zext = zext i16 %857 to i32
We ignore any combinations of those, as they don't change the final calculated value,
and different permutations were observed.
*/

auto getInstructionIgnoringAndZext = [](Value* V, unsigned Opcode) -> Instruction* {
while (auto* VI = dyn_cast<Instruction>(V)) {
if (VI->getOpcode() == Opcode) {
return VI;
}
else if (auto* ZI = dyn_cast<ZExtInst>(VI)) {
// Check if zext is from i16 to i32
if (ZI->getSrcTy()->isIntegerTy(16) && ZI->getDestTy()->isIntegerTy(32)) {
V = ZI->getOperand(0); // Skip over zext
}
else {
return nullptr; // Not the zext we are looking for
}
}
else if (VI->getOpcode() == Instruction::And) {
ConstantInt* andValueConstant = dyn_cast<ConstantInt>(VI->getOperand(1));
// We handle "redundant values", so those which bits enable all of
// 32 lanes, so 31, 63 (spotted in nature), 127, 255 etc.
if (andValueConstant && ((andValueConstant->getZExtValue() & 31) != 31)) {
return nullptr;
}
V = VI->getOperand(0); // Skip over and
}
else {
return nullptr; // Not a zext, and, or the specified opcode
}
}
return nullptr; //unreachable
};

Value* indexOp = I->getOperand(1);

// Get helper lanes parameter
ConstantInt* enableHelperLanes = dyn_cast<ConstantInt>(I->getOperand(2));
if (!enableHelperLanes || enableHelperLanes->getZExtValue() != 0) {
if (!enableHelperLanes) {
return;
}

auto getInstructionIgnoringAndZext = []( Value* V, unsigned Opcode ) -> Instruction* {
while( auto* VI = dyn_cast<Instruction>( V ) ) {
if( VI->getOpcode() == Opcode ) {
return VI;
}
else if( auto* ZI = dyn_cast<ZExtInst>( VI ) ) {
// Check if zext is from i16 to i32
if( ZI->getSrcTy()->isIntegerTy( 16 ) && ZI->getDestTy()->isIntegerTy( 32 ) ) {
V = ZI->getOperand( 0 ); // Skip over zext
} else {
return nullptr; // Not the zext we are looking for
// Try QuadBroadcast pattern if helper lanes = 1
if (enableHelperLanes->getZExtValue() == 1) {
auto* zextInst = dyn_cast<ZExtInst>(indexOp);
if (zextInst && zextInst->getSrcTy()->isIntegerTy(16) &&
zextInst->getDestTy()->isIntegerTy(32)) {

auto* andInst = dyn_cast<Instruction>(zextInst->getOperand(0));
if (andInst && andInst->getOpcode() == Instruction::And) {
// Check for mask constant -4 (0xFFFC)
auto* mask = dyn_cast<ConstantInt>(andInst->getOperand(1));
if (mask && mask->getSExtValue() == -4) {
uint32_t laneIdx = 0;
Value* simdLaneOp = andInst->getOperand(0);

// Check for or operation
if (auto* orInst = dyn_cast<Instruction>(simdLaneOp)) {
if (orInst->getOpcode() == Instruction::Or) {
auto* constOffset = dyn_cast<ConstantInt>(orInst->getOperand(1));
// Return if OR value is not a constant or is >= 4
if (!constOffset || constOffset->getZExtValue() >= 4) {
return;
}
laneIdx = constOffset->getZExtValue() & 0x3;
simdLaneOp = orInst->getOperand(0);
}
}
}
else if( VI->getOpcode() == Instruction::And ) {
ConstantInt* andValueConstant = dyn_cast<ConstantInt>( VI->getOperand( 1 ) );
// We handle "redundant values", so those which bits enable all of
// 32 lanes, so 31, 63 (spotted in nature), 127, 255 etc.
if( andValueConstant && (( andValueConstant->getZExtValue() & 31 ) != 31 ) ) {
return nullptr;

// Check for simdLaneId
auto* simdLaneCall = dyn_cast<CallInst>(simdLaneOp);
if (simdLaneCall) {
Function* simdIdF = simdLaneCall->getCalledFunction();
if (simdIdF &&
GenISAIntrinsic::getIntrinsicID(simdIdF) == GenISAIntrinsic::GenISA_simdLaneId) {

// Pattern matched - create QuadBroadcast
IRBuilder<> builder(I);

Function* quadBroadcastFunc = GenISAIntrinsic::getDeclaration(
builder.GetInsertBlock()->getParent()->getParent(),
GenISAIntrinsic::GenISA_QuadBroadcast,
I->getType());

Value* result = builder.CreateCall(quadBroadcastFunc,
{ I->getOperand(0), builder.getInt32(laneIdx) },
"quadBroadcast");

I->replaceAllUsesWith(result);
I->eraseFromParent();
return;
}
}
V = VI->getOperand( 0 ); // Skip over and
} else {
return nullptr; // Not a zext, and, or the specified opcode
}
}
return nullptr; //unreachable
};
}
}

// Try ShuffleXor pattern if helper lanes = 0
if (enableHelperLanes->getZExtValue() != 0) {
return;
}

Instruction* xorInst = getInstructionIgnoringAndZext( I->getOperand( 1 ), Instruction::Xor );
if( !xorInst )
Instruction* xorInst = getInstructionIgnoringAndZext(indexOp, Instruction::Xor);
if (!xorInst)
return;

auto xorOperand = xorInst->getOperand( 0 );
auto xorValueConstant = dyn_cast<ConstantInt> ( xorInst->getOperand( 1 ) );
if( !xorValueConstant )
auto xorOperand = xorInst->getOperand(0);
auto xorValueConstant = dyn_cast<ConstantInt>(xorInst->getOperand(1));
if (!xorValueConstant)
return;

uint64_t xorValue = xorValueConstant->getZExtValue();
if( xorValue >= 16 )
{
if (xorValue >= 16) {
// currently not supported in the emitter
return;
}

auto simdLaneCandidate = getInstructionIgnoringAndZext( xorOperand, Instruction::Call );

auto simdLaneCandidate = getInstructionIgnoringAndZext(xorOperand, Instruction::Call);
if (!simdLaneCandidate)
return;

CallInst* CI = cast<CallInst>( simdLaneCandidate );
CallInst* CI = cast<CallInst>(simdLaneCandidate);
Function* simdIdF = CI->getCalledFunction();
if( !simdIdF || GenISAIntrinsic::getIntrinsicID( simdIdF ) != GenISAIntrinsic::GenISA_simdLaneId)
if (!simdIdF || GenISAIntrinsic::getIntrinsicID(simdIdF) != GenISAIntrinsic::GenISA_simdLaneId)
return;

// since we didn't return earlier, pattern is found

// ShuffleXor pattern found
auto insertShuffleXor = [](IRBuilder<>& builder,
Value* value,
uint32_t xorValue)
{
Function* simdShuffleXorFunc = GenISAIntrinsic::getDeclaration(
builder.GetInsertBlock()->getParent()->getParent(),
GenISAIntrinsic::GenISA_simdShuffleXor,
value->getType());

return builder.CreateCall(simdShuffleXorFunc,
{ value, builder.getInt32(xorValue) }, "simdShuffleXor");
};
Value* value,
uint32_t xorValue) {
Function* simdShuffleXorFunc = GenISAIntrinsic::getDeclaration(
builder.GetInsertBlock()->getParent()->getParent(),
GenISAIntrinsic::GenISA_simdShuffleXor,
value->getType());

return builder.CreateCall(simdShuffleXorFunc,
{ value, builder.getInt32(xorValue) }, "simdShuffleXor");
};

Value* value = I->getOperand(0);
IRBuilder<> builder(I);
Expand Down
128 changes: 128 additions & 0 deletions IGC/Compiler/tests/CustomSafeOptPass/quad_broadcast.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
;=========================== begin_copyright_notice ============================
;
; Copyright (C) 2017-2022 Intel Corporation
;
; SPDX-License-Identifier: MIT
;
;============================ end_copyright_notice =============================

; RUN: igc_opt -igc-custom-safe-opt -S %s -o %t.ll
; RUN: FileCheck %s --input-file=%t.ll

declare i16 @llvm.genx.GenISA.simdLaneId()
declare float @llvm.genx.GenISA.WaveShuffleIndex.f32(float, i32, i32)
declare float @llvm.genx.GenISA.QuadBroadcast.f32(float, i32)

; Test basic quad broadcast pattern for lane 0
; CHECK-LABEL: @test_quad_broadcast_lane0
define float @test_quad_broadcast_lane0(float %x) nounwind {
entry:
%lane = call i16 @llvm.genx.GenISA.simdLaneId()
%masked = and i16 %lane, -4 ; Mask to quad boundary (0xFFFC)
%idx = zext i16 %masked to i32
; CHECK: call float @llvm.genx.GenISA.QuadBroadcast.f32(float %x, i32 0)
%result = call float @llvm.genx.GenISA.WaveShuffleIndex.f32(float %x, i32 %idx, i32 1)
ret float %result
}

; Test basic quad broadcast pattern for lane 1
; CHECK-LABEL: @test_quad_broadcast_lane1
define float @test_quad_broadcast_lane1(float %x) nounwind {
entry:
%lane = call i16 @llvm.genx.GenISA.simdLaneId()
%lane1 = or i16 %lane, 1 ; Set bit for lane 1
%masked = and i16 %lane1, -4 ; Mask to quad boundary
%idx = zext i16 %masked to i32
; CHECK: call float @llvm.genx.GenISA.QuadBroadcast.f32(float %x, i32 1)
%result = call float @llvm.genx.GenISA.WaveShuffleIndex.f32(float %x, i32 %idx, i32 1)
ret float %result
}

; Test basic quad broadcast pattern for lane 2
; CHECK-LABEL: @test_quad_broadcast_lane2
define float @test_quad_broadcast_lane2(float %x) nounwind {
entry:
%lane = call i16 @llvm.genx.GenISA.simdLaneId()
%lane2 = or i16 %lane, 2 ; Set bit for lane 2
%masked = and i16 %lane2, -4 ; Mask to quad boundary
%idx = zext i16 %masked to i32
; CHECK: call float @llvm.genx.GenISA.QuadBroadcast.f32(float %x, i32 2)
%result = call float @llvm.genx.GenISA.WaveShuffleIndex.f32(float %x, i32 %idx, i32 1)
ret float %result
}

; Test basic quad broadcast pattern for lane 3
; CHECK-LABEL: @test_quad_broadcast_lane3
define float @test_quad_broadcast_lane3(float %x) nounwind {
entry:
%lane = call i16 @llvm.genx.GenISA.simdLaneId()
%lane3 = or i16 %lane, 3 ; Set bit for lane 3
%masked = and i16 %lane3, -4 ; Mask to quad boundary
%idx = zext i16 %masked to i32
; CHECK: call float @llvm.genx.GenISA.QuadBroadcast.f32(float %x, i32 3)
%result = call float @llvm.genx.GenISA.WaveShuffleIndex.f32(float %x, i32 %idx, i32 1)
ret float %result
}

; Test that we don't transform when helper lanes = 0
; CHECK-LABEL: @test_no_transform_helper_lanes
define float @test_no_transform_helper_lanes(float %x) nounwind {
entry:
%lane = call i16 @llvm.genx.GenISA.simdLaneId()
%masked = and i16 %lane, -4
%idx = zext i16 %masked to i32
; CHECK: call float @llvm.genx.GenISA.WaveShuffleIndex.f32(float %x, i32 %idx, i32 0)
%result = call float @llvm.genx.GenISA.WaveShuffleIndex.f32(float %x, i32 %idx, i32 0)
ret float %result
}

; Test that we don't transform when using different AND mask
; CHECK-LABEL: @test_no_transform_different_mask
define float @test_no_transform_different_mask(float %x) nounwind {
entry:
%lane = call i16 @llvm.genx.GenISA.simdLaneId()
%masked = and i16 %lane, -8 ; Different mask, not -4 (0xFFFC)
%idx = zext i16 %masked to i32
; CHECK: call float @llvm.genx.GenISA.WaveShuffleIndex.f32(float %x, i32 %idx, i32 1)
%result = call float @llvm.genx.GenISA.WaveShuffleIndex.f32(float %x, i32 %idx, i32 1)
ret float %result
}

; Test that we don't transform when OR constant is too large
; CHECK-LABEL: @test_no_transform_large_lane
define float @test_no_transform_large_lane(float %x) nounwind {
entry:
%lane = call i16 @llvm.genx.GenISA.simdLaneId()
%lane4 = or i16 %lane, 4 ; Invalid quad lane (must be 0-3)
%masked = and i16 %lane4, -4 ; Mask to quad boundary
%idx = zext i16 %masked to i32
; CHECK: call float @llvm.genx.GenISA.WaveShuffleIndex.f32(float %x, i32 %idx, i32 1)
%result = call float @llvm.genx.GenISA.WaveShuffleIndex.f32(float %x, i32 %idx, i32 1)
ret float %result
}

; Test that we don't transform when OR uses non-constant value
; CHECK-LABEL: @test_no_transform_variable_lane
define float @test_no_transform_variable_lane(float %x, i16 %lane_val) nounwind {
entry:
%lane = call i16 @llvm.genx.GenISA.simdLaneId()
%laneN = or i16 %lane, %lane_val ; Variable lane index
%masked = and i16 %laneN, -4 ; Mask to quad boundary
%idx = zext i16 %masked to i32
; CHECK: call float @llvm.genx.GenISA.WaveShuffleIndex.f32(float %x, i32 %idx, i32 1)
%result = call float @llvm.genx.GenISA.WaveShuffleIndex.f32(float %x, i32 %idx, i32 1)
ret float %result
}

; Test that we don't transform valid quad pattern when helper_lanes = 0
; CHECK-LABEL: @test_no_transform_valid_lane_wrong_helper
define float @test_no_transform_valid_lane_wrong_helper(float %x) nounwind {
entry:
%lane = call i16 @llvm.genx.GenISA.simdLaneId()
%lane1 = or i16 %lane, 1 ; Valid lane (1)
%masked = and i16 %lane1, -4 ; Correct mask
%idx = zext i16 %masked to i32
; CHECK: call float @llvm.genx.GenISA.WaveShuffleIndex.f32(float %x, i32 %idx, i32 0)
%result = call float @llvm.genx.GenISA.WaveShuffleIndex.f32(float %x, i32 %idx, i32 0)
ret float %result
}
Loading

0 comments on commit 34c8a74

Please sign in to comment.