diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 152f8ecdf29c..5b18aefbd781 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -1128,13 +1128,6 @@ static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) { void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, const SIProgramInfo &CurrentProgramInfo, const MachineFunction &MF) const { - const Function &F = MF.getFunction(); - - // Avoid asserting on erroneous cases. - if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL && - F.getCallingConv() != CallingConv::SPIR_KERNEL) - return; - const SIMachineFunctionInfo *MFI = MF.getInfo(); const GCNSubtarget &STM = MF.getSubtarget(); @@ -1181,8 +1174,9 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, if (STM.isXNACKEnabled()) Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED; - unsigned MaxKernArgAlign; - Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign); + // FIXME: Should use getKernArgSize + Out.kernarg_segment_byte_size = + STM.getKernArgSegmentSize(MF.getFunction(), MFI->getExplicitKernArgSize()); Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR; Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR; Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize; @@ -1191,7 +1185,7 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, // These alignment values are specified in powers of two, so alignment = // 2^n. The minimum alignment is 2^4 = 16. Out.kernarg_segment_alignment = std::max((size_t)4, - countTrailingZeros(MaxKernArgAlign)); + countTrailingZeros(MFI->getMaxKernArgAlign())); if (STM.debuggerEmitPrologue()) { Out.debug_wavefront_private_segment_offset_sgpr = diff --git a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp index 29e93a9d9d34..b33079ae4ba0 100644 --- a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -209,16 +209,15 @@ Kernel::CodeProps::Metadata MetadataStreamer::getHSACodeProps( const Function &F = MF.getFunction(); // Avoid asserting on erroneous cases. - if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL && - F.getCallingConv() != CallingConv::SPIR_KERNEL) + if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL) return HSACodeProps; - unsigned MaxKernArgAlign; - HSACodeProps.mKernargSegmentSize = STM.getKernArgSegmentSize(F, - MaxKernArgAlign); + HSACodeProps.mKernargSegmentSize = + STM.getKernArgSegmentSize(F, MFI.getExplicitKernArgSize()); HSACodeProps.mGroupSegmentFixedSize = ProgramInfo.LDSSize; HSACodeProps.mPrivateSegmentFixedSize = ProgramInfo.ScratchSize; - HSACodeProps.mKernargSegmentAlign = std::max(MaxKernArgAlign, 4u); + HSACodeProps.mKernargSegmentAlign = + std::max(uint32_t(4), MFI.getMaxKernArgAlign()); HSACodeProps.mWavefrontSize = STM.getWavefrontSize(); HSACodeProps.mNumSGPRs = ProgramInfo.NumSGPR; HSACodeProps.mNumVGPRs = ProgramInfo.NumVGPR; diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 583a09e34abf..acdedab7e132 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -30,7 +30,6 @@ #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -41,6 +40,18 @@ #include "llvm/Support/KnownBits.h" using namespace llvm; +static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State) { + MachineFunction &MF = State.getMachineFunction(); + AMDGPUMachineFunction *MFI = MF.getInfo(); + + uint64_t Offset = MFI->allocateKernArg(LocVT.getStoreSize(), + ArgFlags.getOrigAlign()); + State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo)); + return true; +} + static bool allocateCCRegs(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State, @@ -899,118 +910,74 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC, /// for each individual part is i8. We pass the memory type as LocVT to the /// calling convention analysis function and the register type (Ins[x].VT) as /// the ValVT. -void AMDGPUTargetLowering::analyzeFormalArgumentsCompute( - CCState &State, - const SmallVectorImpl &Ins) const { - const MachineFunction &MF = State.getMachineFunction(); - const Function &Fn = MF.getFunction(); - LLVMContext &Ctx = Fn.getParent()->getContext(); - const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF); - const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn); - - unsigned MaxAlign = 1; - uint64_t ExplicitArgOffset = 0; - const DataLayout &DL = Fn.getParent()->getDataLayout(); - - unsigned InIndex = 0; - - for (const Argument &Arg : Fn.args()) { - Type *BaseArgTy = Arg.getType(); - unsigned Align = DL.getABITypeAlignment(BaseArgTy); - MaxAlign = std::max(Align, MaxAlign); - unsigned AllocSize = DL.getTypeAllocSize(BaseArgTy); - - uint64_t ArgOffset = alignTo(ExplicitArgOffset, Align) + ExplicitOffset; - ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize; - - // We're basically throwing away everything passed into us and starting over - // to get accurate in-memory offsets. The "PartOffset" is completely useless - // to us as computed in Ins. - // - // We also need to figure out what type legalization is trying to do to get - // the correct memory offsets. - - SmallVector ValueVTs; - SmallVector Offsets; - ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset); - - for (unsigned Value = 0, NumValues = ValueVTs.size(); - Value != NumValues; ++Value) { - uint64_t BasePartOffset = Offsets[Value]; - - EVT ArgVT = ValueVTs[Value]; - EVT MemVT = ArgVT; - MVT RegisterVT = - getRegisterTypeForCallingConv(Ctx, ArgVT); - unsigned NumRegs = - getNumRegistersForCallingConv(Ctx, ArgVT); - - if (!Subtarget->isAmdHsaOS() && - (ArgVT == MVT::i16 || ArgVT == MVT::i8 || ArgVT == MVT::f16)) { - // The ABI says the caller will extend these values to 32-bits. - MemVT = ArgVT.isInteger() ? MVT::i32 : MVT::f32; - } else if (NumRegs == 1) { - // This argument is not split, so the IR type is the memory type. - if (ArgVT.isExtended()) { - // We have an extended type, like i24, so we should just use the - // register type. - MemVT = RegisterVT; - } else { - MemVT = ArgVT; - } - } else if (ArgVT.isVector() && RegisterVT.isVector() && - ArgVT.getScalarType() == RegisterVT.getScalarType()) { - assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements()); - // We have a vector value which has been split into a vector with - // the same scalar type, but fewer elements. This should handle - // all the floating-point vector types. - MemVT = RegisterVT; - } else if (ArgVT.isVector() && - ArgVT.getVectorNumElements() == NumRegs) { - // This arg has been split so that each element is stored in a separate - // register. - MemVT = ArgVT.getScalarType(); - } else if (ArgVT.isExtended()) { - // We have an extended type, like i65. - MemVT = RegisterVT; +void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State, + const SmallVectorImpl &Ins) const { + for (unsigned i = 0, e = Ins.size(); i != e; ++i) { + const ISD::InputArg &In = Ins[i]; + EVT MemVT; + + unsigned NumRegs = getNumRegisters(State.getContext(), In.ArgVT); + + if (!Subtarget->isAmdHsaOS() && + (In.ArgVT == MVT::i16 || In.ArgVT == MVT::i8 || In.ArgVT == MVT::f16)) { + // The ABI says the caller will extend these values to 32-bits. + MemVT = In.ArgVT.isInteger() ? MVT::i32 : MVT::f32; + } else if (NumRegs == 1) { + // This argument is not split, so the IR type is the memory type. + assert(!In.Flags.isSplit()); + if (In.ArgVT.isExtended()) { + // We have an extended type, like i24, so we should just use the register type + MemVT = In.VT; } else { - unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs; - assert(ArgVT.getStoreSizeInBits() % NumRegs == 0); - if (RegisterVT.isInteger()) { - MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits); - } else if (RegisterVT.isVector()) { - assert(!RegisterVT.getScalarType().isFloatingPoint()); - unsigned NumElements = RegisterVT.getVectorNumElements(); - assert(MemoryBits % NumElements == 0); - // This vector type has been split into another vector type with - // a different elements size. - EVT ScalarVT = EVT::getIntegerVT(State.getContext(), - MemoryBits / NumElements); - MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements); - } else { - llvm_unreachable("cannot deduce memory type."); - } + MemVT = In.ArgVT; } - - // Convert one element vectors to scalar. - if (MemVT.isVector() && MemVT.getVectorNumElements() == 1) - MemVT = MemVT.getScalarType(); - - if (MemVT.isExtended()) { - // This should really only happen if we have vec3 arguments - assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3); - MemVT = MemVT.getPow2VectorType(State.getContext()); + } else if (In.ArgVT.isVector() && In.VT.isVector() && + In.ArgVT.getScalarType() == In.VT.getScalarType()) { + assert(In.ArgVT.getVectorNumElements() > In.VT.getVectorNumElements()); + // We have a vector value which has been split into a vector with + // the same scalar type, but fewer elements. This should handle + // all the floating-point vector types. + MemVT = In.VT; + } else if (In.ArgVT.isVector() && + In.ArgVT.getVectorNumElements() == NumRegs) { + // This arg has been split so that each element is stored in a separate + // register. + MemVT = In.ArgVT.getScalarType(); + } else if (In.ArgVT.isExtended()) { + // We have an extended type, like i65. + MemVT = In.VT; + } else { + unsigned MemoryBits = In.ArgVT.getStoreSizeInBits() / NumRegs; + assert(In.ArgVT.getStoreSizeInBits() % NumRegs == 0); + if (In.VT.isInteger()) { + MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits); + } else if (In.VT.isVector()) { + assert(!In.VT.getScalarType().isFloatingPoint()); + unsigned NumElements = In.VT.getVectorNumElements(); + assert(MemoryBits % NumElements == 0); + // This vector type has been split into another vector type with + // a different elements size. + EVT ScalarVT = EVT::getIntegerVT(State.getContext(), + MemoryBits / NumElements); + MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements); + } else { + llvm_unreachable("cannot deduce memory type."); } + } - unsigned PartOffset = 0; - for (unsigned i = 0; i != NumRegs; ++i) { - State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT, - BasePartOffset + PartOffset, - MemVT.getSimpleVT(), - CCValAssign::Full)); - PartOffset += MemVT.getStoreSize(); - } + // Convert one element vectors to scalar. + if (MemVT.isVector() && MemVT.getVectorNumElements() == 1) + MemVT = MemVT.getScalarType(); + + if (MemVT.isExtended()) { + // This should really only happen if we have vec3 arguments + assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3); + MemVT = MemVT.getPow2VectorType(State.getContext()); } + + assert(MemVT.isSimple()); + allocateKernArg(i, In.VT, MemVT.getSimpleVT(), CCValAssign::Full, In.Flags, + State); } } diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index 096e40230c6f..1e027dd67124 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -122,11 +122,8 @@ class AMDGPUTargetLowering : public TargetLowering { SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const; void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, SmallVectorImpl &Results) const; - - void analyzeFormalArgumentsCompute( - CCState &State, - const SmallVectorImpl &Ins) const; - + void analyzeFormalArgumentsCompute(CCState &State, + const SmallVectorImpl &Ins) const; public: AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI); diff --git a/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp index 8cc7e38f7b29..3c5760804b35 100644 --- a/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -77,9 +77,8 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { const unsigned KernArgBaseAlign = 16; // FIXME: Increase if necessary const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(F); - unsigned MaxAlign; // FIXME: Alignment is broken broken with explicit arg offset.; - const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F, MaxAlign); + const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F); if (TotalKernArgSize == 0) return false; @@ -92,11 +91,13 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize)); unsigned AS = KernArgSegment->getType()->getPointerAddressSpace(); + unsigned MaxAlign = 1; uint64_t ExplicitArgOffset = 0; for (Argument &Arg : F.args()) { Type *ArgTy = Arg.getType(); unsigned Align = DL.getABITypeAlignment(ArgTy); + MaxAlign = std::max(Align, MaxAlign); unsigned Size = DL.getTypeSizeInBits(ArgTy); unsigned AllocSize = DL.getTypeAllocSize(ArgTy); diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp index 13b4b50149ce..0574c991ee6e 100644 --- a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -24,23 +24,16 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath), MemoryBound(false), WaveLimiter(false) { - const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF); - // FIXME: Should initialize KernArgSize based on ExplicitKernelArgOffset, // except reserved size is not correctly aligned. - const Function &F = MF.getFunction(); if (auto *Resolver = MF.getMMI().getResolver()) { if (AMDGPUPerfHintAnalysis *PHA = static_cast( Resolver->getAnalysisIfAvailable(&AMDGPUPerfHintAnalysisID, true))) { - MemoryBound = PHA->isMemoryBound(&F); - WaveLimiter = PHA->needsWaveLimiter(&F); + MemoryBound = PHA->isMemoryBound(&MF.getFunction()); + WaveLimiter = PHA->needsWaveLimiter(&MF.getFunction()); } } - - CallingConv::ID CC = F.getCallingConv(); - if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) - ExplicitKernArgSize = ST.getExplicitKernArgSize(F, MaxKernArgAlign); } unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL, diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/lib/Target/AMDGPU/AMDGPUMachineFunction.h index 8d6b871bc03e..2c4bf328008e 100644 --- a/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -23,8 +23,8 @@ class AMDGPUMachineFunction : public MachineFunctionInfo { SmallDenseMap LocalMemoryObjects; protected: - uint64_t ExplicitKernArgSize; // Cache for this. - unsigned MaxKernArgAlign; // Cache for this. + uint64_t ExplicitKernArgSize; + unsigned MaxKernArgAlign; /// Number of bytes in the LDS that are being used. unsigned LDSSize; @@ -44,6 +44,17 @@ class AMDGPUMachineFunction : public MachineFunctionInfo { public: AMDGPUMachineFunction(const MachineFunction &MF); + uint64_t allocateKernArg(uint64_t Size, unsigned Align) { + assert(isPowerOf2_32(Align)); + ExplicitKernArgSize = alignTo(ExplicitKernArgSize, Align); + + uint64_t Result = ExplicitKernArgSize; + ExplicitKernArgSize += Size; + + MaxKernArgAlign = std::max(Align, MaxKernArgAlign); + return Result; + } + uint64_t getExplicitKernArgSize() const { return ExplicitKernArgSize; } diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 98b49070fa99..3efc564c8559 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -209,7 +209,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, FeatureDisable(false), InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), - TLInfo(TM, *this), + TLInfo(TM, *this), FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { AS = AMDGPU::getAMDGPUAS(TT); CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); @@ -406,44 +406,6 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { return true; } -uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, - unsigned &MaxAlign) const { - assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || - F.getCallingConv() == CallingConv::SPIR_KERNEL); - - const DataLayout &DL = F.getParent()->getDataLayout(); - uint64_t ExplicitArgBytes = 0; - MaxAlign = 1; - - for (const Argument &Arg : F.args()) { - Type *ArgTy = Arg.getType(); - - unsigned Align = DL.getABITypeAlignment(ArgTy); - uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); - ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize; - MaxAlign = std::max(MaxAlign, Align); - } - - return ExplicitArgBytes; -} - -unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, - unsigned &MaxAlign) const { - uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); - - unsigned ExplicitOffset = getExplicitKernelArgOffset(F); - - uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; - unsigned ImplicitBytes = getImplicitArgNumBytes(F); - if (ImplicitBytes != 0) { - unsigned Alignment = getAlignmentForImplicitArgPtr(); - TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; - } - - // Being able to dereference past the end is useful for emitting scalar loads. - return alignTo(TotalSize, 4); -} - R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, const TargetMachine &TM) : R600GenSubtargetInfo(TT, GPU, FS), @@ -484,6 +446,40 @@ bool GCNSubtarget::isVGPRSpillingEnabled(const Function& F) const { return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv()); } +uint64_t GCNSubtarget::getExplicitKernArgSize(const Function &F) const { + assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL); + + const DataLayout &DL = F.getParent()->getDataLayout(); + uint64_t ExplicitArgBytes = 0; + for (const Argument &Arg : F.args()) { + Type *ArgTy = Arg.getType(); + + unsigned Align = DL.getABITypeAlignment(ArgTy); + uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); + ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize; + } + + return ExplicitArgBytes; +} + +unsigned GCNSubtarget::getKernArgSegmentSize(const Function &F, + int64_t ExplicitArgBytes) const { + if (ExplicitArgBytes == -1) + ExplicitArgBytes = getExplicitKernArgSize(F); + + unsigned ExplicitOffset = getExplicitKernelArgOffset(F); + + uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; + unsigned ImplicitBytes = getImplicitArgNumBytes(F); + if (ImplicitBytes != 0) { + unsigned Alignment = getAlignmentForImplicitArgPtr(); + TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; + } + + // Being able to dereference past the end is useful for emitting scalar loads. + return alignTo(TotalSize, 4); +} + unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { if (SGPRs <= 80) diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index 623109733651..d9806d6133c6 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -51,7 +51,7 @@ class AMDGPUSubtarget { enum Generation { R600 = 0, R700 = 1, - EVERGREEN = 2, + EVERGREEN = 2, NORTHERN_ISLANDS = 3, SOUTHERN_ISLANDS = 4, SEA_ISLANDS = 5, @@ -82,7 +82,7 @@ class AMDGPUSubtarget { static const AMDGPUSubtarget &get(const MachineFunction &MF); static const AMDGPUSubtarget &get(const TargetMachine &TM, - const Function &F); + const Function &F); /// \returns Default range flat work group size for a calling convention. std::pair getDefaultFlatWorkGroupSize(CallingConv::ID CC) const; @@ -231,18 +231,6 @@ class AMDGPUSubtarget { /// Creates value range metadata on an workitemid.* inrinsic call or load. bool makeLIDRangeMetadata(Instruction *I) const; - /// \returns Number of bytes of arguments that are passed to a shader or - /// kernel in addition to the explicit ones declared for the function. - unsigned getImplicitArgNumBytes(const Function &F) const { - if (isMesaKernel(F)) - return 16; - return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0); - } - uint64_t getExplicitKernArgSize(const Function &F, - unsigned &MaxAlign) const; - unsigned getKernArgSegmentSize(const Function &F, - unsigned &MaxAlign) const; - virtual ~AMDGPUSubtarget() {} }; @@ -681,6 +669,14 @@ class GCNSubtarget : public AMDGPUGenSubtargetInfo, return D16PreservesUnusedBits; } + /// \returns Number of bytes of arguments that are passed to a shader or + /// kernel in addition to the explicit ones declared for the function. + unsigned getImplicitArgNumBytes(const Function &F) const { + if (isMesaKernel(F)) + return 16; + return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0); + } + // Scratch is allocated in 256 dword per wave blocks for the entire // wavefront. When viewed from the perspecive of an arbitrary workitem, this // is 4-byte aligned. @@ -829,6 +825,10 @@ class GCNSubtarget : public AMDGPUGenSubtargetInfo, return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS; } + uint64_t getExplicitKernArgSize(const Function &F) const; + unsigned getKernArgSegmentSize(const Function &F, + int64_t ExplicitArgBytes = -1) const; + /// Return the maximum number of waves per SIMD for kernels using \p SGPRs /// SGPRs unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; diff --git a/lib/Target/AMDGPU/R600.td b/lib/Target/AMDGPU/R600.td index 5c9c1c1ed504..ff96928211cc 100644 --- a/lib/Target/AMDGPU/R600.td +++ b/lib/Target/AMDGPU/R600.td @@ -52,3 +52,8 @@ def CC_R600 : CallingConv<[ T30_XYZW, T31_XYZW, T32_XYZW ]>>> ]>; + +// Calling convention for compute kernels +def CC_R600_Kernel : CallingConv<[ + CCCustom<"allocateKernArg"> +]>; diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp index 113d6249fa60..4110e6a28d66 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -50,6 +50,18 @@ using namespace llvm; +static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State) { + MachineFunction &MF = State.getMachineFunction(); + AMDGPUMachineFunction *MFI = MF.getInfo(); + + uint64_t Offset = MFI->allocateKernArg(LocVT.getStoreSize(), + ArgFlags.getOrigAlign()); + State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo)); + return true; +} + #include "R600GenCallingConv.inc" R600TargetLowering::R600TargetLowering(const TargetMachine &TM, @@ -222,7 +234,7 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FMA, MVT::f32, Expand); setOperationAction(ISD::FMA, MVT::f64, Expand); } - + // FIXME: This was moved from AMDGPUTargetLowering, I'm not sure if we // need it for R600. if (!Subtarget->hasFP32Denormals()) @@ -1571,7 +1583,7 @@ CCAssignFn *R600TargetLowering::CCAssignFnForCall(CallingConv::ID CC, case CallingConv::C: case CallingConv::Fast: case CallingConv::Cold: - llvm_unreachable("kernels should not be handled here"); + return CC_R600_Kernel; case CallingConv::AMDGPU_VS: case CallingConv::AMDGPU_GS: case CallingConv::AMDGPU_PS: @@ -1646,12 +1658,13 @@ SDValue R600TargetLowering::LowerFormalArguments( unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset(); unsigned PartOffset = VA.getLocMemOffset(); + unsigned Offset = Subtarget->getExplicitKernelArgOffset(MF.getFunction()) + + VA.getLocMemOffset(); MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase); SDValue Arg = DAG.getLoad( ISD::UNINDEXED, Ext, VT, DL, Chain, - DAG.getConstant(PartOffset, DL, MVT::i32), DAG.getUNDEF(MVT::i32), - PtrInfo, + DAG.getConstant(Offset, DL, MVT::i32), DAG.getUNDEF(MVT::i32), PtrInfo, MemVT, /* Alignment = */ 4, MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant); diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index ea8578fb19dd..5caf03e909b8 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1164,8 +1164,8 @@ SDValue SITargetLowering::lowerKernargMemParameter( // Try to avoid using an extload by loading earlier than the argument address, // and extracting the relevant bits. The load should hopefully be merged with // the previous argument. - if (MemVT.getStoreSize() < 4 && Align < 4) { - // TODO: Handle align < 4 and size >= 4 (can happen with packed structs). + if (Align < 4) { + assert(MemVT.getStoreSize() < 4); int64_t AlignDownOffset = alignDown(Offset, 4); int64_t OffsetDiff = Offset - AlignDownOffset; @@ -1796,6 +1796,7 @@ SDValue SITargetLowering::LowerFormalArguments( // FIXME: Alignment of explicit arguments totally broken with non-0 explicit // kern arg offset. const unsigned KernelArgBaseAlign = 16; + const unsigned ExplicitOffset = Subtarget->getExplicitKernelArgOffset(Fn); for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { const ISD::InputArg &Arg = Ins[i]; @@ -1811,9 +1812,11 @@ SDValue SITargetLowering::LowerFormalArguments( VT = Ins[i].VT; EVT MemVT = VA.getLocVT(); - const uint64_t Offset = VA.getLocMemOffset(); + const uint64_t Offset = ExplicitOffset + VA.getLocMemOffset(); unsigned Align = MinAlign(KernelArgBaseAlign, Offset); + // The first 36 bytes of the input buffer contains information about + // thread group and global sizes for clover. SDValue Arg = lowerKernargMemParameter( DAG, VT, MemVT, DL, Chain, Offset, Align, Ins[i].Flags.isSExt(), &Ins[i]); Chains.push_back(Arg.getValue(1)); diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 0d5ff75e37ed..7c5bc7431e42 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -54,16 +54,6 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) Occupancy = getMaxWavesPerEU(); limitOccupancy(MF); - CallingConv::ID CC = F.getCallingConv(); - - if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) { - if (!F.arg_empty()) - KernargSegmentPtr = true; - WorkGroupIDX = true; - WorkItemIDX = true; - } else if (CC == CallingConv::AMDGPU_PS) { - PSInputAddr = AMDGPU::getInitialPSInputAddr(F); - } if (!isEntryFunction()) { // Non-entry functions have no special inputs for now, other registers @@ -83,11 +73,21 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) } else { if (F.hasFnAttribute("amdgpu-implicitarg-ptr")) { KernargSegmentPtr = true; - MaxKernArgAlign = std::max(ST.getAlignmentForImplicitArgPtr(), - MaxKernArgAlign); + assert(MaxKernArgAlign == 0); + MaxKernArgAlign = ST.getAlignmentForImplicitArgPtr(); } } + CallingConv::ID CC = F.getCallingConv(); + if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) { + if (!F.arg_empty()) + KernargSegmentPtr = true; + WorkGroupIDX = true; + WorkItemIDX = true; + } else if (CC == CallingConv::AMDGPU_PS) { + PSInputAddr = AMDGPU::getInitialPSInputAddr(F); + } + if (ST.debuggerEmitPrologue()) { // Enable everything. WorkGroupIDX = true; diff --git a/test/CodeGen/AMDGPU/kernel-args.ll b/test/CodeGen/AMDGPU/kernel-args.ll index 9492b710d13e..5c2c868476ba 100644 --- a/test/CodeGen/AMDGPU/kernel-args.ll +++ b/test/CodeGen/AMDGPU/kernel-args.ll @@ -589,17 +589,6 @@ entry: ; ret void ; } -; FUNC-LABEL: {{^}}i65_arg: -; HSA-VI: kernarg_segment_byte_size = 24 -; HSA-VI: kernarg_segment_alignment = 4 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 -define amdgpu_kernel void @i65_arg(i65 addrspace(1)* nocapture %out, i65 %in) nounwind { -entry: - store i65 %in, i65 addrspace(1)* %out, align 4 - ret void -} - ; FUNC-LABEL: {{^}}i1_arg: ; HSA-VI: kernarg_segment_byte_size = 12 ; HSA-VI: kernarg_segment_alignment = 4 @@ -662,7 +651,7 @@ define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwi } ; FUNC-LABEL: {{^}}empty_struct_arg: -; HSA-VI: kernarg_segment_byte_size = 0 +; HSA: kernarg_segment_byte_size = 0 define amdgpu_kernel void @empty_struct_arg({} %in) nounwind { ret void } @@ -678,11 +667,11 @@ define amdgpu_kernel void @empty_struct_arg({} %in) nounwind { ; FIXME: Total argument size is computed wrong ; FUNC-LABEL: {{^}}struct_argument_alignment: -; HSA-VI: kernarg_segment_byte_size = 40 -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 +; HSA: kernarg_segment_byte_size = 40 +; HSA: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 +; HSA: s_load_dword s{{[0-9]+}}, s[4:5], 0x18 +; HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) { %val0 = extractvalue {i32, i64} %arg0, 0 %val1 = extractvalue {i32, i64} %arg0, 1 @@ -698,11 +687,11 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, ; No padding between i8 and next struct, but round up at end to 4 byte ; multiple. ; FUNC-LABEL: {{^}}packed_struct_argument_alignment: -; HSA-VI: kernarg_segment_byte_size = 28 -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4 -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0xc -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10 +; HSA: kernarg_segment_byte_size = 28 +; HSA: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4 +; HSA: s_load_dword s{{[0-9]+}}, s[4:5], 0xc +; HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10 define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) { %val0 = extractvalue <{i32, i64}> %arg0, 0 %val1 = extractvalue <{i32, i64}> %arg0, 1 @@ -714,47 +703,3 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, store volatile i64 %val3, i64 addrspace(1)* null ret void } - -; GCN-LABEL: {{^}}struct_argument_alignment_after: -; HSA-VI: kernarg_segment_byte_size = 64 -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 -; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x30 -define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) { - %val0 = extractvalue {i32, i64} %arg0, 0 - %val1 = extractvalue {i32, i64} %arg0, 1 - %val2 = extractvalue {i32, i64} %arg2, 0 - %val3 = extractvalue {i32, i64} %arg2, 1 - store volatile i32 %val0, i32 addrspace(1)* null - store volatile i64 %val1, i64 addrspace(1)* null - store volatile i32 %val2, i32 addrspace(1)* null - store volatile i64 %val3, i64 addrspace(1)* null - store volatile <4 x i32> %arg4, <4 x i32> addrspace(1)* null - ret void -} - -; GCN-LABEL: {{^}}array_3xi32: -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4 -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8 -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0xc -define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { - store volatile i16 %arg0, i16 addrspace(1)* undef - store volatile [3 x i32] %arg1, [3 x i32] addrspace(1)* undef - ret void -} - -; FIXME: Why not all scalar loads? -; GCN-LABEL: {{^}}array_3xi16: -; HSA-VI: s_add_u32 s{{[0-9]+}}, s4, 2 -; HSA-VI: s_addc_u32 s{{[0-9]+}}, s5, 0 -; HSA-VI: flat_load_ushort -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4 -define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { - store volatile i8 %arg0, i8 addrspace(1)* undef - store volatile [3 x i16] %arg1, [3 x i16] addrspace(1)* undef - ret void -} diff --git a/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll b/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll deleted file mode 100644 index a1bb6c28e740..000000000000 --- a/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll +++ /dev/null @@ -1,132 +0,0 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-ir-lower-kernel-arguments=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,GCN,HSA-VI,FUNC %s - -; Repeat of some problematic tests in kernel-args.ll, with the IR -; argument lowering pass disabled. Struct padding needs to be -; accounted for, as well as legalization of types changing offsets. - -; FUNC-LABEL: {{^}}i1_arg: -; HSA-VI: kernarg_segment_byte_size = 12 -; HSA-VI: kernarg_segment_alignment = 4 - -; GCN: s_load_dword s -; GCN: s_and_b32 -define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind { - store i1 %x, i1 addrspace(1)* %out, align 1 - ret void -} - -; FUNC-LABEL: {{^}}v3i8_arg: -; HSA-VI: kernarg_segment_byte_size = 12 -; HSA-VI: kernarg_segment_alignment = 4 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8 -define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind { -entry: - store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}i65_arg: -; HSA-VI: kernarg_segment_byte_size = 24 -; HSA-VI: kernarg_segment_alignment = 4 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 -define amdgpu_kernel void @i65_arg(i65 addrspace(1)* nocapture %out, i65 %in) nounwind { -entry: - store i65 %in, i65 addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}empty_struct_arg: -; HSA-VI: kernarg_segment_byte_size = 0 -define amdgpu_kernel void @empty_struct_arg({} %in) nounwind { - ret void -} - -; The correct load offsets for these: -; load 4 from 0, -; load 8 from 8 -; load 4 from 24 -; load 8 from 32 - -; With the SelectionDAG argument lowering, the alignments for the -; struct members is not properly considered, making these wrong. - -; FIXME: Total argument size is computed wrong -; FUNC-LABEL: {{^}}struct_argument_alignment: -; HSA-VI: kernarg_segment_byte_size = 40 -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 -define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) { - %val0 = extractvalue {i32, i64} %arg0, 0 - %val1 = extractvalue {i32, i64} %arg0, 1 - %val2 = extractvalue {i32, i64} %arg1, 0 - %val3 = extractvalue {i32, i64} %arg1, 1 - store volatile i32 %val0, i32 addrspace(1)* null - store volatile i64 %val1, i64 addrspace(1)* null - store volatile i32 %val2, i32 addrspace(1)* null - store volatile i64 %val3, i64 addrspace(1)* null - ret void -} - -; No padding between i8 and next struct, but round up at end to 4 byte -; multiple. -; FUNC-LABEL: {{^}}packed_struct_argument_alignment: -; HSA-VI: kernarg_segment_byte_size = 28 -; HSA-VI: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:13 -; HSA-VI: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:17 -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4 -define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) { - %val0 = extractvalue <{i32, i64}> %arg0, 0 - %val1 = extractvalue <{i32, i64}> %arg0, 1 - %val2 = extractvalue <{i32, i64}> %arg1, 0 - %val3 = extractvalue <{i32, i64}> %arg1, 1 - store volatile i32 %val0, i32 addrspace(1)* null - store volatile i64 %val1, i64 addrspace(1)* null - store volatile i32 %val2, i32 addrspace(1)* null - store volatile i64 %val3, i64 addrspace(1)* null - ret void -} - -; GCN-LABEL: {{^}}struct_argument_alignment_after: -; HSA-VI: kernarg_segment_byte_size = 64 -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 -; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x30 -define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) { - %val0 = extractvalue {i32, i64} %arg0, 0 - %val1 = extractvalue {i32, i64} %arg0, 1 - %val2 = extractvalue {i32, i64} %arg2, 0 - %val3 = extractvalue {i32, i64} %arg2, 1 - store volatile i32 %val0, i32 addrspace(1)* null - store volatile i64 %val1, i64 addrspace(1)* null - store volatile i32 %val2, i32 addrspace(1)* null - store volatile i64 %val3, i64 addrspace(1)* null - store volatile <4 x i32> %arg4, <4 x i32> addrspace(1)* null - ret void -} - -; GCN-LABEL: {{^}}array_3xi32: -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4 -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8 -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0xc -define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { - store volatile i16 %arg0, i16 addrspace(1)* undef - store volatile [3 x i32] %arg1, [3 x i32] addrspace(1)* undef - ret void -} - -; GCN-LABEL: {{^}}array_3xi16: -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4 -define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { - store volatile i8 %arg0, i8 addrspace(1)* undef - store volatile [3 x i16] %arg1, [3 x i16] addrspace(1)* undef - ret void -} diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll index 6a9191e7dcbd..f860a122a88f 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll @@ -33,7 +33,7 @@ define amdgpu_kernel void @opencl_kernel_implicitarg_ptr_empty() #1 { ; GCN: enable_sgpr_kernarg_segment_ptr = 1 ; HSA: kernarg_segment_byte_size = 112 -; MESA: kernarg_segment_byte_size = 128 +; MESA: kernarg_segment_byte_size = 464 ; HSA: s_load_dword s0, s[4:5], 0x1c define amdgpu_kernel void @kernel_implicitarg_ptr([112 x i8]) #0 { @@ -47,7 +47,7 @@ define amdgpu_kernel void @kernel_implicitarg_ptr([112 x i8]) #0 { ; GCN: enable_sgpr_kernarg_segment_ptr = 1 ; HSA: kernarg_segment_byte_size = 160 -; MESA: kernarg_segment_byte_size = 128 +; MESA: kernarg_segment_byte_size = 464 ; HSA: s_load_dword s0, s[4:5], 0x1c define amdgpu_kernel void @opencl_kernel_implicitarg_ptr([112 x i8]) #1 { @@ -118,10 +118,10 @@ define amdgpu_kernel void @opencl_kernel_call_implicitarg_ptr_func_empty() #1 { ; GCN-LABEL: {{^}}kernel_call_implicitarg_ptr_func: ; GCN: enable_sgpr_kernarg_segment_ptr = 1 ; HSA: kernarg_segment_byte_size = 112 -; MESA: kernarg_segment_byte_size = 128 +; MESA: kernarg_segment_byte_size = 464 ; HSA: s_add_u32 s6, s4, 0x70 -; MESA: s_add_u32 s6, s4, 0x70 +; MESA: s_add_u32 s6, s4, 0x1c0 ; GCN: s_addc_u32 s7, s5, 0{{$}} ; GCN: s_swappc_b64 @@ -133,9 +133,10 @@ define amdgpu_kernel void @kernel_call_implicitarg_ptr_func([112 x i8]) #0 { ; GCN-LABEL: {{^}}opencl_kernel_call_implicitarg_ptr_func: ; GCN: enable_sgpr_kernarg_segment_ptr = 1 ; HSA: kernarg_segment_byte_size = 160 -; MESA: kernarg_segment_byte_size = 128 +; MESA: kernarg_segment_byte_size = 464 -; GCN: s_add_u32 s6, s4, 0x70 +; HSA: s_add_u32 s6, s4, 0x70 +; MESA: s_add_u32 s6, s4, 0x1c0 ; GCN: s_addc_u32 s7, s5, 0{{$}} ; GCN: s_swappc_b64 @@ -218,7 +219,8 @@ define void @opencl_func_kernarg_implicitarg_ptr() #0 { ; GCN-LABEL: {{^}}kernel_call_kernarg_implicitarg_ptr_func: ; GCN: s_mov_b64 s[6:7], s[4:5] -; GCN: s_add_u32 s8, s6, 0x70 +; HSA: s_add_u32 s8, s6, 0x70 +; MESA: s_add_u32 s8, s6, 0x1c0 ; GCN: s_addc_u32 s9, s7, 0 ; GCN: s_swappc_b64 define amdgpu_kernel void @kernel_call_kernarg_implicitarg_ptr_func([112 x i8]) #0 { diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll index 5853d8d8e4e1..6c1bc9eaa762 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll @@ -79,7 +79,7 @@ define amdgpu_kernel void @opencl_test_implicit_alignment(i32 addrspace(1)* %out ; CO-V2: enable_sgpr_kernarg_segment_ptr = 1 ; HSA: kernarg_segment_byte_size = 0 ; OS-MESA3D: kernarg_segment_byte_size = 16 -; CO-V2: kernarg_segment_alignment = 4 +; CO-V2: kernarg_segment_alignment = 32 ; HSA: s_load_dword s{{[0-9]+}}, s[4:5] define amdgpu_kernel void @test_no_kernargs() #1 {