Skip to content

Commit

Permalink
Merge branch 'NVIDIA:main' into bik
Browse files Browse the repository at this point in the history
  • Loading branch information
aartbik authored Jan 16, 2025
2 parents 8d623a8 + d71f0dd commit a2de7f4
Show file tree
Hide file tree
Showing 82 changed files with 288 additions and 373 deletions.
22 changes: 15 additions & 7 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@ endif()
if(NOT DEFINED rapids-cmake-dir)
include(FetchContent)
# Tell FetchContent to just use the local copy of rapids-cmake:
set(FETCHCONTENT_SOURCE_DIR_RAPIDS_CMAKE "${CMAKE_CURRENT_SOURCE_DIR}/cmake/rapids-cmake")
FetchContent_Declare(rapids-cmake URL /~https://github.com/rapidsai/rapids-cmake/archive/refs/heads/branch-24.12.zip)
FetchContent_Declare(rapids-cmake SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/cmake/rapids-cmake")

# Tell FetchContent to download remote copy of rapids-cmake:
#FetchContent_Declare(rapids-cmake URL /~https://github.com/rapidsai/rapids-cmake/archive/refs/heads/branch-24.12.zip)
FetchContent_MakeAvailable(rapids-cmake)
else()
# The include() commands below search the module path for the corresponding .cmake files
Expand Down Expand Up @@ -132,8 +134,10 @@ rapids_cpm_cccl(

target_link_libraries(matx INTERFACE CCCL::CCCL)

# Set flags for compiling tests faster
set(MATX_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} --threads 0 -ftemplate-backtrace-limit=0)
# Set flags for compiling tests faster (only for nvcc)
if (NOT CMAKE_CUDA_COMPILER_ID STREQUAL "Clang")
set(MATX_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} --threads 0 -ftemplate-backtrace-limit=0)
endif()

# Hack because CMake doesn't have short circult evaluation
if (NOT CMAKE_BUILD_TYPE OR "${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
Expand Down Expand Up @@ -163,9 +167,13 @@ if (NOT ${IS_NVCPP} GREATER -1)
endif()
endif()



set(WARN_FLAGS ${WARN_FLAGS} $<$<COMPILE_LANGUAGE:CUDA>:-Werror all-warnings>)
if (CMAKE_CUDA_COMPILER_ID STREQUAL "Clang")
message((STATUS "Using Clang compiler"))
# Workaround for clang bug: /~https://github.com/llvm/llvm-project/issues/58491
set(WARN_FLAGS ${WARN_FLAGS} $<$<COMPILE_LANGUAGE:CUDA>:-Wno-unused-command-line-argument>)
else()
set(WARN_FLAGS ${WARN_FLAGS} $<$<COMPILE_LANGUAGE:CUDA>:-Werror all-warnings>)
endif()
set(WARN_FLAGS ${WARN_FLAGS} $<$<COMPILE_LANGUAGE:CXX>:-Werror>)

# CUTLASS slows down compile times when used, so leave it as optional for now
Expand Down
2 changes: 1 addition & 1 deletion examples/black_scholes.cu
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ private:

public:
BlackScholes(O out, I1 K, I1 V, I1 S, I1 r, I1 T)
: out_(out), K_(K), V_(V), S_(S), r_(r), T_(T) {}
: out_(out), V_(V), S_(S), K_(K), r_(r), T_(T) {}

__device__ inline void operator()(index_t idx)
{
Expand Down
1 change: 0 additions & 1 deletion examples/convolution.cu
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ using namespace matx;
int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
{
MATX_ENTER_HANDLER();
typedef cuda::std::complex<float> complex;

uint32_t iterations = 10;
constexpr index_t numSamples = 1638400;
Expand Down
6 changes: 3 additions & 3 deletions examples/mvdr_beamformer.h
Original file line number Diff line number Diff line change
Expand Up @@ -164,9 +164,9 @@ class MVDRBeamformer {
auto GetCovMatInvView() { return invCovMatView; }

private:
index_t num_beams_;
index_t num_el_;
index_t data_len_;
[[maybe_unused]] index_t num_beams_;
[[maybe_unused]] index_t num_el_;
[[maybe_unused]] index_t data_len_;
index_t snap_len_;
cuda::std::complex<float> load_coeff_ = {0.1f, 0.f};

Expand Down
3 changes: 0 additions & 3 deletions examples/recursive_filter.cu
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,6 @@ using namespace matx;
int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
{
MATX_ENTER_HANDLER();
using complex = cuda::std::complex<float>;

cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);

Expand Down Expand Up @@ -70,7 +68,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
cudaEventCreate(&start);
cudaEventCreate(&stop);

using OutType = float;
using InType = float;
using FilterType = float;

Expand Down
4 changes: 2 additions & 2 deletions include/matx/core/half.h
Original file line number Diff line number Diff line change
Expand Up @@ -417,7 +417,7 @@ __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ bool operator==(const T &lhs,
const matxHalf<T> &rhs)
{
matxHalf<T> tmp{lhs};
return lhs == tmp;
return rhs == tmp;
}

/**
Expand Down Expand Up @@ -464,7 +464,7 @@ __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ bool operator!=(const T &lhs,
const matxHalf<T> &rhs)
{
matxHalf<T> tmp{lhs};
return !(lhs == tmp);
return !(rhs == tmp);
}

/**
Expand Down
6 changes: 3 additions & 3 deletions include/matx/core/half_complex.h
Original file line number Diff line number Diff line change
Expand Up @@ -515,7 +515,7 @@ __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ bool
operator==(const T &lhs, const matxHalfComplex<T> &rhs)
{
matxHalfComplex<T> tmp{lhs};
return lhs == tmp;
return rhs == tmp;
}

/**
Expand Down Expand Up @@ -562,7 +562,7 @@ __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ bool
operator!=(const T &lhs, const matxHalfComplex<T> &rhs)
{
matxHalfComplex<T> tmp{lhs};
return !(lhs == tmp);
return !(rhs == tmp);
}


Expand Down Expand Up @@ -853,7 +853,7 @@ pow(const T &x, const matxHalfComplex<T> &y)
{
cuda::std::complex<float> tmp{static_cast<float>(y.real()),
static_cast<float>(y.imag())};
tmp = cuda::std::pow(y, pow);
tmp = cuda::std::pow(x, pow);
return {static_cast<T>(tmp.real()), static_cast<T>(tmp.imag())};
}

Expand Down
2 changes: 0 additions & 2 deletions include/matx/core/operator_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -132,8 +132,6 @@ namespace matx {

template <typename Op, typename ValidFunc>
__MATX_INLINE__ auto GetSupportedTensor(const Op &in, const ValidFunc &fn, matxMemorySpace_t space, cudaStream_t stream = 0) {
constexpr int RANK = Op::Rank();

if constexpr (is_matx_transform_op<Op>()) {
// We can assume that if a transform is passed to the input then PreRun has already completed
// on the transform and we can use the internal pointer
Expand Down
2 changes: 1 addition & 1 deletion include/matx/core/print.h
Original file line number Diff line number Diff line change
Expand Up @@ -707,7 +707,7 @@ namespace matx {
*/
template <typename Op, typename... Args,
std::enable_if_t<(Op::Rank() > 0 && sizeof...(Args) == 0), bool> = true>
void fprint(FILE* fp, const Op &op, Args... dims) {
void fprint(FILE* fp, const Op &op, [[maybe_unused]] Args... dims) {
cuda::std::array<int, Op::Rank()> arr = {0};
auto tp = cuda::std::tuple_cat(arr);
cuda::std::apply([&](auto &&...args) { fprint(fp, op, args...); }, tp);
Expand Down
4 changes: 2 additions & 2 deletions include/matx/core/storage.h
Original file line number Diff line number Diff line change
Expand Up @@ -406,7 +406,7 @@ namespace matx
*/
void SetData(T *const data) noexcept
{
data_.reset(data_, [](auto){});
data_.reset(data, [](auto){});
}

/**
Expand All @@ -423,7 +423,7 @@ namespace matx
*
* @param size Size in bytes to allocate
*/
__MATX_INLINE__ T* allocate(size_t size)
__MATX_INLINE__ T* allocate([[maybe_unused]] size_t size)
{
MATX_THROW(matxInvalidParameter, "Cannot call allocate on a smart pointer storage type");
}
Expand Down
6 changes: 3 additions & 3 deletions include/matx/core/tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -967,7 +967,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
Reset(T *const data, T *const ldata) noexcept
{
storage_.SetData(data);
this->SetData(data);
this->SetData(ldata);
}


Expand Down Expand Up @@ -1074,7 +1074,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {

__MATX_INLINE__ __MATX_HOST__ bool IsManagedPointer() {
bool managed;
const CUresult retval = cuPointerGetAttribute(&managed, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)this->Data());
[[maybe_unused]] const CUresult retval = cuPointerGetAttribute(&managed, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)this->Data());
MATX_ASSERT(retval == CUDA_SUCCESS, matxNotSupported);
return managed;
}
Expand Down Expand Up @@ -1453,7 +1453,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {

// Determine where this memory resides
auto kind = GetPointerKind(this->Data());
auto mem_res = cuPointerGetAttributes(sizeof(attr)/sizeof(attr[0]), attr, data, reinterpret_cast<CUdeviceptr>(this->Data()));
[[maybe_unused]] auto mem_res = cuPointerGetAttributes(sizeof(attr)/sizeof(attr[0]), attr, data, reinterpret_cast<CUdeviceptr>(this->Data()));
MATX_ASSERT_STR_EXP(mem_res, CUDA_SUCCESS, matxCudaError, "Error returned from cuPointerGetAttributes");
if (kind == MATX_INVALID_MEMORY) {
if (mem_type == CU_MEMORYTYPE_DEVICE) {
Expand Down
2 changes: 1 addition & 1 deletion include/matx/core/tensor_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ namespace matx
__MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ auto BlockToIdx(const Op &op, index_t abs, int nb_dims) {
using l_stride_type = index_t;
using l_shape_type = index_t;
constexpr int RANK = op.Rank();
constexpr int RANK = Op::Rank();
cuda::std::array<l_shape_type, RANK> indices{0};

for (int idx = 0; idx < RANK - nb_dims; idx++) {
Expand Down
6 changes: 3 additions & 3 deletions include/matx/core/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -134,9 +134,9 @@ __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ auto madd( const T1 &x, const T2 &
//__half2 Y = make_half2(y.real(), y.imag());
//__half2 Z = make_half2(z.real(), z.imag());

const __half2 &X = *reinterpret_cast<const __half2*>(&x);
const __half2 &Y = *reinterpret_cast<const __half2*>(&y);
const __half2 &Z = *reinterpret_cast<const __half2*>(&z);
[[maybe_unused]] const __half2 &X = *reinterpret_cast<const __half2*>(&x);
[[maybe_unused]] const __half2 &Y = *reinterpret_cast<const __half2*>(&y);
[[maybe_unused]] const __half2 &Z = *reinterpret_cast<const __half2*>(&z);

#if 1
#ifdef __CUDA_ARCH__
Expand Down
4 changes: 3 additions & 1 deletion include/matx/executors/host.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,15 +58,17 @@ enum class ThreadsMode {

struct HostExecParams {
HostExecParams(int threads = 1) : threads_(threads) {}
HostExecParams(cpu_set_t cpu_set) : cpu_set_(cpu_set), threads_(1) {
HostExecParams(cpu_set_t cpu_set) : threads_(1), cpu_set_(cpu_set) {
MATX_ASSERT_STR(false, matxNotSupported, "CPU affinity not supported yet");
}

int GetNumThreads() const { return threads_; }

private:
int threads_;
MATX_IGNORE_WARNING_PUSH_CLANG("-Wunused-private-field")
cpu_set_t cpu_set_ {0};
MATX_IGNORE_WARNING_POP_CLANG
};

/**
Expand Down
4 changes: 2 additions & 2 deletions include/matx/generators/chirp.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,8 @@ namespace matx
inline __MATX_HOST__ __MATX_DEVICE__ Chirp(SpaceOp sop, FreqType f0, space_type t1, FreqType f1, ChirpMethod method) :
sop_(sop),
f0_(f0),
f1_(f1),
t1_(t1),
f1_(f1),
method_(method)
{}

Expand Down Expand Up @@ -109,8 +109,8 @@ namespace matx
inline __MATX_HOST__ __MATX_DEVICE__ ComplexChirp(SpaceOp sop, FreqType f0, space_type t1, FreqType f1, ChirpMethod method) :
sop_(sop),
f0_(f0),
t1_(t1),
f1_(f1),
t1_(t1),
method_(method)
{}

Expand Down
10 changes: 5 additions & 5 deletions include/matx/kernels/channelize_poly.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -290,22 +290,22 @@ __global__ void ChannelizePoly1D_Smem(OutType output, InType input, FilterType f
if (outdims[OutElemRank] <= last_elem) {
const filter_t *h = h_start;
output_t accum { 0 };
const int first_end = cuda::std::min(cached_input_ind_tail + filter_phase_len - 1, smem_input_height - 1);
const uint32_t first_end = cuda::std::min(cached_input_ind_tail + filter_phase_len - 1, smem_input_height - 1);
// The footprint of samples involved in the convolution may wrap from the end
// to the beginning of smem_input. The prologue below handles the samples from
// the current tail to the end of smem_input and the epilogue starts back at the
// beginning of smem_input.
const int prologue_count = (first_end - cached_input_ind_tail + 1);
const int epilogue_count = (prologue_count < filter_phase_len) ? filter_phase_len - prologue_count : 0;
const uint32_t prologue_count = (first_end - cached_input_ind_tail + 1);
const uint32_t epilogue_count = (prologue_count < filter_phase_len) ? filter_phase_len - prologue_count : 0;
const input_t *sample = smem_input + cached_input_ind_tail * num_channels + (num_channels - 1 - chan);
// Apply the filter h in reverse order below to flip the filter for convolution
for (int k = 0; k < prologue_count; k++) {
for (uint32_t k = 0; k < prologue_count; k++) {
accum += (*h) * (*sample);
sample += num_channels;
h -= num_channels;
}
sample = smem_input + (num_channels - 1 - chan);
for (int k = 0; k < epilogue_count; k++) {
for (uint32_t k = 0; k < epilogue_count; k++) {
accum += (*h) * (*sample);
sample += num_channels;
h -= num_channels;
Expand Down
Loading

0 comments on commit a2de7f4

Please sign in to comment.