Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Multi_sum_sq review, AtomicAdd removal #17002

Merged
merged 7 commits into from
Dec 14, 2019
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions src/operator/contrib/multi_sum_sq-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
* Copyright (c) 2019 by Contributors
* \file multi_l2_norm-inl.h
* \brief vectorized L2 norm over multiple arrays operators
* \author Clement Fuji Tsang, Andrei Ivanov
* \author Clement Fuji Tsang, Andrei Ivanov, Moises Hernandez
*/


Expand All @@ -32,6 +32,10 @@
#include <vector>
#include "../operator_common.h"

namespace multi_sum_sq {
enum MultiSumSqUpdateResource {kTempSpace};
} // namespace multi_sum_sq

namespace mxnet {
namespace op {

Expand Down Expand Up @@ -80,7 +84,7 @@ inline bool MultiSumSqType(const NodeAttrs& attrs,

template<typename xpu>
void MultiSumSqRun(const std::vector<TBlob> &inputs, int nInputs,
float *out_ptr, mshadow::Stream<xpu> *s);
float *out_ptr, const OpContext &ctx);

template<typename xpu>
void MultiSumSq(const nnvm::NodeAttrs& attrs,
Expand All @@ -91,7 +95,7 @@ void MultiSumSq(const nnvm::NodeAttrs& attrs,
auto s = ctx.get_stream<xpu>();
const auto& p = dmlc::get<MultiSumSqParam>(attrs.parsed);
float* out_ptr = outputs[0].FlatTo2D<xpu, float>(s).dptr_;
MultiSumSqRun<xpu>(inputs, p.num_arrays, out_ptr, s);
MultiSumSqRun<xpu>(inputs, p.num_arrays, out_ptr, ctx);
}

} // namespace op
Expand Down
10 changes: 7 additions & 3 deletions src/operator/contrib/multi_sum_sq.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
* Copyright (c) 2019 by Contributors
* \file multi_sum_sq.cc
* \brief vectorized sum or squared over multiple arrays operators
* \author Clement Fuji Tsang, Andrei Ivanov
* \author Clement Fuji Tsang, Andrei Ivanov, Moises Hernandez
*/

#include "./multi_sum_sq-inl.h"
Expand Down Expand Up @@ -52,6 +52,10 @@ NNVM_REGISTER_OP(multi_sum_sq)
return ret;
})
.set_attr<FCompute>("FCompute<cpu>", MultiSumSq<cpu>)
.set_attr<FResourceRequest>("FResourceRequest",
[](const NodeAttrs& attrs) {
return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
})
.add_argument("data", "NDArray-or-Symbol[]", "Arrays")
.add_arguments(MultiSumSqParam::__FIELDS__());

Expand All @@ -74,9 +78,9 @@ inline void CalcSumSq(const std::vector<TBlob> &inputs, int nInputs,

template<>
void MultiSumSqRun<cpu>(const std::vector<TBlob> &inputs, int nInputs,
float *out_ptr, mshadow::Stream<cpu> *s) {
float *out_ptr, const OpContext &ctx) {
MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType,
CalcSumSq<DType>(inputs, nInputs, out_ptr, s);
CalcSumSq<DType>(inputs, nInputs, out_ptr, ctx.get_stream<cpu>());
)
}

Expand Down
87 changes: 57 additions & 30 deletions src/operator/contrib/multi_sum_sq.cu
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
* Copyright (c) 2019 by Contributors
* \file multi_sum_sq.cu
* \brief vectorized sums of squares norm over multiple arrays operators
* \author Clement Fuji Tsang, Andrei Ivanov
* \author Clement Fuji Tsang, Andrei Ivanov, Moises Hernandez
*/
#include "./multi_sum_sq-inl.h"
#include <cub/cub.cuh>
Expand All @@ -43,15 +43,14 @@ struct MultiSumSqKernelParam {
int sizes[ARRAY_LIMIT];
unsigned char block_to_tensor[BLOCK_LIMIT];
int block_to_chunk[BLOCK_LIMIT];
int max_chunks_per_tensor = -1;
};

template<typename DType>
__device__ __forceinline__ DType reduce_block_into_lanes(DType* x,
DType val,
int lanes = 1,
bool share_result = false) {
int tid = threadIdx.x + threadIdx.y * blockDim.x;
int blockSize = blockDim.x * blockDim.y; // blockSize is intended to be a multiple of 32.
DType val) {
int tid = threadIdx.x;
int blockSize = blockDim.x;
eric-haibin-lin marked this conversation as resolved.
Show resolved Hide resolved

if (blockSize >= 64) {
x[tid] = val;
Expand All @@ -72,27 +71,19 @@ __device__ __forceinline__ DType reduce_block_into_lanes(DType* x,
final = x[tid] + x[tid+32];
else
final = val;
// __SYNCWARP();

#pragma unroll
for (int i = 16; i >= lanes; i >>= 1)
for (int i = 16; i >= 1; i >>= 1)
final = final + __shfl_down_sync(0xffffffff, final, i);
}

if (share_result) {
if (tid < lanes)
x[tid] = final; // EpilogueOp
// Make sure the smem result is visible to all warps.
__syncthreads();
}

return final;
}

template<typename DType>
__global__ void MultiSumSqKernel(int chunk_size,
MultiSumSqKernelParam<DType> param,
float* output) {
float* block_reductions,
int start_tensor_id) {
const int tensor_loc = param.block_to_tensor[blockIdx.x];
const int chunk_len = param.block_to_chunk[blockIdx.x] * chunk_size;
const int n = param.sizes[tensor_loc] - chunk_len;
Expand All @@ -106,32 +97,65 @@ __global__ void MultiSumSqKernel(int chunk_size,
i_start < iMax;
i_start += blockDim.x * ILP) {
int i = i_start + threadIdx.x;
// #pragma unroll
#pragma unroll
for (int ii = 0; ii < ILP && i < iMax; ++ii, i += blockDim.x) {
const auto incoming_val = static_cast<float>(x[i]);
val += incoming_val * incoming_val;
}
}

const float final = reduce_block_into_lanes(vals, val);
if (threadIdx.x == 0)
atomicAdd(output + tensor_loc, final);

if (threadIdx.x == 0){
block_reductions[(start_tensor_id + tensor_loc) * param.max_chunks_per_tensor +
param.block_to_chunk[blockIdx.x]] = final;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we should change the variable name here? = final specifies that a virtual function cannot be overridden in a derived class.

}
}

template<typename DType>
__global__ void GlobalReductionKernel(MultiSumSqKernelParam<DType> param,
float* block_reductions,
float* output) {
__shared__ float vals[512];
float* reductions_this_tensor = block_reductions + blockIdx.x * param.max_chunks_per_tensor;
float val = 0;
for(int i = threadIdx.x; i < param.max_chunks_per_tensor; i += blockDim.x)
val += reductions_this_tensor[i];

float final = reduce_block_into_lanes(vals, val);

if(threadIdx.x == 0)
output[blockIdx.x] = final;
}

template<>
void MultiSumSqRun<gpu>(const std::vector<TBlob> &inputs, int nInputs,
float *out_ptr, mshadow::Stream<gpu> *s) {
float *out_ptr, const OpContext &ctx) {
const int chunk_size = 32768;
const int block_size = 512;
using namespace mxnet_op;
auto s = ctx.get_stream<gpu>();
auto stream = mshadow::Stream<gpu>::GetStream(s);
CUDA_CALL(cudaMemsetAsync(out_ptr, 0, nInputs * sizeof(float), stream));

MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
MultiSumSqKernelParam<DType> param;
// find max num of chunks in tensors
for (int t = 0; t < nInputs; t++) {
int chunks_this_tensor = (inputs[t].shape_.Size() + chunk_size - 1) / chunk_size;
if(chunks_this_tensor > param.max_chunks_per_tensor)
param.max_chunks_per_tensor = chunks_this_tensor;
}
// temporary storage for the reduction of each block
size_t workspace_size = nInputs * param.max_chunks_per_tensor * sizeof(float);
Tensor<gpu, 1, char> workspace =
ctx.requested[multi_sum_sq::kTempSpace].get_space_typed<gpu, 1, char>(
Shape1(workspace_size), s);
Tensor<gpu, 1, float> block_reductions(reinterpret_cast<float*>(&workspace[0]),
Shape1(nInputs * param.max_chunks_per_tensor), s);
eric-haibin-lin marked this conversation as resolved.
Show resolved Hide resolved
CUDA_CALL(cudaMemsetAsync(block_reductions.dptr_, 0, nInputs * param.max_chunks_per_tensor* sizeof(float), stream));

int loc_block_info = 0; // position in param.block_to_tensor and param.block_to_chunck
int loc_tensor_info = 0; // position in param.sizes and param.addresses
int output_offset = 0; // array index of the first block pointed on by param.addresses
int start_tensor_id = 0;
for (int t = 0; t < nInputs; t++, loc_tensor_info++) { // array index in inputs
param.sizes[loc_tensor_info] = inputs[t].shape_.Size();
param.addresses[loc_tensor_info] = inputs[t].FlatTo2D<gpu, DType>(s).dptr_;
Expand All @@ -142,27 +166,30 @@ void MultiSumSqRun<gpu>(const std::vector<TBlob> &inputs, int nInputs,
loc_block_info++;

const bool last_curr_chunk = chunk == chunks_this_tensor;
const bool tensors_full = last_curr_chunk && loc_tensor_info == 109;
const bool blocks_full = (loc_block_info == 320);
const bool tensors_full = last_curr_chunk && loc_tensor_info == (ARRAY_LIMIT-1);
const bool blocks_full = (loc_block_info == BLOCK_LIMIT);
const bool last_chunk = last_curr_chunk && t == nInputs - 1;
if (!(tensors_full || blocks_full || last_chunk))
continue;

MultiSumSqKernel<<<loc_block_info, block_size, 0, stream>>>
(chunk_size, param, out_ptr + output_offset);
(chunk_size, param, block_reductions.dptr_, start_tensor_id);
MSHADOW_CUDA_POST_KERNEL_CHECK(MultiSumSqKernel);

loc_block_info = 0;
if (last_curr_chunk) { // if you start from a new tensor
loc_tensor_info = -1;
output_offset = t + 1;
start_tensor_id = t + 1;
} else { // if you start from the same tensor
param.sizes[0] = param.sizes[loc_tensor_info];
param.addresses[0] = param.addresses[loc_tensor_info];
loc_tensor_info = 0;
output_offset = t;
start_tensor_id = t;
}
}
}
// Global reduction
GlobalReductionKernel<<<nInputs, block_size, 0, stream>>>
(param, block_reductions.dptr_, out_ptr);
});
}

Expand Down
29 changes: 29 additions & 0 deletions tests/python/gpu/test_operator_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,35 @@ def check_fast_lars(w_dtype, g_dtype, shapes, ctx, tol1, tol2):
ref_new_lrs[i] = lrs[i]
assert_almost_equal(ref_new_lrs.asnumpy(), mx_new_lrs.asnumpy(), atol=tol2, rtol=tol2)

def check_multi_sum_sq(dtype, shapes, ctx, tol1, tol2):
values_arr = [np.random.rand(*shape).astype(dtype) * 10. for shape in shapes]

mx_vals = _make_ndarrays(values_arr, ctx=ctx)
sum_sq = mx.nd.multi_sum_sq(*mx_vals, num_arrays=len(shapes))

ref_sum_sq = mx.nd.array([(v.astype('float32') ** 2).sum() for v in values_arr],
dtype='float32', ctx=ctx)

assert_almost_equal(ref_sum_sq.asnumpy(), sum_sq.asnumpy(), atol=tol1, rtol=tol1)

@with_seed()
def test_multi_sum_sq():
min_nparam = 390
max_nparam = 400
mindim = 50000
maxdim = 3200000
maxndim = 1

dtypes = ['float16','float32', 'float64']
for ctx in [mx.gpu(0)]:
for dtype in dtypes:
nparam = np.random.randint(min_nparam + 1, max_nparam + 1)
shapes = [np.random.randint(mindim, maxdim + 1, size=maxndim) for i in range(nparam)]
lowTol = ctx == mx.cpu(0) and ('float16'in [dtype])
eric-haibin-lin marked this conversation as resolved.
Show resolved Hide resolved
tol1 = 1e-3 if lowTol else 1e-5
tol2 = 1e-6 if lowTol else 1e-7
check_multi_sum_sq(dtype, shapes, ctx, tol1, tol2)
eric-haibin-lin marked this conversation as resolved.
Show resolved Hide resolved

@with_seed()
def test_fast_lars():
min_nparam = 50
Expand Down