Skip to content

Commit

Permalink
Optimize NMS (apache#14290)
Browse files Browse the repository at this point in the history
* Optimize NMS

* Fix lint
  • Loading branch information
ptrendx authored and vdantu committed Mar 31, 2019
1 parent 1346780 commit cf74099
Show file tree
Hide file tree
Showing 3 changed files with 368 additions and 88 deletions.
118 changes: 118 additions & 0 deletions src/operator/contrib/bounding_box-common.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

/*!
* \file bounding_box-common.h
* \brief bounding box util functions and operators commonly used by CPU and GPU implementations
* \author Joshua Zhang
*/
#ifndef MXNET_OPERATOR_CONTRIB_BOUNDING_BOX_COMMON_H_
#define MXNET_OPERATOR_CONTRIB_BOUNDING_BOX_COMMON_H_
#include "../mshadow_op.h"
#include "../mxnet_op.h"
#include "../operator_common.h"

namespace mxnet {
namespace op {
namespace box_common_enum {
enum BoxType {kCorner, kCenter};
}

// compute line intersect along either height or width
template<typename DType>
MSHADOW_XINLINE DType Intersect(const DType *a, const DType *b, int encode) {
DType a1 = a[0];
DType a2 = a[2];
DType b1 = b[0];
DType b2 = b[2];
DType w;
if (box_common_enum::kCorner == encode) {
DType left = a1 > b1 ? a1 : b1;
DType right = a2 < b2 ? a2 : b2;
w = right - left;
} else {
DType aw = a2 / 2;
DType bw = b2 / 2;
DType al = a1 - aw;
DType ar = a1 + aw;
DType bl = b1 - bw;
DType br = b1 + bw;
DType left = bl > al ? bl : al;
DType right = br < ar ? br : ar;
w = right - left;
}
return w > 0 ? w : DType(0);
}

/*!
* \brief Implementation of the non-maximum suppression operation
*
* \param i the launched thread index
* \param index sorted index in descending order
* \param batch_start map (b, k) to compact index by indices[batch_start[b] + k]
* \param input the input of nms op
* \param areas pre-computed box areas
* \param k nms topk number
* \param ref compare reference position
* \param num number of input boxes in each batch
* \param stride input stride, usually 6 (id-score-x1-y1-x2-y2)
* \param offset_box box offset, usually 2
* \param thresh nms threshold
* \param force force suppress regardless of class id
* \param offset_id class id offset, used when force == false, usually 0
* \param encode box encoding type, corner(0) or center(1)
* \param DType the data type
*/
struct nms_impl {
template<typename DType>
MSHADOW_XINLINE static void Map(int i, int32_t *index, const int32_t *batch_start,
const DType *input, const DType *areas,
int k, int ref, int num,
int stride, int offset_box, int offset_id,
float thresh, bool force, int encode) {
int b = i / k; // batch
int pos = i % k + ref + 1; // position
ref = static_cast<int>(batch_start[b]) + ref;
pos = static_cast<int>(batch_start[b]) + pos;
if (ref >= static_cast<int>(batch_start[b + 1])) return;
if (pos >= static_cast<int>(batch_start[b + 1])) return;
if (index[ref] < 0) return; // reference has been suppressed
if (index[pos] < 0) return; // self been suppressed
int ref_offset = static_cast<int>(index[ref]) * stride + offset_box;
int pos_offset = static_cast<int>(index[pos]) * stride + offset_box;
if (!force && offset_id >=0) {
int ref_id = static_cast<int>(input[ref_offset - offset_box + offset_id]);
int pos_id = static_cast<int>(input[pos_offset - offset_box + offset_id]);
if (ref_id != pos_id) return; // different class
}
DType intersect = Intersect(input + ref_offset, input + pos_offset, encode);
intersect *= Intersect(input + ref_offset + 1, input + pos_offset + 1, encode);
int ref_area_offset = static_cast<int>(index[ref]);
int pos_area_offset = static_cast<int>(index[pos]);
DType iou = intersect / (areas[ref_area_offset] + areas[pos_area_offset] - intersect);
if (iou > thresh) {
index[pos] = -1;
}
}
};

} // namespace op
} // namespace mxnet

#endif // MXNET_OPERATOR_CONTRIB_BOUNDING_BOX_COMMON_H_
223 changes: 223 additions & 0 deletions src/operator/contrib/bounding_box-inl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,15 @@
*/
#ifndef MXNET_OPERATOR_CONTRIB_BOUNDING_BOX_INL_CUH_
#define MXNET_OPERATOR_CONTRIB_BOUNDING_BOX_INL_CUH_
#include <cmath>
#include <cstdio>
#include <mxnet/operator_util.h>
#include <thrust/copy.h>
#include <thrust/execution_policy.h>
#include "../mshadow_op.h"
#include "../mxnet_op.h"
#include "../operator_common.h"
#include "./bounding_box-common.h"

namespace mxnet {
namespace op {
Expand Down Expand Up @@ -57,6 +60,226 @@ int FilterScores(mshadow::Tensor<gpu, 1, DType> out_scores,
return end_scores - out_scores.dptr_;
}

// compute line intersect along either height or width
template<typename DType>
MSHADOW_XINLINE DType Intersect2(const DType *a, const DType b1, const DType b2, int encode) {
const DType a1 = a[0];
const DType a2 = a[2];
DType left, right;
if (box_common_enum::kCorner == encode) {
left = a1 > b1 ? a1 : b1;
right = a2 < b2 ? a2 : b2;
} else {
const DType aw = a2 / 2;
const DType bw = b2 / 2;
const DType al = a1 - aw;
const DType ar = a1 + aw;
const DType bl = b1 - bw;
const DType br = b1 + bw;
left = bl > al ? bl : al;
right = br < ar ? br : ar;
}
const DType w = right - left;
return w > 0 ? w : DType(0);
}

template<typename DType, int N, bool check_class>
__launch_bounds__(512)
__global__ void nms_apply_kernel(const int topk, int32_t *index,
const int32_t *batch_start,
const DType *input,
const DType *areas,
const int num, const int stride,
const int offset_box, const int offset_id,
const float thresh, const bool force,
const int encode, const int start_offset) {
constexpr int block_size = 512;
const int start = static_cast<int>(batch_start[blockIdx.x]) + start_offset;
const int size_of_batch = static_cast<int>(batch_start[blockIdx.x + 1]) - start;
const int end = min(min(size_of_batch, topk - start_offset), N * block_size);
__shared__ int s_index[N * block_size];

for (int i = threadIdx.x; i < end; i += block_size) {
s_index[i] = static_cast<int>(index[start + i]);
}

__syncthreads();
for (int ref = 0; ref < end; ++ref) {
const int ref_area_offset = static_cast<int>(s_index[ref]);
if (ref_area_offset >= 0) {
const int ref_offset = ref_area_offset * stride + offset_box;
int ref_id = 0;
if (check_class) {
ref_id = static_cast<int>(input[ref_offset - offset_box + offset_id]);
}
for (int i = 0; i < N; ++i) {
const int my_pos = threadIdx.x + i * block_size;
if (my_pos > ref && my_pos < end && s_index[my_pos] >= 0) {
const int pos_area_offset = static_cast<int>(s_index[my_pos]);
const int pos_offset = pos_area_offset * stride + offset_box;
if (check_class) {
const int pos_id = static_cast<int>(input[pos_offset - offset_box + offset_id]);
if (ref_id != pos_id) continue; // different class
}
DType intersect = Intersect(input + ref_offset, input + pos_offset, encode);
intersect *= Intersect(input + ref_offset + 1, input + pos_offset + 1, encode);
const DType iou = intersect /
(areas[ref_area_offset] + areas[pos_area_offset] - intersect);
if (iou > thresh) {
s_index[my_pos] = -1;
}
}
}
__syncthreads();
}
}

for (int i = threadIdx.x; i < end; i += block_size) {
index[start + i] = s_index[i];
}
}

template<typename DType, int N, bool check_class>
__launch_bounds__(512)
__global__ void nms_apply_kernel_rest(const int topk, int32_t *index,
const int32_t *batch_start,
const DType *input,
const DType *areas,
const int num, const int stride,
const int offset_box, const int offset_id,
const float thresh, const bool force,
const int encode, const int start_offset,
const int blocks_per_batch) {
constexpr int block_size = 512;
const int batch = blockIdx.x / blocks_per_batch;
const int start_ref = static_cast<int>(batch_start[batch]) + start_offset;
const int block_offset = (N + blockIdx.x % blocks_per_batch) * block_size;
const int start = start_ref + block_offset;

const int size_of_batch = static_cast<int>(batch_start[batch + 1]) - start;
const int end = min(size_of_batch, topk - start_offset - block_offset);
const int my_pos = start + threadIdx.x;
if (threadIdx.x < end && index[my_pos] >= 0) {
const int pos_area_offset = static_cast<int>(index[my_pos]);
const int pos_offset = pos_area_offset * stride + offset_box;
DType my_box[4];
#pragma unroll
for (int i = 0; i < 4; ++i) {
my_box[i] = input[pos_offset + i];
}
const DType my_area = areas[pos_area_offset];
int pos_id = 0;
if (check_class) {
pos_id = static_cast<int>(input[pos_offset - offset_box + offset_id]);
}

for (int ref = start_ref; ref < start_ref + N * block_size; ++ref) {
const int ref_area_offset = static_cast<int>(index[ref]);
if (ref_area_offset >= 0) {
const int ref_offset = ref_area_offset * stride + offset_box;
int ref_id = 0;
if (check_class) {
ref_id = static_cast<int>(input[ref_offset - offset_box + offset_id]);
if (ref_id != pos_id) continue; // different class
}
DType intersect = Intersect2(input + ref_offset, my_box[0], my_box[2], encode);
intersect *= Intersect2(input + ref_offset + 1, my_box[1], my_box[3], encode);
const DType iou = intersect /
(areas[ref_area_offset] + my_area - intersect);
if (iou > thresh) {
index[my_pos] = -1;
break;
}
}
}
}
}

template<typename DType>
void NMSApply(mshadow::Stream<gpu> *s,
int num_batch, int topk,
mshadow::Tensor<gpu, 1, int32_t>* sorted_index,
mshadow::Tensor<gpu, 1, int32_t>* batch_start,
mshadow::Tensor<gpu, 3, DType>* buffer,
mshadow::Tensor<gpu, 1, DType>* areas,
int num_elem, int width_elem,
int coord_start, int id_index,
float threshold, bool force_suppress,
int in_format) {
using namespace mxnet_op;
constexpr int THRESHOLD = 1024;
for (int ref = 0; ref < topk; ref += THRESHOLD) {
constexpr int block_size = 512;
constexpr int N = THRESHOLD / block_size;
auto stream = mshadow::Stream<gpu>::GetStream(s);
if (!force_suppress && id_index >= 0) {
nms_apply_kernel<DType, N, true><<<num_batch, block_size, 0, stream>>>(topk,
sorted_index->dptr_,
batch_start->dptr_,
buffer->dptr_,
areas->dptr_,
num_elem,
width_elem,
coord_start,
id_index,
threshold,
force_suppress,
in_format,
ref);
int blocks_per_batch = (topk - ref - THRESHOLD + block_size - 1)/block_size;
int blocks = blocks_per_batch * num_batch;
if (blocks > 0) {
nms_apply_kernel_rest<DType, N, true><<<blocks, block_size, 0, stream>>>(topk,
sorted_index->dptr_,
batch_start->dptr_,
buffer->dptr_,
areas->dptr_,
num_elem,
width_elem,
coord_start,
id_index,
threshold,
force_suppress,
in_format,
ref,
blocks_per_batch);
}
} else {
nms_apply_kernel<DType, N, false><<<num_batch, block_size, 0, stream>>>(topk,
sorted_index->dptr_,
batch_start->dptr_,
buffer->dptr_,
areas->dptr_,
num_elem,
width_elem,
coord_start,
id_index,
threshold,
force_suppress,
in_format,
ref);
int blocks_per_batch = (topk - ref - THRESHOLD + block_size - 1)/block_size;
int blocks = blocks_per_batch * num_batch;
if (blocks > 0) {
nms_apply_kernel_rest<DType, N, false><<<blocks, block_size, 0, stream>>>(topk,
sorted_index->dptr_,
batch_start->dptr_,
buffer->dptr_,
areas->dptr_,
num_elem,
width_elem,
coord_start,
id_index,
threshold,
force_suppress,
in_format,
ref,
blocks_per_batch);
}
}
}
}

} // namespace op
} // namespace mxnet

Expand Down
Loading

0 comments on commit cf74099

Please sign in to comment.