eric-haibin-lin · eric-haibin-lin · Jun 23, 2017 · Jun 22, 2017 · Jun 22, 2017 · Jun 22, 2017
diff --git a/src/common/utils.h b/src/common/utils.h
@@ -7,6 +7,7 @@
 #define MXNET_COMMON_UTILS_H_
 
 #include <dmlc/logging.h>
+#include <dmlc/omp.h>
 #include <mxnet/engine.h>
 #include <mxnet/ndarray.h>
 #include <mxnet/op_attr_types.h>
@@ -152,6 +153,16 @@ inline int GetExecNumMatchColor() {
   return std::min(num_match_color, GetNumThreadPerGPU());
 }
 
+template<typename T, typename V>
+V ParallelAccumulate(const T* a, const size_t n, V start) {
+  V sum = start;
+#pragma omp parallel for reduction(+:sum)
+  for (size_t i = 0; i < n; ++i) {
+    sum += a[i];
+  }
+  return sum;
+}
+
 /*!
  * \brief
  * Helper function for ParallelSort.

diff --git a/src/operator/nn/cast_storage-inl.h b/src/operator/nn/cast_storage-inl.h
@@ -6,8 +6,11 @@
 #ifndef MXNET_OPERATOR_NN_CAST_STORAGE_INL_H_
 #define MXNET_OPERATOR_NN_CAST_STORAGE_INL_H_
 
-#include <numeric>
+#include <dmlc/timer.h>
+#include <mxnet/ndarray.h>
+#include <vector>
 #include "../mxnet_op.h"
+#include "../operator_common.h"
 #ifdef __CUDACC__
 #include "./cast_storage-inl.cuh"
 #endif  // __CUDACC__
@@ -54,23 +57,41 @@ inline void CastStorageDnsRspImpl(mshadow::Stream<cpu>* s, const TBlob& dns, NDA
       rsp->CheckAndAllocAuxData(rowsparse::kIdx, mshadow::Shape1(num_rows));
       TBlob row_idx_blob = rsp->aux_data(rowsparse::kIdx);
       RType* row_idx = row_idx_blob.dptr<RType>();
+
+      double start = dmlc::GetTime();
       mxnet_op::Kernel<MarkRspRowIdx, cpu>::Launch(s, num_rows, row_idx,
           dns.dptr<DType>(), num_cols);
+      double elapsed1 = dmlc::GetTime() - start;
+
       index_t nnr = 0;
-      nnr = std::accumulate(row_idx, row_idx+num_rows, nnr);
+      start = dmlc::GetTime();
+      nnr = mxnet::common::ParallelAccumulate(row_idx, num_rows, nnr);
+      //nnr = std::accumulate(row_idx, row_idx+num_rows, nnr);
+      double elapsed2 = dmlc::GetTime() - start;
+
       rsp->set_aux_shape(rowsparse::kIdx, mshadow::Shape1(nnr));
       if (0 == nnr) return;
       rsp->CheckAndAllocData(mshadow::Shape2(nnr, num_cols));
       mshadow::Tensor<cpu, 2, DType> dns_data = dns.FlatTo2D<cpu, DType>(s);
       mshadow::Tensor<cpu, 2, DType> rsp_data = rsp->data().FlatTo2D<cpu, DType>(s);
       size_t idx = 0;
+
+      start = dmlc::GetTime();
       for (index_t i = 0; i < num_rows; ++i) {
         if (row_idx[i] > 0) {
           row_idx[idx] = i;
           mshadow::Copy(rsp_data[idx], dns_data[i], s);
           ++idx;
         }
       }
+      double elapsed3 = dmlc::GetTime() - start;
+
+      double total = elapsed1 + elapsed2 + elapsed3;
+      LOG(INFO) << "shape = " << rsp->shape();
+      LOG(INFO) << "nnr = " << nnr;
+      LOG(INFO) << "MarkRspRowIdx cost " << elapsed1 * 1000 << " ms, " << static_cast<int>(elapsed1/total*100) << '%';
+      LOG(INFO) << "ParallelAccumulate cost " << elapsed2 * 1000 << " ms, " << static_cast<int>(elapsed2/total*100) << '%';
+      LOG(INFO) << "Copy rows cost " << elapsed3 * 1000 << " ms, " << static_cast<int>(elapsed3/total*100) << '%';
     });
   });
 }
@@ -287,6 +308,45 @@ void CastStorageComputeImpl(mshadow::Stream<xpu>* s,
   }
 }
 
+struct CastStorageParam : public dmlc::Parameter<CastStorageParam> {
+  int storage_type;
+  DMLC_DECLARE_PARAMETER(CastStorageParam) {
+    DMLC_DECLARE_FIELD(storage_type)
+    .add_enum("default", kDefaultStorage)
+    .add_enum("row_sparse", kRowSparseStorage)
+    .add_enum("csr", kCSRStorage)
+    .describe("Output storage type.");
+  }
+};
+
+inline bool CastStorageInferStorageType(const nnvm::NodeAttrs& attrs,
+                                        std::vector<int> *in_attrs,
+                                        std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  CHECK_NE(in_attrs->at(0), kUndefinedStorage)
+    << "src ndarray's storage type must be specified";
+  const CastStorageParam& param = nnvm::get<CastStorageParam>(attrs.parsed);
+  CHECK_NE(param.storage_type, kUndefinedStorage)
+    << "dst ndarray's storage type must be specified";
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, param.storage_type);
+  return true;
+}
+
+template<typename xpu>
+void CastStorageComputeEx(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const std::vector<NDArray>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<NDArray>& outputs) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  CHECK_EQ(inputs.size(), 1);
+  CHECK_EQ(outputs.size(), 1);
+  CastStorageComputeImpl<xpu>(s, inputs[0], outputs[0]);
+}
+
 }  // namespace op
 }  // namespace mxnet
 

diff --git a/src/operator/nn/cast_storage.cc b/src/operator/nn/cast_storage.cc
@@ -0,0 +1,31 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file cast_storage.cc
+ * \brief CPU Implementation of cast_storage operator.
+ */
+
+#include "./cast_storage-inl.h"
+#include "../elemwise_op_common.h"
+#include "../tensor/elemwise_unary_op.h"
+
+namespace mxnet {
+namespace op {
+
+// TODO(haibin) declare backward op for cast storage
+DMLC_REGISTER_PARAMETER(CastStorageParam);
+NNVM_REGISTER_OP(cast_storage)
+.describe(R"code(Casts tensor storage type to the new type.
+)code" ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<CastStorageParam>)
+.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<nnvm::FInferStorageType>("FInferStorageType", CastStorageInferStorageType)
+.set_attr<FCompute>("FCompute<cpu>", IdentityCompute<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", CastStorageComputeEx<cpu>)
+.add_argument("data", "NDArray-or-Symbol", "The input.")
+.add_arguments(CastStorageParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/nn/cast_storage.cu b/src/operator/nn/cast_storage.cu
@@ -0,0 +1,17 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file cast_storage.cu
+ * \brief GPU Implementation of cast_storage operator.
+ */
+#include "./cast_storage-inl.h"
+#include "../tensor/elemwise_unary_op.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(cast_storage)
+.set_attr<FCompute>("FCompute<gpu>", IdentityCompute<cpu>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", CastStorageComputeEx<gpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_unary_op.cc b/src/operator/tensor/elemwise_unary_op.cc
@@ -171,26 +171,6 @@ NNVM_REGISTER_OP(_backward_cast)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
 .set_attr<FCompute>("FCompute<cpu>", CastCompute<cpu>);
 
-// TODO(haibin) declare backward op for cast storage
-// Only support cast to default storage now
-// Other types require add infer_storage type pass
-DMLC_REGISTER_PARAMETER(CastStorageParam);
-NNVM_REGISTER_OP(cast_storage)
-.describe(R"code(Casts tensor storage type to the new type.
-)code" ADD_FILELINE)
-.set_num_inputs(1)
-.set_num_outputs(1)
-.set_attr_parser(ParamParser<CastStorageParam>)
-.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
-.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
-.set_attr<nnvm::FInferStorageType>("FInferStorageType", CastStorageInferStorageType)
-.set_attr<FCompute>("FCompute<cpu>", IdentityCompute<cpu>)
-// _backward pass
-// .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"negative"})
-.set_attr<FComputeEx>("FComputeEx<cpu>", CastStorageComputeEx<cpu>)
-.add_argument("data", "NDArray-or-Symbol", "The input.")
-.add_arguments(CastStorageParam::__FIELDS__());
-
 
 // negative
 MXNET_OPERATOR_REGISTER_UNARY(negative)

diff --git a/src/operator/tensor/elemwise_unary_op.cu b/src/operator/tensor/elemwise_unary_op.cu
@@ -45,10 +45,6 @@ NNVM_REGISTER_OP(Cast)
 NNVM_REGISTER_OP(_backward_cast)
 .set_attr<FCompute>("FCompute<gpu>", CastCompute<gpu>);
 
-NNVM_REGISTER_OP(cast_storage)
-.set_attr<FCompute>("FCompute<gpu>", IdentityCompute<cpu>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", CastStorageComputeEx<gpu>);
-
 // negative
 NNVM_REGISTER_OP(negative)
 .set_attr<FCompute>("FCompute<gpu>", UnaryCompute<gpu, mshadow_op::negation>);

diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h
@@ -14,7 +14,6 @@
 #include "../elemwise_op_common.h"
 #include "../special_functions-inl.h"
 #include "./broadcast_reduce-inl.h"
-#include "../nn/cast_storage-inl.h"
 
 namespace mxnet {
 namespace op {
@@ -204,45 +203,6 @@ struct relu_grad {
 };
 }  // namespace kernel_launch_op
 
-struct CastStorageParam : public dmlc::Parameter<CastStorageParam> {
-  int storage_type;
-  DMLC_DECLARE_PARAMETER(CastStorageParam) {
-    DMLC_DECLARE_FIELD(storage_type)
-    .add_enum("default", kDefaultStorage)
-    .add_enum("row_sparse", kRowSparseStorage)
-    .add_enum("csr", kCSRStorage)
-    .describe("Output storage type.");
-  }
-};
-
-inline bool CastStorageInferStorageType(const nnvm::NodeAttrs& attrs,
-                                        std::vector<int> *in_attrs,
-                                        std::vector<int> *out_attrs) {
-  CHECK_EQ(in_attrs->size(), 1U);
-  CHECK_EQ(out_attrs->size(), 1U);
-  CHECK_NE(in_attrs->at(0), kUndefinedStorage)
-    << "src ndarray's storage type must be specified";
-  const CastStorageParam& param = nnvm::get<CastStorageParam>(attrs.parsed);
-  CHECK_NE(param.storage_type, kUndefinedStorage)
-    << "dst ndarray's storage type must be specified";
-  TYPE_ASSIGN_CHECK(*out_attrs, 0, param.storage_type);
-  return true;
-}
-
-template<typename xpu>
-void CastStorageComputeEx(const nnvm::NodeAttrs& attrs,
-                          const OpContext& ctx,
-                          const std::vector<NDArray>& inputs,
-                          const std::vector<OpReqType>& req,
-                          const std::vector<NDArray>& outputs) {
-  using namespace mshadow;
-  using namespace mshadow::expr;
-  Stream<xpu> *s = ctx.get_stream<xpu>();
-  CHECK_EQ(inputs.size(), 1);
-  CHECK_EQ(outputs.size(), 1);
-  CastStorageComputeImpl<xpu>(s, inputs[0], outputs[0]);
-}
-
 #define MXNET_OPERATOR_REGISTER_UNARY(name)                         \
   NNVM_REGISTER_OP(name)                                            \
   .set_num_inputs(1)                                                \