MXNet Extensions enhancements2 (#19016)

* initial commit * split lib_api.h into lib_api.cc, updated examples for 2.0/gluon * fixed licenses * whitespace * whitespace * modernize * fix modernize * fix modernize * fix modernize * fixed move * added lib_api.cc to CMakeLists.txt for example libs * working example * remove GLOBAL to fix protobuf issue * fixed library unload * added test target * fixed sanity * changed destructor to default * added /LD option for customop_gpu_lib target * moved /LD inside the <> * diff compile flags for relu_lib.cu and lib_api.cc * set CMAKE_VERBOSE_MAKEFILE for debug * added -v to ninja * added /MT * another try * changed /MT to -MT * set flags for cxx separately * split /LD /MT flags * refactored cuda APIs into header file * removed debugging stuff * updated instructions for gpu build * moved building into cmakelists * moved build stuff into separate CMakeLists.txt * fixed gpu example * fixed license * added dlmc library dependency * added nnvm dependency * removed nnvm dmlc dependencies, added WINDOWS_EXPORT_ALL_SYMBOLS option * fixed WINDOWS_EXPORT_ALL_SYMBOLS * changed nnvm to shared library * backed out external ops changes * split relu example into separate files to test separate lib_api.h/cc * sanity * addressed initial review items Co-authored-by: Ubuntu <ubuntu@ip-172-31-6-220.us-west-2.compute.internal>
apache · Sep 1, 2020 · 8379740 · 8379740
1 parent 9268f89
commit 8379740
Show file tree

Hide file tree

Showing 23 changed files with 2,150 additions and 1,556 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -722,19 +722,20 @@ endif()
 target_compile_definitions(mxnet PUBLIC DMLC_LOG_FATAL_THROW=$<BOOL:${LOG_FATAL_THROW}>)
 
 # extension libraries (custom operators, custom subgraphs) are built by default
-add_library(customop_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op/gemm_lib.cc)
-add_library(transposecsr_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op/transposecsr_lib.cc)
-add_library(transposerowsp_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op/transposerowsp_lib.cc)
-add_library(subgraph_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_subgraph/subgraph_lib.cc)
-add_library(pass_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_pass/pass_lib.cc)
+add_library(customop_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op/gemm_lib.cc ${CMAKE_CURRENT_SOURCE_DIR}/src/lib_api.cc)
+add_library(transposecsr_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op/transposecsr_lib.cc ${CMAKE_CURRENT_SOURCE_DIR}/src/lib_api.cc)
+add_library(transposerowsp_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op/transposerowsp_lib.cc ${CMAKE_CURRENT_SOURCE_DIR}/src/lib_api.cc)
+add_library(subgraph_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_subgraph/subgraph_lib.cc ${CMAKE_CURRENT_SOURCE_DIR}/src/lib_api.cc)
+add_library(pass_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_pass/pass_lib.cc ${CMAKE_CURRENT_SOURCE_DIR}/src/lib_api.cc)
+
 target_include_directories(customop_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
 target_include_directories(transposecsr_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
 target_include_directories(transposerowsp_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
 target_include_directories(subgraph_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
 target_include_directories(pass_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
 if(USE_CUDA)
-  add_library(customop_gpu_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op/relu_lib.cu)
-  target_include_directories(customop_gpu_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
+  add_library(customop_gpu_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op/relu_lib.cu ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op/relu_lib.cc ${CMAKE_CURRENT_SOURCE_DIR}/src/lib_api.cc)
+  target_include_directories(customop_gpu_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op)
 endif()
 if(UNIX)
   if (USE_CUDA)
@@ -752,7 +753,9 @@ elseif(MSVC)
   set_target_properties(subgraph_lib PROPERTIES PREFIX "lib")
   set_target_properties(pass_lib PROPERTIES PREFIX "lib")
   if(USE_CUDA)
-    target_compile_options(customop_gpu_lib PUBLIC "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-fPIC>")
+    target_compile_options(customop_gpu_lib PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-LD -MT>")
+    target_compile_options(customop_gpu_lib PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:/LD>")
+    target_compile_options(customop_gpu_lib PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:/MT>")
     set_target_properties(customop_gpu_lib PROPERTIES PREFIX "lib")
   endif()
 endif()

diff --git a/config/linux_gpu.cmake b/config/linux_gpu.cmake
@@ -24,7 +24,8 @@
 #
 #  $ cp config/linux_gpu.cmake config.cmake
 #
-#  Next modify the according entries, and then compile by
+#  Next modify the entries in the config.cmake like MXNET_CUDA_ARCH to set the specific
+#  GPU architecture, and then compile by
 #
 #  $ mkdir build; cd build
 #  $ cmake ..
@@ -42,15 +43,18 @@ set(USE_CUDA ON CACHE BOOL "Build with CUDA support")
 set(USE_CUDNN ON CACHE BOOL "Build with cudnn support, if found")
 
 # Target NVIDIA GPU achitecture.
-# Valid options are "Auto" for autodetection, "All" for all available
-# architectures or a list of architectures by compute capability number, such as
-# "7.0" or "7.0;7.5" as well as name, such as "Volta" or "Volta;Turing".
+# Valid options are:
+#   - "Auto" for autodetection, will try and discover which GPU architecture to use by
+#            looking at the available GPUs on the machine that you're building on
+#   - "All" for all available GPU architectures supported by the version of CUDA installed
+#   - "specific GPU architectures" by giving the compute capability number such as
+#            "7.0" or "7.0;7.5" (ie. sm_70 or sm_75) or you can specify the name like:
+#            "Volta" or "Volta;Turing", be sure not to use quotes (ie. just set to 7.0)
 # The value specified here is passed to cmake's CUDA_SELECT_NVCC_ARCH_FLAGS to
 # obtain the compilation flags for nvcc.
 #
 # When compiling on a machine without GPU, autodetection will fail and you
-# should instead specify the target architecture manually to avoid excessive
-# compilation times.
+# should instead specify the target architecture manually.
 set(MXNET_CUDA_ARCH "Auto" CACHE STRING "Target NVIDIA GPU achitecture")
 
 #---------------------------------------------

diff --git a/example/extensions/lib_api/Makefile b/example/extensions/lib_api/Makefile
@@ -16,7 +16,7 @@
 # under the License.
 
 all:
-	g++ -std=c++11 -shared -fPIC init_lib.cc -o libinit_lib.so -I ../../../include/mxnet
+	g++ -std=c++11 -shared -fPIC init_lib.cc ../../../src/lib_api.cc -o libinit_lib.so -I ../../../include
 
 test:
 	g++ -std=c++11 -O3 -o libtest libtest.cc -ldl -I ../../../include/mxnet

diff --git a/example/extensions/lib_api/init_lib.cc b/example/extensions/lib_api/init_lib.cc
@@ -24,7 +24,7 @@
  */
 
 #include <iostream>
-#include "lib_api.h"
+#include "mxnet/lib_api.h"
 
 using namespace mxnet::ext;
 

diff --git a/example/extensions/lib_custom_op/Makefile b/example/extensions/lib_custom_op/Makefile
@@ -18,16 +18,19 @@
 all: gemm_lib relu_lib transposecsr_lib transposerowsp_lib
 
 gemm_lib:
-	g++ -shared -fPIC -std=c++11 gemm_lib.cc -o libgemm_lib.so -I ../../../include/mxnet
+	g++ -shared -fPIC -std=c++11 gemm_lib.cc ../../../src/lib_api.cc -o libgemm_lib.so -I ../../../include
 
 relu_lib:
-	nvcc -shared -std=c++11 -Xcompiler -fPIC relu_lib.cu -o librelu_lib.so -I ../../../include/mxnet
+	g++ -fPIC -c -std=c++11 relu_lib.cc -o relu_lib.cc.o -I ../../../include
+	g++ -fPIC -c -std=c++11 ../../../src/lib_api.cc -o lib_api.cc.o -I ../../../include
+	nvcc -c -std=c++11 -Xcompiler -fPIC relu_lib.cu -o relu_lib.cu.o -I ../../../include
+	nvcc -shared relu_lib.cc.o lib_api.cc.o relu_lib.cu.o -o librelu_lib.so
 
 transposecsr_lib:
-	g++ -shared -fPIC -std=c++11 transposecsr_lib.cc -o libtransposecsr_lib.so -I ../../../include/mxnet
+	g++ -shared -fPIC -std=c++11 transposecsr_lib.cc ../../../src/lib_api.cc -o libtransposecsr_lib.so -I ../../../include
 
 transposerowsp_lib:
-	g++ -shared -fPIC -std=c++11 transposerowsp_lib.cc -o libtransposerowsp_lib.so -I ../../../include/mxnet
+	g++ -shared -fPIC -std=c++11 transposerowsp_lib.cc ../../../src/lib_api.cc -o libtransposerowsp_lib.so -I ../../../include
 
 clean:
 	rm -rf libgemm_lib.so librelu_lib.so libtransposecsr_lib.so libtransposerowsp_lib.so
diff --git a/example/extensions/lib_custom_op/gemm_lib.cc b/example/extensions/lib_custom_op/gemm_lib.cc
@@ -25,7 +25,7 @@
 
 #include <iostream>
 #include <utility>
-#include "lib_api.h"
+#include "mxnet/lib_api.h"
 
 using namespace mxnet::ext;
 

diff --git a/example/extensions/lib_custom_op/relu_lib.cc b/example/extensions/lib_custom_op/relu_lib.cc
@@ -0,0 +1,171 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2020 by Contributors
+ * \file relu_lib.cu
+ * \brief simple custom relu and noisy relu operator implemented using CUDA function
+ */
+
+#include <iostream>
+#include "relu_lib.h"
+
+using namespace mxnet::ext;
+
+MXReturnValue parseAttrs(const std::unordered_map<std::string, std::string>& attrs,
+                         int* num_in, int* num_out) {
+  *num_in = 1;
+  *num_out = 1;
+  return MX_SUCCESS;
+}
+
+MXReturnValue inferType(const std::unordered_map<std::string, std::string>& attrs,
+                        std::vector<int>* intypes,
+                        std::vector<int>* outtypes) {
+  outtypes->at(0) = intypes->at(0);
+  return MX_SUCCESS;
+}
+
+MXReturnValue inferShape(const std::unordered_map<std::string, std::string>& attrs,
+                         std::vector<std::vector<unsigned int>>* inshapes,
+                         std::vector<std::vector<unsigned int>>* outshapes) {
+  outshapes->at(0) = inshapes->at(0);
+  return MX_SUCCESS;
+}
+
+MXReturnValue forwardCPU(const std::unordered_map<std::string, std::string>& attrs,
+                         std::vector<MXTensor>* inputs,
+                         std::vector<MXTensor>* outputs,
+                         const OpResource& res) {
+  float* in_data = inputs->at(0).data<float>();
+  float* out_data = outputs->at(0).data<float>();
+  for (int i=0; i<inputs->at(0).size(); i++) {
+    out_data[i] = in_data[i] > 0 ? in_data[i] : 0;
+  }
+  return MX_SUCCESS;
+}
+
+MXReturnValue backwardCPU(const std::unordered_map<std::string, std::string>& attrs,
+                          std::vector<MXTensor>* inputs,
+                          std::vector<MXTensor>* outputs,
+                          const OpResource& res) {
+  float* out_grad = inputs->at(0).data<float>();
+  float* in_data = inputs->at(1).data<float>();
+  float* in_grad = outputs->at(0).data<float>();
+  for (int i=0; i<inputs->at(1).size(); i++) {
+    in_grad[i] = in_data[i] > 0 ? 1 * out_grad[i] : 0;
+  }
+  return MX_SUCCESS;
+}
+
+REGISTER_OP(my_relu)
+.setParseAttrs(parseAttrs)
+.setInferType(inferType)
+.setInferShape(inferShape)
+.setForward(forwardCPU, "cpu")
+.setForward(forwardGPU, "gpu")
+.setBackward(backwardCPU, "cpu")
+.setBackward(backwardGPU, "gpu");
+
+
+MyStatefulReluCPU::MyStatefulReluCPU(const std::unordered_map<std::string, std::string>& attrs)
+  : attrs_(attrs) {}
+
+MXReturnValue MyStatefulReluCPU::Forward(std::vector<MXTensor>* inputs,
+                                         std::vector<MXTensor>* outputs,
+                                         const OpResource& op_res) {
+  return forwardCPU(attrs_, inputs, outputs, op_res);
+}
+
+MXReturnValue MyStatefulReluCPU::Backward(std::vector<MXTensor>* inputs,
+                                          std::vector<MXTensor>* outputs,
+                                          const OpResource& op_res) {
+  return backwardCPU(attrs_, inputs, outputs, op_res);
+}
+
+MyStatefulReluGPU::MyStatefulReluGPU(const std::unordered_map<std::string, std::string>& attrs)
+  : attrs_(attrs) {}
+
+MXReturnValue MyStatefulReluGPU::Forward(std::vector<MXTensor>* inputs,
+                                         std::vector<MXTensor>* outputs,
+                                         const OpResource& op_res) {
+  return forwardGPU(attrs_, inputs, outputs, op_res);
+}
+
+MXReturnValue MyStatefulReluGPU::Backward(std::vector<MXTensor>* inputs,
+                                          std::vector<MXTensor>* outputs,
+                                          const OpResource& op_res) {
+  return backwardGPU(attrs_, inputs, outputs, op_res);
+}
+
+
+MXReturnValue createOpStateCPU(const std::unordered_map<std::string, std::string>& attrs,
+                               CustomStatefulOp** op_inst) {
+  *op_inst = new MyStatefulReluCPU(attrs);
+  return MX_SUCCESS;
+}
+
+MXReturnValue createOpStateGPU(const std::unordered_map<std::string, std::string>& attrs,
+                               CustomStatefulOp** op_inst) {
+  *op_inst = new MyStatefulReluGPU(attrs);
+  return MX_SUCCESS;
+}
+
+REGISTER_OP(my_state_relu)
+.setParseAttrs(parseAttrs)
+.setInferType(inferType)
+.setInferShape(inferShape)
+.setCreateOpState(createOpStateCPU, "cpu")
+.setCreateOpState(createOpStateGPU, "gpu");
+
+MXReturnValue noisyForwardCPU(const std::unordered_map<std::string, std::string>& attrs,
+                              std::vector<MXTensor>* inputs,
+                              std::vector<MXTensor>* outputs,
+                              const OpResource& res) {
+  float* in_data = inputs->at(0).data<float>();
+  float* out_data = outputs->at(0).data<float>();
+
+  mx_cpu_rand_t* states = res.get_cpu_rand_states();
+  std::normal_distribution<float> dist_normal;
+
+  for (int i=0; i<inputs->at(0).size(); ++i) {
+    float noise = dist_normal(*states);
+    out_data[i] = in_data[i] + noise > 0 ? in_data[i] + noise : 0;
+  }
+  return MX_SUCCESS;
+}
+
+REGISTER_OP(my_noisy_relu)
+.setParseAttrs(parseAttrs)
+.setInferType(inferType)
+.setInferShape(inferShape)
+.setForward(noisyForwardCPU, "cpu")
+.setForward(noisyForwardGPU, "gpu")
+.setBackward(backwardCPU, "cpu")
+.setBackward(backwardGPU, "gpu");
+
+MXReturnValue initialize(int version) {
+  if (version >= 20000) {
+    std::cout << "MXNet version " << version << " supported" << std::endl;
+    return MX_SUCCESS;
+  } else {
+    MX_ERROR_MSG << "MXNet version " << version << " not supported";
+    return MX_FAIL;
+  }
+}