lihaofd · lihaofd · Apr 23, 2019 · Apr 23, 2019 · Apr 23, 2019
diff --git a/3rdparty/mshadow b/3rdparty/mshadow
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -50,6 +50,7 @@ mxnet_option(USE_SIGNAL_HANDLER   "Print stack traces on segfaults." OFF)
 mxnet_option(USE_TENSORRT         "Enable infeference optimization with TensorRT." OFF)
 mxnet_option(USE_ASAN             "Enable Clang/GCC ASAN sanitizers." OFF)
 mxnet_option(ENABLE_TESTCOVERAGE  "Enable compilation with test coverage metric output" OFF)
+mxnet_option(USE_INT64_TENSOR_SIZE "Use int64_t to represent the total number of elements in a tensor" OFF)
 
 message(STATUS "CMAKE_CROSSCOMPILING ${CMAKE_CROSSCOMPILING}")
 message(STATUS "CMAKE_HOST_SYSTEM_PROCESSOR ${CMAKE_HOST_SYSTEM_PROCESSOR}")
@@ -295,6 +296,13 @@ else()
   add_definitions(-DMXNET_USE_NCCL=0)
 endif()
 
+if (USE_INT64_TENSOR_SIZE)
+  message(STATUS "Using 64-bit integer for tensor size")
+  add_definitions(-DMSHADOW_INT64_TENSOR_SIZE=1)
+else()
+  add_definitions(-DMSHADOW_INT64_TENSOR_SIZE=0)
+endif()
+
 include(cmake/ChooseBlas.cmake)
 if(USE_CUDA AND FIRST_CUDA)
   include(3rdparty/mshadow/cmake/Utils.cmake)

diff --git a/Makefile b/Makefile
@@ -189,6 +189,11 @@ ifeq ($(USE_OPERATOR_TUNING), 1)
 	CFLAGS += -DMXNET_USE_OPERATOR_TUNING=1
 endif
 
+ifeq ($(USE_INT64_TENSOR_SIZE), 1)
+   CFLAGS += -DMSHADOW_INT64_TENSOR_SIZE=1
+else
+   CFLAGS += -DMSHADOW_INT64_TENSOR_SIZE=0
+endif
 # verify existence of separate lapack library when using blas/openblas/atlas
 # switch off lapack support in case it can't be found
 # issue covered with this

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
@@ -755,6 +755,53 @@ build_ubuntu_gpu_cmake() {
     ninja -v
 }
 
+build_ubuntu_cpu_large_tensor() {
+    set -ex
+    cd /work/build
+    build_ccache_wrappers
+    cmake \
+        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache    \
+        -DCMAKE_C_COMPILER_LAUNCHER=ccache      \
+        -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache   \
+        -DUSE_SIGNAL_HANDLER=ON                 \
+        -DENABLE_TESTCOVERAGE=ON                \
+        -DUSE_CUDA=OFF                          \
+        -DUSE_CUDNN=OFF                         \
+        -DUSE_MKLDNN=OFF                        \
+        -DCMAKE_BUILD_TYPE=Release              \
+        -DUSE_INT64_TENSOR_SIZE=ON              \
+        -G Ninja                                \
+        /work/mxnet
+
+    ninja -v
+}
+
+build_ubuntu_gpu_large_tensor() {
+    set -ex
+    cd /work/build
+    build_ccache_wrappers
+    cmake \
+        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache    \
+        -DCMAKE_C_COMPILER_LAUNCHER=ccache      \
+        -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache   \
+        -DUSE_SIGNAL_HANDLER=ON                 \
+        -DENABLE_TESTCOVERAGE=ON                \
+        -DUSE_CUDA=ON                           \
+        -DUSE_CUDNN=ON                          \
+        -DUSE_MKL_IF_AVAILABLE=OFF              \
+        -DUSE_MKLML_MKL=OFF                     \
+        -DUSE_MKLDNN=OFF                        \
+        -DUSE_DIST_KVSTORE=ON                   \
+        -DCMAKE_BUILD_TYPE=Release              \
+        -DCUDA_ARCH_NAME=Manual                 \
+        -DCUDA_ARCH_BIN=$CI_CMAKE_CUDA_ARCH_BIN \
+        -DUSE_INT64_TENSOR_SIZE=ON              \
+        -G Ninja                                \
+        /work/mxnet
+
+    ninja -v
+}
+
 build_ubuntu_blc() {
     echo "pass"
 }
@@ -1183,6 +1230,13 @@ nightly_test_KVStore_singleNode() {
     python tests/nightly/test_kvstore.py
 }
 
+#Test Large Tensor Size
+nightly_test_large_tensor() {
+    set -ex
+    export PYTHONPATH=./python/
+    nosetests-3.4 tests/nightly/test_large_array.py
+}
+
 #Tests Amalgamation Build with 5 different sets of flags
 nightly_test_amalgamation() {
     set -ex

diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
@@ -119,6 +119,34 @@ def compile_unix_openblas_debug_cpu() {
     }]
 }
 
+def compile_unix_int64_cpu() {
+    return ['CPU: USE_INT64_TENSOR_SIZE': {
+      node(NODE_LINUX_CPU) {
+        ws('workspace/build-cpu-int64') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            utils.init_git()
+            utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_large_tensor', false)
+            utils.pack_lib('ubuntu_cpu_int64', mx_cmake_lib, true)
+          }
+        }
+      }
+    }]
+}
+
+def compile_unix_int64_gpu() {
+    return ['GPU: USE_INT64_TENSOR_SIZE': {
+      node(NODE_LINUX_GPU) {
+        ws('workspace/build-gpu-int64') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            utils.init_git()
+            utils.docker_run('ubuntu_gpu', 'build_ubuntu_gpu_large_tensor', false)
+            utils.pack_lib('ubuntu_gpu_int64', mx_cmake_lib, true)
+          }
+        }
+      }
+    }]
+}
+
 def compile_unix_mkl_cpu() {
     return ['CPU: MKL': {
       node(NODE_LINUX_CPU) {

diff --git a/ci/jenkins/Jenkinsfile_unix_cpu b/ci/jenkins/Jenkinsfile_unix_cpu
@@ -38,7 +38,8 @@ core_logic: {
     custom_steps.compile_unix_openblas_debug_cpu(),
     custom_steps.compile_unix_mkl_cpu(),
     custom_steps.compile_unix_mkldnn_cpu(),
-    custom_steps.compile_unix_mkldnn_mkl_cpu()
+    custom_steps.compile_unix_mkldnn_mkl_cpu(),
+    custom_steps.compile_unix_int64_cpu()
   ])
 
   utils.parallel_stage('Tests', [

diff --git a/ci/jenkins/Jenkinsfile_unix_gpu b/ci/jenkins/Jenkinsfile_unix_gpu
@@ -40,6 +40,7 @@ core_logic: {
     custom_steps.compile_unix_cmake_mkldnn_gpu(),
     custom_steps.compile_unix_cmake_gpu(),
     custom_steps.compile_unix_tensorrt_gpu(),
+    custom_steps.compile_unix_int64_gpu()
   ]) 
 
   utils.parallel_stage('Tests', [

diff --git a/contrib/clojure-package/src/dev/generator.clj b/contrib/clojure-package/src/dev/generator.clj
@@ -123,7 +123,11 @@
     (.write w "\n\n")
     (.write w "\n\n")
   (doseq [f functions]
-    (clojure.pprint/pprint f w)
+    (let [fstr (-> f
+                   clojure.pprint/pprint
+                   with-out-str
+                   (clojure.string/replace #"\\n\\n" "\n"))]
+      (.write w fstr))
     (.write w "\n"))))
 
 ;;;;;;; Common operations
@@ -447,7 +451,10 @@
                             :type "Map[String, String]"
                             :optional? true
                             :description "Attributes of the symbol"}))
-        doc (gen-symbol-api-doc fn-description params)
+        doc (clojure.string/join
+             "\n\n  "
+             (-> (gen-symbol-api-doc fn-description params)
+                 (clojure.string/split #"\n")))
         default-call (gen-symbol-api-default-arity op-name params)]
     `(~'defn ~(symbol fn-name)
       ~doc
@@ -520,7 +527,10 @@
                                  :type "NDArray-or-Symbol"
                                  :optional? true
                                  :description "Output array."}))
-        doc (gen-ndarray-api-doc fn-description params)
+        doc (clojure.string/join
+             "\n\n  "
+             (-> (gen-ndarray-api-doc fn-description params)
+                 (clojure.string/split #"\n")))
         opt-params (filter :optional? params)
         req-params (remove :optional? params)
         req-call (gen-ndarray-api-required-arity fn-name req-params)

diff --git a/contrib/clojure-package/test/good-test-ndarray-api.clj b/contrib/clojure-package/test/good-test-ndarray-api.clj
@@ -31,15 +31,96 @@
 
 (defn
  activation
- "Applies an activation function element-wise to the input.\n\nThe following activation functions are supported:\n\n- `relu`: Rectified Linear Unit, :math:`y = max(x, 0)`\n- `sigmoid`: :math:`y = \\frac{1}{1 + exp(-x)}`\n- `tanh`: Hyperbolic tangent, :math:`y = \\frac{exp(x) - exp(-x)}{exp(x) + exp(-x)}`\n- `softrelu`: Soft ReLU, or SoftPlus, :math:`y = log(1 + exp(x))`\n- `softsign`: :math:`y = \\frac{x}{1 + abs(x)}`\n\n\n\nDefined in src/operator/nn/activation.cc:L167\n\n`data`: The input array.\n`act-type`: Activation function to be applied.\n`out`: Output array. (optional)\n"
+ "Applies an activation function element-wise to the input.
+
+  The following activation functions are supported:
+
+  - `relu`: Rectified Linear Unit, :math:`y = max(x, 0)`
+  - `sigmoid`: :math:`y = \\frac{1}{1 + exp(-x)}`
+  - `tanh`: Hyperbolic tangent, :math:`y = \\frac{exp(x) - exp(-x)}{exp(x) + exp(-x)}`
+  - `softrelu`: Soft ReLU, or SoftPlus, :math:`y = log(1 + exp(x))`
+  - `softsign`: :math:`y = \\frac{x}{1 + abs(x)}`
+
+
+
+  Defined in src/operator/nn/activation.cc:L167
+
+  `data`: The input array.
+  `act-type`: Activation function to be applied.
+  `out`: Output array. (optional)"
  ([data act-type] (activation {:data data, :act-type act-type}))
  ([{:keys [data act-type out], :or {out nil}, :as opts}]
   (util/coerce-return
    (NDArrayAPI/Activation data act-type (util/->option out)))))
 
 (defn
  batch-norm
- "Batch normalization.\n\nNormalizes a data batch by mean and variance, and applies a scale ``gamma`` as\nwell as offset ``beta``.\n\nAssume the input has more than one dimension and we normalize along axis 1.\nWe first compute the mean and variance along this axis:\n\n.. math::\n\n  data\\_mean[i] = mean(data[:,i,:,...]) \\\\\n  data\\_var[i] = var(data[:,i,:,...])\n\nThen compute the normalized output, which has the same shape as input, as following:\n\n.. math::\n\n  out[:,i,:,...] = \\frac{data[:,i,:,...] - data\\_mean[i]}{\\sqrt{data\\_var[i]+\\epsilon}} * gamma[i] + beta[i]\n\nBoth *mean* and *var* returns a scalar by treating the input as a vector.\n\nAssume the input has size *k* on axis 1, then both ``gamma`` and ``beta``\nhave shape *(k,)*. If ``output_mean_var`` is set to be true, then outputs both ``data_mean`` and\nthe inverse of ``data_var``, which are needed for the backward pass. Note that gradient of these\ntwo outputs are blocked.\n\nBesides the inputs and the outputs, this operator accepts two auxiliary\nstates, ``moving_mean`` and ``moving_var``, which are *k*-length\nvectors. They are global statistics for the whole dataset, which are updated\nby::\n\n  moving_mean = moving_mean * momentum + data_mean * (1 - momentum)\n  moving_var = moving_var * momentum + data_var * (1 - momentum)\n\nIf ``use_global_stats`` is set to be true, then ``moving_mean`` and\n``moving_var`` are used instead of ``data_mean`` and ``data_var`` to compute\nthe output. It is often used during inference.\n\nThe parameter ``axis`` specifies which axis of the input shape denotes\nthe 'channel' (separately normalized groups).  The default is 1.  Specifying -1 sets the channel\naxis to be the last item in the input shape.\n\nBoth ``gamma`` and ``beta`` are learnable parameters. But if ``fix_gamma`` is true,\nthen set ``gamma`` to 1 and its gradient to 0.\n\n.. Note::\n  When ``fix_gamma`` is set to True, no sparse support is provided. If ``fix_gamma is`` set to False,\n  the sparse tensors will fallback.\n\n\n\nDefined in src/operator/nn/batch_norm.cc:L574\n\n`data`: Input data to batch normalization\n`gamma`: gamma array\n`beta`: beta array\n`moving-mean`: running mean of input\n`moving-var`: running variance of input\n`eps`: Epsilon to prevent div 0. Must be no less than CUDNN_BN_MIN_EPSILON defined in cudnn.h when using cudnn (usually 1e-5) (optional)\n`momentum`: Momentum for moving average (optional)\n`fix-gamma`: Fix gamma while training (optional)\n`use-global-stats`: Whether use global moving statistics instead of local batch-norm. This will force change batch-norm into a scale shift operator. (optional)\n`output-mean-var`: Output the mean and inverse std  (optional)\n`axis`: Specify which shape axis the channel is specified (optional)\n`cudnn-off`: Do not select CUDNN operator, if available (optional)\n`out`: Output array. (optional)\n"
+ "Batch normalization.
+
+  Normalizes a data batch by mean and variance, and applies a scale ``gamma`` as
+  well as offset ``beta``.
+
+  Assume the input has more than one dimension and we normalize along axis 1.
+  We first compute the mean and variance along this axis:
+
+  .. math::
+
+    data\\_mean[i] = mean(data[:,i,:,...]) \\\\
+    data\\_var[i] = var(data[:,i,:,...])
+
+  Then compute the normalized output, which has the same shape as input, as following:
+
+  .. math::
+
+    out[:,i,:,...] = \\frac{data[:,i,:,...] - data\\_mean[i]}{\\sqrt{data\\_var[i]+\\epsilon}} * gamma[i] + beta[i]
+
+  Both *mean* and *var* returns a scalar by treating the input as a vector.
+
+  Assume the input has size *k* on axis 1, then both ``gamma`` and ``beta``
+  have shape *(k,)*. If ``output_mean_var`` is set to be true, then outputs both ``data_mean`` and
+  the inverse of ``data_var``, which are needed for the backward pass. Note that gradient of these
+  two outputs are blocked.
+
+  Besides the inputs and the outputs, this operator accepts two auxiliary
+  states, ``moving_mean`` and ``moving_var``, which are *k*-length
+  vectors. They are global statistics for the whole dataset, which are updated
+  by::
+
+    moving_mean = moving_mean * momentum + data_mean * (1 - momentum)
+    moving_var = moving_var * momentum + data_var * (1 - momentum)
+
+  If ``use_global_stats`` is set to be true, then ``moving_mean`` and
+  ``moving_var`` are used instead of ``data_mean`` and ``data_var`` to compute
+  the output. It is often used during inference.
+
+  The parameter ``axis`` specifies which axis of the input shape denotes
+  the 'channel' (separately normalized groups).  The default is 1.  Specifying -1 sets the channel
+  axis to be the last item in the input shape.
+
+  Both ``gamma`` and ``beta`` are learnable parameters. But if ``fix_gamma`` is true,
+  then set ``gamma`` to 1 and its gradient to 0.
+
+  .. Note::
+    When ``fix_gamma`` is set to True, no sparse support is provided. If ``fix_gamma is`` set to False,
+    the sparse tensors will fallback.
+
+
+
+  Defined in src/operator/nn/batch_norm.cc:L574
+
+  `data`: Input data to batch normalization
+  `gamma`: gamma array
+  `beta`: beta array
+  `moving-mean`: running mean of input
+  `moving-var`: running variance of input
+  `eps`: Epsilon to prevent div 0. Must be no less than CUDNN_BN_MIN_EPSILON defined in cudnn.h when using cudnn (usually 1e-5) (optional)
+  `momentum`: Momentum for moving average (optional)
+  `fix-gamma`: Fix gamma while training (optional)
+  `use-global-stats`: Whether use global moving statistics instead of local batch-norm. This will force change batch-norm into a scale shift operator. (optional)
+  `output-mean-var`: Output the mean and inverse std  (optional)
+  `axis`: Specify which shape axis the channel is specified (optional)
+  `cudnn-off`: Do not select CUDNN operator, if available (optional)
+  `out`: Output array. (optional)"
  ([data gamma beta moving-mean moving-var]
   (batch-norm
    {:data data,
+5 −1		mshadow/base.h
+54 −44		mshadow/dot_engine-inl.h
+2 −2		mshadow/tensor.h