diff --git a/cmake/FindcuTENSOR.cmake b/cmake/FindcuTENSOR.cmake index 5e441ef8..76182d6d 100644 --- a/cmake/FindcuTENSOR.cmake +++ b/cmake/FindcuTENSOR.cmake @@ -79,7 +79,7 @@ else() endif() if(NOT cuTENSOR_FOUND) - set(CUTENSOR_VERSION 1.4.0.6) + set(CUTENSOR_VERSION 1.5.0.3) set(CUTENSOR_FILENAME libcutensor-linux-x86_64-${CUTENSOR_VERSION}-archive) message(STATUS "cuTENSOR not found. Downloading library. By continuing this download you accept to the license terms of cuTENSOR") diff --git a/cmake/FindcuTensorNet.cmake b/cmake/FindcuTensorNet.cmake index 34a462e1..c49a856f 100644 --- a/cmake/FindcuTensorNet.cmake +++ b/cmake/FindcuTensorNet.cmake @@ -81,10 +81,10 @@ endif() if(NOT cuTensorNet_FOUND) message(STATUS "cuTensorNet not found. Downloading library. By continuing this download you accept to the license terms of cuQuantum SDK") - set(CUTENSORNET_VERSION 0.1.0.30) + set(CUTENSORNET_VERSION 22.03.0.40) set(CUTENSORNET_FILENAME cuquantum-linux-x86_64-${CUTENSORNET_VERSION}-archive) - - file(DOWNLOAD https://developer.download.nvidia.com/compute/cuquantum/redist/linux-x86_64/${CUTENSORNET_FILENAME}.tar.xz + + file(DOWNLOAD https://developer.download.nvidia.com/compute/cuquantum/redist/cuquantum/linux-x86_64/${CUTENSORNET_FILENAME}.tar.xz ${CMAKE_BINARY_DIR}/${CUTENSORNET_FILENAME}.tar.xz) file(ARCHIVE_EXTRACT INPUT ${CMAKE_BINARY_DIR}/${CUTENSORNET_FILENAME}.tar.xz DESTINATION ${CMAKE_BINARY_DIR}/cutensornet/) diff --git a/include/matx_einsum.h b/include/matx_einsum.h index 77a9da33..ffeef9d3 100644 --- a/include/matx_einsum.h +++ b/include/matx_einsum.h @@ -127,6 +127,15 @@ class matxEinsumHandle_t { MATX_ASSERT_STR(cutensornetCreateContractionOptimizerInfo(handle_, descNet_, &optimizerInfo) == CUTENSORNET_STATUS_SUCCESS, matxcuTensorError, "Failed to create cuTensorNet contraction optimizer info"); + int imbalance_factor = 30; + MATX_ASSERT_STR(cutensornetContractionOptimizerConfigSetAttribute( + handle_, + optimizerConfig, + CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_GRAPH_IMBALANCE_FACTOR, + &imbalance_factor, + sizeof(imbalance_factor)) == CUTENSORNET_STATUS_SUCCESS, + matxcuTensorError, "Failed to run contraction optimizer"); + size_t freeMem, totalMem; MATX_ASSERT(cudaMemGetInfo(&freeMem, &totalMem) == cudaSuccess, matxCudaError); @@ -150,13 +159,37 @@ class matxEinsumHandle_t { MATX_ASSERT(params_.num_slices_ > 0, matxcuTensorError); + MATX_ASSERT(cutensornetCreateWorkspaceDescriptor(handle_, &workDesc_) == CUTENSORNET_STATUS_SUCCESS, matxcuTensorError); + + uint64_t requiredWorkspaceSize = 0; + MATX_ASSERT(cutensornetWorkspaceComputeSizes(handle_, + descNet_, + optimizerInfo, + workDesc_) == CUTENSORNET_STATUS_SUCCESS, matxcuTensorError); + + MATX_ASSERT(cutensornetWorkspaceGetSize(handle_, + workDesc_, + CUTENSORNET_WORKSIZE_PREF_MIN, + CUTENSORNET_MEMSPACE_DEVICE, + &requiredWorkspaceSize) == CUTENSORNET_STATUS_SUCCESS, matxcuTensorError); + + MATX_ASSERT_STR(workSize_ > requiredWorkspaceSize, matxOutOfMemory, "Not enough workspace memory is available."); + + matxAlloc(&workspace_, workSize_, MATX_ASYNC_DEVICE_MEMORY, stream); + + MATX_ASSERT (cutensornetWorkspaceSet(handle_, + workDesc_, + CUTENSORNET_MEMSPACE_DEVICE, + workspace_, + workSize_) == CUTENSORNET_STATUS_SUCCESS, matxcuTensorError); + /******************************* * Initialize all pair-wise contraction plans (for cuTENSOR) *******************************/ MATX_ASSERT_STR(cutensornetCreateContractionPlan(handle_, descNet_, optimizerInfo, - workSize_, + workDesc_, &plan_) == CUTENSORNET_STATUS_SUCCESS, matxcuTensorError, "cutensornetCreateContractionPlan failed"); @@ -169,12 +202,6 @@ class matxEinsumHandle_t { &autotunePref) == CUTENSORNET_STATUS_SUCCESS, matxcuTensorError, "cutensornetCreateContractionAutotunePreference failed"); - // Allocate the real amount needed and free the old amount - MATX_ASSERT_STR(cutensornetContractionGetWorkspaceSize(handle_, descNet_, optimizerInfo, &workSize_) == CUTENSORNET_STATUS_SUCCESS, - matxcuTensorError, "cutensornetContractionGetWorkspaceSize failed"); - - matxAlloc(&workspace_, workSize_, MATX_ASYNC_DEVICE_MEMORY, stream); - const int numAutotuningIterations = 5; // may be 0 MATX_ASSERT_STR(cutensornetContractionAutotunePreferenceSetAttribute( handle_, @@ -189,8 +216,7 @@ class matxEinsumHandle_t { plan_, data_in, out.Data(), - workspace_, - workSize_, + workDesc_, autotunePref, stream) == CUTENSORNET_STATUS_SUCCESS, matxcuTensorError, "cutensornetContractionAutotune failed"); @@ -329,8 +355,7 @@ class matxEinsumHandle_t { plan_, data_in, out.Data(), - workspace_, - workSize_, + workDesc_, slice, stream) == CUTENSORNET_STATUS_SUCCESS, matxcuTensorError, "cutensornetContraction failed"); @@ -348,6 +373,7 @@ class matxEinsumHandle_t { cutensornetContractionPlan_t plan_; uint64_t workSize_; void *workspace_; + cutensornetWorkspaceDescriptor_t workDesc_; cutensornetHandle_t handle_; cutensornetNetworkDescriptor_t descNet_; EinsumParams_t params_; diff --git a/test/00_tensor/EinsumTests.cu b/test/00_tensor/EinsumTests.cu index 3b8eab1f..c1d9895b 100644 --- a/test/00_tensor/EinsumTests.cu +++ b/test/00_tensor/EinsumTests.cu @@ -98,7 +98,7 @@ TYPED_TEST_SUITE(EinsumTestsIntegral, MatXAllIntegralTypes); TYPED_TEST_SUITE(EinsumTestsNumericNonComplex, MatXNumericNonComplexTypes); TYPED_TEST_SUITE(EinsumTestsBoolean, MatXBoolTypes); -#if ENABLE_CUTENSOR +#if MATX_ENABLE_CUTENSOR TYPED_TEST(EinsumTestsFloatNonComplexNonHalfTypes, Contraction3D) { MATX_ENTER_HANDLER();