RT-TDDFT GPU Acceleration: RT-TD now fully support GPU computation (#…

…5773) * Phase 1 of RT-TDDFT GPU Acceleration: Rewriting existing code using Tensor * [pre-commit.ci lite] apply automatic fixes * Initialize int info in bandenergy.cpp * Initialize double aa, bb in bandenergy.cpp * Fix a bug where CopyFrom caused shared data between tensors, using =(assignment operator overload) instead * RT-TDDFT GPU Acceleration (Phase 2): Adding needed BLAS and LAPACK support for Tensor on CPU and refactoring linear algebra operations in TDDFT * LAPACK wrapper functions: change const basic-type input parameters from pass-by-reference to pass-by-value * Did nothing, just formatting esolver.cpp * Core algorithm: RT-TD now has preliminary support for GPU computation * Fix GitHub CI CUDA build bug due to deleted variable * Refactor some files * Getting ready for gathering MPI processes * MPI multi-process compatibility * Fix GitHub CI MPI compilation bug * Minor fix and refactor * Initialize double aa, bb and one line for one variable * Rename bandenergy.cpp to band_energy.cpp and corresponding adjustments * Fix compile error and change CMakeLists accordingly * Initialize int naroc * Initialize MPI related variables: myid, num_procs and root_proc * Refactor Propagator class implementation into multiple files for better code organization * Remove all GlobalV::ofs_running from RT-TDDFT core algorithms and pass it as an input parameter instead * Add assert in some places and optimize redundant index calculations in nested loops --------- Co-authored-by: pre-commit-ci-lite[bot] <117423508+pre-commit-ci-lite[bot]@users.noreply.github.com>
deepmodeling · Jan 22, 2025 · 3f8fe4f · 3f8fe4f
1 parent 0098171
commit 3f8fe4f
Show file tree

Hide file tree

Showing 47 changed files with 4,131 additions and 1,104 deletions.
diff --git a/source/Makefile.Objects b/source/Makefile.Objects
@@ -557,10 +557,13 @@ OBJS_IO_LCAO=cal_r_overlap_R.o\
 
 OBJS_LCAO=evolve_elec.o\
       evolve_psi.o\
-      bandenergy.o\
+      band_energy.o\
       middle_hamilt.o\
       norm_psi.o\
       propagator.o\
+      propagator_cn2.o\
+      propagator_taylor.o\
+      propagator_etrs.o\
       td_velocity.o\
       td_current.o\
       snap_psibeta_half_tddft.o\

diff --git a/source/module_base/lapack_connector.h b/source/module_base/lapack_connector.h
@@ -133,8 +133,8 @@ extern "C"
 
     // zgetrf computes the LU factorization of a general matrix
     // while zgetri takes its output to perform matrix inversion
-    void zgetrf_(const int* m, const int *n, const std::complex<double> *A, const int *lda, int *ipiv, const int* info);
-    void zgetri_(const int* n, std::complex<double> *A, const int *lda, int *ipiv, std::complex<double> *work, int *lwork, const int *info);
+    void zgetrf_(const int* m, const int *n, std::complex<double> *A, const int *lda, int *ipiv, int* info);
+    void zgetri_(const int* n, std::complex<double>* A, const int* lda, const int* ipiv, std::complex<double>* work, const int* lwork, int* info);
 
     // if trans=='N':	C = alpha * A * A.H + beta * C
 	// if trans=='C':	C = alpha * A.H * A + beta * C

diff --git a/source/module_base/module_container/ATen/kernels/cuda/lapack.cu b/source/module_base/module_container/ATen/kernels/cuda/lapack.cu
@@ -117,6 +117,49 @@ struct lapack_dngvd<T, DEVICE_GPU> {
     }
 };
 
+template <typename T>
+struct lapack_getrf<T, DEVICE_GPU> {
+    void operator()(
+        const int& m,
+        const int& n,
+        T* Mat,
+        const int& lda,
+        int* ipiv)
+    {
+        cuSolverConnector::getrf(cusolver_handle, m, n, Mat, lda, ipiv);
+    }
+};
+
+template <typename T>
+struct lapack_getri<T, DEVICE_GPU> {
+    void operator()(
+        const int& n,
+        T* Mat,
+        const int& lda,
+        const int* ipiv,
+        T* work,
+        const int& lwork)
+    {
+        throw std::runtime_error("cuSOLVER does not provide LU-based matrix inversion interface (getri). To compute the inverse on GPU, use getrs instead.");
+    }
+};
+
+template <typename T>
+struct lapack_getrs<T, DEVICE_GPU> {
+    void operator()(
+        const char& trans,
+        const int& n,
+        const int& nrhs,
+        T* A,
+        const int& lda,
+        const int* ipiv,
+        T* B,
+        const int& ldb)
+    {
+        cuSolverConnector::getrs(cusolver_handle, trans, n, nrhs, A, lda, ipiv, B, ldb);
+    }
+};
+
 template struct set_matrix<float,  DEVICE_GPU>;
 template struct set_matrix<double, DEVICE_GPU>;
 template struct set_matrix<std::complex<float>,  DEVICE_GPU>;
@@ -142,5 +185,20 @@ template struct lapack_dngvd<double, DEVICE_GPU>;
 template struct lapack_dngvd<std::complex<float>,  DEVICE_GPU>;
 template struct lapack_dngvd<std::complex<double>, DEVICE_GPU>;
 
+template struct lapack_getrf<float,  DEVICE_GPU>;
+template struct lapack_getrf<double, DEVICE_GPU>;
+template struct lapack_getrf<std::complex<float>,  DEVICE_GPU>;
+template struct lapack_getrf<std::complex<double>, DEVICE_GPU>;
+
+template struct lapack_getri<float,  DEVICE_GPU>;
+template struct lapack_getri<double, DEVICE_GPU>;
+template struct lapack_getri<std::complex<float>,  DEVICE_GPU>;
+template struct lapack_getri<std::complex<double>, DEVICE_GPU>;
+
+template struct lapack_getrs<float,  DEVICE_GPU>;
+template struct lapack_getrs<double, DEVICE_GPU>;
+template struct lapack_getrs<std::complex<float>,  DEVICE_GPU>;
+template struct lapack_getrs<std::complex<double>, DEVICE_GPU>;
+
 } // namespace kernels
 } // namespace container
diff --git a/source/module_base/module_container/ATen/kernels/lapack.cpp b/source/module_base/module_container/ATen/kernels/lapack.cpp
@@ -124,6 +124,61 @@ struct lapack_dngvd<T, DEVICE_CPU> {
     }
 };
 
+template <typename T>
+struct lapack_getrf<T, DEVICE_CPU> {
+    void operator()(
+        const int& m,
+        const int& n,
+        T* Mat,
+        const int& lda,
+        int* ipiv)
+    {
+        int info = 0;
+        lapackConnector::getrf(m, n, Mat, lda, ipiv, info);
+        if (info != 0) {
+            throw std::runtime_error("getrf failed with info = " + std::to_string(info));
+        }
+    }
+};
+
+template <typename T>
+struct lapack_getri<T, DEVICE_CPU> {
+    void operator()(
+        const int& n,
+        T* Mat,
+        const int& lda,
+        const int* ipiv,
+        T* work,
+        const int& lwork)
+    {
+        int info = 0;
+        lapackConnector::getri(n, Mat, lda, ipiv, work, lwork, info);
+        if (info != 0) {
+            throw std::runtime_error("getri failed with info = " + std::to_string(info));
+        }
+    }
+};
+
+template <typename T>
+struct lapack_getrs<T, DEVICE_CPU> {
+    void operator()(
+        const char& trans,
+        const int& n,
+        const int& nrhs,
+        T* A,
+        const int& lda,
+        const int* ipiv,
+        T* B,
+        const int& ldb)
+    {
+        int info = 0;
+        lapackConnector::getrs(trans, n, nrhs, A, lda, ipiv, B, ldb, info);
+        if (info != 0) {
+            throw std::runtime_error("getrs failed with info = " + std::to_string(info));
+        }
+    }
+};
+
 template struct set_matrix<float,  DEVICE_CPU>;
 template struct set_matrix<double, DEVICE_CPU>;
 template struct set_matrix<std::complex<float>,  DEVICE_CPU>;
@@ -149,5 +204,20 @@ template struct lapack_dngvd<double, DEVICE_CPU>;
 template struct lapack_dngvd<std::complex<float>,  DEVICE_CPU>;
 template struct lapack_dngvd<std::complex<double>, DEVICE_CPU>;
 
+template struct lapack_getrf<float,  DEVICE_CPU>;
+template struct lapack_getrf<double, DEVICE_CPU>;
+template struct lapack_getrf<std::complex<float>,  DEVICE_CPU>;
+template struct lapack_getrf<std::complex<double>, DEVICE_CPU>;
+
+template struct lapack_getri<float, DEVICE_CPU>;
+template struct lapack_getri<double, DEVICE_CPU>;
+template struct lapack_getri<std::complex<float>, DEVICE_CPU>;
+template struct lapack_getri<std::complex<double>, DEVICE_CPU>;
+
+template struct lapack_getrs<float, DEVICE_CPU>;
+template struct lapack_getrs<double, DEVICE_CPU>;
+template struct lapack_getrs<std::complex<float>, DEVICE_CPU>;
+template struct lapack_getrs<std::complex<double>, DEVICE_CPU>;
+
 } // namespace kernels
 } // namespace container
diff --git a/source/module_base/module_container/ATen/kernels/lapack.h b/source/module_base/module_container/ATen/kernels/lapack.h
@@ -65,6 +65,42 @@ struct lapack_dngvd {
         Real* eigen_val);
 };
 
+
+template <typename T, typename Device>
+struct lapack_getrf {
+    void operator()(
+        const int& m,
+        const int& n,
+        T* Mat,
+        const int& lda,
+        int* ipiv);
+};
+
+
+template <typename T, typename Device>
+struct lapack_getri {
+    void operator()(
+        const int& n,
+        T* Mat,
+        const int& lda,
+        const int* ipiv,
+        T* work,
+        const int& lwork);
+};
+
+template <typename T, typename Device>
+struct lapack_getrs {
+    void operator()(
+        const char& trans,
+        const int& n,
+        const int& nrhs,
+        T* A,
+        const int& lda,
+        const int* ipiv,
+        T* B,
+        const int& ldb);
+};
+
 #if defined(__CUDA) || defined(__ROCM)
 // TODO: Use C++ singleton to manage the GPU handles
 void createGpuSolverHandle();  // create cusolver handle