Skip to content

Commit

Permalink
RT-TDDFT GPU Acceleration: RT-TD now fully support GPU computation (#…
Browse files Browse the repository at this point in the history
…5773)

* Phase 1 of RT-TDDFT GPU Acceleration: Rewriting existing code using Tensor

* [pre-commit.ci lite] apply automatic fixes

* Initialize int info in bandenergy.cpp

* Initialize double aa, bb in bandenergy.cpp

* Fix a bug where CopyFrom caused shared data between tensors, using =(assignment operator overload) instead

* RT-TDDFT GPU Acceleration (Phase 2): Adding needed BLAS and LAPACK support for Tensor on CPU and refactoring linear algebra operations in TDDFT

* LAPACK wrapper functions: change const basic-type input parameters from pass-by-reference to pass-by-value

* Did nothing, just formatting esolver.cpp

* Core algorithm: RT-TD now has preliminary support for GPU computation

* Fix GitHub CI CUDA build bug due to deleted variable

* Refactor some files

* Getting ready for gathering MPI processes

* MPI multi-process compatibility

* Fix GitHub CI MPI compilation bug

* Minor fix and refactor

* Initialize double aa, bb and one line for one variable

* Rename bandenergy.cpp to band_energy.cpp and corresponding adjustments

* Fix compile error and change CMakeLists accordingly

* Initialize int naroc

* Initialize MPI related variables: myid, num_procs and root_proc

* Refactor Propagator class implementation into multiple files for better code organization

* Remove all GlobalV::ofs_running from RT-TDDFT core algorithms and pass it as an input parameter instead

* Add assert in some places and optimize redundant index calculations in nested loops

---------

Co-authored-by: pre-commit-ci-lite[bot] <117423508+pre-commit-ci-lite[bot]@users.noreply.github.com>
  • Loading branch information
AsTonyshment and pre-commit-ci-lite[bot] authored Jan 22, 2025
1 parent 0098171 commit 3f8fe4f
Show file tree
Hide file tree
Showing 47 changed files with 4,131 additions and 1,104 deletions.
5 changes: 4 additions & 1 deletion source/Makefile.Objects
Original file line number Diff line number Diff line change
Expand Up @@ -557,10 +557,13 @@ OBJS_IO_LCAO=cal_r_overlap_R.o\

OBJS_LCAO=evolve_elec.o\
evolve_psi.o\
bandenergy.o\
band_energy.o\
middle_hamilt.o\
norm_psi.o\
propagator.o\
propagator_cn2.o\
propagator_taylor.o\
propagator_etrs.o\
td_velocity.o\
td_current.o\
snap_psibeta_half_tddft.o\
Expand Down
4 changes: 2 additions & 2 deletions source/module_base/lapack_connector.h
Original file line number Diff line number Diff line change
Expand Up @@ -133,8 +133,8 @@ extern "C"

// zgetrf computes the LU factorization of a general matrix
// while zgetri takes its output to perform matrix inversion
void zgetrf_(const int* m, const int *n, const std::complex<double> *A, const int *lda, int *ipiv, const int* info);
void zgetri_(const int* n, std::complex<double> *A, const int *lda, int *ipiv, std::complex<double> *work, int *lwork, const int *info);
void zgetrf_(const int* m, const int *n, std::complex<double> *A, const int *lda, int *ipiv, int* info);
void zgetri_(const int* n, std::complex<double>* A, const int* lda, const int* ipiv, std::complex<double>* work, const int* lwork, int* info);

// if trans=='N': C = alpha * A * A.H + beta * C
// if trans=='C': C = alpha * A.H * A + beta * C
Expand Down
58 changes: 58 additions & 0 deletions source/module_base/module_container/ATen/kernels/cuda/lapack.cu
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,49 @@ struct lapack_dngvd<T, DEVICE_GPU> {
}
};

template <typename T>
struct lapack_getrf<T, DEVICE_GPU> {
void operator()(
const int& m,
const int& n,
T* Mat,
const int& lda,
int* ipiv)
{
cuSolverConnector::getrf(cusolver_handle, m, n, Mat, lda, ipiv);
}
};

template <typename T>
struct lapack_getri<T, DEVICE_GPU> {
void operator()(
const int& n,
T* Mat,
const int& lda,
const int* ipiv,
T* work,
const int& lwork)
{
throw std::runtime_error("cuSOLVER does not provide LU-based matrix inversion interface (getri). To compute the inverse on GPU, use getrs instead.");
}
};

template <typename T>
struct lapack_getrs<T, DEVICE_GPU> {
void operator()(
const char& trans,
const int& n,
const int& nrhs,
T* A,
const int& lda,
const int* ipiv,
T* B,
const int& ldb)
{
cuSolverConnector::getrs(cusolver_handle, trans, n, nrhs, A, lda, ipiv, B, ldb);
}
};

template struct set_matrix<float, DEVICE_GPU>;
template struct set_matrix<double, DEVICE_GPU>;
template struct set_matrix<std::complex<float>, DEVICE_GPU>;
Expand All @@ -142,5 +185,20 @@ template struct lapack_dngvd<double, DEVICE_GPU>;
template struct lapack_dngvd<std::complex<float>, DEVICE_GPU>;
template struct lapack_dngvd<std::complex<double>, DEVICE_GPU>;

template struct lapack_getrf<float, DEVICE_GPU>;
template struct lapack_getrf<double, DEVICE_GPU>;
template struct lapack_getrf<std::complex<float>, DEVICE_GPU>;
template struct lapack_getrf<std::complex<double>, DEVICE_GPU>;

template struct lapack_getri<float, DEVICE_GPU>;
template struct lapack_getri<double, DEVICE_GPU>;
template struct lapack_getri<std::complex<float>, DEVICE_GPU>;
template struct lapack_getri<std::complex<double>, DEVICE_GPU>;

template struct lapack_getrs<float, DEVICE_GPU>;
template struct lapack_getrs<double, DEVICE_GPU>;
template struct lapack_getrs<std::complex<float>, DEVICE_GPU>;
template struct lapack_getrs<std::complex<double>, DEVICE_GPU>;

} // namespace kernels
} // namespace container
70 changes: 70 additions & 0 deletions source/module_base/module_container/ATen/kernels/lapack.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,61 @@ struct lapack_dngvd<T, DEVICE_CPU> {
}
};

template <typename T>
struct lapack_getrf<T, DEVICE_CPU> {
void operator()(
const int& m,
const int& n,
T* Mat,
const int& lda,
int* ipiv)
{
int info = 0;
lapackConnector::getrf(m, n, Mat, lda, ipiv, info);
if (info != 0) {
throw std::runtime_error("getrf failed with info = " + std::to_string(info));
}
}
};

template <typename T>
struct lapack_getri<T, DEVICE_CPU> {
void operator()(
const int& n,
T* Mat,
const int& lda,
const int* ipiv,
T* work,
const int& lwork)
{
int info = 0;
lapackConnector::getri(n, Mat, lda, ipiv, work, lwork, info);
if (info != 0) {
throw std::runtime_error("getri failed with info = " + std::to_string(info));
}
}
};

template <typename T>
struct lapack_getrs<T, DEVICE_CPU> {
void operator()(
const char& trans,
const int& n,
const int& nrhs,
T* A,
const int& lda,
const int* ipiv,
T* B,
const int& ldb)
{
int info = 0;
lapackConnector::getrs(trans, n, nrhs, A, lda, ipiv, B, ldb, info);
if (info != 0) {
throw std::runtime_error("getrs failed with info = " + std::to_string(info));
}
}
};

template struct set_matrix<float, DEVICE_CPU>;
template struct set_matrix<double, DEVICE_CPU>;
template struct set_matrix<std::complex<float>, DEVICE_CPU>;
Expand All @@ -149,5 +204,20 @@ template struct lapack_dngvd<double, DEVICE_CPU>;
template struct lapack_dngvd<std::complex<float>, DEVICE_CPU>;
template struct lapack_dngvd<std::complex<double>, DEVICE_CPU>;

template struct lapack_getrf<float, DEVICE_CPU>;
template struct lapack_getrf<double, DEVICE_CPU>;
template struct lapack_getrf<std::complex<float>, DEVICE_CPU>;
template struct lapack_getrf<std::complex<double>, DEVICE_CPU>;

template struct lapack_getri<float, DEVICE_CPU>;
template struct lapack_getri<double, DEVICE_CPU>;
template struct lapack_getri<std::complex<float>, DEVICE_CPU>;
template struct lapack_getri<std::complex<double>, DEVICE_CPU>;

template struct lapack_getrs<float, DEVICE_CPU>;
template struct lapack_getrs<double, DEVICE_CPU>;
template struct lapack_getrs<std::complex<float>, DEVICE_CPU>;
template struct lapack_getrs<std::complex<double>, DEVICE_CPU>;

} // namespace kernels
} // namespace container
36 changes: 36 additions & 0 deletions source/module_base/module_container/ATen/kernels/lapack.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,42 @@ struct lapack_dngvd {
Real* eigen_val);
};


template <typename T, typename Device>
struct lapack_getrf {
void operator()(
const int& m,
const int& n,
T* Mat,
const int& lda,
int* ipiv);
};


template <typename T, typename Device>
struct lapack_getri {
void operator()(
const int& n,
T* Mat,
const int& lda,
const int* ipiv,
T* work,
const int& lwork);
};

template <typename T, typename Device>
struct lapack_getrs {
void operator()(
const char& trans,
const int& n,
const int& nrhs,
T* A,
const int& lda,
const int* ipiv,
T* B,
const int& ldb);
};

#if defined(__CUDA) || defined(__ROCM)
// TODO: Use C++ singleton to manage the GPU handles
void createGpuSolverHandle(); // create cusolver handle
Expand Down
Loading

0 comments on commit 3f8fe4f

Please sign in to comment.