diff --git a/source/source_hsolver/CMakeLists.txt b/source/source_hsolver/CMakeLists.txt index b115d6d4cd..6b364562a0 100644 --- a/source/source_hsolver/CMakeLists.txt +++ b/source/source_hsolver/CMakeLists.txt @@ -13,6 +13,7 @@ list(APPEND objects diago_pxxxgvx.cpp diag_hs_para.cpp diago_params.cpp + diago_ppcg.cpp ) diff --git a/source/source_hsolver/diago_ppcg.cpp b/source/source_hsolver/diago_ppcg.cpp new file mode 100644 index 0000000000..fc49be8367 --- /dev/null +++ b/source/source_hsolver/diago_ppcg.cpp @@ -0,0 +1,542 @@ +#include "source_hsolver/diago_ppcg.h" + +#include "diago_iter_assist.h" +#include "para_linear_transform.h" +#include "source_base/global_function.h" +#include "source_base/kernels/math_kernel_op.h" +#include "source_base/parallel_comm.h" + +#include +#include +#include +#include + +namespace hsolver +{ + +template +DiagoPPCG::DiagoPPCG(const Real* precondition_in) +{ + this->r_type = ct::DataTypeToEnum::value; + this->t_type = ct::DataTypeToEnum::value; + this->device_type = ct::DeviceTypeToEnum::value; + + this->h_prec = std::move(ct::TensorMap((void*)precondition_in, r_type, ct::DeviceType::CpuDevice, {this->n_basis})); + + this->one = &one_; + this->zero = &zero_; + this->neg_one = &neg_one_; +} + +template +DiagoPPCG::~DiagoPPCG() +{ + // h_prec is a ref to outside data, do not free. +} + +template +void DiagoPPCG::init_iter(const int nband, const int nband_l, const int nbasis, const int ndim) +{ + this->n_band = nband; + this->n_band_l = nband_l; + this->n_basis = nbasis; + this->n_dim = ndim; + + this->eigen = std::move(ct::Tensor(r_type, device_type, {this->n_band})); + this->err_st = std::move(ct::Tensor(r_type, device_type, {this->n_band_l})); + + this->psi = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis})); + this->hpsi = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis})); + this->w = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis})); + this->hw = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis})); + this->p = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis})); + this->hp = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis})); + this->work = std::move(ct::Tensor(t_type, device_type, {this->n_band_l, this->n_basis})); + + this->prec = std::move(ct::Tensor(r_type, device_type, {this->n_basis})); + + this->nlocked = 0; + this->eigen_locked.resize(this->n_band, static_cast(0.0)); + +#ifdef __MPI + this->pmmcn.set_dimension(BP_WORLD, POOL_WORLD, n_band_l, n_basis, n_band_l, n_basis, n_dim, n_band); + this->plintrans.set_dimension(n_dim, nband_l, n_band_l, n_basis, BP_WORLD, false); + + this->all_n_band_l.resize(this->plintrans.nproc_col); + MPI_Allgather(&this->n_band_l, 1, MPI_INT, this->all_n_band_l.data(), 1, MPI_INT, BP_WORLD); + this->band_displs.resize(this->plintrans.nproc_col); + this->band_displs[0] = 0; + for (int i = 1; i < this->plintrans.nproc_col; ++i) + { + this->band_displs[i] = this->band_displs[i - 1] + this->all_n_band_l[i - 1]; + } +#else + this->pmmcn.set_dimension(n_band_l, n_basis, n_band_l, n_basis, n_dim, n_band); + this->plintrans.set_dimension(n_dim, nband_l, n_band_l, n_basis, false); + this->all_n_band_l = {this->n_band_l}; + this->band_displs = {0}; +#endif +} + +template +void DiagoPPCG::calc_prec() +{ + syncmem_var_h2d_op()(this->prec.template data(), this->h_prec.template data(), this->n_basis); +} + +template +void DiagoPPCG::calc_hpsi(const HPsiFunc& hpsi_func, T* psi_in, ct::Tensor& hpsi_out) +{ + hpsi_func(psi_in, hpsi_out.data(), this->n_basis, this->n_band_l); +} + +template +void DiagoPPCG::calc_grad(const ct::Tensor& prec_in, + ct::Tensor& err_out, + ct::Tensor& psi_in, + ct::Tensor& hpsi_in, + ct::Tensor& grad_out, + const int nlocked_in) +{ + int start_nband = 0; +#ifdef __MPI + if (this->plintrans.nproc_col > 1) + { + start_nband = this->plintrans.start_colB[GlobalV::MY_BNDGROUP]; + } +#endif + int local_nlocked = std::max(0, nlocked_in - start_nband); + local_nlocked = std::min(local_nlocked, this->n_band_l); + + // Zero out locked bands + for (int ib = 0; ib < local_nlocked; ++ib) + { + setmem_complex_op()(grad_out.data() + ib * this->n_basis, 0, this->n_basis); + err_out.data()[ib] = static_cast(0.0); + } + + for (int ib = local_nlocked; ib < this->n_band_l; ++ib) + { + T* psi_ptr = psi_in.data() + ib * this->n_basis; + T* hpsi_ptr = hpsi_in.data() + ib * this->n_basis; + T* grad_ptr = grad_out.data() + ib * this->n_basis; + + // 1. Normalize psi (and hpsi consistently) + Real norm = ModuleBase::dot_real_op()(this->n_dim, psi_ptr, psi_ptr, true); + norm = 1.0 / sqrt(norm); + ModuleBase::vector_div_constant_op()(this->n_dim, psi_ptr, psi_ptr, norm); + ModuleBase::vector_div_constant_op()(this->n_dim, hpsi_ptr, hpsi_ptr, norm); + + // 2. Rayleigh quotient: epsilo = + Real epsilo = ModuleBase::dot_real_op()(this->n_dim, psi_ptr, hpsi_ptr, true); + + // 3. Residual: grad = hpsi - epsilo * psi + ModuleBase::vector_add_vector_op()(this->n_dim, grad_ptr, hpsi_ptr, 1.0, psi_ptr, -epsilo); + + // 4. Error = ||raw residual|| + Real err = ModuleBase::dot_real_op()(this->n_dim, grad_ptr, grad_ptr, true); + err_out.data()[ib] = sqrt(err); + + // 5. Apply preconditioner: grad = grad / prec + ModuleBase::vector_div_vector_op()(this->n_dim, grad_ptr, grad_ptr, prec_in.data()); + } +} + +template +void DiagoPPCG::update_locking(const ct::Tensor& err_in, const std::vector& ethr_band) +{ + // Gather local errors to global array + std::vector local_err(this->n_band_l); + if (err_in.device_type() == ct::DeviceType::GpuDevice) + { + syncmem_var_d2h_op()(local_err.data(), err_in.data(), this->n_band_l); + } + else + { + std::memcpy(local_err.data(), err_in.data(), this->n_band_l * sizeof(Real)); + } + + std::vector global_err(this->n_band, static_cast(0.0)); + std::vector global_ethr(this->n_band, 0.0); + +#ifdef __MPI + MPI_Datatype mpi_real_type = (sizeof(Real) == sizeof(float)) ? MPI_FLOAT : MPI_DOUBLE; + MPI_Allgatherv(local_err.data(), + this->n_band_l, + mpi_real_type, + global_err.data(), + this->all_n_band_l.data(), + this->band_displs.data(), + mpi_real_type, + BP_WORLD); + + std::vector local_ethr_double(ethr_band.begin(), ethr_band.end()); + MPI_Allgatherv(local_ethr_double.data(), + this->n_band_l, + MPI_DOUBLE, + global_ethr.data(), + this->all_n_band_l.data(), + this->band_displs.data(), + MPI_DOUBLE, + BP_WORLD); +#else + for (int i = 0; i < this->n_band_l; ++i) + { + global_err[i] = local_err[i]; + global_ethr[i] = ethr_band[i]; + } +#endif + + // Gather current eigenvalues from device + std::vector current_eigen(this->n_band, static_cast(0.0)); + syncmem_var_d2h_op()(current_eigen.data(), this->eigen.data(), this->n_band); + + // Scan from current nlocked forward and lock converged bands + while (this->nlocked < this->n_band) + { + if (global_err[this->nlocked] <= global_ethr[this->nlocked]) + { + this->eigen_locked[this->nlocked] = current_eigen[this->nlocked]; + this->nlocked++; + } + else + { + break; + } + } +} + +template +void DiagoPPCG::orth_projection(const ct::Tensor& psi_in, ct::Tensor& hsub_tmp, ct::Tensor& grad_out) +{ + // hsub_tmp = psi^H * grad (n_band x n_band global) + this->pmmcn.multiply(1.0, psi_in.data(), grad_out.data(), 0.0, hsub_tmp.data()); + + // grad = grad - psi * hsub_tmp + this->plintrans.act(-1.0, psi_in.data(), hsub_tmp.data(), 1.0, grad_out.data()); +} + +template +void DiagoPPCG::orth_cholesky(ct::Tensor& workspace_in, + ct::Tensor& psi_out, + ct::Tensor& hpsi_out, + ct::Tensor& hsub_out) +{ + // hsub_out = psi_out^H * psi_out + this->pmmcn.multiply(1.0, psi_out.data(), psi_out.data(), 0.0, hsub_out.data()); + + ct::kernels::set_matrix()('L', hsub_out.data(), this->n_band); + + ct::kernels::lapack_potrf()('U', this->n_band, hsub_out.data(), this->n_band); + ct::kernels::lapack_trtri()('U', 'N', this->n_band, hsub_out.data(), this->n_band); + + // Rotate psi and hpsi + this->plintrans.act(1.0, psi_out.data(), hsub_out.data(), 0.0, workspace_in.data()); + syncmem_complex_op()(psi_out.data(), workspace_in.data(), this->n_band_l * this->n_basis); + + this->plintrans.act(1.0, hpsi_out.data(), hsub_out.data(), 0.0, workspace_in.data()); + syncmem_complex_op()(hpsi_out.data(), workspace_in.data(), this->n_band_l * this->n_basis); +} + +template +bool DiagoPPCG::test_error(const ct::Tensor& err_in, const std::vector& ethr_band) +{ + Real* _err_st = err_in.data(); + bool not_conv = false; + std::vector tmp_cpu; + if (err_in.device_type() == ct::DeviceType::GpuDevice) + { + tmp_cpu.resize(this->n_band_l); + _err_st = tmp_cpu.data(); + syncmem_var_d2h_op()(_err_st, err_in.data(), this->n_band_l); + } + for (int ii = 0; ii < this->n_band_l; ++ii) + { + if (_err_st[ii] > ethr_band[ii]) + { + not_conv = true; + } + } +#ifdef __MPI + MPI_Allreduce(MPI_IN_PLACE, ¬_conv, 1, MPI_C_BOOL, MPI_LOR, BP_WORLD); +#endif + return not_conv; +} + +template +void DiagoPPCG::diag_subspace(const HPsiFunc& hpsi_func, + const bool has_p, + ct::Tensor& psi_out, + ct::Tensor& hpsi_out, + ct::Tensor& p_out, + ct::Tensor& hp_out, + const int nlocked_in) +{ + const int n_sub = has_p ? 3 * this->n_band : 2 * this->n_band; + + // 1. Compute H|w> + this->calc_hpsi(hpsi_func, this->w.data(), this->hw); + + // 2. Compute overlap (S) and Hamiltonian (H) projection blocks. + // Only upper-triangular blocks are computed explicitly; + // lower-triangular parts are filled by Hermitian conjugate. + ct::Tensor b_00(t_type, device_type, {this->n_band, this->n_band}); + ct::Tensor b_01(t_type, device_type, {this->n_band, this->n_band}); + ct::Tensor b_11(t_type, device_type, {this->n_band, this->n_band}); + + this->pmmcn.multiply(one_, psi_out.data(), psi_out.data(), zero_, b_00.data()); + this->pmmcn.multiply(one_, psi_out.data(), this->w.data(), zero_, b_01.data()); + this->pmmcn.multiply(one_, this->w.data(), this->w.data(), zero_, b_11.data()); + + ct::Tensor bh_00(t_type, device_type, {this->n_band, this->n_band}); + ct::Tensor bh_01(t_type, device_type, {this->n_band, this->n_band}); + ct::Tensor bh_11(t_type, device_type, {this->n_band, this->n_band}); + + this->pmmcn.multiply(one_, psi_out.data(), hpsi_out.data(), zero_, bh_00.data()); + this->pmmcn.multiply(one_, psi_out.data(), this->hw.data(), zero_, bh_01.data()); + this->pmmcn.multiply(one_, this->w.data(), this->hw.data(), zero_, bh_11.data()); + + ct::Tensor b_02, b_12, b_22, bh_02, bh_12, bh_22; + if (has_p) + { + b_02 = ct::Tensor(t_type, device_type, {this->n_band, this->n_band}); + b_12 = ct::Tensor(t_type, device_type, {this->n_band, this->n_band}); + b_22 = ct::Tensor(t_type, device_type, {this->n_band, this->n_band}); + bh_02 = ct::Tensor(t_type, device_type, {this->n_band, this->n_band}); + bh_12 = ct::Tensor(t_type, device_type, {this->n_band, this->n_band}); + bh_22 = ct::Tensor(t_type, device_type, {this->n_band, this->n_band}); + + this->pmmcn.multiply(one_, psi_out.data(), p_out.data(), zero_, b_02.data()); + this->pmmcn.multiply(one_, this->w.data(), p_out.data(), zero_, b_12.data()); + this->pmmcn.multiply(one_, p_out.data(), p_out.data(), zero_, b_22.data()); + + this->pmmcn.multiply(one_, psi_out.data(), hp_out.data(), zero_, bh_02.data()); + this->pmmcn.multiply(one_, this->w.data(), hp_out.data(), zero_, bh_12.data()); + this->pmmcn.multiply(one_, p_out.data(), hp_out.data(), zero_, bh_22.data()); + } + + // 3. Assemble projected matrices on CPU + ct::Tensor hsub_cpu(t_type, ct::DeviceType::CpuDevice, {n_sub, n_sub}); + ct::Tensor ssub_cpu(t_type, ct::DeviceType::CpuDevice, {n_sub, n_sub}); + ct::Tensor vcc_cpu(t_type, ct::DeviceType::CpuDevice, {n_sub, n_sub}); + ct::Tensor eigen_cpu(r_type, ct::DeviceType::CpuDevice, {n_sub}); + + // Helper to copy block and optionally Hermitian-conjugate transpose + auto copy_block = [&](const ct::Tensor& dev_block, int row_off, int col_off, bool to_h, bool hc) { + std::vector tmp(this->n_band * this->n_band); + syncmem_complex_d2h_op()(tmp.data(), dev_block.data(), this->n_band * this->n_band); + T* dest = to_h ? hsub_cpu.data() : ssub_cpu.data(); + for (int j = 0; j < this->n_band; ++j) + { + for (int i = 0; i < this->n_band; ++i) + { + T val = hc ? std::conj(tmp[j + i * this->n_band]) : tmp[i + j * this->n_band]; + dest[(row_off + i) + (col_off + j) * n_sub] = val; + } + } + }; + + // S_sub assembly + copy_block(b_00, 0, 0, false, false); + copy_block(b_01, 0, this->n_band, false, false); + copy_block(b_01, this->n_band, 0, false, true); // b_10 = b_01^H + copy_block(b_11, this->n_band, this->n_band, false, false); + + // H_sub assembly + copy_block(bh_00, 0, 0, true, false); + copy_block(bh_01, 0, this->n_band, true, false); + copy_block(bh_01, this->n_band, 0, true, true); // bh_10 = bh_01^H + copy_block(bh_11, this->n_band, this->n_band, true, false); + + if (has_p) + { + copy_block(b_02, 0, 2 * this->n_band, false, false); + copy_block(b_02, 2 * this->n_band, 0, false, true); + copy_block(b_12, this->n_band, 2 * this->n_band, false, false); + copy_block(b_12, 2 * this->n_band, this->n_band, false, true); + copy_block(b_22, 2 * this->n_band, 2 * this->n_band, false, false); + + copy_block(bh_02, 0, 2 * this->n_band, true, false); + copy_block(bh_02, 2 * this->n_band, 0, true, true); + copy_block(bh_12, this->n_band, 2 * this->n_band, true, false); + copy_block(bh_12, 2 * this->n_band, this->n_band, true, true); + copy_block(bh_22, 2 * this->n_band, 2 * this->n_band, true, false); + } + + // 4. Freeze locked bands: force their rows/columns to diagonal standard basis + if (nlocked_in > 0) + { + for (int i = 0; i < nlocked_in; ++i) + { + for (int j = 0; j < n_sub; ++j) + { + T s_val = (j == i) ? one_ : zero_; + T h_val = (j == i) ? static_cast(this->eigen_locked[i]) : zero_; + hsub_cpu.data()[i + j * n_sub] = h_val; + hsub_cpu.data()[j + i * n_sub] = h_val; + ssub_cpu.data()[i + j * n_sub] = s_val; + ssub_cpu.data()[j + i * n_sub] = s_val; + } + } + } + + // 5. Solve generalized eigenvalue problem H_sub * v = lambda * S_sub * v + hsolver::hegvd_op()(nullptr, + n_sub, + n_sub, + hsub_cpu.data(), + ssub_cpu.data(), + eigen_cpu.data(), + vcc_cpu.data()); + + // Ensure locked eigenvalues remain unchanged (overwrite in case of numerical drift) + for (int i = 0; i < nlocked_in && i < this->n_band; ++i) + { + eigen_cpu.data()[i] = this->eigen_locked[i]; + } + + // 6. Move eigenvectors back to device + ct::Tensor vcc_dev = vcc_cpu.to_device(); + + // 7. Update psi = X*C_X + W*C_W + (P*C_P) + setmem_complex_op()(this->work.data(), 0, this->n_band_l * this->n_basis); + this->plintrans.act(1.0, psi_out.data(), vcc_dev.data() + 0, 0.0, this->work.data()); + this->plintrans.act(1.0, this->w.data(), vcc_dev.data() + this->n_band, 1.0, this->work.data()); + if (has_p) + { + this->plintrans.act(1.0, p_out.data(), vcc_dev.data() + 2 * this->n_band, 1.0, this->work.data()); + } + syncmem_complex_op()(psi_out.data(), this->work.data(), this->n_band_l * this->n_basis); + + // 8. Update hpsi = HX*C_X + HW*C_W + (HP*C_P) + setmem_complex_op()(this->work.data(), 0, this->n_band_l * this->n_basis); + this->plintrans.act(1.0, hpsi_out.data(), vcc_dev.data() + 0, 0.0, this->work.data()); + this->plintrans.act(1.0, this->hw.data(), vcc_dev.data() + this->n_band, 1.0, this->work.data()); + if (has_p) + { + this->plintrans.act(1.0, hp_out.data(), vcc_dev.data() + 2 * this->n_band, 1.0, this->work.data()); + } + syncmem_complex_op()(hpsi_out.data(), this->work.data(), this->n_band_l * this->n_basis); + + // 9. Update p = W*C_W + (P*C_P) -- LOBPCG style, no X component + setmem_complex_op()(this->work.data(), 0, this->n_band_l * this->n_basis); + this->plintrans.act(1.0, this->w.data(), vcc_dev.data() + this->n_band, 0.0, this->work.data()); + if (has_p) + { + this->plintrans.act(1.0, p_out.data(), vcc_dev.data() + 2 * this->n_band, 1.0, this->work.data()); + } + syncmem_complex_op()(p_out.data(), this->work.data(), this->n_band_l * this->n_basis); + + // 10. Update hp = HW*C_W + (HP*C_P) + setmem_complex_op()(this->work.data(), 0, this->n_band_l * this->n_basis); + this->plintrans.act(1.0, this->hw.data(), vcc_dev.data() + this->n_band, 0.0, this->work.data()); + if (has_p) + { + this->plintrans.act(1.0, hp_out.data(), vcc_dev.data() + 2 * this->n_band, 1.0, this->work.data()); + } + syncmem_complex_op()(hp_out.data(), this->work.data(), this->n_band_l * this->n_basis); + + // 11. Update eigenvalues with the lowest n_band eigenvalues from subspace diagonalization + syncmem_var_h2d_op()(this->eigen.data(), eigen_cpu.data(), this->n_band); +} + +template +void DiagoPPCG::diag(const HPsiFunc& hpsi_func, + T* psi_in, + Real* eigenvalue_in, + const std::vector& ethr_band) +{ + const int current_scf_iter = hsolver::DiagoIterAssist::SCF_ITER; + + // Map the input psi pointer + this->psi = std::move(ct::TensorMap(psi_in, t_type, device_type, {this->n_band_l, this->n_basis})); + + // Update precondition array + this->calc_prec(); + + // Initial subspace diagonalization to improve the initial guess + this->calc_hpsi(hpsi_func, psi_in, this->hpsi); + + // Build and diagonalize the subspace Hamiltonian in the psi basis + ct::Tensor hsub_init(t_type, device_type, {this->n_band, this->n_band}); + this->pmmcn.multiply(one_, this->hpsi.data(), this->psi.data(), zero_, hsub_init.data()); + ct::kernels::lapack_heevd()(this->n_band, + hsub_init.data(), + this->n_band, + this->eigen.data()); + + // Rotate psi and hpsi with the eigenvectors of the subspace problem + this->plintrans.act(1.0, this->psi.data(), hsub_init.data(), 0.0, this->work.data()); + syncmem_complex_op()(this->psi.data(), this->work.data(), this->n_band_l * this->n_basis); + this->plintrans.act(1.0, this->hpsi.data(), hsub_init.data(), 0.0, this->work.data()); + syncmem_complex_op()(this->hpsi.data(), this->work.data(), this->n_band_l * this->n_basis); + + // Initialize search direction to zero + setmem_complex_op()(this->p.data(), 0, this->n_band_l * this->n_basis); + setmem_complex_op()(this->hp.data(), 0, this->n_band_l * this->n_basis); + + // Allocate a reusable tensor for projection overlap + ct::Tensor hsub_orth(t_type, device_type, {this->n_band, this->n_band}); + + int ntry = 0; + int max_iter = current_scf_iter > 1 ? this->nline : this->nline * 6; + this->nlocked = 0; + + do + { + ++ntry; + + // 1. Calculate preconditioned residual w and error for active bands only + this->calc_grad(this->prec, this->err_st, this->psi, this->hpsi, this->w, this->nlocked); + + // 2. Update locking status: scan from current nlocked forward + this->update_locking(this->err_st, ethr_band); + + // 3. Exit if all bands have converged + if (this->nlocked >= this->n_band) + { + break; + } + + // 4. Project active residual to orthogonal complement of psi + this->orth_projection(this->psi, hsub_orth, this->w); + + // 5. Expanded subspace diagonalization with locking + // Locked bands are frozen in the subspace problem + this->diag_subspace(hpsi_func, ntry > 1, this->psi, this->hpsi, this->p, this->hp, this->nlocked); + + // Note: orth_cholesky is intentionally skipped here. + // The Rayleigh-Ritz step already provides orthonormal vectors (within numerical precision). + // Global Cholesky would destroy the locking by remixing all bands. + + } while (ntry < max_iter && this->nlocked < this->n_band); + + // Final subspace diagonalization to obtain accurate eigenvalues + this->pmmcn.multiply(one_, this->hpsi.data(), this->psi.data(), zero_, hsub_orth.data()); + ct::kernels::lapack_heevd()(this->n_band, + hsub_orth.data(), + this->n_band, + this->eigen.data()); + this->plintrans.act(1.0, this->psi.data(), hsub_orth.data(), 0.0, this->work.data()); + syncmem_complex_op()(this->psi.data(), this->work.data(), this->n_band_l * this->n_basis); + + // Copy eigenvalues to output + int start_nband = 0; +#ifdef __MPI + if (this->plintrans.nproc_col > 1) + { + start_nband = this->plintrans.start_colB[GlobalV::MY_BNDGROUP]; + } +#endif + syncmem_var_d2h_op()(eigenvalue_in, this->eigen.data() + start_nband, this->n_band_l); +} + +// Explicit template instantiations +template class DiagoPPCG, base_device::DEVICE_CPU>; +template class DiagoPPCG, base_device::DEVICE_CPU>; +#if ((defined __CUDA) || (defined __ROCM)) +template class DiagoPPCG, base_device::DEVICE_GPU>; +template class DiagoPPCG, base_device::DEVICE_GPU>; +#endif + +} // namespace hsolver diff --git a/source/source_hsolver/diago_ppcg.h b/source/source_hsolver/diago_ppcg.h new file mode 100644 index 0000000000..d0ca5a2ebb --- /dev/null +++ b/source/source_hsolver/diago_ppcg.h @@ -0,0 +1,201 @@ +#ifndef DIAGO_PPCG_H_ +#define DIAGO_PPCG_H_ + +#include "source_base/kernels/math_kernel_op.h" +#include "source_base/module_device/memory_op.h" +#include "source_base/module_device/types.h" +#include "source_base/para_gemm.h" +#include "source_hsolver/kernels/hegvd_op.h" +#include "source_hsolver/para_linear_transform.h" + +#include +#include +#include + +namespace hsolver +{ + +/** + * @class DiagoPPCG + * @brief A class for diagonalization using the Projected Preconditioned Conjugate Gradient (PPCG) method. + * + * The DiagoPPCG class implements a block LOBPCG-like algorithm for solving generalized eigenvalue problems. + * It uses an expanded subspace [X, W, P] where X is the current eigenvector approximation, + * W is the preconditioned residual, and P is the conjugate search direction from previous steps. + * + * @tparam T The floating-point type used for calculations. + * @tparam Device The device used for calculations (e.g., cpu or gpu). + */ +template , typename Device = base_device::DEVICE_CPU> +class DiagoPPCG +{ + private: + using Real = typename GetTypeReal::type; + + public: + /** + * @brief Constructor for DiagoPPCG class. + * + * @param precondition_in Pointer to the host precondition array. + */ + explicit DiagoPPCG(const Real* precondition_in); + + /** + * @brief Destructor for DiagoPPCG class. + */ + ~DiagoPPCG(); + + /** + * @brief Initialize the class before diagonalization. + * + * @param nband The number of bands of all processes. + * @param nband_l The number of bands of current process. + * @param nbasis The number of basis functions. Leading dimension of psi. + * @param ndim The number of valid dimension of psi. + */ + void init_iter(const int nband, const int nband_l, const int nbasis, const int ndim); + + using HPsiFunc = std::function; + + /** + * @brief Diagonalize the Hamiltonian using the PPCG method. + * + * @param hpsi_func A function computing the product of the Hamiltonian matrix H + * and a wavefunction blockvector X. + * @param psi_in Pointer to input wavefunction psi matrix with [dim: n_basis x n_band, column major]. + * @param eigenvalue_in Pointer to the eigen array with [dim: n_band]. + * @param ethr_band Convergence threshold for each band. + */ + void diag(const HPsiFunc& hpsi_func, T* psi_in, Real* eigenvalue_in, const std::vector& ethr_band); + + private: + /// the number of bands of all processes + int n_band = 0; + /// the number of bands of current process + int n_band_l = 0; + /// the number of cols of the input psi + int n_basis = 0; + /// valid dimension of psi + int n_dim = 0; + /// max iter steps for ppcg loop + int nline = 4; + + /// parallel matrix multiplication + ModuleBase::PGemmCN pmmcn; + PLinearTransform plintrans; + + ct::DataType r_type = ct::DataType::DT_INVALID; + ct::DataType t_type = ct::DataType::DT_INVALID; + ct::DeviceType device_type = ct::DeviceType::UnKnown; + + ct::Tensor h_prec = {}; + ct::Tensor prec = {}; + ct::Tensor eigen = {}; + + /// Number of globally converged (locked) bands + int nlocked = 0; + /// Locked eigenvalues on CPU + std::vector eigen_locked; + /// MPI band distribution for error gathering + std::vector all_n_band_l; + std::vector band_displs; + ct::Tensor err_st = {}; + + ct::Tensor psi = {}, hpsi = {}; + ct::Tensor w = {}, hw = {}; + ct::Tensor p = {}, hp = {}; + ct::Tensor work = {}; + + Device* ctx = {}; + const T *one = nullptr, *zero = nullptr, *neg_one = nullptr; + const T one_ = static_cast(1.0), zero_ = static_cast(0.0), neg_one_ = static_cast(-1.0); + + /** + * @brief Update the precondition array from host to device. + */ + void calc_prec(); + + /** + * @brief Apply the H operator to psi and obtain the hpsi matrix. + */ + void calc_hpsi(const HPsiFunc& hpsi_func, T* psi_in, ct::Tensor& hpsi_out); + + /** + * @brief Calculate the preconditioned residual (gradient) and error. + * + * @param prec_in Input preconditioner. + * @param err_out Output error for each local band. + * @param psi_in Input wavefunction. + * @param hpsi_in H|psi> matrix. + * @param grad_out Output preconditioned residual. + */ + void calc_grad(const ct::Tensor& prec_in, + ct::Tensor& err_out, + ct::Tensor& psi_in, + ct::Tensor& hpsi_in, + ct::Tensor& grad_out, + const int nlocked_in = 0); + + /** + * @brief Orthogonalize grad to psi using S-inner product (S=I for norm-conserving). + * + * @param psi_in Input wavefunction. + * @param hsub_tmp Workspace for overlap matrix. + * @param grad_out Input/Output gradient. + */ + void orth_projection(const ct::Tensor& psi_in, ct::Tensor& hsub_tmp, ct::Tensor& grad_out); + + /** + * @brief Perform expanded subspace diagonalization and update X, P, HX, HP. + * + * @param hpsi_func Hamiltonian application function. + * @param has_p If true, use 3-block [X, W, P]; otherwise use 2-block [X, W]. + * @param psi_out Input/Output wavefunction. + * @param hpsi_out Input/Output H|psi>. + * @param p_out Input/Output search direction. + * @param hp_out Input/Output H|p>. + */ + void diag_subspace(const HPsiFunc& hpsi_func, + const bool has_p, + ct::Tensor& psi_out, + ct::Tensor& hpsi_out, + ct::Tensor& p_out, + ct::Tensor& hp_out, + const int nlocked_in = 0); + + /** + * @brief Orthogonalize and normalize psi using Cholesky decomposition. + */ + void orth_cholesky(ct::Tensor& workspace_in, ct::Tensor& psi_out, ct::Tensor& hpsi_out, ct::Tensor& hsub_out); + + /** + * @brief Update locking status: scan errors from current nlocked forward + * and lock bands that have converged. + */ + void update_locking(const ct::Tensor& err_in, const std::vector& ethr_band); + + /** + * @brief Check if all bands have converged. + */ + bool test_error(const ct::Tensor& err_in, const std::vector& ethr_band); + + using ct_Device = typename ct::PsiToContainer::type; + using setmem_var_op = ct::kernels::set_memory; + using resmem_var_op = ct::kernels::resize_memory; + using delmem_var_op = ct::kernels::delete_memory; + using syncmem_var_h2d_op = ct::kernels::synchronize_memory; + using syncmem_var_d2h_op = ct::kernels::synchronize_memory; + + using setmem_complex_op = ct::kernels::set_memory; + using delmem_complex_op = ct::kernels::delete_memory; + using resmem_complex_op = ct::kernels::resize_memory; + using syncmem_complex_op = ct::kernels::synchronize_memory; + using syncmem_complex_h2d_op = ct::kernels::synchronize_memory; + using syncmem_complex_d2h_op = ct::kernels::synchronize_memory; + + using gemm_op = ModuleBase::gemm_op; +}; + +} // namespace hsolver + +#endif // DIAGO_PPCG_H_ diff --git a/source/source_hsolver/test/CMakeLists.txt b/source/source_hsolver/test/CMakeLists.txt index 1b1529adb4..13a0b0032d 100644 --- a/source/source_hsolver/test/CMakeLists.txt +++ b/source/source_hsolver/test/CMakeLists.txt @@ -16,6 +16,17 @@ if (ENABLE_MPI) ../../source_hamilt/operator.cpp ../../source_pw/module_pwdft/op_pw.cpp ) + AddTest( + TARGET MODULE_HSOLVER_ppcg_perf + LIBS parameter ${math_libs} base psi device container + SOURCES diago_ppcg_perf_test.cpp + ../diago_ppcg.cpp ../diago_bpcg.cpp ../diago_cg.cpp ../diago_david.cpp + ../para_linear_transform.cpp ../diago_iter_assist.cpp ../diag_const_nums.cpp + ../kernels/hegvd_op.cpp + ../../source_basis/module_pw/test/test_tool.cpp + ../../source_hamilt/operator.cpp + ../../source_pw/module_pwdft/op_pw.cpp + ) AddTest( TARGET MODULE_HSOLVER_cg LIBS parameter ${math_libs} base psi device container diff --git a/source/source_hsolver/test/diago_ppcg_perf_test.cpp b/source/source_hsolver/test/diago_ppcg_perf_test.cpp new file mode 100644 index 0000000000..6983d2eacc --- /dev/null +++ b/source/source_hsolver/test/diago_ppcg_perf_test.cpp @@ -0,0 +1,284 @@ +#include "../diag_comm_info.h" +#include "../diago_bpcg.h" +#include "../diago_cg.h" +#include "../diago_david.h" +#include "../diago_iter_assist.h" +#include "../diago_ppcg.h" +#include "diago_mock.h" +#include "source_base/kernels/math_kernel_op.h" +#include "source_base/module_external/lapack_connector.h" +#include "source_psi/psi.h" + +#include +#include +#include +#include +#include +#include +#include + +#ifdef __MPI +#include "source_base/parallel_comm.h" +#endif + +using T = std::complex; + +// LAPACK reference eigenvalues (values only) +void lapack_eigenvalues(int npw, const std::vector& hm, double* e) +{ + std::vector tmp = hm; + int lwork = 2 * npw; + std::vector work(lwork); + std::vector rwork(3 * npw - 2); + int info = 0; + char jobz = 'N', uplo = 'U'; + zheev_(&jobz, &uplo, &npw, tmp.data(), &npw, e, work.data(), &lwork, rwork.data(), &info); +} + +// Unified H|psi> via gemm +auto make_hpsi_func(const std::vector& hmat, int dim) +{ + return [hmat, dim](T* psi_in, T* hpsi_out, const int ld_psi, const int nvec) { + T one = T(1.0); + T zero = T(0.0); + base_device::DEVICE_CPU* ctx = {}; + ModuleBase::gemm_op()('N', + 'N', + dim, + nvec, + dim, + &one, + hmat.data(), + dim, + psi_in, + ld_psi, + &zero, + hpsi_out, + ld_psi); + }; +} + +// S|psi> = |psi> (identity, norm-conserving) +auto spsi_identity = [](T* psi_in, T* spsi_out, const int ld_psi, const int nvec) { + for (int i = 0; i < ld_psi * nvec; ++i) + { + spsi_out[i] = psi_in[i]; + } +}; + +struct PerfResult +{ + std::string name; + double time = 0.0; + double max_err = 0.0; + bool converged = false; +}; + +// -------------------- PPCG -------------------- +PerfResult test_ppcg(int nband, + int npw, + double ethr, + const psi::Psi& psi0, + const std::function& hpsi_func, + double* precondition, + const std::vector& e_ref) +{ + psi::Psi psi(psi0); + psi.fix_k(0); + std::vector en(nband, 0.0); + + hsolver::DiagoPPCG ppcg(precondition); + ppcg.init_iter(nband, nband, npw, npw); + hsolver::DiagoIterAssist::SCF_ITER = 1; // first SCF step + std::vector ethr_band(nband, ethr); + + double t1 = MPI_Wtime(); + ppcg.diag(hpsi_func, psi.get_pointer(), en.data(), ethr_band); + double t2 = MPI_Wtime(); + + double err = 0.0; + for (int i = 0; i < nband; ++i) + { + err = std::max(err, std::abs(en[i] - e_ref[i])); + } + return {"PPCG", t2 - t1, err, err < 1e-2}; +} + +// -------------------- BPCG -------------------- +PerfResult test_bpcg(int nband, + int npw, + double ethr, + const psi::Psi& psi0, + const std::function& hpsi_func, + double* precondition, + const std::vector& e_ref) +{ + psi::Psi psi(psi0); + psi.fix_k(0); + std::vector en(nband, 0.0); + + hsolver::DiagoBPCG bpcg(precondition); + bpcg.init_iter(nband, nband, npw, npw); + hsolver::DiagoIterAssist::SCF_ITER = 1; + std::vector ethr_band(nband, ethr); + + double t1 = MPI_Wtime(); + bpcg.diag(hpsi_func, psi.get_pointer(), en.data(), ethr_band); + double t2 = MPI_Wtime(); + + double err = 0.0; + for (int i = 0; i < nband; ++i) + { + err = std::max(err, std::abs(en[i] - e_ref[i])); + } + return {"BPCG", t2 - t1, err, err < 1e-2}; +} + +// -------------------- CG -------------------- +PerfResult test_cg(int nband, + int npw, + double ethr, + int maxiter, + const psi::Psi& psi0, + const std::function& hpsi_func, + double* precondition, + const std::vector& e_ref) +{ + psi::Psi psi(psi0); + psi.fix_k(0); + std::vector en(nband, 0.0); + + hsolver::DiagoCG cg("pw", "scf"); + hsolver::DiagoIterAssist::PW_DIAG_NMAX = maxiter; + hsolver::DiagoIterAssist::PW_DIAG_THR = ethr; + std::vector ethr_band(nband, ethr); + + double t1 = MPI_Wtime(); + cg.diag(hpsi_func, spsi_identity, npw, nband, npw, psi.get_pointer(), en.data(), ethr_band, precondition); + double t2 = MPI_Wtime(); + + double err = 0.0; + for (int i = 0; i < nband; ++i) + { + err = std::max(err, std::abs(en[i] - e_ref[i])); + } + return {"CG", t2 - t1, err, err < 1e-2}; +} + +// -------------------- Davidson -------------------- +PerfResult test_david(int nband, + int npw, + double ethr, + int maxiter, + const psi::Psi& psi0, + const std::function& hpsi_func, + double* precondition, + const std::vector& e_ref) +{ + psi::Psi psi(psi0); + psi.fix_k(0); + std::vector en(nband, 0.0); + +#ifdef __MPI + int rank = 0, nproc = 1; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &nproc); + const hsolver::diag_comm_info comm_info(MPI_COMM_WORLD, rank, nproc); +#else + const hsolver::diag_comm_info comm_info(0, 1); +#endif + + hsolver::DiagoDavid david(precondition, nband, npw, 4, false, comm_info); + std::vector ethr_band(nband, ethr); + + double t1 = MPI_Wtime(); + david.diag(hpsi_func, spsi_identity, npw, psi.get_pointer(), en.data(), ethr_band, maxiter); + double t2 = MPI_Wtime(); + + double err = 0.0; + for (int i = 0; i < nband; ++i) + { + err = std::max(err, std::abs(en[i] - e_ref[i])); + } + return {"Davidson", t2 - t1, err, err < 1e-2}; +} + +// ============================================================ +int main(int argc, char** argv) +{ + MPI_Init(&argc, &argv); + int rank = 0, nproc = 1; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &nproc); + +#ifdef __MPI + BP_WORLD = MPI_COMM_WORLD; +#endif + + // ---------- test parameters ---------- + int nband = 20; + int npw = 500; + int sparsity = 0; // 0 = dense + double ethr = 1e-5; + int maxiter = 300; + // ------------------------------------- + + // generate Hamiltonian, precondition and initial guess + HPsi hpsi_gen(nband, npw, sparsity); + DIAGOTEST::hmatrix = hpsi_gen.hamilt(); + DIAGOTEST::npw = npw; + DIAGOTEST::npw_local = new int[1]; + DIAGOTEST::npw_local[0] = npw; + DIAGOTEST::hmatrix_local = DIAGOTEST::hmatrix; + + double* precondition = hpsi_gen.precond(); + + // LAPACK reference + std::vector e_ref(npw); + lapack_eigenvalues(npw, DIAGOTEST::hmatrix, e_ref.data()); + + // initial psi guess (perturbed eigenvectors) + psi::Psi psi0(1, nband, npw, npw, true); + std::default_random_engine p(1); + std::uniform_int_distribution u(1, 10); + for (int i = 0; i < nband; ++i) + { + for (int j = 0; j < npw; ++j) + { + double r = static_cast(u(p)) / 10.0; + psi0(0, i, j) = DIAGOTEST::hmatrix[j * npw + i] * r; + } + } + + auto hpsi_func = make_hpsi_func(DIAGOTEST::hmatrix_local, npw); + + // run benchmarks + PerfResult r_ppcg = test_ppcg(nband, npw, ethr, psi0, hpsi_func, precondition, e_ref); + PerfResult r_bpcg = test_bpcg(nband, npw, ethr, psi0, hpsi_func, precondition, e_ref); + PerfResult r_cg = test_cg(nband, npw, ethr, maxiter, psi0, hpsi_func, precondition, e_ref); + PerfResult r_david = test_david(nband, npw, ethr, maxiter, psi0, hpsi_func, precondition, e_ref); + + if (rank == 0) + { + std::cout << "\n========================================\n"; + std::cout << " Diagonalization Performance Comparison\n"; + std::cout << " nband=" << nband << ", npw=" << npw << ", sparsity=" << sparsity << "\n"; + std::cout << "========================================\n"; + std::cout << std::setw(10) << "Method" << std::setw(14) << "Time(s)" << std::setw(14) << "MaxError" + << std::setw(8) << "OK" << "\n"; + std::cout << "----------------------------------------\n"; + auto print = [](const PerfResult& r) { + std::cout << std::setw(10) << r.name << std::setw(14) << std::scientific << std::setprecision(3) << r.time + << std::setw(14) << r.max_err << std::setw(8) << (r.converged ? "Yes" : "No") << "\n"; + }; + print(r_ppcg); + print(r_bpcg); + print(r_cg); + print(r_david); + std::cout << "========================================\n\n"; + } + + delete[] DIAGOTEST::npw_local; + MPI_Finalize(); + return 0; +}