diff --git a/source/source_estate/elecstate_print.cpp b/source/source_estate/elecstate_print.cpp index 517c002dd6d..df2bb9b4821 100644 --- a/source/source_estate/elecstate_print.cpp +++ b/source/source_estate/elecstate_print.cpp @@ -58,6 +58,7 @@ void print_scf_iterinfo(const std::string& ks_solver, {"scalapack_gvx", "GV"}, {"cusolver", "CU"}, {"bpcg", "BP"}, + {"lobpcg", "LB"}, {"pexsi", "PE"}, {"cusolvermp", "CM"}, {"sdft", "CT"}}; // CT = Chebyshev Trace, for pure SDFT (nbands=0) where no H diagonalization is performed diff --git a/source/source_hsolver/CMakeLists.txt b/source/source_hsolver/CMakeLists.txt index b115d6d4cd2..2e30217a6e5 100644 --- a/source/source_hsolver/CMakeLists.txt +++ b/source/source_hsolver/CMakeLists.txt @@ -4,6 +4,7 @@ list(APPEND objects diago_david.cpp diago_dav_subspace.cpp diago_bpcg.cpp + diago_lobpcg.cpp para_linear_transform.cpp hsolver_pw.cpp hsolver_lcaopw.cpp diff --git a/source/source_hsolver/diago_lobpcg.cpp b/source/source_hsolver/diago_lobpcg.cpp new file mode 100644 index 00000000000..8ae513ea080 --- /dev/null +++ b/source/source_hsolver/diago_lobpcg.cpp @@ -0,0 +1,694 @@ +#include "source_hsolver/diago_lobpcg.h" + +#include "diago_iter_assist.h" +#include "source_base/global_function.h" +#include "source_base/kernels/math_kernel_op.h" +#include "source_base/parallel_comm.h" + +#include +#include +#include +#include +#include +#include +#include + +namespace hsolver { + +// ============================================================================ +// Band-major helpers (CPU, n_band_l == n_band required) +// +// Psi is stored band-major: psi_data[ib * n_basis + ig], +// shape [n_band_l, n_basis]. The BLAS view is n_basis rows by n_band_l cols. +// +// Subspace matrices (C, V) are column-major for direct LAPACK use: +// C[col * ld + row] = C[j * nb + i] (nb = leading dimension). +// ============================================================================ + +// ============================================================================ +// File-static helpers +// ============================================================================ + +template +static void mirror_lower(T* mat, int ld, int active_sub) +{ + for (int c = 0; c < active_sub; c++) + for (int r = c + 1; r < active_sub; r++) + mat[c * ld + r] = std::conj(mat[r * ld + c]); +} + +template +static void clean_hermitian_diag(T* mat, int ld, int active_sub) +{ + using Real = typename GetTypeReal::type; + for (int i = 0; i < active_sub; i++) { + int idx = i * ld + i; + mat[idx] = T(std::real(mat[idx]), static_cast(0)); + } +} + +template +static bool append_orthonormal_block( + const int nvec, const int nbs, const typename GetTypeReal::type thresh, + const T* block, const T* hblock, + std::vector& basis, std::vector& hbasis) +{ + using Real = typename GetTypeReal::type; + bool appended = false; + const Real thresh2 = thresh * thresh; + + for (int ib = 0; ib < nvec; ++ib) { + std::vector q(nbs); + std::vector hq(nbs); + for (int ig = 0; ig < nbs; ++ig) { + q[ig] = block[ib * nbs + ig]; + hq[ig] = hblock[ib * nbs + ig]; + } + + const int nold = static_cast(basis.size() / nbs); + for (int jq = 0; jq < nold; ++jq) { + T dot = static_cast(0.0); + for (int ig = 0; ig < nbs; ++ig) { + dot += std::conj(basis[jq * nbs + ig]) * q[ig]; + } +#ifdef __MPI + Parallel_Reduce::reduce_pool(&dot, 1); +#endif + for (int ig = 0; ig < nbs; ++ig) { + q[ig] -= dot * basis[jq * nbs + ig]; + hq[ig] -= dot * hbasis[jq * nbs + ig]; + } + } + + Real norm2 = static_cast(0.0); + for (int ig = 0; ig < nbs; ++ig) { + norm2 += std::norm(q[ig]); + } +#ifdef __MPI + Parallel_Reduce::reduce_pool(&norm2, 1); +#endif + if (norm2 <= thresh2) { + continue; + } + + const Real inv_norm = static_cast(1.0) / std::sqrt(norm2); + for (int ig = 0; ig < nbs; ++ig) { + basis.push_back(q[ig] * inv_norm); + hbasis.push_back(hq[ig] * inv_norm); + } + appended = true; + } + + return appended; +} + +// ============================================================================ +// Constructor / Destructor +// ============================================================================ + +template +DiagoLobpcg::DiagoLobpcg(const Real* precondition) +{ + this->r_type = ct::DataTypeToEnum::value; + this->t_type = ct::DataTypeToEnum::value; + this->dev_type = ct::DeviceTypeToEnum::value; + this->h_prec_ptr = precondition; + this->one = &one_; + this->zero = &zero_; + this->neg_one = &neg_one_; +} + +template +DiagoLobpcg::~DiagoLobpcg() {} + +// ============================================================================ +// init_iter +// ============================================================================ + +template +void DiagoLobpcg::init_iter(int nband, int nband_l, + int nbasis, int ndim) +{ + this->n_band = nband; + this->n_band_l = nband_l; + this->n_basis = nbasis; + this->n_dim = ndim; + this->nsub = 3 * n_band; + this->has_pdir = false; + + this->eigen = ct::Tensor(r_type, dev_type, {this->n_band}); + this->sub_eigen = ct::Tensor(r_type, dev_type, {this->nsub}); + this->err_st = ct::Tensor(r_type, dev_type, {this->n_band_l}); + + this->hsub = ct::Tensor(t_type, dev_type, {this->nsub, this->nsub}); + this->tmp_hsub = ct::Tensor(t_type, dev_type, {this->n_band, this->n_band}); + + this->hpsi = ct::Tensor(t_type, dev_type, {this->n_band_l, this->n_basis}); + this->spsi = ct::Tensor(t_type, dev_type, {this->n_band_l, this->n_basis}); + this->grad = ct::Tensor(t_type, dev_type, {this->n_band_l, this->n_basis}); + this->hgrad = ct::Tensor(t_type, dev_type, {this->n_band_l, this->n_basis}); + this->pdir = ct::Tensor(t_type, dev_type, {this->n_band_l, this->n_basis}); + this->hpdir = ct::Tensor(t_type, dev_type, {this->n_band_l, this->n_basis}); + + this->work = ct::Tensor(t_type, dev_type, {this->n_band_l, this->n_basis}); + this->hwork = ct::Tensor(t_type, dev_type, {this->n_band_l, this->n_basis}); + this->pwork = ct::Tensor(t_type, dev_type, {this->n_band_l, this->n_basis}); + this->hpwork = ct::Tensor(t_type, dev_type, {this->n_band_l, this->n_basis}); + + this->prec = ct::Tensor(r_type, dev_type, {this->n_basis}); + this->h_prec = ct::TensorMap( + (void*)this->h_prec_ptr, this->r_type, + ct::DeviceType::CpuDevice, {this->n_basis}); + +#ifdef __MPI + this->pmmcn.set_dimension(BP_WORLD, POOL_WORLD, + this->n_band_l, this->n_basis, + this->n_band_l, this->n_basis, + this->n_dim, this->n_band); + this->plintrans.set_dimension(this->n_dim, this->n_band_l, + this->n_band_l, this->n_basis, + BP_WORLD, false); +#else + this->pmmcn.set_dimension(this->n_band_l, this->n_basis, + this->n_band_l, this->n_basis, + this->n_dim, this->n_band); + this->plintrans.set_dimension(this->n_dim, this->n_band_l, + this->n_band_l, this->n_basis, + false); +#endif +} + +// ============================================================================ +// calc_prec +// ============================================================================ + +template +void DiagoLobpcg::calc_prec() +{ + syncmem_var_h2d_op()(this->prec.template data(), + this->h_prec.template data(), + this->n_basis); +} + +// ============================================================================ +// calc_hpsi_with_block / calc_spsi_with_block +// ============================================================================ + +template +void DiagoLobpcg::calc_hpsi_with_block( + const HPsiFunc& hpsi_func, T* psi_in, ct::Tensor& hpsi_out) +{ + hpsi_func(psi_in, hpsi_out.data(), this->n_basis, this->n_band_l); +} + +template +void DiagoLobpcg::calc_spsi_with_block( + const SPsiFunc& spsi_func, const T* psi_in, ct::Tensor& spsi_out) +{ + spsi_func(psi_in, spsi_out.data(), this->n_basis, this->n_band_l); +} + +// ============================================================================ +// rayleigh_ritz (NC, S=I) +// +// psi_in is assumed orthonormal. H_sub = , heevd, rotate. +// ============================================================================ + +template +void DiagoLobpcg::rayleigh_ritz( + ct::Tensor& psi_inout, ct::Tensor& hpsi_inout, ct::Tensor& eigen_out) +{ + const int nb = this->n_band; + const int nbs = this->n_basis; + const int local_sz = this->n_band_l * nbs; + + // Zero-fill H_sub before inner_product to eliminate stale data + for (int ii = 0; ii < nb * nb; ++ii) + this->tmp_hsub.data()[ii] = static_cast(0.0); + + this->pmmcn.multiply(this->one_, psi_inout.data(), hpsi_inout.data(), + this->zero_, this->tmp_hsub.data()); + mirror_lower(this->tmp_hsub.data(), nb, nb); + clean_hermitian_diag(this->tmp_hsub.data(), nb, nb); + + // Force exact Hermitian symmetrization. + { + T* hsub = this->tmp_hsub.data(); + for (int jj = 0; jj < nb; ++jj) { + hsub[jj * nb + jj] = T(std::real(hsub[jj * nb + jj]), 0.0); + for (int ii = jj + 1; ii < nb; ++ii) { + T a = hsub[jj * nb + ii]; + T b = std::conj(hsub[ii * nb + jj]); + T avg = static_cast(0.5) * (a + b); + hsub[jj * nb + ii] = avg; + hsub[ii * nb + jj] = std::conj(avg); + } + } + } + + ct::kernels::lapack_heevd()( + nb, this->tmp_hsub.data(), nb, eigen_out.data()); + + setmem_complex_op()(this->work.data(), static_cast(0.0), local_sz); + this->plintrans.act(this->one_, psi_inout.data(), + this->tmp_hsub.data(), + this->zero_, this->work.data()); + syncmem_complex_op()(psi_inout.data(), this->work.data(), local_sz); + + setmem_complex_op()(this->work.data(), static_cast(0.0), local_sz); + this->plintrans.act(this->one_, hpsi_inout.data(), + this->tmp_hsub.data(), + this->zero_, this->work.data()); + syncmem_complex_op()(hpsi_inout.data(), this->work.data(), local_sz); +} + +template +void DiagoLobpcg::generalized_rayleigh_ritz( + ct::Tensor&, ct::Tensor&, ct::Tensor&, ct::Tensor&) +{ + ModuleBase::WARNING_QUIT("DiagoLobpcg", + "Generalized R-R (USPP) is not implemented yet."); +} + +// ============================================================================ +// compute_residual — NC: R = HX - lambda*X, grad = R ./ prec +// ============================================================================ + +template +void DiagoLobpcg::compute_residual( + const ct::Tensor& psi_in, const ct::Tensor& hpsi_in, + const ct::Tensor& eigen_in, const ct::Tensor& prec_in, + ct::Tensor& grad_out, ct::Tensor& err_out) +{ + const Real* _prec = prec_in.data(); + const Real* _eigen = eigen_in.data(); + const T* _psi = psi_in.data(); + const T* _hpsi = hpsi_in.data(); + T* _grad = grad_out.data(); + Real* _err = err_out.data(); + + for (int ib = 0; ib < this->n_band_l; ib++) { + const int ioff = ib * this->n_basis; + const Real lambda = _eigen[ib]; + Real err_j = 0.0; + for (int ig = 0; ig < this->n_dim; ig++) { + const int idx = ioff + ig; + const T r = _hpsi[idx] - lambda * _psi[idx]; + _grad[idx] = r / std::max(_prec[ig], + static_cast(1e-8)); + err_j += std::norm(r); + } + for (int ig = this->n_dim; ig < this->n_basis; ig++) + _grad[ioff + ig] = static_cast(0.0); + _err[ib] = err_j; + } +#ifdef __MPI + Parallel_Reduce::reduce_pool(_err, this->n_band_l); +#endif + for (int ib = 0; ib < this->n_band_l; ib++) + _err[ib] = std::sqrt(_err[ib]); +} + +template +void DiagoLobpcg::compute_residual_s( + const ct::Tensor&, const ct::Tensor&, const ct::Tensor&, + const ct::Tensor&, const ct::Tensor&, ct::Tensor&, ct::Tensor&) +{ + ModuleBase::WARNING_QUIT("DiagoLobpcg", + "compute_residual_s (USPP) is not implemented yet."); +} + +// ============================================================================ +// orth_projection — grad -= psi * [S=I] +// inner(i,j) = (col-major) +// grad_j -= sum_i psi_i * inner(i,j) +// ============================================================================ + +template +void DiagoLobpcg::orth_projection( + const ct::Tensor& psi_in, ct::Tensor& hsub_work, ct::Tensor& grad_out) +{ + this->pmmcn.multiply(this->one_, psi_in.data(), grad_out.data(), + this->zero_, hsub_work.data()); + + this->plintrans.act(this->neg_one_, psi_in.data(), + hsub_work.data(), + this->one_, grad_out.data()); +} + +template +void DiagoLobpcg::orth_projection_s( + const ct::Tensor&, const ct::Tensor&, ct::Tensor&, ct::Tensor&) +{ + ModuleBase::WARNING_QUIT("DiagoLobpcg", + "orth_projection_s (USPP) is not implemented yet."); +} + +// ============================================================================ +// rotate_wf +// ============================================================================ + +template +void DiagoLobpcg::rotate_wf( + const ct::Tensor& hsub_in, ct::Tensor& psi_out, ct::Tensor& workspace_in) +{ + const int nbs = this->n_basis; + setmem_complex_op()(workspace_in.data(), static_cast(0.0), + this->n_band_l * nbs); + this->plintrans.act(this->one_, psi_out.data(), + hsub_in.data(), + this->zero_, workspace_in.data()); + syncmem_complex_op()(psi_out.data(), workspace_in.data(), + this->n_band_l * nbs); +} + +// ============================================================================ +// orth_cholesky — S=I Cholesky orthonormalization +// ============================================================================ + +template +void DiagoLobpcg::orth_cholesky( + ct::Tensor& workspace_in, ct::Tensor& psi_out, + ct::Tensor& hpsi_out, ct::Tensor& hsub_out) +{ + const int nb = this->n_band; + const int nbs = this->n_basis; + + T* psi_d = psi_out.data(); + T* hpsi_d = hpsi_out.data(); + const Real eps = static_cast(100) + * std::numeric_limits::epsilon(); + + for (int ib = 0; ib < nb; ++ib) { + for (int jb = 0; jb < ib; ++jb) { + T dot = static_cast(0.0); + for (int ig = 0; ig < nbs; ++ig) { + dot += std::conj(psi_d[jb * nbs + ig]) + * psi_d[ib * nbs + ig]; + } +#ifdef __MPI + Parallel_Reduce::reduce_pool(&dot, 1); +#endif + for (int ig = 0; ig < nbs; ++ig) { + psi_d[ib * nbs + ig] -= dot * psi_d[jb * nbs + ig]; + hpsi_d[ib * nbs + ig] -= dot * hpsi_d[jb * nbs + ig]; + } + } + + Real norm2 = static_cast(0.0); + for (int ig = 0; ig < nbs; ++ig) { + norm2 += std::norm(psi_d[ib * nbs + ig]); + } +#ifdef __MPI + Parallel_Reduce::reduce_pool(&norm2, 1); +#endif + if (!(norm2 > eps)) { + throw std::runtime_error("orth_cholesky failed: dependent vectors"); + } + const Real inv_norm = static_cast(1.0) / std::sqrt(norm2); + for (int ig = 0; ig < nbs; ++ig) { + psi_d[ib * nbs + ig] *= inv_norm; + hpsi_d[ib * nbs + ig] *= inv_norm; + } + } +} + +template +void DiagoLobpcg::orth_cholesky_s( + ct::Tensor&, ct::Tensor&, ct::Tensor&, ct::Tensor&, ct::Tensor&) +{ + ModuleBase::WARNING_QUIT("DiagoLobpcg", + "orth_cholesky_s (USPP) is not implemented yet."); +} + +// ============================================================================ +// test_error +// ============================================================================ + +template +bool DiagoLobpcg::test_error( + const ct::Tensor& err_in, const std::vector& ethr_band) +{ + Real* _err_st = err_in.data(); + bool not_conv = false; + std::vector tmp_cpu; + if (err_in.device_type() == ct::DeviceType::GpuDevice) { + tmp_cpu.resize(this->n_band_l); + _err_st = tmp_cpu.data(); + syncmem_var_d2h_op()(_err_st, err_in.data(), this->n_band_l); + } + for (int ii = 0; ii < this->n_band_l; ii++) + if (_err_st[ii] > ethr_band[ii]) not_conv = true; +#ifdef __MPI + MPI_Allreduce(MPI_IN_PLACE, ¬_conv, 1, MPI_C_BOOL, MPI_LOR, BP_WORLD); +#endif + return not_conv; +} + +// ============================================================================ +// lobpcg_update — generalized R-R on subspace W = [X, Z, P] +// +// H_sub = , S_sub = +// H_sub C = S_sub C Lambda (hegvd) +// X_new = V_XX*X + V_ZX*Z + V_PX*P +// P_new = V_ZX*Z + V_PX*P (soft restart) +// ============================================================================ + +template +void DiagoLobpcg::lobpcg_update( + ct::Tensor& psi, ct::Tensor& hpsi, + ct::Tensor& grad, ct::Tensor& hgrad, + ct::Tensor& pdir, ct::Tensor& hpdir, + ct::Tensor& eigen) +{ + const int n = this->n_band; + const int nbs = this->n_basis; + const int local_sz = this->n_band_l * nbs; + const Real eps = static_cast(100) + * std::numeric_limits::epsilon(); + + std::vector basis; + std::vector hbasis; + basis.reserve((this->has_pdir ? 3 : 2) * local_sz); + hbasis.reserve((this->has_pdir ? 3 : 2) * local_sz); + + append_orthonormal_block(n, nbs, eps, psi.data(), hpsi.data(), + basis, hbasis); + append_orthonormal_block(n, nbs, eps, grad.data(), hgrad.data(), + basis, hbasis); + if (this->has_pdir) { + append_orthonormal_block(n, nbs, eps, pdir.data(), hpdir.data(), + basis, hbasis); + } + + const int m = static_cast(basis.size() / nbs); + if (m < n) { + throw std::runtime_error("LOBPCG subspace lost rank"); + } + + T* hsub_d = this->hsub.data(); + setmem_complex_op()(hsub_d, static_cast(0.0), this->nsub * this->nsub); + + for (int jc = 0; jc < m; ++jc) { + for (int ir = 0; ir < m; ++ir) { + T sum = static_cast(0.0); + for (int ig = 0; ig < nbs; ++ig) { + sum += std::conj(basis[ir * nbs + ig]) + * hbasis[jc * nbs + ig]; + } + hsub_d[jc * this->nsub + ir] = sum; + } + } +#ifdef __MPI + for (int jc = 0; jc < m; ++jc) { + Parallel_Reduce::reduce_pool(hsub_d + jc * this->nsub, m); + } +#endif + clean_hermitian_diag(hsub_d, this->nsub, m); + for (int jc = 0; jc < m; ++jc) { + for (int ir = jc + 1; ir < m; ++ir) { + const T avg = static_cast(0.5) + * (hsub_d[jc * this->nsub + ir] + + std::conj(hsub_d[ir * this->nsub + jc])); + hsub_d[jc * this->nsub + ir] = avg; + hsub_d[ir * this->nsub + jc] = std::conj(avg); + } + } + + ct::kernels::lapack_heevd()( + m, hsub_d, this->nsub, this->sub_eigen.data()); + + const Real* sub = this->sub_eigen.data(); + Real* eig = eigen.data(); + for (int ib = 0; ib < n; ++ib) { + eig[ib] = sub[ib]; + } + + T* x_new = this->work.data(); + T* hx_new = this->hwork.data(); + T* p_new = this->pwork.data(); + T* hp_new = this->hpwork.data(); + for (int i = 0; i < local_sz; ++i) { + x_new[i] = static_cast(0.0); + hx_new[i] = static_cast(0.0); + p_new[i] = static_cast(0.0); + hp_new[i] = static_cast(0.0); + } + + for (int ib = 0; ib < n; ++ib) { + for (int iq = 0; iq < m; ++iq) { + const T coeff = hsub_d[ib * this->nsub + iq]; + for (int ig = 0; ig < nbs; ++ig) { + const int dst = ib * nbs + ig; + const int src = iq * nbs + ig; + x_new[dst] += coeff * basis[src]; + hx_new[dst] += coeff * hbasis[src]; + } + if (iq >= n) { + for (int ig = 0; ig < nbs; ++ig) { + const int dst = ib * nbs + ig; + const int src = iq * nbs + ig; + p_new[dst] += coeff * basis[src]; + hp_new[dst] += coeff * hbasis[src]; + } + } + } + } + + syncmem_complex_op()(psi.data(), x_new, local_sz); + syncmem_complex_op()(hpsi.data(), hx_new, local_sz); + syncmem_complex_op()(pdir.data(), p_new, local_sz); + syncmem_complex_op()(hpdir.data(), hp_new, local_sz); + + this->has_pdir = true; +} + +// ============================================================================ +// diag — main LOBPCG loop (generalized interface, S=I currently supported) +// ============================================================================ + +template +void DiagoLobpcg::diag( + const HPsiFunc& hpsi_func, const SPsiFunc& spsi_func, + T* psi_in, Real* eigenvalue_in, const std::vector& ethr_band) +{ + // ---- runtime guard ----------------------------------------------------- + if (this->n_band_l != this->n_band) { + ModuleBase::WARNING_QUIT("DiagoLobpcg", + "Explicit-loop LOBPCG requires n_band_l == n_band. " + "Band-parallel MPI is not supported in this implementation."); + } + + this->has_pdir = false; + const int scf_iter = DiagoIterAssist::SCF_ITER; + + this->psi = ct::TensorMap(psi_in, t_type, dev_type, + {this->n_band_l, this->n_basis}); + + this->calc_prec(); + + this->calc_hpsi_with_block(hpsi_func, psi_in, this->hpsi); + this->calc_spsi_with_block(spsi_func, psi_in, this->spsi); + for (int ib = 0; ib < this->n_band_l; ++ib) { + for (int ig = 0; ig < this->n_dim; ++ig) { + const int idx = ib * this->n_basis + ig; + if (std::abs(this->spsi.data()[idx] - this->psi.data()[idx]) + > static_cast(100) * std::numeric_limits::epsilon()) { + ModuleBase::WARNING_QUIT("DiagoLobpcg", + "Generalized LOBPCG for S != I is not implemented yet."); + } + } + } + // Re-orthonormalize before initial R-R so H_sub is well-conditioned + this->orth_cholesky(this->work, this->psi, this->hpsi, this->tmp_hsub); + this->rayleigh_ritz(this->psi, this->hpsi, this->eigen); + + setmem_complex_op()(this->pdir.data(), static_cast(0.0), + this->n_basis * this->n_band_l); + setmem_complex_op()(this->hpdir.data(), static_cast(0.0), + this->n_basis * this->n_band_l); + + const int max_iter = (scf_iter > 1) ? this->nline : (this->nline * 20); + + for (int ntry = 0; ntry < max_iter; ++ntry) { + this->compute_residual(this->psi, this->hpsi, this->eigen, + this->prec, this->grad, this->err_st); + if (!this->test_error(this->err_st, ethr_band)) + break; + + const int psi_sz = this->n_basis * this->n_band_l; + const int eig_sz = this->n_band_l; + + this->orth_projection(this->psi, this->tmp_hsub, this->grad); + + this->calc_hpsi_with_block(hpsi_func, this->grad.data(), this->hgrad); + + // Backup stable state in case lobpcg_update corrupts psi/hpsi + std::vector psi_bak(psi_sz), hpsi_bak(psi_sz); + std::vector eigen_bak(eig_sz); + std::copy(this->psi.data(), this->psi.data() + psi_sz, psi_bak.data()); + std::copy(this->hpsi.data(), this->hpsi.data() + psi_sz, hpsi_bak.data()); + std::copy(this->eigen.data(), this->eigen.data() + eig_sz, eigen_bak.data()); + + try { + this->lobpcg_update(this->psi, this->hpsi, + this->grad, this->hgrad, + this->pdir, this->hpdir, + this->eigen); + } catch (const std::exception& e1) { + std::copy(psi_bak.data(), psi_bak.data() + psi_sz, this->psi.data()); + std::copy(hpsi_bak.data(), hpsi_bak.data() + psi_sz, this->hpsi.data()); + std::copy(eigen_bak.data(), eigen_bak.data() + eig_sz, this->eigen.data()); + + setmem_complex_op()(this->pdir.data(), static_cast(0.0), psi_sz); + setmem_complex_op()(this->hpdir.data(), static_cast(0.0), psi_sz); + this->has_pdir = false; + + try { + this->lobpcg_update(this->psi, this->hpsi, + this->grad, this->hgrad, + this->pdir, this->hpdir, + this->eigen); + } catch (const std::exception& e2) { + std::copy(psi_bak.data(), psi_bak.data() + psi_sz, this->psi.data()); + std::copy(hpsi_bak.data(), hpsi_bak.data() + psi_sz, this->hpsi.data()); + std::copy(eigen_bak.data(), eigen_bak.data() + eig_sz, this->eigen.data()); + + this->calc_hpsi_with_block(hpsi_func, this->psi.data(), this->hpsi); + this->orth_cholesky(this->work, this->psi, this->hpsi, this->tmp_hsub); + this->rayleigh_ritz(this->psi, this->hpsi, this->eigen); + } + } + + this->calc_hpsi_with_block(hpsi_func, this->psi.data(), this->hpsi); + this->orth_cholesky(this->work, this->psi, this->hpsi, this->tmp_hsub); + this->rayleigh_ritz(this->psi, this->hpsi, this->eigen); + this->orth_projection(this->psi, this->tmp_hsub, this->pdir); + this->calc_hpsi_with_block(hpsi_func, this->pdir.data(), this->hpdir); + + if (scf_iter == 1 && ((ntry + 1) % this->nline == 0)) { + this->calc_hpsi_with_block(hpsi_func, this->psi.data(), this->hpsi); + this->orth_cholesky(this->work, this->psi, this->hpsi, this->tmp_hsub); + this->rayleigh_ritz(this->psi, this->hpsi, this->eigen); + setmem_complex_op()(this->pdir.data(), static_cast(0.0), + this->n_basis * this->n_band_l); + setmem_complex_op()(this->hpdir.data(), static_cast(0.0), + this->n_basis * this->n_band_l); + this->has_pdir = false; + } + } + + this->calc_hpsi_with_block(hpsi_func, this->psi.data(), this->hpsi); + this->orth_cholesky(this->work, this->psi, this->hpsi, this->tmp_hsub); + this->rayleigh_ritz(this->psi, this->hpsi, this->eigen); + // Ensure hpsi = H*psi is consistent for external residual checks + this->calc_hpsi_with_block(hpsi_func, this->psi.data(), this->hpsi); + + syncmem_var_d2h_op()(eigenvalue_in, + this->eigen.data(), + this->n_band_l); +} + +template class DiagoLobpcg, base_device::DEVICE_CPU>; + +} // namespace hsolver diff --git a/source/source_hsolver/diago_lobpcg.h b/source/source_hsolver/diago_lobpcg.h new file mode 100644 index 00000000000..cbd07798a7b --- /dev/null +++ b/source/source_hsolver/diago_lobpcg.h @@ -0,0 +1,232 @@ +#ifndef DIAGO_LOBPCG_H_ +#define DIAGO_LOBPCG_H_ + +#include +#include +#include +#include + +#include "source_base/kernels/math_kernel_op.h" +#include "source_base/module_device/memory_op.h" +#include "source_base/module_device/types.h" +#include "source_base/para_gemm.h" +#include "source_hamilt/hamilt.h" +#include "source_hsolver/kernels/hegvd_op.h" +#include "source_hsolver/para_linear_transform.h" + +#include +#include +#include + +namespace hsolver { + +/** + * @class DiagoLobpcg + * @brief Locally Optimal Block Preconditioned Conjugate Gradient eigensolver. + * + * LOBPCG maintains a block Rayleigh-Ritz subspace: + * - First iteration: W = [X, Z] (2-block, no valid P yet) + * - Subsequent: W = [X, Z, P] (3-block) + * where X = current eigenvectors, Z = preconditioned residual, P = search directions. + * + * @note Currently supports NC (S=I) + CPU only. + * GPU and USPP support are planned for subsequent phases. + * + * @tparam T Complex floating-point type. + * @tparam Device Must be base_device::DEVICE_CPU (GPU not yet supported). + */ +template , typename Device = base_device::DEVICE_CPU> +class DiagoLobpcg +{ + private: + static_assert(std::is_same::value, + "DiagoLobpcg currently supports CPU only."); + + using Real = typename GetTypeReal::type; + + public: + /// @brief H * psi -> hpsi. + using HPsiFunc = std::function; + + /// @brief S * psi -> spsi. + using SPsiFunc = std::function; + + /// Constructor — stores host preconditioner pointer. + explicit DiagoLobpcg(const Real* precondition); + + ~DiagoLobpcg(); + + /// Allocate workspace and bind h_prec TensorMap (after n_basis is known). + void init_iter(const int nband, const int nband_l, const int nbasis, const int ndim); + + /// Set max inner iterations per SCF step (default 4). + void set_nline(const int n) { this->nline = n; } + + /// Generalized diagonalization interface. Currently supports S = I only. + void diag(const HPsiFunc& hpsi_func, + const SPsiFunc& spsi_func, + T* psi_in, + Real* eigenvalue_in, + const std::vector& ethr_band); + + private: + // ---- dimensions ---- + int n_band = 0; ///< total bands (global) + int n_band_l = 0; ///< local bands + int n_basis = 0; ///< basis functions (lda of psi) + int n_dim = 0; ///< valid dimension (= current_ngk) + int nline = 4; ///< max inner iterations per SCF step + int nsub = 0; ///< physical leading dim of hsub (= 3*n_band) + bool has_pdir = false; ///< true when P block holds valid directions + + // ---- parallel ops ---- + ModuleBase::PGemmCN pmmcn; + PLinearTransform plintrans; + + // ---- type traits ---- + ct::DataType r_type = ct::DataType::DT_INVALID; + ct::DataType t_type = ct::DataType::DT_INVALID; + ct::DeviceType dev_type = ct::DeviceType::UnKnown; + + // ---- preconditioner ---- + ct::Tensor prec = {}; ///< device copy [n_basis] + const Real* h_prec_ptr = nullptr; ///< host pointer (saved in ctor) + ct::Tensor h_prec = {}; ///< host TensorMap [n_basis] (bound in init_iter) + + // ---- eigenvalues & convergence ---- + ct::Tensor eigen = {}; ///< output eigenvalues [n_band] + ct::Tensor sub_eigen = {}; ///< subspace eigenvalues [nsub] = [3*n_band] + ct::Tensor err_st = {}; ///< residual norm per band [n_band_l] + + // ---- core blocks ---- + // Layout for {n_band_l, n_basis} tensors: + // data[ib * n_basis + ig] — band-major contiguous. + // BLAS view: n_basis rows × n_band_l cols, column-major. + ct::Tensor psi = {}; ///< X (TensorMap → psi_in, no ownership) + ct::Tensor hpsi = {}; ///< HX + ct::Tensor spsi = {}; ///< SX (Phase 2) + ct::Tensor grad = {}; ///< Z = T(R) + ct::Tensor hgrad = {}; ///< HZ + ct::Tensor pdir = {}; ///< P + ct::Tensor hpdir = {}; ///< HP + + // ---- subspace matrices ---- + ct::Tensor hsub = {}; ///< H_sub [nsub × nsub] + + // ---- workspace ---- + ct::Tensor work = {}; ///< [n_band_l, n_basis] + ct::Tensor hwork = {}; ///< [n_band_l, n_basis] + ct::Tensor pwork = {}; ///< [n_band_l, n_basis] (P update) + ct::Tensor hpwork = {}; ///< [n_band_l, n_basis] (HP update) + ct::Tensor tmp_hsub = {}; ///< scratch [n_band, n_band] + + // ---- GEMM constants (following BPCG pattern) ---- + Device* ctx = {}; + const T one_ = static_cast(1.0); + const T zero_ = static_cast(0.0); + const T neg_one_ = static_cast(-1.0); + const T* one = nullptr; + const T* zero = nullptr; + const T* neg_one = nullptr; + + // ---- helpers ---- + + void calc_prec(); + + void calc_hpsi_with_block(const HPsiFunc& hpsi_func, + T* psi_in, + ct::Tensor& hpsi_out); + + void calc_spsi_with_block(const SPsiFunc& spsi_func, + const T* psi_in, + ct::Tensor& spsi_out); + + /// Standard R-R: H_sub = psi^H * hpsi → heevd → rotate. + /// multiply(hpsi, psi) = psi^H * hpsi. + void rayleigh_ritz(ct::Tensor& psi_inout, + ct::Tensor& hpsi_inout, + ct::Tensor& eigen_out); + + /// Generalized R-R. NOT IMPLEMENTED — aborts. + void generalized_rayleigh_ritz(ct::Tensor& psi_inout, + ct::Tensor& hpsi_inout, + ct::Tensor& spsi_inout, + ct::Tensor& eigen_out); + + /// NC residual: R = HX - X*Lambda, Z = R ./ prec. + /// CPU-only: direct loops. + void compute_residual(const ct::Tensor& psi_in, + const ct::Tensor& hpsi_in, + const ct::Tensor& eigen_in, + const ct::Tensor& prec_in, + ct::Tensor& grad_out, + ct::Tensor& err_out); + + /// USPP residual. NOT IMPLEMENTED — aborts. + void compute_residual_s(const ct::Tensor& psi_in, + const ct::Tensor& hpsi_in, + const ct::Tensor& spsi_in, + const ct::Tensor& eigen_in, + const ct::Tensor& prec_in, + ct::Tensor& grad_out, + ct::Tensor& err_out); + + /// grad -= psi * (psi^H * grad). multiply(grad, psi) = psi^H * grad. + void orth_projection(const ct::Tensor& psi_in, + ct::Tensor& hsub_work, + ct::Tensor& grad_out); + + /// S-orthogonalize. NOT IMPLEMENTED — aborts. + void orth_projection_s(const ct::Tensor& psi_in, + const ct::Tensor& spsi_in, + ct::Tensor& hsub_work, + ct::Tensor& grad_out); + + /// Core subspace update (2-block first, then 3-block). + /// Orthonormalizes W = [X, Z, P] before Rayleigh-Ritz for stability. + void lobpcg_update(ct::Tensor& psi, + ct::Tensor& hpsi, + ct::Tensor& grad, + ct::Tensor& hgrad, + ct::Tensor& pdir, + ct::Tensor& hpdir, + ct::Tensor& eigen); + + /// psi = psi * U (via plintrans). + void rotate_wf(const ct::Tensor& hsub_in, + ct::Tensor& psi_out, + ct::Tensor& workspace_in); + + /// S=I Cholesky: psi^H*psi → potrf(U) → trtri → psi *= U^{-1}, hpsi *= U^{-1}. + void orth_cholesky(ct::Tensor& workspace_in, + ct::Tensor& psi_out, + ct::Tensor& hpsi_out, + ct::Tensor& hsub_out); + + /// S-Cholesky. NOT IMPLEMENTED — aborts. + void orth_cholesky_s(ct::Tensor& workspace_in, + ct::Tensor& psi_out, + ct::Tensor& hpsi_out, + ct::Tensor& spsi_out, + ct::Tensor& hsub_out); + + bool test_error(const ct::Tensor& err_in, const std::vector& ethr_band); + + // ---- memory-op aliases ---- + using ct_Device = typename ct::PsiToContainer::type; + using setmem_var_op = ct::kernels::set_memory; + using resmem_var_op = ct::kernels::resize_memory; + using delmem_var_op = ct::kernels::delete_memory; + using syncmem_var_h2d_op = ct::kernels::synchronize_memory; + using syncmem_var_d2h_op = ct::kernels::synchronize_memory; + + using setmem_complex_op = ct::kernels::set_memory; + using delmem_complex_op = ct::kernels::delete_memory; + using resmem_complex_op = ct::kernels::resize_memory; + using syncmem_complex_op = ct::kernels::synchronize_memory; + using syncmem_complex_h2d_op = ct::kernels::synchronize_memory; + using syncmem_complex_d2h_op = ct::kernels::synchronize_memory; +}; + +} // namespace hsolver +#endif // DIAGO_LOBPCG_H_ diff --git a/source/source_hsolver/hsolver_pw.cpp b/source/source_hsolver/hsolver_pw.cpp index b88bc3b90dd..618ee316406 100644 --- a/source/source_hsolver/hsolver_pw.cpp +++ b/source/source_hsolver/hsolver_pw.cpp @@ -11,6 +11,7 @@ #include "source_hsolver/diago_cg.h" #include "source_hsolver/diago_dav_subspace.h" #include "source_hsolver/diago_david.h" +#include "source_hsolver/diago_lobpcg.h" #include "source_hsolver/diago_iter_assist.h" #include "source_io/module_parameter/parameter.h" #include "source_psi/psi.h" @@ -18,11 +19,46 @@ #include +#include #include namespace hsolver { +template +typename std::enable_if>::value + && std::is_same::value, + void>::type +run_lobpcg_pw(const HPsiFunc& hpsi_func, + const SPsiFunc& spsi_func, + psi::Psi& psi, + const std::vector::type>& pre_condition, + typename GetTypeReal::type* eigenvalue, + const std::vector& ethr_band) +{ + const int nband_l = psi.get_nbands(); + const int nbasis = psi.get_nbasis(); + const int ndim = psi.get_current_ngk(); + DiagoLobpcg lobpcg(pre_condition.data()); + lobpcg.init_iter(PARAM.inp.nbands, nband_l, nbasis, ndim); + lobpcg.diag(hpsi_func, spsi_func, psi.get_pointer(), eigenvalue, ethr_band); +} + +template +typename std::enable_if>::value + || !std::is_same::value, + void>::type +run_lobpcg_pw(const HPsiFunc&, + const SPsiFunc&, + psi::Psi&, + const std::vector::type>&, + typename GetTypeReal::type*, + const std::vector&) +{ + ModuleBase::WARNING_QUIT("HSolverPW", + "LOBPCG is currently implemented only for CPU complex PW calculations."); +} + template void HSolverPW::cal_smooth_ethr(const double& wk, const double* wg, @@ -83,7 +119,7 @@ void HSolverPW::solve(hamilt::Hamilt* pHamilt, this->nproc_in_pool = nproc_in_pool_in; // report if the specified diagonalization method is not supported - const std::initializer_list _methods = {"cg", "dav", "dav_subspace", "bpcg"}; + const std::initializer_list _methods = {"cg", "dav", "dav_subspace", "bpcg", "lobpcg"}; if (std::find(std::begin(_methods), std::end(_methods), this->method) == std::end(_methods)) { ModuleBase::WARNING_QUIT("HSolverPW::solve", "This type of eigensolver is not supported!"); @@ -273,8 +309,8 @@ void HSolverPW::hamiltSolvePsiK(hamilt::Hamilt* hm, hpsi_info info(&psi_wrapper, bands_range, hpsi_out); hm->ops->hPsi(info); }; - auto spsi_func = [hm](const T* psi_in, T* spsi_out, const int ld_psi, const int nvec) { - hm->sPsi(psi_in, spsi_out, ld_psi, ld_psi, nvec); + auto spsi_func = [hm, cur_nbasis](const T* psi_in, T* spsi_out, const int ld_psi, const int nvec) { + hm->sPsi(psi_in, spsi_out, ld_psi, cur_nbasis, nvec); }; if (this->method == "cg") @@ -323,6 +359,10 @@ void HSolverPW::hamiltSolvePsiK(hamilt::Hamilt* hm, bpcg.init_iter(PARAM.inp.nbands, nband_l, nbasis, ndim); bpcg.diag(hpsi_func, psi.get_pointer(), eigenvalue, this->ethr_band); } + else if (this->method == "lobpcg") + { + run_lobpcg_pw(hpsi_func, spsi_func, psi, pre_condition, eigenvalue, this->ethr_band); + } else if (this->method == "dav_subspace") { bool scf = this->calculation_type == "nscf" ? false : true; diff --git a/source/source_hsolver/test/CMakeLists.txt b/source/source_hsolver/test/CMakeLists.txt index 1b1529adb4a..14d7c7dacc2 100644 --- a/source/source_hsolver/test/CMakeLists.txt +++ b/source/source_hsolver/test/CMakeLists.txt @@ -11,7 +11,15 @@ if (ENABLE_MPI) AddTest( TARGET MODULE_HSOLVER_bpcg LIBS parameter ${math_libs} base psi device container - SOURCES diago_bpcg_test.cpp ../diago_bpcg.cpp ../para_linear_transform.cpp ../diago_iter_assist.cpp + SOURCES diago_bpcg_test.cpp ../diago_bpcg.cpp ../para_linear_transform.cpp ../diago_iter_assist.cpp + ../../source_basis/module_pw/test/test_tool.cpp + ../../source_hamilt/operator.cpp + ../../source_pw/module_pwdft/op_pw.cpp + ) + AddTest( + TARGET MODULE_HSOLVER_lobpcg + LIBS parameter ${math_libs} base psi device container + SOURCES diago_lobpcg_test.cpp ../diago_lobpcg.cpp ../para_linear_transform.cpp ../diago_iter_assist.cpp ../../source_basis/module_pw/test/test_tool.cpp ../../source_hamilt/operator.cpp ../../source_pw/module_pwdft/op_pw.cpp @@ -76,14 +84,14 @@ if (ENABLE_MPI) AddTest( TARGET MODULE_HSOLVER_pw LIBS parameter ${math_libs} psi device base container - SOURCES test_hsolver_pw.cpp ../hsolver_pw.cpp ../hsolver_lcaopw.cpp ../diago_bpcg.cpp ../diago_dav_subspace.cpp ../diag_const_nums.cpp ../diago_iter_assist.cpp ../para_linear_transform.cpp + SOURCES test_hsolver_pw.cpp ../hsolver_pw.cpp ../hsolver_lcaopw.cpp ../diago_bpcg.cpp ../diago_lobpcg.cpp ../diago_dav_subspace.cpp ../diag_const_nums.cpp ../diago_iter_assist.cpp ../para_linear_transform.cpp ../../source_estate/elecstate_tools.cpp ../../source_estate/occupy.cpp ../../source_base/module_fft/fft_bundle.cpp ../../source_base/module_fft/fft_cpu.cpp ) AddTest( TARGET MODULE_HSOLVER_sdft LIBS parameter ${math_libs} psi device base container - SOURCES test_hsolver_sdft.cpp ../hsolver_pw_sdft.cpp ../hsolver_pw.cpp ../diago_bpcg.cpp ../diago_dav_subspace.cpp ../diag_const_nums.cpp ../diago_iter_assist.cpp ../para_linear_transform.cpp + SOURCES test_hsolver_sdft.cpp ../hsolver_pw_sdft.cpp ../hsolver_pw.cpp ../diago_bpcg.cpp ../diago_lobpcg.cpp ../diago_dav_subspace.cpp ../diag_const_nums.cpp ../diago_iter_assist.cpp ../para_linear_transform.cpp ../../source_estate/elecstate_tools.cpp ../../source_estate/occupy.cpp ../../source_base/module_fft/fft_bundle.cpp ../../source_base/module_fft/fft_cpu.cpp ) @@ -197,4 +205,4 @@ if (ENABLE_MPI) ) endif() endif() -endif() \ No newline at end of file +endif() diff --git a/source/source_hsolver/test/diago_lobpcg_test.cpp b/source/source_hsolver/test/diago_lobpcg_test.cpp new file mode 100644 index 00000000000..1cddaa6cfba --- /dev/null +++ b/source/source_hsolver/test/diago_lobpcg_test.cpp @@ -0,0 +1,287 @@ +#include "../diago_iter_assist.h" +#include "../diago_lobpcg.h" +#include "source_base/global_variable.h" +#include "source_base/module_external/lapack_connector.h" +#include "source_base/parallel_comm.h" +#include "source_basis/module_pw/test/test_tool.h" + +#ifdef __MPI +#include "mpi.h" +#endif + +#include +#include +#include +#include +#include +#include + +/************************************************ + * unit test of DiagoLobpcg (NC, CPU-only) + * + * Validates eigenvalues, orthonormality, and + * residual against LAPACK zheev for random + * well-conditioned Hermitian matrices. + ***********************************************/ + +using TestT = std::complex; +using TestDevice = base_device::DEVICE_CPU; +using TestReal = double; + +/// Reference eigenvalues via LAPACK zheev (eigenvalues only). +static int lapackEigen(int npw, std::vector& hm, TestReal* e) +{ + int lwork = std::max(1, 2 * npw - 1); + std::vector work(lwork); + std::vector rwork(std::max(1, 3 * npw - 2)); + int info = 0; + char jobz = 'N', uplo = 'U'; + zheev_(&jobz, &uplo, &npw, hm.data(), &npw, e, + work.data(), &lwork, rwork.data(), &info); + return info; +} + +static void run_generalized_lobpcg() +{ + const int npw = 8, nband = 3; + std::vector psi(nband * npw, {0.0, 0.0}); + for (int ib = 0; ib < nband; ++ib) + psi[ib * npw + ib] = {1.0, 0.0}; + std::vector eigens(nband, 0.0); + std::vector prec(npw, 1.0); + std::vector ethr(nband, 1e-6); + + auto hpsi_func = [](TestT* psi_in, TestT* hpsi_out, + int ld_psi, int nvec) { + std::copy(psi_in, psi_in + ld_psi * nvec, hpsi_out); + }; + auto spsi_func = [](const TestT* psi_in, TestT* spsi_out, + int ld_psi, int nvec) { + std::copy(psi_in, psi_in + ld_psi * nvec, spsi_out); + spsi_out[0] *= 2.0; + }; + + hsolver::DiagoLobpcg lobpcg(prec.data()); + lobpcg.init_iter(nband, nband, npw, npw); + lobpcg.diag(hpsi_func, spsi_func, psi.data(), eigens.data(), ethr); +} + +class DiagoLobpcgTest : public ::testing::Test +{ + protected: + static int idx(int row, int col, int ld) { return col * ld + row; } + + void run_and_validate(int npw, int nband, + const std::vector& hmat, + const std::vector& prec, + const std::vector& e_ref, + double eig_tol, double orth_tol, double res_tol) + { + const int ld = npw; + + // ---- random orthonormal initial guess ---- + std::vector psi(nband * npw, {0.0, 0.0}); + { + std::mt19937 gen(123); + std::uniform_real_distribution dist(-1.0, 1.0); + for (int ib = 0; ib < nband; ib++) + for (int ig = 0; ig < npw; ig++) + psi[ib * npw + ig] = {dist(gen), dist(gen)}; + + for (int ib = 0; ib < nband; ib++) + { + for (int jb = 0; jb < ib; jb++) + { + TestT dot = {0.0, 0.0}; + for (int ig = 0; ig < npw; ig++) + dot += std::conj(psi[jb * npw + ig]) * psi[ib * npw + ig]; + for (int ig = 0; ig < npw; ig++) + psi[ib * npw + ig] -= dot * psi[jb * npw + ig]; + } + TestReal norm = 0.0; + for (int ig = 0; ig < npw; ig++) + norm += std::norm(psi[ib * npw + ig]); + norm = std::sqrt(norm); + for (int ig = 0; ig < npw; ig++) + psi[ib * npw + ig] /= norm; + } + } + + auto hpsi_func = [&](TestT* psi_in, TestT* hpsi_out, + int ld_psi, int nvec) { + for (int iv = 0; iv < nvec; iv++) + for (int i = 0; i < npw; i++) + { + TestT sum = {0.0, 0.0}; + for (int j = 0; j < npw; j++) + sum += hmat[idx(i, j, ld)] * psi_in[iv * ld_psi + j]; + hpsi_out[iv * ld_psi + i] = sum; + } + }; + + // ---- run LOBPCG ---- + std::vector eigens(nband, 0.0); + std::vector ethr(nband, 1e-6); + + // SCF_ITER = 1 triggers periodic R-R restart that clears P, which + // naturally limits subspace ill-conditioning. Use moderate nline. + const int old_scf = hsolver::DiagoIterAssist::SCF_ITER; + hsolver::DiagoIterAssist::SCF_ITER = 1; + + hsolver::DiagoLobpcg lobpcg(prec.data()); + lobpcg.init_iter(nband, nband, npw, npw); + lobpcg.set_nline(4); + auto spsi_func = [](const TestT* psi_in, TestT* spsi_out, + int ld_psi, int nvec) { + std::copy(psi_in, psi_in + ld_psi * nvec, spsi_out); + }; + lobpcg.diag(hpsi_func, spsi_func, psi.data(), eigens.data(), ethr); + + hsolver::DiagoIterAssist::SCF_ITER = old_scf; + + // ---- validate eigenvalues ---- + for (int ib = 0; ib < nband; ib++) + ASSERT_NEAR(eigens[ib], e_ref[ib], eig_tol) + << "eigenvalue[" << ib << "] mismatch: " + << eigens[ib] << " vs ref " << e_ref[ib]; + + // ---- validate orthonormality ---- + for (int i = 0; i < nband; i++) + for (int j = 0; j < nband; j++) + { + TestT dot = {0.0, 0.0}; + for (int ig = 0; ig < npw; ig++) + dot += std::conj(psi[i * npw + ig]) * psi[j * npw + ig]; + if (i == j) + { + EXPECT_NEAR(std::real(dot), 1.0, orth_tol) + << "psi^H psi diag[" << i << "] = " << std::real(dot); + EXPECT_NEAR(std::imag(dot), 0.0, orth_tol) + << "psi^H psi diag[" << i << "] imag = " << std::imag(dot); + } + else + EXPECT_NEAR(std::abs(dot), 0.0, orth_tol) + << "psi^H psi (" << i << "," << j << ") = " << std::abs(dot); + } + + // ---- validate residual: ||H*psi_i - eig_i * psi_i|| ---- + for (int ib = 0; ib < nband; ib++) + { + TestReal res2 = 0.0; + for (int i = 0; i < npw; i++) + { + TestT hxi = {0.0, 0.0}; + for (int j = 0; j < npw; j++) + hxi += hmat[idx(i, j, ld)] * psi[ib * npw + j]; + const auto r = hxi - eigens[ib] * psi[ib * npw + i]; + res2 += std::norm(r); + } + EXPECT_LT(std::sqrt(res2), res_tol) + << "residual[" << ib << "] = " << std::sqrt(res2); + } + } + + /// Build a strongly diagonally-dominant random Hermitian matrix. + static void build_well_conditioned(int npw, std::vector& hmat, + std::vector& prec, + std::vector& e_ref) + { + const int ld = npw; + hmat.assign(npw * npw, {0.0, 0.0}); + + std::mt19937 gen(42); + std::uniform_real_distribution dist(-1.0, 1.0); + + for (int i = 0; i < npw; i++) + { + for (int j = i; j < npw; j++) + { + TestReal re = dist(gen) * 0.5; + TestReal im = (i != j) ? dist(gen) * 0.5 : 0.0; + hmat[idx(i, j, ld)] = {re, im}; + hmat[idx(j, i, ld)] = {re, -im}; + } + hmat[idx(i, i, ld)] += TestT( + static_cast(2.0 * (i + 1) * (i + 1)), 0.0); + } + + // Reference + auto hcopy = hmat; + e_ref.resize(npw); + ASSERT_EQ(lapackEigen(npw, hcopy, e_ref.data()), 0); + + // Preconditioner + prec.resize(npw); + for (int i = 0; i < npw; i++) + prec[i] = std::max(static_cast(1.0), + std::real(hmat[idx(i, i, ld)])); + } +}; + +// ============================================================================ +// Test cases: various matrix sizes and band counts +// ============================================================================ + +TEST_F(DiagoLobpcgTest, SmallMatrix) +{ + const int npw = 50, nband = 10; + std::vector hmat; + std::vector prec, e_ref; + build_well_conditioned(npw, hmat, prec, e_ref); + run_and_validate(npw, nband, hmat, prec, e_ref, 1e-5, 1e-8, 1e-4); +} + +TEST_F(DiagoLobpcgTest, MediumMatrix) +{ + const int npw = 200, nband = 20; + std::vector hmat; + std::vector prec, e_ref; + build_well_conditioned(npw, hmat, prec, e_ref); + run_and_validate(npw, nband, hmat, prec, e_ref, 1e-4, 1e-6, 2e-4); +} + +TEST_F(DiagoLobpcgTest, LargerMatrixFewBands) +{ + const int npw = 400, nband = 12; + std::vector hmat; + std::vector prec, e_ref; + build_well_conditioned(npw, hmat, prec, e_ref); + run_and_validate(npw, nband, hmat, prec, e_ref, 1e-4, 1e-6, 3e-4); +} + +TEST_F(DiagoLobpcgTest, NonIdentityOverlapNotImplemented) +{ + EXPECT_EXIT( + run_generalized_lobpcg(), + ::testing::ExitedWithCode(1), + ".*" + ); +} + +int main(int argc, char** argv) +{ + int nproc = 1, myrank = 0; + +#ifdef __MPI + int nproc_in_pool, kpar = 1, mypool, rank_in_pool; + setupmpi(argc, argv, nproc, myrank); + divide_pools(nproc, myrank, nproc_in_pool, kpar, mypool, rank_in_pool); + MPI_Comm_split(MPI_COMM_WORLD, myrank, 0, &BP_WORLD); + GlobalV::NPROC_IN_POOL = nproc; +#else + MPI_Init(&argc, &argv); +#endif + + testing::InitGoogleTest(&argc, argv); + ::testing::TestEventListeners& listeners = + ::testing::UnitTest::GetInstance()->listeners(); + if (myrank != 0) + delete listeners.Release(listeners.default_result_printer()); + + int result = RUN_ALL_TESTS(); + if (myrank == 0 && result != 0) + std::cout << "ERROR: some tests are not passed" << std::endl; + + MPI_Finalize(); + return result; +} diff --git a/source/source_io/module_parameter/read_input_item_elec_stru.cpp b/source/source_io/module_parameter/read_input_item_elec_stru.cpp index 39f37febc54..7fdb3fa5417 100644 --- a/source/source_io/module_parameter/read_input_item_elec_stru.cpp +++ b/source/source_io/module_parameter/read_input_item_elec_stru.cpp @@ -131,7 +131,7 @@ Then the user has to correct the input file and restart the calculation.)"; }; item.check_value = [](const Input_Item& item, const Parameter& para) { const std::string& ks_solver = para.input.ks_solver; - const std::vector pw_solvers = {"cg", "dav", "bpcg", "dav_subspace"}; + const std::vector pw_solvers = {"cg", "dav", "bpcg", "dav_subspace", "lobpcg"}; const std::vector lcao_solvers = { "genelpa", "elpa", @@ -1040,7 +1040,7 @@ Use case: When experimental or high-level theoretical results suggest that the S item.annotation = "threshold for eigenvalues is cg electron iterations"; item.category = "Plane wave related variables"; item.type = "Real"; - item.description = "Only used when you use ks_solver = cg/dav/dav_subspace/bpcg. It indicates the threshold for the first electronic iteration, from the second iteration the pw_diag_thr will be updated automatically. For nscf calculations with planewave basis set, pw_diag_thr should be <= 1e-3."; + item.description = "Only used when you use ks_solver = cg/dav/dav_subspace/bpcg/lobpcg. It indicates the threshold for the first electronic iteration, from the second iteration the pw_diag_thr will be updated automatically. For nscf calculations with planewave basis set, pw_diag_thr should be <= 1e-3."; item.default_value = "0.01"; item.unit = ""; item.availability = ""; @@ -1099,10 +1099,10 @@ Use case: When experimental or high-level theoretical results suggest that the S item.annotation = "max iteration number for cg"; item.category = "Plane wave related variables"; item.type = "Integer"; - item.description = "Only useful when you use ks_solver = cg/dav/dav_subspace/bpcg. It indicates the maximal iteration number for cg/david/dav_subspace/bpcg method."; + item.description = "Only useful when you use ks_solver = cg/dav/dav_subspace/bpcg/lobpcg. It indicates the maximal iteration number for cg/david/dav_subspace/bpcg/lobpcg method."; item.default_value = "50"; item.unit = ""; - item.availability = "basis_type==pw, ks_solver==cg/dav/dav_subspace/bpcg"; + item.availability = "basis_type==pw, ks_solver==cg/dav/dav_subspace/bpcg/lobpcg"; read_sync_int(input.pw_diag_nmax); this->add_item(item); }